diff --git a/setup.py b/setup.py index 54103cd..f872be5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import find_packages, setup install_requires = [ "tqdm", - "ujson", + "orjson", "nest_asyncio", "aiohttp", "requests", @@ -13,7 +13,7 @@ install_requires = [ setup( name="twitter-api-client", - version="0.5.4", + version="0.5.5", python_requires=">=3.9.7", description="Twitter API", long_description=dedent(''' diff --git a/twitter/account.py b/twitter/account.py index cbeb195..e97a3c1 100644 --- a/twitter/account.py +++ b/twitter/account.py @@ -13,7 +13,7 @@ from pathlib import Path from urllib.parse import urlencode from uuid import uuid1, getnode -import ujson +import orjson from requests import Response from tqdm import tqdm @@ -22,7 +22,7 @@ from .config.operations import operations from .config.settings import * from .constants import * from .login import login -from .utils import get_headers, build_query +from .utils import get_headers, build_query, find_key try: if get_ipython().__class__.__name__ == 'ZMQInteractiveShell': @@ -98,6 +98,7 @@ class Account: payload['variables'] |= variables url = f"{self.GRAPHQL_URL}/{qid}/{name}" r = self.session.post(url, headers=get_headers(self.session), json=payload) + self.check_response(r) return r def api(self, path: str, settings: dict) -> Response: @@ -105,6 +106,7 @@ class Account: headers['content-type'] = 'application/x-www-form-urlencoded' url = f'{self.V1_URL}/{path}' r = self.session.post(url, headers=headers, data=urlencode(settings)) + self.check_response(r) return r @log(info=['json']) @@ -200,7 +202,7 @@ class Account: headers = get_headers(self.session) headers['content-type'] = 'application/x-www-form-urlencoded' url = 'https://caps.twitter.com/v2/cards/create.json' - r = self.session.post(url, headers=headers, params={'card_data': ujson.dumps(options)}) + r = self.session.post(url, headers=headers, params={'card_data': orjson.dumps(options).decode()}) card_uri = r.json()['card_uri'] r = self.tweet(text, poll_params={'card_uri': card_uri}) return r @@ -547,3 +549,10 @@ class Account: url = 'https://twitter.com/i/api/account/self.sessions/revoke_all' r = self.session.post(url, headers=headers) return r + + @staticmethod + def check_response(r): + if r.status_code == 429: + raise Exception(f'rate limit exceeded: {r.url}') + if find_key(data := r.json(), 'errors'): + logger.debug(f'[{WARN}ERROR{RESET}]: {data}') diff --git a/twitter/scraper.py b/twitter/scraper.py index 2ec6600..24e5dac 100644 --- a/twitter/scraper.py +++ b/twitter/scraper.py @@ -8,15 +8,16 @@ from copy import deepcopy from pathlib import Path from urllib.parse import urlsplit -import ujson +import orjson from aiohttp import ClientSession, TCPConnector from tqdm import tqdm from .config.log import log_config from .config.operations import operations +from .config.settings import trending_params from .constants import * from .login import login -from .utils import find_key, build_query, get_headers +from .utils import find_key, build_query, get_headers, set_qs try: if get_ipython().__class__.__name__ == 'ZMQInteractiveShell': @@ -215,10 +216,15 @@ class Scraper: try: r = await fn() data = await r.json() + if r.status == 429: + logger.debug(f'rate limit exceeded: {r.url}') + return r, {} + if find_key(data, 'errors'): + logger.debug(f'[{WARN}ERROR{RESET}]: {data}') return r, data except Exception as e: if i == retries: - logger.debug(f'{WARN}Max retries exceeded{RESET}\n{e}') + logger.debug(f'{WARN} Max retries exceeded{RESET}\n{e}') return t = 2 ** i + random.random() logger.debug(f'retrying in {f"{t:.2f}"} seconds\t\t{e}') @@ -229,8 +235,9 @@ class Scraper: for d in data: path = Path(f'data/raw/{d[ID]}') path.mkdir(parents=True, exist_ok=True) - with open(path / f'{time.time_ns()}_{name}.json', 'w') as fp: - ujson.dump(d, fp, indent=4) + (path / f'{time.time_ns()}_{name}.json').write_text( + orjson.dumps(d, option=orjson.OPT_INDENT_2).decode()) + except KeyError as e: logger.debug(f'failed to save data: {e}') @@ -248,11 +255,6 @@ class Scraper: name = urlsplit(post_url).path.replace('/', '_')[1:] ext = urlsplit(cdn_url).path.split('/')[-1] try: - # with open(f'{path}/{name}_{ext}', 'wb') as fp: - # r = self.session.get(cdn_url, stream=True) - # for chunk in r.iter_content(chunk_size=chunk_size): - # fp.write(chunk) - r = self.session.get(cdn_url, stream=True) total_bytes = int(r.headers.get('Content-Length', 0)) desc = f'downloading: {name}' @@ -282,3 +284,25 @@ class Scraper: # logger.debug(f'{hq_videos = }') if hq_videos: [self.download(url, video) for video in hq_videos] + + def trends(self) -> dict: + """Get trends for all UTC offsets""" + url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params) + headers = get_headers(self.session) + offsets = [f"{str(i).zfill(3)}00" if i < 0 else f"+{str(i).zfill(2)}00" for i in range(-12, 15)] + res = [] + for offset in offsets: + headers['x-twitter-utcoffset'] = offset + r = self.session.get(url, headers=headers) + res.append(r.json()) + logger.debug(f'getting trends for: {offset = }') + all_trends = {} + for data in res: + trends = find_key(data, 'item') + for t in trends: + all_trends |= {t['content']['trend']['name']: t} + path = Path(f'data/raw/trends') + path.mkdir(parents=True, exist_ok=True) + (path / f'{time.time_ns()}.json').write_text( + orjson.dumps(all_trends, option=orjson.OPT_INDENT_2).decode()) + return all_trends diff --git a/twitter/search.py b/twitter/search.py index 02d4e51..aa0ae0e 100644 --- a/twitter/search.py +++ b/twitter/search.py @@ -1,6 +1,6 @@ import asyncio import atexit -import json +import orjson import logging.config import platform import random @@ -64,7 +64,7 @@ async def paginate(query: str, session: aiohttp.ClientSession, config: dict, out config['cursor'] = next_cursor data, next_cursor = await backoff(lambda: get(session, api, config), query) data['query'] = query - (out / f'raw/{time.time_ns()}.json').write_text(json.dumps(data, indent=4)) + (out / f'raw/{time.time_ns()}.json').write_text(orjson.dumps(data, option=orjson.OPT_INDENT_2).decode()) all_data.append(data) return all_data @@ -136,10 +136,10 @@ def make_output_dirs(path: str) -> Path: @atexit.register def combine_results(in_path: Path = IN_PATH, out_path: Path = OUT_PATH): try: - out_path.write_text(json.dumps({ + out_path.write_text(orjson.dumps({ k: v for p in in_path.iterdir() if p.suffix == '.json' - for k, v in json.loads(p.read_text())['globalObjects']['tweets'].items() - }, indent=2)) + for k, v in orjson.loads(p.read_text())['globalObjects']['tweets'].items() + }, option=orjson.OPT_INDENT_2).decode()) except Exception as e: logger.debug(f'FAILED to combine search results, {e}') diff --git a/twitter/utils.py b/twitter/utils.py index ca51550..2729f0f 100644 --- a/twitter/utils.py +++ b/twitter/utils.py @@ -1,11 +1,12 @@ from urllib.parse import urlsplit, urlencode, urlunsplit, parse_qs, quote -import ujson +import orjson def set_qs(url: str, qs: dict, update=False, **kwargs) -> str: *_, q, f = urlsplit(url) - return urlunsplit((*_, urlencode(qs | parse_qs(q) if update else qs, doseq=True, quote_via=quote, safe=kwargs.get('safe','')), f)) + return urlunsplit((*_, urlencode(qs | parse_qs(q) if update else qs, doseq=True, quote_via=quote, + safe=kwargs.get('safe', '')), f)) def find_key(obj: any, key: str) -> list: @@ -64,4 +65,4 @@ def get_headers(session) -> dict: def build_query(params): - return '&'.join(f'{k}={ujson.dumps(v)}' for k, v in params.items()) + return '&'.join(f'{k}={orjson.dumps(v).decode()}' for k, v in params.items())