import asyncio import logging.config import math import platform import random import time from copy import deepcopy from pathlib import Path from urllib.parse import urlsplit import orjson from aiohttp import ClientSession, TCPConnector from tqdm import tqdm from .config.log import log_config from .config.operations import operations from .config.settings import trending_params from .constants import * from .login import login from .utils import find_key, build_query, get_headers, set_qs try: if get_ipython().__class__.__name__ == 'ZMQInteractiveShell': import nest_asyncio nest_asyncio.apply() except: ... if platform.system() != 'Windows': import uvloop asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) else: asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) ID = 'ID' DUP_LIMIT = 5 logging.config.dictConfig(log_config) logger = logging.getLogger(__name__) class Scraper: GRAPHQL_URL = 'https://api.twitter.com/graphql' def __init__(self, email: str, username: str, password: str): self.session = login(email, username, password) def tweets(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.UserTweets, limit) def tweets_and_replies(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.UserTweetsAndReplies, limit) def likes(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.Likes, limit) def media(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.UserMedia, limit) def followers(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.Followers, limit) def following(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.Following, limit) def favoriters(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.Favoriters, limit) def retweeters(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.Retweeters, limit) def tweets_details(self, ids: list[int], limit=math.inf): return self.run(ids, Operation.Data.TweetDetail, limit) # no pagination needed def tweet_by_rest_id(self, ids: list[int]): return self.run(ids, Operation.Data.TweetResultByRestId) # no pagination needed def user_by_screen_name(self, ids: list[str]): return self.run(ids, Operation.Data.UserByScreenName) # no pagination needed def user_by_rest_id(self, ids: list[int]): return self.run(ids, Operation.Data.UserByRestId) # no pagination needed (special batch query) def users_by_rest_ids(self, ids: list[int]): name, key = Operation.Data.UsersByRestIds params = deepcopy(operations[name]) qid = params['queryId'] params['variables']['userIds'] = ids q = build_query(params) url = f"{self.GRAPHQL_URL}/{qid}/{name}?{q}" headers = get_headers(self.session) headers['content-type'] = "application/json" users = self.session.get(url, headers=headers).json() return users def run(self, ids: list, operation: tuple, limit=None): res = self.query(ids, operation) if limit is None: return res return asyncio.run(self.pagination(res, operation, limit)) + res def query(self, ids: list[any], operation: tuple) -> list: name, key = operation params = deepcopy(operations[name]) qid = params['queryId'] urls = [] for _id in ids: params['variables'][key] = _id q = build_query(params) urls.append((_id, f"{self.GRAPHQL_URL}/{qid}/{name}?{q}")) headers = get_headers(self.session) headers['content-type'] = "application/json" res = asyncio.run(self.process(urls, headers)) self.save_data(res, name) return res async def process(self, urls: list, headers: dict) -> list: conn = TCPConnector(limit=100, ssl=False, ttl_dns_cache=69) async with ClientSession(headers=headers, connector=conn) as s: # add cookies from logged-in session s.cookie_jar.update_cookies(self.session.cookies) return await asyncio.gather(*(self.get(s, u) for u in urls)) async def get(self, session: ClientSession, url: tuple) -> dict: identifier, api_url = url logger.debug(f'processing: {url}') try: r = await session.get(api_url) limits = {k: v for k, v in r.headers.items() if 'x-rate-limit' in k} logger.debug(f'{limits = }') if r.status == 429: logger.debug(f'rate limit exceeded: {url}') return {} data = await r.json() return {ID: identifier, **data} except Exception as e: logger.debug(f'[{ERROR}error{RESET}] failed to download {url}: {e}') async def pagination(self, res: list, operation: tuple, limit: int) -> list: conn = TCPConnector(limit=100, ssl=False, ttl_dns_cache=69) headers = get_headers(self.session) headers['content-type'] = "application/json" async with ClientSession(headers=headers, connector=conn) as s: # add cookies from logged-in session s.cookie_jar.update_cookies(self.session.cookies) return await asyncio.gather(*(self.paginate(s, data, operation, limit) for data in res)) async def paginate(self, session: ClientSession, data: dict, operation: tuple, limit: int): def get_cursor(data): # inefficient, but need to deal with arbitrary schema entries = find_key(data, 'entries') if entries: for entry in entries.pop(): entry_id = entry.get('entryId', '') if ('cursor-bottom' in entry_id) or ('cursor-showmorethreads' in entry_id): content = entry['content'] if itemContent := content.get('itemContent'): return itemContent['value'] # v2 cursor return content['value'] # v1 cursor all_data = [] try: name, key = operation params = deepcopy(operations[name]) qid = params['queryId'] ids = set() counts = [] params['variables'][key] = data[ID] cursor = get_cursor(data) while 1: params['variables']['cursor'] = cursor query = build_query(params) url = f"{self.GRAPHQL_URL}/{qid}/{name}?{query}" # code [353]: "This request requires a matching csrf cookie and header." r, _data = await self.backoff(lambda: session.get(url)) if csrf := r.cookies.get("ct0"): session.headers.update({"x-csrf-token": csrf.value}) session.cookie_jar.update_cookies(r.cookies) tagged_data = _data | {ID: data[ID]} self.save_data([tagged_data], name) all_data.append(tagged_data) cursor = get_cursor(_data) logger.debug(f'{cursor = }') ids |= set(find_key(tagged_data, 'rest_id')) logger.debug(f'({data[ID]})\t{len(ids)} unique results') counts.append(len(ids)) success_message = f'[{SUCCESS}success{RESET}] done pagination' # followers/following have "0|" if not cursor or cursor.startswith('0|'): logger.debug(f'{success_message}\tlast cursor: {cursor}') break if len(ids) >= limit: logger.debug(f'{success_message}\tsurpassed limit of {limit} results') break # did last 5 requests return duplicate data? if len(counts) > DUP_LIMIT and len(set(counts[-1:-DUP_LIMIT:-1])) == 1: logger.debug(f'{success_message}\tpast {DUP_LIMIT} requests returned duplicate data') break except Exception as e: logger.debug(f'[{ERROR}error{RESET}] paginate falied: {e}') # save_data(all_data, name) return all_data async def backoff(self, fn, retries=12): for i in range(retries + 1): try: r = await fn() data = await r.json() if r.status == 429: logger.debug(f'[{ERROR}error{RESET}] rate limit exceeded: {r.url}') return r, {} if find_key(data, 'errors'): logger.debug(f'[{ERROR}error{RESET}] twitter errors: {data}') return r, data except Exception as e: if i == retries: logger.debug(f'[{ERROR}error{RESET}] max retries exceeded{RESET}\n{e}') return t = 2 ** i + random.random() logger.debug(f'{WARN}retrying in {f"{t:.2f}"} seconds{RESET}\t\t{e}') time.sleep(t) def save_data(self, data: list, name: str = ''): try: for d in data: path = Path(f'data/raw/{d[ID]}') path.mkdir(parents=True, exist_ok=True) (path / f'{time.time_ns()}_{name}.json').write_text( orjson.dumps(d, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8' ) except KeyError as e: logger.debug(f'[{ERROR}error{RESET}] failed to save data: {e}') def download(self, post_url: str, cdn_url: str, path: str = 'media', chunk_size: int = 4096) -> None: """ Download file @param post_url: the actual post url @param cdn_url: the cdn url @param path: path to save media @param chunk_size: chunk size in bytes @return: None """ Path(path).mkdir(parents=True, exist_ok=True) name = urlsplit(post_url).path.replace('/', '_')[1:] ext = urlsplit(cdn_url).path.split('/')[-1] try: r = self.session.get(cdn_url, stream=True) total_bytes = int(r.headers.get('Content-Length', 0)) desc = f'downloading: {name}' with tqdm(total=total_bytes, desc=desc, unit='B', unit_scale=True, unit_divisor=1024) as pbar: with open(f'{path}/{name}_{ext}', 'wb') as f: for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) pbar.update(f.tell() - pbar.n) except Exception as e: logger.debug(f'[{ERROR}error{RESET}] failed to download media: {post_url} {e}') def download_media(self, ids: list[int], photos: bool = True, videos: bool = True) -> None: res = self.tweet_by_rest_id(ids) for r in res: user_id = find_key(r, 'user_results')[0]['result']['rest_id'] url = f'https://twitter.com/{user_id}/status/{r[ID]}' # evaluates to username in browser media = [y for x in find_key(r, 'media') for y in x] # in case of arbitrary schema if photos: photos = list({u for m in media if 'ext_tw_video_thumb' not in (u := m['media_url_https'])}) if photos: [self.download(url, photo) for photo in photos] if videos: videos = [x['variants'] for m in media if (x := m.get('video_info'))] hq_videos = {sorted(v, key=lambda d: d.get('bitrate', 0))[-1]['url'] for v in videos} if hq_videos: [self.download(url, video) for video in hq_videos] def trends(self) -> dict: """Get trends for all UTC offsets""" url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params) headers = get_headers(self.session) offsets = [f"{str(i).zfill(3)}00" if i < 0 else f"+{str(i).zfill(2)}00" for i in range(-12, 15)] res = [] for offset in offsets: headers['x-twitter-utcoffset'] = offset r = self.session.get(url, headers=headers) res.append(r.json()) logger.debug(f'getting trends for: {offset = }') all_trends = {} for data in res: trends = find_key(data, 'item') for t in trends: all_trends |= {t['content']['trend']['name']: t} path = Path(f'data/raw/trends') path.mkdir(parents=True, exist_ok=True) (path / f'{time.time_ns()}.json').write_text( orjson.dumps(all_trends, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8' ) return all_trends