From 7002b140aaf7cca5a180e99b7c7349201cb02fc3 Mon Sep 17 00:00:00 2001 From: Trevor Hobenshield Date: Fri, 5 Apr 2024 13:54:46 -0700 Subject: [PATCH] update discovery script --- scripts/update.py | 240 ++++++++++++++++++++++++++++++++++++++--- twitter/__version__.py | 2 +- 2 files changed, 226 insertions(+), 16 deletions(-) mode change 100644 => 100755 scripts/update.py diff --git a/scripts/update.py b/scripts/update.py old mode 100644 new mode 100755 index f803575..d734b04 --- a/scripts/update.py +++ b/scripts/update.py @@ -1,37 +1,247 @@ +import asyncio +import logging.config +import platform +import random import re +import subprocess +from asyncio import Semaphore +from functools import partial +from logging import getLogger, Logger from pathlib import Path +from typing import Generator +import aiofiles +import chompjs import orjson -from httpx import Client +from httpx import AsyncClient, Response, Limits, Client +from selectolax.lexbor import LexborHTMLParser +from tqdm.asyncio import tqdm_asyncio -PATH_OPS = Path('ops.json') -PATH_MAIN = Path('main.js') +try: + get_ipython() + import nest_asyncio + + nest_asyncio.apply() +except: + ... + +if platform.system() != 'Windows': + try: + import uvloop + + uvloop.install() + except: + ... + +dump_json = partial(orjson.dumps, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS) +def mkdir(path: str | Path) -> Path: + p = Path(path) + p.mkdir(exist_ok=True, parents=True) + return p -def _get_ops(client: Client) -> None: - r1 = client.get('https://twitter.com') - m = re.findall('href="(https\:\/\/abs\.twimg\.com\/responsive-web\/client-web\/main\.\w+\.js)"', r1.text) - r2 = client.get(m[0]) - PATH_MAIN.write_text(r2.text) - expr = r'\{[^{}]*queryId:\s?"([^"]+)",\s*operationName:\s?"([^"]+)",\s*operationType:\s?"([^"]+)",\s*metadata:\s?\{\s*featureSwitches:\s?\[(.*?)\],\s*fieldToggles:\s?\[(.*?)\]\s*\}\s*\}' - matches = re.findall(expr, r2.text, flags=re.A) - D = {} +logging.config.dictConfig({ + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'standard': { + 'format': '%(asctime)s.%(msecs)03d [%(levelname)s] :: %(message)s', + 'datefmt': '%Y-%m-%d %H:%M:%S' + } + }, + 'handlers': { + 'file': { + 'class': 'logging.FileHandler', + 'level': 'DEBUG', + 'formatter': 'standard', + 'filename': 'log.log', + 'mode': 'a' + }, + 'console_warning': { + 'class': 'logging.StreamHandler', + 'level': 'WARNING', + 'formatter': 'standard' + }, + 'console_info': { + 'class': 'logging.StreamHandler', + 'level': 'INFO', + 'formatter': 'standard', + 'filters': ['info_only'] + } + }, + 'filters': { + 'info_only': { + '()': lambda: lambda record: record.levelno == logging.INFO + } + }, + 'loggers': { + 'my_logger': { + 'handlers': ['file', 'console_warning', 'console_info'], + 'level': 'DEBUG' + } + } +}) +logger = getLogger(list(Logger.manager.loggerDict)[-1]) + +PATH_DATA = mkdir('data') + +PATH_HOMEPAGE = PATH_DATA / 'x.html' +PATH_INITIAL_STATE = PATH_DATA / 'initial_state.json' +PATH_FEATURES = PATH_DATA / 'features.json' +PATH_LIMITS = PATH_DATA / 'limits.json' +PATH_OPS = PATH_DATA / 'ops.json' +PATH_MAIN = PATH_DATA / 'main.js' +PATH_URLS = PATH_DATA / 'csp.txt' +STRINGS = PATH_DATA / 'strings.txt' +PATHS = PATH_DATA / 'paths.txt' +JS_FILES_MAP = PATH_DATA / 'js.json' +JS_FILES = mkdir(PATH_DATA / 'js') +OPERATIONS = PATH_DATA / 'operations' + +USER_AGENTS = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3.1 Safari/605.1.1', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.3', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.', +] + +_a = 'a.js' +_base = 'https://abs.twimg.com/responsive-web/client-web' + + +async def backoff(fn: callable, sem: Semaphore, *args, m: int = 20, b: int = 2, max_retries: int = 8, **kwargs) -> any: + ignore_status_codes = kwargs.pop('ignore_status_codes', []) + for i in range(max_retries + 1): + try: + async with sem: + r = await fn(*args, **kwargs) + if r.status_code in ignore_status_codes: + return r + r.raise_for_status() + return r + except Exception as e: + if i == max_retries: + logger.warning(f'Max retries exceeded\n{e}') + return + t = min(random.random() * (b ** i), m) + logger.info(f'Retrying in {f"{t:.2f}"} seconds\n{e}') + await asyncio.sleep(t) + + +def download(urls: list[str], out: str = 'tmp', sz: int = None, fname_fn: partial = None, **kwargs) -> Generator: + async def get(client: AsyncClient, sem: Semaphore, url: str): + fname = url.split('/')[-1] if not fname_fn else fname_fn(url) + async with aiofiles.open(f'{_out}/{fname}', 'wb') as fp: + r = await backoff(client.get, sem, url, **kwargs) + async for chunk in r.aiter_bytes(sz): + await fp.write(chunk) + return r + + _out = mkdir(out) + return (partial(get, url=u) for u in urls) + + +def send(cfgs: list[dict], **kwargs) -> Generator: + async def f(client: AsyncClient, sem: Semaphore, cfg: dict) -> Response: + return await backoff(client.request, sem, **cfg, **kwargs) + + return (partial(f, cfg=cfg) for cfg in cfgs) + + +async def process(fns: Generator, max_connections: int = 2000, **kwargs): + client_defaults = { + 'cookies': kwargs.pop('cookies', None), + 'headers': {'user-agent': random.choice(USER_AGENTS)} | kwargs.pop('headers', {}), + 'timeout': kwargs.pop('timeout', 30.0), + 'verify': kwargs.pop('verify', False), + 'http2': kwargs.pop('http2', True), + 'follow_redirects': kwargs.pop('follow_redirects', True), + 'limits': kwargs.pop('limits', Limits( + max_connections=max_connections, + max_keepalive_connections=None, + keepalive_expiry=5.0, + )) + } + # tqdm + desc = kwargs.pop('desc', None) + sem = Semaphore(max_connections) + async with AsyncClient(**client_defaults, **kwargs) as client: + tasks = (fn(client=client, sem=sem) for fn in fns) + if desc: + return await tqdm_asyncio.gather(*tasks, desc=desc) + return await asyncio.gather(*tasks) + + +def _get_endpoints(res: Response, out: Path = JS_FILES_MAP) -> dict: + temp = re.findall('\+"\."\+(\{.*\})\[e\]\+?' + '"' + _a + '"', res.text)[0] + endpoints = orjson.loads(temp.replace('vendor:', '"vendor":').replace('api:', '"api":')) + if out: + out.write_bytes(dump_json(endpoints)) + return endpoints + + +def get_js_files(r: Response, out: Path = JS_FILES) -> None: + endpoints = _get_endpoints(r) + csp = sorted({x.strip(';') for x in r.headers.get("content-security-policy").split() if x.startswith("https://")}) + PATH_URLS.write_text('\n'.join(csp)) + urls = [ + f'{_base}/{k}.{v}{_a}' + for k, v in endpoints.items() + if not re.search(r'participantreaction|\.countries-|emojipicker|i18n|icons\/', k, flags=re.I) + ] + asyncio.run(process(download(urls, out=out), desc='Downloading JS files')) + + +def parse_matches(matches: list[tuple]) -> dict: + d = {} for m in matches: - D[m[1]] = { + d[m[1]] = { "queryId": m[0], "operationName": m[1], "operationType": m[2], "featureSwitches": sorted(re.sub(r'[\s"\']', '', x) for x in (m[3].split(',') if m[3] else [])), "fieldToggles": sorted(re.sub(r'[\s"\']', '', x) for x in (m[4].split(',') if m[4] else [])) } - PATH_OPS.write_bytes(orjson.dumps(D, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)) + return d def main(): - c = Client(headers={'user-agent': 'Chrome/110.0.0.0'}, follow_redirects=True) - _get_ops(c) + client = Client(headers={'user-agent': random.choice(USER_AGENTS)}, follow_redirects=True, http2=True) + r1 = client.get('https://x.com') + PATH_HOMEPAGE.write_text(r1.text) + + try: + get_js_files(r1) + except Exception as e: + logger.warning(f'Failed to get js files\t\t{e}') + + main_js = re.findall(r'href="(https\:\/\/abs\.twimg\.com\/responsive-web\/client-web\/main\.\w+\.js)"', r1.text)[0] + r2 = client.get(main_js) + PATH_MAIN.write_text(r2.text) + + expr = r'\{[^{}]*queryId:\s?"([^"]+)",\s*operationName:\s?"([^"]+)",\s*operationType:\s?"([^"]+)",\s*metadata:\s?\{\s*featureSwitches:\s?\[(.*?)\],\s*fieldToggles:\s?\[(.*?)\]\s*\}\s*\}' + + matches = re.findall(expr, r2.text, flags=re.A) + ops = parse_matches(matches) + + # search all js files for more GraphQL operation definitions + for p in JS_FILES.iterdir(): + matches = re.findall(expr, p.read_text(), flags=re.A) + ops |= parse_matches(matches) + + PATH_OPS.write_bytes(dump_json(ops)) + html = LexborHTMLParser(PATH_HOMEPAGE.read_text()) + k = 'window.__INITIAL_STATE__=' + PATH_INITIAL_STATE.write_bytes(dump_json(chompjs.parse_js_object([x for x in html.css('script') if k in x.text()][0].text().replace(k, '').strip(';')))) + + data = orjson.loads(PATH_INITIAL_STATE.read_bytes()) + config = data['featureSwitch']['defaultConfig'] | data['featureSwitch']['user']['config'] + features = {k: v.get('value') for k, v in config.items() if isinstance(v.get('value'), bool)} + numeric = {k: v.get('value') for k, v in config.items() if isinstance(v.get('value'), int) and not isinstance(v.get('value'), bool)} + PATH_FEATURES.write_bytes(dump_json(features)) + PATH_LIMITS.write_bytes(dump_json(numeric)) if __name__ == '__main__': diff --git a/twitter/__version__.py b/twitter/__version__.py index 3020f58..7cd8a36 100644 --- a/twitter/__version__.py +++ b/twitter/__version__.py @@ -1,5 +1,5 @@ __title__ = "twitter-api-client" __description__ = "Implementation of X/Twitter v1, v2, and GraphQL APIs." -__version__ = "0.10.19" +__version__ = "0.10.20" __author__ = "Trevor Hobenshield" __license__ = "MIT" \ No newline at end of file