Files
twitter-api-client/info/parse_endpoints.py
2023-03-25 15:26:54 -07:00

79 lines
2.3 KiB
Python

import asyncio
import logging.config
import re
from pathlib import Path
import aiohttp
import nest_asyncio
import orjson
import uvloop
from twitter.config.log import log_config
logger = logging.getLogger(__name__)
logging.config.dictConfig(log_config)
nest_asyncio.apply()
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
BOLD = '\u001b[1m'
SUCCESS = '\u001b[32m'
WARN = '\u001b[31m'
RESET = '\u001b[0m'
PATHS = Path('paths.txt')
ENDPOINTS = Path('endpoints.json')
OUT_PATH = Path("endpoints")
OUT_PATH.mkdir(exist_ok=True, parents=True)
def get_headers(filename: str = 'headers.txt') -> dict:
if (path := Path(filename)).exists():
return {y.group(): z.group()
for x in path.read_text().splitlines()
if (y := re.search('^[\w-]+(?=:\s)', x),
z := re.search(f'(?<={y.group()}:\s).*', x))}
# default
return {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
async def process(fn: callable, headers: dict, urls: any, **kwargs) -> tuple:
conn = aiohttp.TCPConnector(ssl=False, limit=420, ttl_dns_cache=69)
async with aiohttp.ClientSession(headers=headers, connector=conn) as s:
return await asyncio.gather(*(fn(s, u, **kwargs) for u in urls))
async def get(session: aiohttp.ClientSession, url: str, **kwargs) -> tuple[str, dict]:
try:
logger.debug(f"GET {url}")
res = await session.get(url)
data = await getattr(res, kwargs.get('res', 'text'))()
return url, data
except Exception as e:
logger.debug(f"[{WARN}FAILED{RESET}]: {url}\n{e}")
def find_paths():
res = set()
for p in OUT_PATH.iterdir():
data = p.read_text()
if x := re.findall('"[^"]*"|`[^`]*`|\'[^\']*\'', data):
res |= set(x[1:-1] for x in x if '/' in x)
PATHS.write_text('\n'.join(sorted(res)))
def main():
urls = (
f'https://abs.twimg.com/responsive-web/client-web/{k}.{v}a.js'
for k, v in orjson.loads(ENDPOINTS.read_text()).items() if 'endpoint' in k
)
headers = get_headers()
res = asyncio.run(process(get, headers, urls))
[(OUT_PATH / u.split('/')[-1]).write_text(r) for u, r in res]
find_paths()
if __name__ == '__main__':
main()