Files
twitter-api-client/twitter/search.py
2023-04-16 17:41:01 -07:00

145 lines
4.9 KiB
Python

import asyncio
import atexit
import math
import orjson
import logging.config
import platform
import random
import re
import time
from pathlib import Path
import aiohttp
import requests
from .config.log import log_config
from .config.settings import search_config
from .constants import *
from .utils import set_qs
IN_PATH = Path('~/data/raw').expanduser()
OUT_PATH = Path(f'~/data/processed/search_results_{time.time_ns()}.json').expanduser()
reset = '\u001b[0m'
colors = [f'\u001b[{i}m' for i in range(30, 38)]
logging.config.dictConfig(log_config)
logger = logging.getLogger(__name__)
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
nest_asyncio.apply()
except:
...
if platform.system() != 'Windows':
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except:
...
else:
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
def search(*args, out: str = 'data', **kwargs):
out_path = make_output_dirs(out)
return asyncio.run(process(args, search_config, out_path, **kwargs))
async def process(queries: tuple, config: dict, out: Path, **kwargs) -> list:
conn = aiohttp.TCPConnector(limit=len(queries), ssl=False)
async with aiohttp.ClientSession(headers=__get_headers(), connector=conn) as s:
return await asyncio.gather(*(paginate(q, s, config, out, **kwargs) for q in queries))
async def paginate(query: str, session: aiohttp.ClientSession, config: dict, out: Path, **kwargs) -> list[dict]:
api = 'https://api.twitter.com/2/search/adaptive.json?'
config['q'] = query
data, next_cursor = await backoff(lambda: get(session, api, config), query)
all_data = [data]
c = colors.pop() if colors else ''
ids = set()
while next_cursor:
ids |= set(data['globalObjects']['tweets'])
if len(ids) >= kwargs.get('limit', math.inf):
logger.debug(f'[{SUCCESS}success{RESET}] returned {len(ids)} search results for {c}{query}{reset}')
return all_data
logger.debug(f'{c}{query}{reset}')
config['cursor'] = next_cursor
data, next_cursor = await backoff(lambda: get(session, api, config), query)
data['query'] = query
(out / f'raw/{time.time_ns()}.json').write_text(
orjson.dumps(data, option=orjson.OPT_INDENT_2).decode(),
encoding='utf-8'
)
all_data.append(data)
return all_data
async def backoff(fn, info, retries=12):
for i in range(retries + 1):
try:
data, next_cursor = await fn()
if not data.get('globalObjects', {}).get('tweets'):
raise Exception
return data, next_cursor
except Exception as e:
if i == retries:
logger.debug(f'Max retries exceeded\n{e}')
return
t = 2 ** i + random.random()
logger.debug(f'No data for: \u001b[1m{info}\u001b[0m | retrying in {f"{t:.2f}"} seconds\t\t{e}')
time.sleep(t)
async def get(session: aiohttp.ClientSession, api: str, params: dict) -> tuple[dict, str]:
url = set_qs(api, params, update=True, safe='()')
r = await session.get(url)
data = await r.json()
next_cursor = get_cursor(data)
return data, next_cursor
def get_cursor(res: dict):
try:
for instr in res['timeline']['instructions']:
if replaceEntry := instr.get('replaceEntry'):
cursor = replaceEntry['entry']['content']['operation']['cursor']
if cursor['cursorType'] == 'Bottom':
return cursor['value']
continue
for entry in instr['addEntries']['entries']:
if entry['entryId'] == 'cursor-bottom-0':
return entry['content']['operation']['cursor']['value']
except Exception as e:
logger.debug(e)
def __get_headers(fname: str = None) -> dict:
if fname:
with open(fname) as fp:
return {y.group(): z.group()
for x in fp.read().splitlines()
if (y := re.search('^[\w-]+(?=:\s)', x),
z := re.search(f'(?<={y.group()}:\s).*', x))}
# default
headers = {
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
'user-agent': 'Mozilla/5.0 (Linux; Android 11; Nokia G20) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.88 Mobile Safari/537.36'
}
r = requests.post('https://api.twitter.com/1.1/guest/activate.json', headers=headers)
headers['x-guest-token'] = r.json()['guest_token']
return headers
def make_output_dirs(path: str) -> Path:
p = Path(f'~/{path}').expanduser()
(p / 'raw').mkdir(parents=True, exist_ok=True)
(p / 'processed').mkdir(parents=True, exist_ok=True)
(p / 'final').mkdir(parents=True, exist_ok=True)
return p