Files
twitter-api-client/twitter/scraper.py
2023-05-23 16:10:30 -07:00

300 lines
12 KiB
Python

import asyncio
import logging.config
import math
import platform
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from logging import Logger
from pathlib import Path
from urllib.parse import urlsplit
import httpx
import orjson
from httpx import AsyncClient, Response
from tqdm import tqdm
from .constants import *
from .login import login
from .util import find_key, save_data, get_cursor, get_headers, set_qs, fmt_status
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
nest_asyncio.apply()
except:
...
if platform.system() != 'Windows':
try:
import uvloop
uvloop.install()
except ImportError as e:
...
class Scraper:
def __init__(self, email: str, username: str, password: str, **kwargs):
self.session = login(email, username, password, **kwargs)
self.api = 'https://twitter.com/i/api/graphql'
self.save = kwargs.get('save', True)
self.debug = kwargs.get('debug', 0)
self.logger = self.init_logger(kwargs.get('log_config', False))
@staticmethod
def init_logger(cfg: dict) -> Logger:
logging.config.dictConfig(cfg or log_config)
return logging.getLogger(__name__)
def users(self, screen_names: list[str]) -> list:
return self._run(screen_names, Operation.UserByScreenName)
def tweets_by_id(self, tweet_ids: list[int]) -> list[dict]:
return self._run(tweet_ids, Operation.TweetResultByRestId)
def tweets_details(self, tweet_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(tweet_ids, Operation.TweetDetail, limit)
def tweets(self, user_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(user_ids, Operation.UserTweets, limit)
def tweets_and_replies(self, user_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(user_ids, Operation.UserTweetsAndReplies, limit)
def media(self, user_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(user_ids, Operation.UserMedia, limit)
def likes(self, user_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(user_ids, Operation.Likes, limit)
def followers(self, user_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(user_ids, Operation.Followers, limit)
# auth required
def following(self, user_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(user_ids, Operation.Following, limit)
# auth required
def favoriters(self, tweet_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(tweet_ids, Operation.Favoriters, limit)
# auth required
def retweeters(self, tweet_ids: list[int], limit=math.inf) -> list[dict]:
return self._run(tweet_ids, Operation.Retweeters, limit)
def profile_spotlights(self, screen_names: list[str]) -> list:
"""
This endpoint is included for completeness only. It returns very few data points.
Use the batched query `users_by_ids` instead if you wish to pull user profile data.
"""
return self._run(screen_names, Operation.ProfileSpotlightsQuery)
def users_by_id(self, user_ids: list[int]) -> list[dict]:
"""
This endpoint is included for completeness only.
Use the batched query `users_by_ids` instead if you wish to pull user profile data.
"""
return self._run(user_ids, Operation.UserByRestId)
def tweet_stats(self, user_ids: list[int]) -> list[dict]:
return self._run(user_ids, Operation.TweetStats)
def recommended_users(self, user_id: int = None) -> dict:
qid, op, key = Operation.ConnectTabTimeline
context = {"contextualUserId": user_id} if user_id else {}
params = {k: orjson.dumps(v).decode() for k, v in {
'variables': Operation.default_variables | {key: orjson.dumps(context).decode()},
'features': Operation.default_features,
}.items()}
r = self.session.get(f'{self.api}/{qid}/{op}', headers=get_headers(self.session), params=params)
txt = r.text
data = r.json()
if self.debug:
self.log(r, txt, data)
if self.save:
save_data(data, op, user_id)
return data
# special case, batch query
def users_by_ids(self, user_ids: list[int]) -> dict:
"""
Get user data in batches
Batch-size limited to around 200-300 users
"""
qid, op, key = Operation.UsersByRestIds
params = {k: orjson.dumps(v).decode() for k, v in {
'variables': Operation.default_variables | {key: user_ids},
'features': Operation.default_features,
}.items()}
r = self.session.get(f'{self.api}/{qid}/{op}', headers=get_headers(self.session), params=params)
txt = r.text
data = r.json()
if self.debug:
self.log(r, txt, data)
if self.save:
save_data(data, op, user_ids[0])
return data
def _run(self, ids: list[int | str], operation: tuple, limit=None):
return asyncio.run(self._process(ids, operation, limit))
async def _process(self, ids: list[int | str], op: tuple, limit: int | None) -> list:
async with AsyncClient(headers=get_headers(self.session)) as s:
return await asyncio.gather(*(self._paginate(s, _id, op, limit) for _id in ids))
async def _paginate(self, session: AsyncClient, _id: int | str, operation: tuple,
limit: int | None, **kwargs) -> list[dict] | tuple[list[dict], str]:
cursor = kwargs.get('cursor')
is_resuming = False
dups = 0
DUP_LIMIT = 3
if cursor:
is_resuming = True
res = []
ids = set()
else:
r = await self._query(session, _id, operation)
initial_data = r.json()
res = [initial_data]
ids = set(find_key(initial_data, 'rest_id'))
cursor = get_cursor(initial_data)
while (dups < DUP_LIMIT) and cursor:
prev_len = len(ids)
if prev_len >= limit:
# return res
break
r = await self._query(session, _id, operation, cursor=cursor)
data = r.json()
cursor = get_cursor(data)
ids |= set(find_key(data, 'rest_id'))
if self.debug:
self.logger.debug(f'cursor: {cursor}\tunique results: {len(ids)}')
if prev_len == len(ids):
dups += 1
res.append(data)
if is_resuming:
return res, cursor
return res
def resume_pagination(self, *args, **kwargs):
async def _resume(session: AsyncClient, _id: int | str, operation: tuple, limit=math.inf, **kwargs) -> tuple:
session = AsyncClient(headers=get_headers(session), cookies=self.session.cookies)
return await self._paginate(session, _id, operation, limit, **kwargs)
return asyncio.run(_resume(*args, **kwargs))
async def _query(self, session: AsyncClient, _id: int | str | list, operation: tuple, **kwargs) -> Response:
qid, op, k = operation
params = {k: orjson.dumps(v).decode() for k, v in {
'variables': {k: _id} | Operation.default_variables | kwargs,
'features': Operation.default_features,
}.items()}
r = await session.get(f'{self.api}/{qid}/{op}', params=params)
txt = r.text
data = r.json()
if self.debug:
self.log(r, txt, data)
if self.save:
save_data(data, op, _id[0] if isinstance(_id, list) else _id)
return r
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True) -> None:
tweets = self.tweets_by_id(ids)
urls = []
for tweet in tweets:
tweet_id = find_key(tweet, 'id_str')[0]
url = f'https://twitter.com/i/status/{tweet_id}' # `i` evaluates to screen_name
media = [y for x in find_key(tweet, 'media') for y in x]
if photos:
photo_urls = list({u for m in media if 'ext_tw_video_thumb' not in (u := m['media_url_https'])})
[urls.append([url, photo]) for photo in photo_urls]
if videos:
video_urls = [x['variants'] for m in media if (x := m.get('video_info'))]
hq_videos = {sorted(v, key=lambda d: d.get('bitrate', 0))[-1]['url'] for v in video_urls}
[urls.append([url, video]) for video in hq_videos]
with tqdm(total=len(urls), desc='downloading media') as pbar:
with ThreadPoolExecutor(max_workers=32) as e:
for future in as_completed(e.submit(self._download, x, y) for x, y in urls):
future.result()
pbar.update()
def _download(self, post_url: str, cdn_url: str, path: str = 'media', chunk_size: int = 4096) -> None:
Path(path).mkdir(parents=True, exist_ok=True)
name = urlsplit(post_url).path.replace('/', '_')[1:]
ext = urlsplit(cdn_url).path.split('/')[-1]
try:
with httpx.stream('GET', cdn_url) as r:
with open(f'{path}/{name}_{ext}', 'wb') as f:
for chunk in r.iter_bytes(chunk_size=chunk_size):
f.write(chunk)
except Exception as e:
self.logger.debug(f'[{RED}error{RESET}] failed to download media: {post_url} {e}')
def trends(self) -> dict:
"""Get trends for all UTC offsets"""
def get_trends(offset: str, url: str, headers: dict):
headers['x-twitter-utcoffset'] = offset
r = self.session.get(url, headers=headers)
trends = find_key(r.json(), 'item')
return {t['content']['trend']['name']: t for t in trends}
headers = get_headers(self.session)
url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params)
offsets = [f"{str(i).zfill(3)}00" if i < 0 else f"+{str(i).zfill(2)}00" for i in range(-12, 15)]
trends = {}
with tqdm(total=len(offsets), desc='downloading trends') as pbar:
with ThreadPoolExecutor(max_workers=32) as e:
for future in as_completed(e.submit(get_trends, o, url, headers) for o in offsets):
trends |= future.result()
pbar.update()
path = Path(f'data/raw/trends')
path.mkdir(parents=True, exist_ok=True)
(path / f'{time.time_ns()}.json').write_text(
orjson.dumps(trends, option=orjson.OPT_INDENT_2).decode(),
encoding='utf-8'
)
return trends
def log(self, r: Response | Response, txt: str, data: dict):
status = r.status_code
def stat(r):
if self.debug >= 1:
self.logger.debug(f'{r.url}')
if self.debug >= 2:
self.logger.debug(f'{txt}')
limits = {k: v for k, v in r.headers.items() if 'x-rate-limit' in k}
current_time = int(time.time())
wait = int(r.headers.get('x-rate-limit-reset', current_time)) - current_time
self.logger.debug(
f"remaining: {MAGENTA}{limits['x-rate-limit-remaining']}/{limits['x-rate-limit-limit']}{RESET} requests")
self.logger.debug(f'reset: {MAGENTA}{(wait / 60):.2f}{RESET} minutes')
try:
if 'json' in r.headers.get('content-type', ''):
if data.get('errors'):
self.logger.debug(f'[{RED}error{RESET}] {status} {data}')
else:
self.logger.debug(fmt_status(status))
stat(r)
else:
self.logger.debug(fmt_status(status))
stat(r)
except Exception as e:
self.logger.debug(f'failed to log: {e}')