300 lines
12 KiB
Python
300 lines
12 KiB
Python
import asyncio
|
|
import logging.config
|
|
import math
|
|
import platform
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from logging import Logger
|
|
from pathlib import Path
|
|
from urllib.parse import urlsplit
|
|
|
|
import httpx
|
|
import orjson
|
|
from httpx import AsyncClient, Response
|
|
from tqdm import tqdm
|
|
|
|
from .constants import *
|
|
from .login import login
|
|
from .util import find_key, save_data, get_cursor, get_headers, set_qs, fmt_status
|
|
|
|
try:
|
|
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
|
|
import nest_asyncio
|
|
|
|
nest_asyncio.apply()
|
|
except:
|
|
...
|
|
|
|
if platform.system() != 'Windows':
|
|
try:
|
|
import uvloop
|
|
|
|
uvloop.install()
|
|
except ImportError as e:
|
|
...
|
|
|
|
|
|
class Scraper:
|
|
def __init__(self, email: str, username: str, password: str, **kwargs):
|
|
self.session = login(email, username, password, **kwargs)
|
|
self.api = 'https://twitter.com/i/api/graphql'
|
|
self.save = kwargs.get('save', True)
|
|
self.debug = kwargs.get('debug', 0)
|
|
self.logger = self.init_logger(kwargs.get('log_config', False))
|
|
|
|
@staticmethod
|
|
def init_logger(cfg: dict) -> Logger:
|
|
logging.config.dictConfig(cfg or log_config)
|
|
return logging.getLogger(__name__)
|
|
|
|
def users(self, screen_names: list[str]) -> list:
|
|
return self._run(screen_names, Operation.UserByScreenName)
|
|
|
|
def tweets_by_id(self, tweet_ids: list[int]) -> list[dict]:
|
|
return self._run(tweet_ids, Operation.TweetResultByRestId)
|
|
|
|
def tweets_details(self, tweet_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(tweet_ids, Operation.TweetDetail, limit)
|
|
|
|
def tweets(self, user_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(user_ids, Operation.UserTweets, limit)
|
|
|
|
def tweets_and_replies(self, user_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(user_ids, Operation.UserTweetsAndReplies, limit)
|
|
|
|
def media(self, user_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(user_ids, Operation.UserMedia, limit)
|
|
|
|
def likes(self, user_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(user_ids, Operation.Likes, limit)
|
|
|
|
def followers(self, user_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(user_ids, Operation.Followers, limit)
|
|
|
|
# auth required
|
|
def following(self, user_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(user_ids, Operation.Following, limit)
|
|
|
|
# auth required
|
|
def favoriters(self, tweet_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(tweet_ids, Operation.Favoriters, limit)
|
|
|
|
# auth required
|
|
def retweeters(self, tweet_ids: list[int], limit=math.inf) -> list[dict]:
|
|
return self._run(tweet_ids, Operation.Retweeters, limit)
|
|
|
|
def profile_spotlights(self, screen_names: list[str]) -> list:
|
|
"""
|
|
This endpoint is included for completeness only. It returns very few data points.
|
|
Use the batched query `users_by_ids` instead if you wish to pull user profile data.
|
|
"""
|
|
return self._run(screen_names, Operation.ProfileSpotlightsQuery)
|
|
|
|
def users_by_id(self, user_ids: list[int]) -> list[dict]:
|
|
"""
|
|
This endpoint is included for completeness only.
|
|
Use the batched query `users_by_ids` instead if you wish to pull user profile data.
|
|
"""
|
|
return self._run(user_ids, Operation.UserByRestId)
|
|
|
|
def tweet_stats(self, user_ids: list[int]) -> list[dict]:
|
|
return self._run(user_ids, Operation.TweetStats)
|
|
|
|
def recommended_users(self, user_id: int = None) -> dict:
|
|
qid, op, key = Operation.ConnectTabTimeline
|
|
context = {"contextualUserId": user_id} if user_id else {}
|
|
params = {k: orjson.dumps(v).decode() for k, v in {
|
|
'variables': Operation.default_variables | {key: orjson.dumps(context).decode()},
|
|
'features': Operation.default_features,
|
|
}.items()}
|
|
r = self.session.get(f'{self.api}/{qid}/{op}', headers=get_headers(self.session), params=params)
|
|
txt = r.text
|
|
data = r.json()
|
|
if self.debug:
|
|
self.log(r, txt, data)
|
|
if self.save:
|
|
save_data(data, op, user_id)
|
|
return data
|
|
|
|
# special case, batch query
|
|
def users_by_ids(self, user_ids: list[int]) -> dict:
|
|
"""
|
|
Get user data in batches
|
|
|
|
Batch-size limited to around 200-300 users
|
|
"""
|
|
qid, op, key = Operation.UsersByRestIds
|
|
params = {k: orjson.dumps(v).decode() for k, v in {
|
|
'variables': Operation.default_variables | {key: user_ids},
|
|
'features': Operation.default_features,
|
|
}.items()}
|
|
r = self.session.get(f'{self.api}/{qid}/{op}', headers=get_headers(self.session), params=params)
|
|
txt = r.text
|
|
data = r.json()
|
|
if self.debug:
|
|
self.log(r, txt, data)
|
|
if self.save:
|
|
save_data(data, op, user_ids[0])
|
|
return data
|
|
|
|
def _run(self, ids: list[int | str], operation: tuple, limit=None):
|
|
return asyncio.run(self._process(ids, operation, limit))
|
|
|
|
async def _process(self, ids: list[int | str], op: tuple, limit: int | None) -> list:
|
|
async with AsyncClient(headers=get_headers(self.session)) as s:
|
|
return await asyncio.gather(*(self._paginate(s, _id, op, limit) for _id in ids))
|
|
|
|
async def _paginate(self, session: AsyncClient, _id: int | str, operation: tuple,
|
|
limit: int | None, **kwargs) -> list[dict] | tuple[list[dict], str]:
|
|
|
|
cursor = kwargs.get('cursor')
|
|
is_resuming = False
|
|
dups = 0
|
|
DUP_LIMIT = 3
|
|
|
|
if cursor:
|
|
is_resuming = True
|
|
res = []
|
|
ids = set()
|
|
else:
|
|
r = await self._query(session, _id, operation)
|
|
initial_data = r.json()
|
|
res = [initial_data]
|
|
ids = set(find_key(initial_data, 'rest_id'))
|
|
cursor = get_cursor(initial_data)
|
|
|
|
while (dups < DUP_LIMIT) and cursor:
|
|
prev_len = len(ids)
|
|
if prev_len >= limit:
|
|
# return res
|
|
break
|
|
|
|
r = await self._query(session, _id, operation, cursor=cursor)
|
|
data = r.json()
|
|
|
|
cursor = get_cursor(data)
|
|
ids |= set(find_key(data, 'rest_id'))
|
|
|
|
if self.debug:
|
|
self.logger.debug(f'cursor: {cursor}\tunique results: {len(ids)}')
|
|
|
|
if prev_len == len(ids):
|
|
dups += 1
|
|
|
|
res.append(data)
|
|
|
|
if is_resuming:
|
|
return res, cursor
|
|
return res
|
|
|
|
def resume_pagination(self, *args, **kwargs):
|
|
async def _resume(session: AsyncClient, _id: int | str, operation: tuple, limit=math.inf, **kwargs) -> tuple:
|
|
session = AsyncClient(headers=get_headers(session), cookies=self.session.cookies)
|
|
return await self._paginate(session, _id, operation, limit, **kwargs)
|
|
|
|
return asyncio.run(_resume(*args, **kwargs))
|
|
|
|
async def _query(self, session: AsyncClient, _id: int | str | list, operation: tuple, **kwargs) -> Response:
|
|
qid, op, k = operation
|
|
params = {k: orjson.dumps(v).decode() for k, v in {
|
|
'variables': {k: _id} | Operation.default_variables | kwargs,
|
|
'features': Operation.default_features,
|
|
}.items()}
|
|
r = await session.get(f'{self.api}/{qid}/{op}', params=params)
|
|
txt = r.text
|
|
data = r.json()
|
|
if self.debug:
|
|
self.log(r, txt, data)
|
|
if self.save:
|
|
save_data(data, op, _id[0] if isinstance(_id, list) else _id)
|
|
return r
|
|
|
|
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True) -> None:
|
|
tweets = self.tweets_by_id(ids)
|
|
urls = []
|
|
for tweet in tweets:
|
|
tweet_id = find_key(tweet, 'id_str')[0]
|
|
url = f'https://twitter.com/i/status/{tweet_id}' # `i` evaluates to screen_name
|
|
media = [y for x in find_key(tweet, 'media') for y in x]
|
|
if photos:
|
|
photo_urls = list({u for m in media if 'ext_tw_video_thumb' not in (u := m['media_url_https'])})
|
|
[urls.append([url, photo]) for photo in photo_urls]
|
|
if videos:
|
|
video_urls = [x['variants'] for m in media if (x := m.get('video_info'))]
|
|
hq_videos = {sorted(v, key=lambda d: d.get('bitrate', 0))[-1]['url'] for v in video_urls}
|
|
[urls.append([url, video]) for video in hq_videos]
|
|
|
|
with tqdm(total=len(urls), desc='downloading media') as pbar:
|
|
with ThreadPoolExecutor(max_workers=32) as e:
|
|
for future in as_completed(e.submit(self._download, x, y) for x, y in urls):
|
|
future.result()
|
|
pbar.update()
|
|
|
|
def _download(self, post_url: str, cdn_url: str, path: str = 'media', chunk_size: int = 4096) -> None:
|
|
Path(path).mkdir(parents=True, exist_ok=True)
|
|
name = urlsplit(post_url).path.replace('/', '_')[1:]
|
|
ext = urlsplit(cdn_url).path.split('/')[-1]
|
|
try:
|
|
with httpx.stream('GET', cdn_url) as r:
|
|
with open(f'{path}/{name}_{ext}', 'wb') as f:
|
|
for chunk in r.iter_bytes(chunk_size=chunk_size):
|
|
f.write(chunk)
|
|
except Exception as e:
|
|
self.logger.debug(f'[{RED}error{RESET}] failed to download media: {post_url} {e}')
|
|
|
|
def trends(self) -> dict:
|
|
"""Get trends for all UTC offsets"""
|
|
|
|
def get_trends(offset: str, url: str, headers: dict):
|
|
headers['x-twitter-utcoffset'] = offset
|
|
r = self.session.get(url, headers=headers)
|
|
trends = find_key(r.json(), 'item')
|
|
return {t['content']['trend']['name']: t for t in trends}
|
|
|
|
headers = get_headers(self.session)
|
|
url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params)
|
|
offsets = [f"{str(i).zfill(3)}00" if i < 0 else f"+{str(i).zfill(2)}00" for i in range(-12, 15)]
|
|
trends = {}
|
|
with tqdm(total=len(offsets), desc='downloading trends') as pbar:
|
|
with ThreadPoolExecutor(max_workers=32) as e:
|
|
for future in as_completed(e.submit(get_trends, o, url, headers) for o in offsets):
|
|
trends |= future.result()
|
|
pbar.update()
|
|
|
|
path = Path(f'data/raw/trends')
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
(path / f'{time.time_ns()}.json').write_text(
|
|
orjson.dumps(trends, option=orjson.OPT_INDENT_2).decode(),
|
|
encoding='utf-8'
|
|
)
|
|
return trends
|
|
|
|
def log(self, r: Response | Response, txt: str, data: dict):
|
|
status = r.status_code
|
|
|
|
def stat(r):
|
|
if self.debug >= 1:
|
|
self.logger.debug(f'{r.url}')
|
|
if self.debug >= 2:
|
|
self.logger.debug(f'{txt}')
|
|
|
|
limits = {k: v for k, v in r.headers.items() if 'x-rate-limit' in k}
|
|
current_time = int(time.time())
|
|
wait = int(r.headers.get('x-rate-limit-reset', current_time)) - current_time
|
|
self.logger.debug(
|
|
f"remaining: {MAGENTA}{limits['x-rate-limit-remaining']}/{limits['x-rate-limit-limit']}{RESET} requests")
|
|
self.logger.debug(f'reset: {MAGENTA}{(wait / 60):.2f}{RESET} minutes')
|
|
|
|
try:
|
|
if 'json' in r.headers.get('content-type', ''):
|
|
if data.get('errors'):
|
|
self.logger.debug(f'[{RED}error{RESET}] {status} {data}')
|
|
else:
|
|
self.logger.debug(fmt_status(status))
|
|
stat(r)
|
|
else:
|
|
self.logger.debug(fmt_status(status))
|
|
stat(r)
|
|
except Exception as e:
|
|
self.logger.debug(f'failed to log: {e}')
|