add new batched endpoint TweetResultsByRestIds

This commit is contained in:
Trevor Hobenshield
2023-12-07 12:28:56 -08:00
parent f15d0e29c9
commit a4d8a83763
9 changed files with 133 additions and 88 deletions

View File

@@ -215,8 +215,20 @@ account.change_password('old pwd','new pwd')
#### Get all user/tweet data
Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
| Endpoint | Batch Size | Rate Limit |
|---------------|----------------|---------------|
| tweets_by_ids | ~220 | 500 / 15 mins |
| tweets_by_id | 1 | 50 / 15 mins |
| users_by_ids | ~220 | 100 / 15 mins |
| users_by_id | 1 | 500 / 15 mins |
![](assets/scrape.gif)
*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
```python
from twitter.scraper import Scraper
@@ -236,7 +248,8 @@ scraper = Scraper(email, username, password)
# user data
users = scraper.users(['foo', 'bar', 'hello', 'world'])
users = scraper.users_by_ids([123, 234, 345]) # batch-request
users = scraper.users_by_ids([123, 234, 345]) # preferred
users = scraper.users_by_id([123, 234, 345])
tweets = scraper.tweets([123, 234, 345])
likes = scraper.likes([123, 234, 345])
tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
@@ -250,8 +263,9 @@ scraper.recommended_users()
scraper.recommended_users([123])
# tweet data
tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
tweets_details = scraper.tweets_details([987, 876, 754])
tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
tweets = scraper.tweets_by_id([987, 876, 754])
tweet_details = scraper.tweets_details([987, 876, 754])
retweeters = scraper.retweeters([987, 876, 754])
favoriters = scraper.favoriters([987, 876, 754])

View File

@@ -3,12 +3,13 @@ from setuptools import find_packages, setup
from pathlib import Path
install_requires = [
"aiofiles",
"websockets",
"nest_asyncio",
"httpx",
"tqdm",
"orjson",
'aiofiles',
'nest_asyncio',
'httpx',
'tqdm',
'orjson',
'm3u8',
'websockets',
'uvloop; platform_system != "Windows"',
]
@@ -239,7 +240,18 @@ setup(
### Scraping
#### Get all user/tweet data
Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
| Endpoint | Batch Size | Rate Limit |
|---------------|----------------|---------------|
| tweets_by_ids | ~220 | 500 / 15 mins |
| tweets_by_id | 1 | 50 / 15 mins |
| users_by_ids | ~220 | 100 / 15 mins |
| users_by_id | 1 | 500 / 15 mins |
*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
```python
from twitter.scraper import Scraper
@@ -259,7 +271,8 @@ setup(
# user data
users = scraper.users(['foo', 'bar', 'hello', 'world'])
users = scraper.users_by_ids([123, 234, 345]) # batch-request
users = scraper.users_by_ids([123, 234, 345]) # preferred
users = scraper.users_by_id([123, 234, 345])
tweets = scraper.tweets([123, 234, 345])
likes = scraper.likes([123, 234, 345])
tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
@@ -267,24 +280,25 @@ setup(
following = scraper.following([123, 234, 345])
followers = scraper.followers([123, 234, 345])
scraper.tweet_stats([111111, 222222, 333333])
# get recommended users based on user
scraper.recommended_users()
scraper.recommended_users([123])
# tweet data
tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
tweets_details = scraper.tweets_details([987, 876, 754])
tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
tweets = scraper.tweets_by_id([987, 876, 754])
tweet_details = scraper.tweets_details([987, 876, 754])
retweeters = scraper.retweeters([987, 876, 754])
favoriters = scraper.favoriters([987, 876, 754])
scraper.download_media([
111111,
222222,
333333,
444444,
])
# trends
scraper.trends()
```

View File

@@ -1,5 +1,5 @@
__title__ = "twitter-api-client"
__description__ = "Implementation of X/Twitter v1, v2, and GraphQL APIs."
__version__ = "0.10.12"
__version__ = "0.10.13"
__author__ = "Trevor Hobenshield"
__license__ = "MIT"

View File

@@ -3,7 +3,6 @@ import hashlib
import logging.config
import math
import mimetypes
import platform
from copy import deepcopy
from datetime import datetime
from string import ascii_letters
@@ -18,20 +17,18 @@ from .login import login
from .util import *
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
import nest_asyncio
nest_asyncio.apply()
nest_asyncio.apply()
except:
...
if platform.system() != 'Windows':
try:
import uvloop
try:
import uvloop
uvloop.install()
except ImportError as e:
...
uvloop.install()
except:
...
class Account:

View File

@@ -1,5 +1,10 @@
from dataclasses import dataclass
# todo: not accurate measure. value will decrease as new gql features/variables are required. (actual limitation is request size, i.e. new gql features an variables contribute to total request size)
MAX_GQL_CHAR_LIMIT = 4_200
MAX_ENDPOINT_LIMIT = 500 # 500/15 mins
MAX_IMAGE_SIZE = 5_242_880 # ~5 MB
MAX_GIF_SIZE = 15_728_640 # ~15 MB
MAX_VIDEO_SIZE = 536_870_912 # ~530 MB
@@ -105,6 +110,7 @@ class Operation:
UserMedia = {'userId': int}, 'YqiE3JL1KNgf9nSljYdxaA', 'UserMedia'
UserTweetsAndReplies = {'userId': int}, 'RIWc55YCNyUJ-U3HHGYkdg', 'UserTweetsAndReplies'
TweetResultByRestId = {'tweetId': int}, 'D_jNhjWZeRZT5NURzfJZSQ', 'TweetResultByRestId'
TweetResultsByRestIds = {'tweetIds': list[int | str]}, 'BWy5aoI-WvwbeSiHUIf2Hw', 'TweetResultsByRestIds'
TweetDetail = {'focalTweetId': int}, 'zXaXQgfyR4GxE21uwYQSyA', 'TweetDetail'
TweetStats = {'rest_id': int}, 'EvbTkPDT-xQCfupPu0rWMA', 'TweetStats'
Likes = {'userId': int}, 'nXEl0lfN_XSznVMlprThgQ', 'Likes'
@@ -360,6 +366,10 @@ class Operation:
'withMessages': True,
}
default_features = {
# new
'c9s_tweet_anatomy_moderator_badge_enabled': True,
'responsive_web_home_pinned_timelines_enabled': True,
'blue_business_profile_image_shape_enabled': True,
'creator_subscriptions_tweet_preview_api_enabled': True,
'freedom_of_speech_not_reach_fetch_enabled': True,

View File

@@ -2,7 +2,7 @@ import sys
from httpx import Client
from .constants import GREEN, YELLOW, RED, BOLD, RESET
from .constants import YELLOW, RED, BOLD, RESET
from .util import find_key

View File

@@ -1,9 +1,7 @@
import asyncio
import logging.config
import math
import platform
import aiofiles
import websockets
from httpx import AsyncClient, Limits, ReadTimeout, URL
from tqdm.asyncio import tqdm_asyncio
@@ -13,20 +11,18 @@ from .login import login
from .util import *
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
import nest_asyncio
nest_asyncio.apply()
nest_asyncio.apply()
except:
...
if platform.system() != 'Windows':
try:
import uvloop
try:
import uvloop
uvloop.install()
except ImportError as e:
...
uvloop.install()
except:
...
class Scraper:
@@ -49,7 +45,7 @@ class Scraper:
"""
return self._run(Operation.UserByScreenName, screen_names, **kwargs)
def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
def tweets_by_id(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
"""
Get tweet metadata by tweet ids.
@@ -59,6 +55,18 @@ class Scraper:
"""
return self._run(Operation.TweetResultByRestId, tweet_ids, **kwargs)
def tweets_by_ids(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
"""
Get tweet metadata by tweet ids.
Special batch query for tweet data. Most efficient way to get tweets.
@param tweet_ids: list of tweet ids
@param kwargs: optional keyword arguments
@return: list of tweet data as dicts
"""
return self._run(Operation.TweetResultsByRestIds, batch_ids(tweet_ids), **kwargs)
def tweets_details(self, tweet_ids: list[int], **kwargs) -> list[dict]:
"""
Get tweet data by tweet ids.
@@ -230,8 +238,7 @@ class Scraper:
"""
return self._run(Operation.UserByRestId, user_ids, **kwargs)
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192,
stream: bool = False) -> None:
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192, stream: bool = False) -> None:
"""
Download media from tweets by tweet ids.
@@ -515,12 +522,12 @@ class Scraper:
return asyncio.run(process())
def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | dict], **kwargs):
def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | list | dict], **kwargs):
keys, qid, name = operation
# stay within rate-limits
if (l := len(queries)) > 500:
if (l := len(queries)) > MAX_ENDPOINT_LIMIT:
self.logger.warning(f'Got {l} queries, truncating to first 500.')
queries = list(queries)[:500]
queries = list(queries)[:MAX_ENDPOINT_LIMIT]
if all(isinstance(q, dict) for q in queries):
data = asyncio.run(self._process(operation, list(queries), **kwargs))
@@ -542,14 +549,13 @@ class Scraper:
if self.debug:
log(self.logger, self.debug, r)
if self.save:
save_json(r, self.out, name, **kwargs)
await save_json(r, self.out, name, **kwargs)
return r
async def _process(self, operation: tuple, queries: list[dict], **kwargs):
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc=operation[-1])

View File

@@ -19,20 +19,18 @@ reset = '\x1b[0m'
colors = [f'\x1b[{i}m' for i in range(31, 37)]
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
import nest_asyncio
nest_asyncio.apply()
nest_asyncio.apply()
except:
...
if platform.system() != 'Windows':
try:
import uvloop
try:
import uvloop
uvloop.install()
except ImportError as e:
...
uvloop.install()
except:
...
class Search:

View File

@@ -5,10 +5,12 @@ from logging import Logger
from pathlib import Path
from urllib.parse import urlsplit, urlencode, urlunsplit, parse_qs, quote
import aiofiles
import orjson
from aiofiles.os import makedirs
from httpx import Response, Client
from .constants import GREEN, MAGENTA, RED, RESET, ID_MAP
from .constants import GREEN, MAGENTA, RED, RESET, ID_MAP, MAX_GQL_CHAR_LIMIT
def init_session():
@@ -25,20 +27,17 @@ def init_session():
return client
def batch_ids(ids: list[int], char_limit: int = 4_500) -> list[dict]:
""" To avoid 431 errors """
length = 0
res, batch = [], []
def batch_ids(ids: list[int | str], char_limit: int = MAX_GQL_CHAR_LIMIT) -> list[list]:
"""To avoid 431 errors"""
res, batch, length = [], [], 0
for x in map(str, ids):
curr_length = len(x)
if length + curr_length > char_limit:
if length + len(x) > char_limit:
res.append(batch)
batch = []
length = 0
batch, length = [], 0
batch.append(x)
length += curr_length
if batch:
res.append(batch)
length += len(x)
res.append(batch) if batch else ...
print(f'Batched {sum(map(len, res))} ids into {len(res)} requests')
return res
@@ -46,15 +45,22 @@ def build_params(params: dict) -> dict:
return {k: orjson.dumps(v).decode() for k, v in params.items()}
def save_json(r: Response, path: Path, name: str, **kwargs):
async def save_json(r: Response, path: str | Path, name: str, **kwargs):
try:
data = r.json()
kwargs.pop('cursor', None)
out = path / '_'.join(map(str, kwargs.values()))
out.mkdir(parents=True, exist_ok=True)
(out / f'{time.time_ns()}_{name}.json').write_bytes(orjson.dumps(data))
# special case: only 2 endpoints have batch requests as of Dec 2023
if name in {'TweetResultsByRestIds', 'UsersByRestIds'}:
out = f'{path}/batch'
else:
out = f'{path}/{"_".join(map(str, kwargs.values()))}'
await makedirs(out, exist_ok=True)
async with aiofiles.open(f'{out}/{time.time_ns()}_{name}.json', 'wb') as fp:
await fp.write(orjson.dumps(data))
except Exception as e:
print(f'Failed to save data: {e}')
print(f'Failed to save JSON data for {kwargs}\n{e}')
def flatten(seq: list | tuple) -> list:
@@ -212,19 +218,6 @@ def fmt_status(status: int) -> str:
return f'[{color}{status}{RESET}]'
def get_ids(data: list | dict, operation: tuple) -> set:
expr = ID_MAP[operation[-1]]
return {k for k in find_key(data, 'entryId') if re.search(expr, k)}
def dump(path: str, **kwargs):
fname, data = list(kwargs.items())[0]
out = Path(path)
out.mkdir(exist_ok=True, parents=True)
(out / f'{fname}_{time.time_ns()}.json').write_bytes(
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
def get_code(cls, retries=5) -> str | None:
""" Get verification code from Proton Mail inbox """
@@ -245,3 +238,16 @@ def get_code(cls, retries=5) -> str | None:
t = 2 ** i + random.random()
print(f'Retrying in {f"{t:.2f}"} seconds')
time.sleep(t)
# todo: to remove
def get_ids(data: list | dict, operation: tuple) -> set:
expr = ID_MAP[operation[-1]]
return {k for k in find_key(data, 'entryId') if re.search(expr, k)}
def dump(path: str, **kwargs):
fname, data = list(kwargs.items())[0]
out = Path(path)
out.mkdir(exist_ok=True, parents=True)
(out / f'{fname}_{time.time_ns()}.json').write_bytes(orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))