mirror of
https://github.com/trevorhobenshield/twitter-api-client.git
synced 2025-12-19 09:58:30 -05:00
add new batched endpoint TweetResultsByRestIds
This commit is contained in:
20
readme.md
20
readme.md
@@ -215,8 +215,20 @@ account.change_password('old pwd','new pwd')
|
||||
|
||||
#### Get all user/tweet data
|
||||
|
||||
Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
|
||||
|
||||
| Endpoint | Batch Size | Rate Limit |
|
||||
|---------------|----------------|---------------|
|
||||
| tweets_by_ids | ~220 | 500 / 15 mins |
|
||||
| tweets_by_id | 1 | 50 / 15 mins |
|
||||
| users_by_ids | ~220 | 100 / 15 mins |
|
||||
| users_by_id | 1 | 500 / 15 mins |
|
||||
|
||||
|
||||

|
||||
|
||||
*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
|
||||
|
||||
```python
|
||||
from twitter.scraper import Scraper
|
||||
|
||||
@@ -236,7 +248,8 @@ scraper = Scraper(email, username, password)
|
||||
|
||||
# user data
|
||||
users = scraper.users(['foo', 'bar', 'hello', 'world'])
|
||||
users = scraper.users_by_ids([123, 234, 345]) # batch-request
|
||||
users = scraper.users_by_ids([123, 234, 345]) # preferred
|
||||
users = scraper.users_by_id([123, 234, 345])
|
||||
tweets = scraper.tweets([123, 234, 345])
|
||||
likes = scraper.likes([123, 234, 345])
|
||||
tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
|
||||
@@ -250,8 +263,9 @@ scraper.recommended_users()
|
||||
scraper.recommended_users([123])
|
||||
|
||||
# tweet data
|
||||
tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
|
||||
tweets_details = scraper.tweets_details([987, 876, 754])
|
||||
tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
|
||||
tweets = scraper.tweets_by_id([987, 876, 754])
|
||||
tweet_details = scraper.tweets_details([987, 876, 754])
|
||||
retweeters = scraper.retweeters([987, 876, 754])
|
||||
favoriters = scraper.favoriters([987, 876, 754])
|
||||
|
||||
|
||||
42
setup.py
42
setup.py
@@ -3,12 +3,13 @@ from setuptools import find_packages, setup
|
||||
from pathlib import Path
|
||||
|
||||
install_requires = [
|
||||
"aiofiles",
|
||||
"websockets",
|
||||
"nest_asyncio",
|
||||
"httpx",
|
||||
"tqdm",
|
||||
"orjson",
|
||||
'aiofiles',
|
||||
'nest_asyncio',
|
||||
'httpx',
|
||||
'tqdm',
|
||||
'orjson',
|
||||
'm3u8',
|
||||
'websockets',
|
||||
'uvloop; platform_system != "Windows"',
|
||||
]
|
||||
|
||||
@@ -239,7 +240,18 @@ setup(
|
||||
### Scraping
|
||||
|
||||
#### Get all user/tweet data
|
||||
|
||||
|
||||
Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
|
||||
|
||||
| Endpoint | Batch Size | Rate Limit |
|
||||
|---------------|----------------|---------------|
|
||||
| tweets_by_ids | ~220 | 500 / 15 mins |
|
||||
| tweets_by_id | 1 | 50 / 15 mins |
|
||||
| users_by_ids | ~220 | 100 / 15 mins |
|
||||
| users_by_id | 1 | 500 / 15 mins |
|
||||
|
||||
*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
|
||||
|
||||
```python
|
||||
from twitter.scraper import Scraper
|
||||
|
||||
@@ -259,7 +271,8 @@ setup(
|
||||
|
||||
# user data
|
||||
users = scraper.users(['foo', 'bar', 'hello', 'world'])
|
||||
users = scraper.users_by_ids([123, 234, 345]) # batch-request
|
||||
users = scraper.users_by_ids([123, 234, 345]) # preferred
|
||||
users = scraper.users_by_id([123, 234, 345])
|
||||
tweets = scraper.tweets([123, 234, 345])
|
||||
likes = scraper.likes([123, 234, 345])
|
||||
tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
|
||||
@@ -267,24 +280,25 @@ setup(
|
||||
following = scraper.following([123, 234, 345])
|
||||
followers = scraper.followers([123, 234, 345])
|
||||
scraper.tweet_stats([111111, 222222, 333333])
|
||||
|
||||
|
||||
# get recommended users based on user
|
||||
scraper.recommended_users()
|
||||
scraper.recommended_users([123])
|
||||
|
||||
|
||||
# tweet data
|
||||
tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
|
||||
tweets_details = scraper.tweets_details([987, 876, 754])
|
||||
tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
|
||||
tweets = scraper.tweets_by_id([987, 876, 754])
|
||||
tweet_details = scraper.tweets_details([987, 876, 754])
|
||||
retweeters = scraper.retweeters([987, 876, 754])
|
||||
favoriters = scraper.favoriters([987, 876, 754])
|
||||
|
||||
|
||||
scraper.download_media([
|
||||
111111,
|
||||
222222,
|
||||
333333,
|
||||
444444,
|
||||
])
|
||||
|
||||
|
||||
# trends
|
||||
scraper.trends()
|
||||
```
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
__title__ = "twitter-api-client"
|
||||
__description__ = "Implementation of X/Twitter v1, v2, and GraphQL APIs."
|
||||
__version__ = "0.10.12"
|
||||
__version__ = "0.10.13"
|
||||
__author__ = "Trevor Hobenshield"
|
||||
__license__ = "MIT"
|
||||
@@ -3,7 +3,6 @@ import hashlib
|
||||
import logging.config
|
||||
import math
|
||||
import mimetypes
|
||||
import platform
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from string import ascii_letters
|
||||
@@ -18,20 +17,18 @@ from .login import login
|
||||
from .util import *
|
||||
|
||||
try:
|
||||
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
|
||||
import nest_asyncio
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
nest_asyncio.apply()
|
||||
except:
|
||||
...
|
||||
|
||||
if platform.system() != 'Windows':
|
||||
try:
|
||||
import uvloop
|
||||
try:
|
||||
import uvloop
|
||||
|
||||
uvloop.install()
|
||||
except ImportError as e:
|
||||
...
|
||||
uvloop.install()
|
||||
except:
|
||||
...
|
||||
|
||||
|
||||
class Account:
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
# todo: not accurate measure. value will decrease as new gql features/variables are required. (actual limitation is request size, i.e. new gql features an variables contribute to total request size)
|
||||
MAX_GQL_CHAR_LIMIT = 4_200
|
||||
|
||||
MAX_ENDPOINT_LIMIT = 500 # 500/15 mins
|
||||
|
||||
MAX_IMAGE_SIZE = 5_242_880 # ~5 MB
|
||||
MAX_GIF_SIZE = 15_728_640 # ~15 MB
|
||||
MAX_VIDEO_SIZE = 536_870_912 # ~530 MB
|
||||
@@ -105,6 +110,7 @@ class Operation:
|
||||
UserMedia = {'userId': int}, 'YqiE3JL1KNgf9nSljYdxaA', 'UserMedia'
|
||||
UserTweetsAndReplies = {'userId': int}, 'RIWc55YCNyUJ-U3HHGYkdg', 'UserTweetsAndReplies'
|
||||
TweetResultByRestId = {'tweetId': int}, 'D_jNhjWZeRZT5NURzfJZSQ', 'TweetResultByRestId'
|
||||
TweetResultsByRestIds = {'tweetIds': list[int | str]}, 'BWy5aoI-WvwbeSiHUIf2Hw', 'TweetResultsByRestIds'
|
||||
TweetDetail = {'focalTweetId': int}, 'zXaXQgfyR4GxE21uwYQSyA', 'TweetDetail'
|
||||
TweetStats = {'rest_id': int}, 'EvbTkPDT-xQCfupPu0rWMA', 'TweetStats'
|
||||
Likes = {'userId': int}, 'nXEl0lfN_XSznVMlprThgQ', 'Likes'
|
||||
@@ -360,6 +366,10 @@ class Operation:
|
||||
'withMessages': True,
|
||||
}
|
||||
default_features = {
|
||||
# new
|
||||
'c9s_tweet_anatomy_moderator_badge_enabled': True,
|
||||
'responsive_web_home_pinned_timelines_enabled': True,
|
||||
|
||||
'blue_business_profile_image_shape_enabled': True,
|
||||
'creator_subscriptions_tweet_preview_api_enabled': True,
|
||||
'freedom_of_speech_not_reach_fetch_enabled': True,
|
||||
|
||||
@@ -2,7 +2,7 @@ import sys
|
||||
|
||||
from httpx import Client
|
||||
|
||||
from .constants import GREEN, YELLOW, RED, BOLD, RESET
|
||||
from .constants import YELLOW, RED, BOLD, RESET
|
||||
from .util import find_key
|
||||
|
||||
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
import asyncio
|
||||
import logging.config
|
||||
import math
|
||||
import platform
|
||||
|
||||
import aiofiles
|
||||
import websockets
|
||||
from httpx import AsyncClient, Limits, ReadTimeout, URL
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
@@ -13,20 +11,18 @@ from .login import login
|
||||
from .util import *
|
||||
|
||||
try:
|
||||
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
|
||||
import nest_asyncio
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
nest_asyncio.apply()
|
||||
except:
|
||||
...
|
||||
|
||||
if platform.system() != 'Windows':
|
||||
try:
|
||||
import uvloop
|
||||
try:
|
||||
import uvloop
|
||||
|
||||
uvloop.install()
|
||||
except ImportError as e:
|
||||
...
|
||||
uvloop.install()
|
||||
except:
|
||||
...
|
||||
|
||||
|
||||
class Scraper:
|
||||
@@ -49,7 +45,7 @@ class Scraper:
|
||||
"""
|
||||
return self._run(Operation.UserByScreenName, screen_names, **kwargs)
|
||||
|
||||
def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
|
||||
def tweets_by_id(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
|
||||
"""
|
||||
Get tweet metadata by tweet ids.
|
||||
|
||||
@@ -59,6 +55,18 @@ class Scraper:
|
||||
"""
|
||||
return self._run(Operation.TweetResultByRestId, tweet_ids, **kwargs)
|
||||
|
||||
def tweets_by_ids(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
|
||||
"""
|
||||
Get tweet metadata by tweet ids.
|
||||
|
||||
Special batch query for tweet data. Most efficient way to get tweets.
|
||||
|
||||
@param tweet_ids: list of tweet ids
|
||||
@param kwargs: optional keyword arguments
|
||||
@return: list of tweet data as dicts
|
||||
"""
|
||||
return self._run(Operation.TweetResultsByRestIds, batch_ids(tweet_ids), **kwargs)
|
||||
|
||||
def tweets_details(self, tweet_ids: list[int], **kwargs) -> list[dict]:
|
||||
"""
|
||||
Get tweet data by tweet ids.
|
||||
@@ -230,8 +238,7 @@ class Scraper:
|
||||
"""
|
||||
return self._run(Operation.UserByRestId, user_ids, **kwargs)
|
||||
|
||||
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192,
|
||||
stream: bool = False) -> None:
|
||||
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192, stream: bool = False) -> None:
|
||||
"""
|
||||
Download media from tweets by tweet ids.
|
||||
|
||||
@@ -515,12 +522,12 @@ class Scraper:
|
||||
|
||||
return asyncio.run(process())
|
||||
|
||||
def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | dict], **kwargs):
|
||||
def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | list | dict], **kwargs):
|
||||
keys, qid, name = operation
|
||||
# stay within rate-limits
|
||||
if (l := len(queries)) > 500:
|
||||
if (l := len(queries)) > MAX_ENDPOINT_LIMIT:
|
||||
self.logger.warning(f'Got {l} queries, truncating to first 500.')
|
||||
queries = list(queries)[:500]
|
||||
queries = list(queries)[:MAX_ENDPOINT_LIMIT]
|
||||
|
||||
if all(isinstance(q, dict) for q in queries):
|
||||
data = asyncio.run(self._process(operation, list(queries), **kwargs))
|
||||
@@ -542,14 +549,13 @@ class Scraper:
|
||||
if self.debug:
|
||||
log(self.logger, self.debug, r)
|
||||
if self.save:
|
||||
save_json(r, self.out, name, **kwargs)
|
||||
await save_json(r, self.out, name, **kwargs)
|
||||
return r
|
||||
|
||||
async def _process(self, operation: tuple, queries: list[dict], **kwargs):
|
||||
limits = Limits(max_connections=100, max_keepalive_connections=10)
|
||||
headers = self.session.headers if self.guest else get_headers(self.session)
|
||||
cookies = self.session.cookies
|
||||
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
|
||||
async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
|
||||
tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
|
||||
if self.pbar:
|
||||
return await tqdm_asyncio.gather(*tasks, desc=operation[-1])
|
||||
|
||||
@@ -19,20 +19,18 @@ reset = '\x1b[0m'
|
||||
colors = [f'\x1b[{i}m' for i in range(31, 37)]
|
||||
|
||||
try:
|
||||
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
|
||||
import nest_asyncio
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
nest_asyncio.apply()
|
||||
except:
|
||||
...
|
||||
|
||||
if platform.system() != 'Windows':
|
||||
try:
|
||||
import uvloop
|
||||
try:
|
||||
import uvloop
|
||||
|
||||
uvloop.install()
|
||||
except ImportError as e:
|
||||
...
|
||||
uvloop.install()
|
||||
except:
|
||||
...
|
||||
|
||||
|
||||
class Search:
|
||||
|
||||
@@ -5,10 +5,12 @@ from logging import Logger
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlsplit, urlencode, urlunsplit, parse_qs, quote
|
||||
|
||||
import aiofiles
|
||||
import orjson
|
||||
from aiofiles.os import makedirs
|
||||
from httpx import Response, Client
|
||||
|
||||
from .constants import GREEN, MAGENTA, RED, RESET, ID_MAP
|
||||
from .constants import GREEN, MAGENTA, RED, RESET, ID_MAP, MAX_GQL_CHAR_LIMIT
|
||||
|
||||
|
||||
def init_session():
|
||||
@@ -25,20 +27,17 @@ def init_session():
|
||||
return client
|
||||
|
||||
|
||||
def batch_ids(ids: list[int], char_limit: int = 4_500) -> list[dict]:
|
||||
""" To avoid 431 errors """
|
||||
length = 0
|
||||
res, batch = [], []
|
||||
def batch_ids(ids: list[int | str], char_limit: int = MAX_GQL_CHAR_LIMIT) -> list[list]:
|
||||
"""To avoid 431 errors"""
|
||||
res, batch, length = [], [], 0
|
||||
for x in map(str, ids):
|
||||
curr_length = len(x)
|
||||
if length + curr_length > char_limit:
|
||||
if length + len(x) > char_limit:
|
||||
res.append(batch)
|
||||
batch = []
|
||||
length = 0
|
||||
batch, length = [], 0
|
||||
batch.append(x)
|
||||
length += curr_length
|
||||
if batch:
|
||||
res.append(batch)
|
||||
length += len(x)
|
||||
res.append(batch) if batch else ...
|
||||
print(f'Batched {sum(map(len, res))} ids into {len(res)} requests')
|
||||
return res
|
||||
|
||||
|
||||
@@ -46,15 +45,22 @@ def build_params(params: dict) -> dict:
|
||||
return {k: orjson.dumps(v).decode() for k, v in params.items()}
|
||||
|
||||
|
||||
def save_json(r: Response, path: Path, name: str, **kwargs):
|
||||
async def save_json(r: Response, path: str | Path, name: str, **kwargs):
|
||||
try:
|
||||
data = r.json()
|
||||
kwargs.pop('cursor', None)
|
||||
out = path / '_'.join(map(str, kwargs.values()))
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
(out / f'{time.time_ns()}_{name}.json').write_bytes(orjson.dumps(data))
|
||||
|
||||
# special case: only 2 endpoints have batch requests as of Dec 2023
|
||||
if name in {'TweetResultsByRestIds', 'UsersByRestIds'}:
|
||||
out = f'{path}/batch'
|
||||
else:
|
||||
out = f'{path}/{"_".join(map(str, kwargs.values()))}'
|
||||
await makedirs(out, exist_ok=True)
|
||||
async with aiofiles.open(f'{out}/{time.time_ns()}_{name}.json', 'wb') as fp:
|
||||
await fp.write(orjson.dumps(data))
|
||||
|
||||
except Exception as e:
|
||||
print(f'Failed to save data: {e}')
|
||||
print(f'Failed to save JSON data for {kwargs}\n{e}')
|
||||
|
||||
|
||||
def flatten(seq: list | tuple) -> list:
|
||||
@@ -212,19 +218,6 @@ def fmt_status(status: int) -> str:
|
||||
return f'[{color}{status}{RESET}]'
|
||||
|
||||
|
||||
def get_ids(data: list | dict, operation: tuple) -> set:
|
||||
expr = ID_MAP[operation[-1]]
|
||||
return {k for k in find_key(data, 'entryId') if re.search(expr, k)}
|
||||
|
||||
|
||||
def dump(path: str, **kwargs):
|
||||
fname, data = list(kwargs.items())[0]
|
||||
out = Path(path)
|
||||
out.mkdir(exist_ok=True, parents=True)
|
||||
(out / f'{fname}_{time.time_ns()}.json').write_bytes(
|
||||
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
|
||||
|
||||
|
||||
def get_code(cls, retries=5) -> str | None:
|
||||
""" Get verification code from Proton Mail inbox """
|
||||
|
||||
@@ -245,3 +238,16 @@ def get_code(cls, retries=5) -> str | None:
|
||||
t = 2 ** i + random.random()
|
||||
print(f'Retrying in {f"{t:.2f}"} seconds')
|
||||
time.sleep(t)
|
||||
|
||||
|
||||
# todo: to remove
|
||||
def get_ids(data: list | dict, operation: tuple) -> set:
|
||||
expr = ID_MAP[operation[-1]]
|
||||
return {k for k in find_key(data, 'entryId') if re.search(expr, k)}
|
||||
|
||||
|
||||
def dump(path: str, **kwargs):
|
||||
fname, data = list(kwargs.items())[0]
|
||||
out = Path(path)
|
||||
out.mkdir(exist_ok=True, parents=True)
|
||||
(out / f'{fname}_{time.time_ns()}.json').write_bytes(orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
|
||||
|
||||
Reference in New Issue
Block a user