add new batched endpoint TweetResultsByRestIds

2025-12-19 09:58:30 -05:00 · 2023-12-07 12:28:56 -08:00
parent f15d0e29c9
commit a4d8a83763
9 changed files with 133 additions and 88 deletions
--- a/readme.md
+++ b/readme.md
@@ -215,8 +215,20 @@ account.change_password('old pwd','new pwd')

 #### Get all user/tweet data

+Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
+
+| Endpoint      | Batch Size     | Rate Limit    |
+|---------------|----------------|---------------|
+| tweets_by_ids | ~220           | 500 / 15 mins |
+| tweets_by_id  | 1              | 50 / 15 mins  |
+| users_by_ids  | ~220           | 100 / 15 mins |
+| users_by_id   | 1              | 500 / 15 mins |
+
+
 ![](assets/scrape.gif)

+*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
+
 ```python
 from twitter.scraper import Scraper

@@ -236,7 +248,8 @@ scraper = Scraper(email, username, password)

 # user data
 users = scraper.users(['foo', 'bar', 'hello', 'world'])
-users = scraper.users_by_ids([123, 234, 345])  # batch-request
+users = scraper.users_by_ids([123, 234, 345]) # preferred
+users = scraper.users_by_id([123, 234, 345])
 tweets = scraper.tweets([123, 234, 345])
 likes = scraper.likes([123, 234, 345])
 tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
@@ -250,8 +263,9 @@ scraper.recommended_users()
 scraper.recommended_users([123])

 # tweet data
-tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
-tweets_details = scraper.tweets_details([987, 876, 754])
+tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
+tweets = scraper.tweets_by_id([987, 876, 754])
+tweet_details = scraper.tweets_details([987, 876, 754])
 retweeters = scraper.retweeters([987, 876, 754])
 favoriters = scraper.favoriters([987, 876, 754])

--- a/setup.py
+++ b/setup.py
@@ -3,12 +3,13 @@ from setuptools import find_packages, setup
 from pathlib import Path

 install_requires = [
-    "aiofiles",
-    "websockets",
-    "nest_asyncio",
-    "httpx",
-    "tqdm",
-    "orjson",
+    'aiofiles',
+    'nest_asyncio',
+    'httpx',
+    'tqdm',
+    'orjson',
+    'm3u8',
+    'websockets',
    'uvloop; platform_system != "Windows"',
 ]

@@ -239,7 +240,18 @@ setup(
    ### Scraping

    #### Get all user/tweet data
-
+    
+    Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
+    
+    | Endpoint      | Batch Size     | Rate Limit    |
+    |---------------|----------------|---------------|
+    | tweets_by_ids | ~220           | 500 / 15 mins |
+    | tweets_by_id  | 1              | 50 / 15 mins  |
+    | users_by_ids  | ~220           | 100 / 15 mins |
+    | users_by_id   | 1              | 500 / 15 mins |
+    
+    *As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
+    
    ```python
    from twitter.scraper import Scraper

@@ -259,7 +271,8 @@ setup(

    # user data
    users = scraper.users(['foo', 'bar', 'hello', 'world'])
-    users = scraper.users_by_ids([123, 234, 345])  # batch-request
+    users = scraper.users_by_ids([123, 234, 345]) # preferred
+    users = scraper.users_by_id([123, 234, 345])
    tweets = scraper.tweets([123, 234, 345])
    likes = scraper.likes([123, 234, 345])
    tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
@@ -267,24 +280,25 @@ setup(
    following = scraper.following([123, 234, 345])
    followers = scraper.followers([123, 234, 345])
    scraper.tweet_stats([111111, 222222, 333333])
-
+    
    # get recommended users based on user
    scraper.recommended_users()
    scraper.recommended_users([123])
-
+    
    # tweet data
-    tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
-    tweets_details = scraper.tweets_details([987, 876, 754])
+    tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
+    tweets = scraper.tweets_by_id([987, 876, 754])
+    tweet_details = scraper.tweets_details([987, 876, 754])
    retweeters = scraper.retweeters([987, 876, 754])
    favoriters = scraper.favoriters([987, 876, 754])
-
+    
    scraper.download_media([
        111111,
        222222,
        333333,
        444444,
    ])
-
+    
    # trends
    scraper.trends()
    ```
--- a/twitter/version.py
+++ b/twitter/version.py
@@ -1,5 +1,5 @@
 __title__ = "twitter-api-client"
 __description__ = "Implementation of X/Twitter v1, v2, and GraphQL APIs."
-__version__ = "0.10.12"
+__version__ = "0.10.13"
 __author__ = "Trevor Hobenshield"
 __license__ = "MIT"
--- a/twitter/account.py
+++ b/twitter/account.py
@@ -3,7 +3,6 @@ import hashlib
 import logging.config
 import math
 import mimetypes
-import platform
 from copy import deepcopy
 from datetime import datetime
 from string import ascii_letters
@@ -18,20 +17,18 @@ from .login import login
 from .util import *

 try:
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        import nest_asyncio
+    import nest_asyncio

-        nest_asyncio.apply()
+    nest_asyncio.apply()
 except:
    ...

-if platform.system() != 'Windows':
-    try:
-        import uvloop
+try:
+    import uvloop

-        uvloop.install()
-    except ImportError as e:
-        ...
+    uvloop.install()
+except:
+    ...


 class Account:
--- a/twitter/constants.py
+++ b/twitter/constants.py
@@ -1,5 +1,10 @@
 from dataclasses import dataclass

+# todo: not accurate measure. value will decrease as new gql features/variables are required. (actual limitation is request size, i.e. new gql features an variables contribute to total request size)
+MAX_GQL_CHAR_LIMIT = 4_200
+
+MAX_ENDPOINT_LIMIT = 500  # 500/15 mins
+
 MAX_IMAGE_SIZE = 5_242_880  # ~5 MB
 MAX_GIF_SIZE = 15_728_640  # ~15 MB
 MAX_VIDEO_SIZE = 536_870_912  # ~530 MB
@@ -105,6 +110,7 @@ class Operation:
    UserMedia = {'userId': int}, 'YqiE3JL1KNgf9nSljYdxaA', 'UserMedia'
    UserTweetsAndReplies = {'userId': int}, 'RIWc55YCNyUJ-U3HHGYkdg', 'UserTweetsAndReplies'
    TweetResultByRestId = {'tweetId': int}, 'D_jNhjWZeRZT5NURzfJZSQ', 'TweetResultByRestId'
+    TweetResultsByRestIds = {'tweetIds': list[int | str]}, 'BWy5aoI-WvwbeSiHUIf2Hw', 'TweetResultsByRestIds'
    TweetDetail = {'focalTweetId': int}, 'zXaXQgfyR4GxE21uwYQSyA', 'TweetDetail'
    TweetStats = {'rest_id': int}, 'EvbTkPDT-xQCfupPu0rWMA', 'TweetStats'
    Likes = {'userId': int}, 'nXEl0lfN_XSznVMlprThgQ', 'Likes'
@@ -360,6 +366,10 @@ class Operation:
        'withMessages': True,
    }
    default_features = {
+        # new
+        'c9s_tweet_anatomy_moderator_badge_enabled': True,
+        'responsive_web_home_pinned_timelines_enabled': True,
+
        'blue_business_profile_image_shape_enabled': True,
        'creator_subscriptions_tweet_preview_api_enabled': True,
        'freedom_of_speech_not_reach_fetch_enabled': True,
--- a/twitter/login.py
+++ b/twitter/login.py
@@ -2,7 +2,7 @@ import sys

 from httpx import Client

-from .constants import GREEN, YELLOW, RED, BOLD, RESET
+from .constants import YELLOW, RED, BOLD, RESET
 from .util import find_key


--- a/twitter/scraper.py
+++ b/twitter/scraper.py
@@ -1,9 +1,7 @@
 import asyncio
 import logging.config
 import math
-import platform

-import aiofiles
 import websockets
 from httpx import AsyncClient, Limits, ReadTimeout, URL
 from tqdm.asyncio import tqdm_asyncio
@@ -13,20 +11,18 @@ from .login import login
 from .util import *

 try:
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        import nest_asyncio
+    import nest_asyncio

-        nest_asyncio.apply()
+    nest_asyncio.apply()
 except:
    ...

-if platform.system() != 'Windows':
-    try:
-        import uvloop
+try:
+    import uvloop

-        uvloop.install()
-    except ImportError as e:
-        ...
+    uvloop.install()
+except:
+    ...


 class Scraper:
@@ -49,7 +45,7 @@ class Scraper:
        """
        return self._run(Operation.UserByScreenName, screen_names, **kwargs)

-    def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
+    def tweets_by_id(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
        """
        Get tweet metadata by tweet ids.

@@ -59,6 +55,18 @@ class Scraper:
        """
        return self._run(Operation.TweetResultByRestId, tweet_ids, **kwargs)

+    def tweets_by_ids(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
+        """
+        Get tweet metadata by tweet ids.
+
+        Special batch query for tweet data. Most efficient way to get tweets.
+
+        @param tweet_ids: list of tweet ids
+        @param kwargs: optional keyword arguments
+        @return: list of tweet data as dicts
+        """
+        return self._run(Operation.TweetResultsByRestIds, batch_ids(tweet_ids), **kwargs)
+
    def tweets_details(self, tweet_ids: list[int], **kwargs) -> list[dict]:
        """
        Get tweet data by tweet ids.
@@ -230,8 +238,7 @@ class Scraper:
        """
        return self._run(Operation.UserByRestId, user_ids, **kwargs)

-    def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192,
-                       stream: bool = False) -> None:
+    def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192, stream: bool = False) -> None:
        """
        Download media from tweets by tweet ids.

@@ -515,12 +522,12 @@ class Scraper:

        return asyncio.run(process())

-    def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | dict], **kwargs):
+    def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | list | dict], **kwargs):
        keys, qid, name = operation
        # stay within rate-limits
-        if (l := len(queries)) > 500:
+        if (l := len(queries)) > MAX_ENDPOINT_LIMIT:
            self.logger.warning(f'Got {l} queries, truncating to first 500.')
-            queries = list(queries)[:500]
+            queries = list(queries)[:MAX_ENDPOINT_LIMIT]

        if all(isinstance(q, dict) for q in queries):
            data = asyncio.run(self._process(operation, list(queries), **kwargs))
@@ -542,14 +549,13 @@ class Scraper:
        if self.debug:
            log(self.logger, self.debug, r)
        if self.save:
-            save_json(r, self.out, name, **kwargs)
+            await save_json(r, self.out, name, **kwargs)
        return r

    async def _process(self, operation: tuple, queries: list[dict], **kwargs):
-        limits = Limits(max_connections=100, max_keepalive_connections=10)
        headers = self.session.headers if self.guest else get_headers(self.session)
        cookies = self.session.cookies
-        async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
+        async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
            tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
            if self.pbar:
                return await tqdm_asyncio.gather(*tasks, desc=operation[-1])
--- a/twitter/search.py
+++ b/twitter/search.py
@@ -19,20 +19,18 @@ reset = '\x1b[0m'
 colors = [f'\x1b[{i}m' for i in range(31, 37)]

 try:
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        import nest_asyncio
+    import nest_asyncio

-        nest_asyncio.apply()
+    nest_asyncio.apply()
 except:
    ...

-if platform.system() != 'Windows':
-    try:
-        import uvloop
+try:
+    import uvloop

-        uvloop.install()
-    except ImportError as e:
-        ...
+    uvloop.install()
+except:
+    ...


 class Search:
--- a/twitter/util.py
+++ b/twitter/util.py
@@ -5,10 +5,12 @@ from logging import Logger
 from pathlib import Path
 from urllib.parse import urlsplit, urlencode, urlunsplit, parse_qs, quote

+import aiofiles
 import orjson
+from aiofiles.os import makedirs
 from httpx import Response, Client

-from .constants import GREEN, MAGENTA, RED, RESET, ID_MAP
+from .constants import GREEN, MAGENTA, RED, RESET, ID_MAP, MAX_GQL_CHAR_LIMIT


 def init_session():
@@ -25,20 +27,17 @@ def init_session():
    return client


-def batch_ids(ids: list[int], char_limit: int = 4_500) -> list[dict]:
-    """ To avoid 431 errors """
-    length = 0
-    res, batch = [], []
+def batch_ids(ids: list[int | str], char_limit: int = MAX_GQL_CHAR_LIMIT) -> list[list]:
+    """To avoid 431 errors"""
+    res, batch, length = [], [], 0
    for x in map(str, ids):
-        curr_length = len(x)
-        if length + curr_length > char_limit:
+        if length + len(x) > char_limit:
            res.append(batch)
-            batch = []
-            length = 0
+            batch, length = [], 0
        batch.append(x)
-        length += curr_length
-    if batch:
-        res.append(batch)
+        length += len(x)
+    res.append(batch) if batch else ...
+    print(f'Batched {sum(map(len, res))} ids into {len(res)} requests')
    return res


@@ -46,15 +45,22 @@ def build_params(params: dict) -> dict:
    return {k: orjson.dumps(v).decode() for k, v in params.items()}


-def save_json(r: Response, path: Path, name: str, **kwargs):
+async def save_json(r: Response, path: str | Path, name: str, **kwargs):
    try:
        data = r.json()
        kwargs.pop('cursor', None)
-        out = path / '_'.join(map(str, kwargs.values()))
-        out.mkdir(parents=True, exist_ok=True)
-        (out / f'{time.time_ns()}_{name}.json').write_bytes(orjson.dumps(data))
+
+        # special case: only 2 endpoints have batch requests as of Dec 2023
+        if name in {'TweetResultsByRestIds', 'UsersByRestIds'}:
+            out = f'{path}/batch'
+        else:
+            out = f'{path}/{"_".join(map(str, kwargs.values()))}'
+        await makedirs(out, exist_ok=True)
+        async with aiofiles.open(f'{out}/{time.time_ns()}_{name}.json', 'wb') as fp:
+            await fp.write(orjson.dumps(data))
+
    except Exception as e:
-        print(f'Failed to save data: {e}')
+        print(f'Failed to save JSON data for {kwargs}\n{e}')


 def flatten(seq: list | tuple) -> list:
@@ -212,19 +218,6 @@ def fmt_status(status: int) -> str:
    return f'[{color}{status}{RESET}]'


-def get_ids(data: list | dict, operation: tuple) -> set:
-    expr = ID_MAP[operation[-1]]
-    return {k for k in find_key(data, 'entryId') if re.search(expr, k)}
-
-
-def dump(path: str, **kwargs):
-    fname, data = list(kwargs.items())[0]
-    out = Path(path)
-    out.mkdir(exist_ok=True, parents=True)
-    (out / f'{fname}_{time.time_ns()}.json').write_bytes(
-        orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
-
-
 def get_code(cls, retries=5) -> str | None:
    """ Get verification code from Proton Mail inbox """

@@ -245,3 +238,16 @@ def get_code(cls, retries=5) -> str | None:
        t = 2 ** i + random.random()
        print(f'Retrying in {f"{t:.2f}"} seconds')
        time.sleep(t)
+
+
+# todo: to remove
+def get_ids(data: list | dict, operation: tuple) -> set:
+    expr = ID_MAP[operation[-1]]
+    return {k for k in find_key(data, 'entryId') if re.search(expr, k)}
+
+
+def dump(path: str, **kwargs):
+    fname, data = list(kwargs.items())[0]
+    out = Path(path)
+    out.mkdir(exist_ok=True, parents=True)
+    (out / f'{fname}_{time.time_ns()}.json').write_bytes(orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))