add new GraphQL endpoint /SearchTimeline. api.twitter.com/2/search/adaptive.json deprecated

2023-07-02 12:09:53 -07:00
parent 55425b7181
commit afba99f2eb
6 changed files with 270 additions and 153 deletions
--- a/examples/postprocess.py
+++ b/examples/postprocess.py
@@ -0,0 +1,96 @@
+import re
+import pandas as pd
+from twitter.util import find_key
+
+
+def get_tweets(data: list | dict, cols: list[str] = None):
+    """
+    Convert raw GraphQL response to DataFrame
+
+    @param data: tweets
+    @param cols: option to only include certain columns
+    @return: DataFrame of tweets
+    """
+    entries = [y for x in find_key(data, 'entries') for y in x]
+    # filter out promoted tweets
+    tweets = [x for x in entries if not x.get('entryId').startswith('promoted')]
+    df = (
+        pd.json_normalize(find_key(tweets, 'tweet_results'), max_level=1)
+        ['result.legacy'].apply(pd.Series)
+        .dropna(subset='user_id_str')
+        .assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
+        .sort_values('created_at', ascending=False)
+        .reset_index(drop=True)
+    )
+    numeric = [
+        'user_id_str',
+        'id_str',
+        'favorite_count',
+        'quote_count',
+        'reply_count',
+        'retweet_count',
+    ]
+    df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
+    cols = cols or [
+        'id_str',
+        'user_id_str',
+        'created_at',
+        'full_text',
+        'favorite_count',
+        'quote_count',
+        'reply_count',
+        'retweet_count',
+        'lang',
+    ]
+    return df[cols]
+
+
+def get_tweets_urls(data: dict | list, expr: str, cols: list[str] = None) -> pd.DataFrame:
+    """
+    Convert raw GraphQL response to DataFrame
+
+    Search for tweets containing specific urls by regex
+
+    @param data: tweets
+    @param expr: regex to match urls
+    @param cols: option to only include certain columns
+    @return: DataFrame of tweets matching the expression
+    """
+    tweet_results = find_key(data, 'tweet_results')
+    results = []
+    for res in tweet_results:
+        legacy = res.get('result', {}).get('legacy', {})
+        urls = find_key(res, 'expanded_url')
+        if any(re.search(expr, x) for x in urls):
+            results.append({'urls': urls} | legacy)
+    try:
+        df = (
+            pd.DataFrame(results)
+            .assign(date=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
+            .sort_values('created_at', ascending=False)
+            .reset_index(drop=True)
+        )
+        numeric = [
+            'user_id_str',
+            'id_str',
+            'favorite_count',
+            'quote_count',
+            'reply_count',
+            'retweet_count',
+        ]
+        df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
+        cols = cols or [
+            'id_str',
+            'user_id_str',
+            'created_at',
+            'urls',
+            'full_text',
+            'favorite_count',
+            'quote_count',
+            'reply_count',
+            'retweet_count',
+            'lang',
+        ]
+        return df[cols]
+    except Exception as e:
+        print(e)
--- a/readme.md
+++ b/readme.md
@@ -296,28 +296,34 @@ follower_subset, last_cursor = scraper.followers([user_id], limit=limit, cursor=
 from twitter.search import Search

 email, username, password = ..., ..., ...
-# default output directory is `data/raw` if save=True
-search = Search(email, username, password)
+# default output directory is `data/search_results` if save=True
+search = Search(email, username, password, save=True, debug=1)

-latest_results = search.run(
-    'brasil portugal -argentina',
-    'paperswithcode -tensorflow -tf',
-    'ios android',
-    limit=100,
-    latest=True,  # get latest tweets only
-    retries=3,
-)
-
-general_results = search.run(
-    '(#dogs OR #cats) min_retweets:500',
-    'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01',
-    'brasil portugal -argentina',
-    'paperswithcode -tensorflow -tf',
-    'skateboarding baseball guitar',
-    'cheese bread butter',
-    'ios android',
-    limit=100,
-    retries=7,
+res = search.run(
+    limit=37,
+    retries=5,
+    queries=[
+        {
+            'category': 'Top',
+            'query': 'paperswithcode -tensorflow -tf'
+        },
+        {
+            'category': 'Latest',
+            'query': 'test'
+        },
+        {
+            'category': 'People',
+            'query': 'brasil portugal -argentina'
+        },
+        {
+            'category': 'Photos',
+            'query': 'greece'
+        },
+        {
+            'category': 'Videos',
+            'query': 'italy'
+        },
+    ],
 )
 ```

--- a/setup.py
+++ b/setup.py
@@ -315,28 +315,34 @@ setup(
    from twitter.search import Search

    email, username, password = ..., ..., ...
-    # default output directory is `data/raw` if save=True
-    search = Search(email, username, password)
+    # default output directory is `data/search_results` if save=True
+    search = Search(email, username, password, save=True, debug=1)

-    latest_results = search.run(
-        'brasil portugal -argentina',
-        'paperswithcode -tensorflow -tf',
-        'ios android',
-        limit=100,
-        latest=True,  # get latest tweets only
-        retries=3,
-    )
-
-    general_results = search.run(
-        '(#dogs OR #cats) min_retweets:500',
-        'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01',
-        'brasil portugal -argentina',
-        'paperswithcode -tensorflow -tf',
-        'skateboarding baseball guitar',
-        'cheese bread butter',
-        'ios android',
-        limit=100,
-        retries=7,
+    res = search.run(
+        limit=37,
+        retries=5,
+        queries=[
+            {
+                'category': 'Top',
+                'query': 'paperswithcode -tensorflow -tf'
+            },
+            {
+                'category': 'Latest',
+                'query': 'test'
+            },
+            {
+                'category': 'People',
+                'query': 'brasil portugal -argentina'
+            },
+            {
+                'category': 'Photos',
+                'query': 'greece'
+            },
+            {
+                'category': 'Videos',
+                'query': 'italy'
+            },
+        ],
    )
    ```

--- a/twitter/version.py
+++ b/twitter/version.py
@@ -1,5 +1,5 @@
 __title__ = "twitter-api-client"
 __description__ = "Implementation of Twitter's v1, v2, and GraphQL APIs."
-__version__ = "0.10.6"
+__version__ = "0.10.7"
 __author__ = "Trevor Hobenshield"
 __license__ = "MIT"
--- a/twitter/constants.py
+++ b/twitter/constants.py
@@ -65,6 +65,15 @@ ID_MAP = {
 }


+@dataclass
+class SearchCategory:
+    Top = 'Top'
+    Latest = 'Latest'
+    People = 'People'
+    Photos = 'Photos'
+    Videos = 'Videos'
+
+
@dataclass
 class SpaceCategory:
    Top = 'Top'
@@ -85,6 +94,7 @@ class SpaceState:
@dataclass
 class Operation:
    # todo: dynamically update
+    SearchTimeline = {'rawQuery': str, 'product': str}, 'nK1dw4oV3k4w5TdtcAdSww', 'SearchTimeline'
    AudioSpaceById = {'id': str}, 'fYAuJHiY3TmYdBmrRtIKhA', 'AudioSpaceById'
    AudioSpaceSearch = {'filter': str, 'query': str}, 'NTq79TuSz6fHj8lQaferJw', 'AudioSpaceSearch',
    UserByScreenName = {'screen_name': str}, 'sLVLhk0bGj3MVFEKTdax1w', 'UserByScreenName'
@@ -272,7 +282,6 @@ class Operation:
    RitoFlaggedAccountsTimeline = 'lMzaBZHIbD6GuPqJJQubMg', 'RitoFlaggedAccountsTimeline'
    RitoFlaggedTweetsTimeline = 'iCuXMibh6yj9AelyjKXDeA', 'RitoFlaggedTweetsTimeline'
    RitoSuggestedActionsFacePile = 'GnQKeEdL1LyeK3dTQCS1yw', 'RitoSuggestedActionsFacePile'
-    SearchTimeline = 'gkjsKepM6gl_HmFWoWKfgg', 'SearchTimeline'
    SetDefault = 'QEMLEzEMzoPNbeauKCCLbg', 'SetDefault'
    SetSafetyModeSettings = 'qSJIPIpf4gA7Wn21bT3D4w', 'SetSafetyModeSettings'
    SharingAudiospacesListeningDataWithFollowersUpdate = '5h0kNbk3ii97rmfY6CdgAA', 'SharingAudiospacesListeningDataWithFollowersUpdate'
@@ -351,40 +360,42 @@ class Operation:
        'withMessages': True,
    }
    default_features = {
-        "blue_business_profile_image_shape_enabled": True,
-        "creator_subscriptions_tweet_preview_api_enabled": True,
-        "freedom_of_speech_not_reach_fetch_enabled": False,
-        "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
-        "graphql_timeline_v2_bookmark_timeline": True,
-        "hidden_profile_likes_enabled": True,
-        "highlights_tweets_tab_ui_enabled": True,
-        "interactive_text_enabled": True,
-        "longform_notetweets_consumption_enabled": True,
-        "longform_notetweets_inline_media_enabled": False,
-        "longform_notetweets_rich_text_read_enabled": True,
-        "longform_notetweets_richtext_consumption_enabled": True,
-        "profile_foundations_tweet_stats_enabled": True,
-        "profile_foundations_tweet_stats_tweet_frequency": True,
-        "responsive_web_birdwatch_note_limit_enabled": True,
-        "responsive_web_edit_tweet_api_enabled": True,
-        "responsive_web_enhance_cards_enabled": False,
-        "responsive_web_graphql_exclude_directive_enabled": True,
-        "responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
-        "responsive_web_graphql_timeline_navigation_enabled": True,
-        "responsive_web_text_conversations_enabled": False,
-        "responsive_web_twitter_article_data_v2_enabled": True,
-        "responsive_web_twitter_blue_verified_badge_is_enabled": True,
-        "rweb_lists_timeline_redesign_enabled": True,
-        "spaces_2022_h2_clipping": True,
-        "spaces_2022_h2_spaces_communities": True,
-        "standardized_nudges_misinfo": True,
-        "subscriptions_verification_info_verified_since_enabled": True,
-        "tweet_awards_web_tipping_enabled": False,
-        "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
-        "tweetypie_unmention_optimization_enabled": True,
-        "verified_phone_label_enabled": False,
-        "vibe_api_enabled": True,
-        "view_counts_everywhere_api_enabled": True,
+        'blue_business_profile_image_shape_enabled': True,
+        'creator_subscriptions_tweet_preview_api_enabled': True,
+        'freedom_of_speech_not_reach_fetch_enabled': True,
+        'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True,
+        'graphql_timeline_v2_bookmark_timeline': True,
+        'hidden_profile_likes_enabled': True,
+        'highlights_tweets_tab_ui_enabled': True,
+        'interactive_text_enabled': True,
+        'longform_notetweets_consumption_enabled': True,
+        'longform_notetweets_inline_media_enabled': True,
+        'longform_notetweets_rich_text_read_enabled': True,
+        'longform_notetweets_richtext_consumption_enabled': True,
+        'profile_foundations_tweet_stats_enabled': True,
+        'profile_foundations_tweet_stats_tweet_frequency': True,
+        'responsive_web_birdwatch_note_limit_enabled': True,
+        'responsive_web_edit_tweet_api_enabled': True,
+        'responsive_web_enhance_cards_enabled': False,
+        'responsive_web_graphql_exclude_directive_enabled': True,
+        'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False,
+        'responsive_web_graphql_timeline_navigation_enabled': True,
+        'responsive_web_media_download_video_enabled': False,
+        'responsive_web_text_conversations_enabled': False,
+        'responsive_web_twitter_article_data_v2_enabled': True,
+        'responsive_web_twitter_article_tweet_consumption_enabled': False,
+        'responsive_web_twitter_blue_verified_badge_is_enabled': True,
+        'rweb_lists_timeline_redesign_enabled': True,
+        'spaces_2022_h2_clipping': True,
+        'spaces_2022_h2_spaces_communities': True,
+        'standardized_nudges_misinfo': True,
+        'subscriptions_verification_info_verified_since_enabled': True,
+        'tweet_awards_web_tipping_enabled': False,
+        'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True,
+        'tweetypie_unmention_optimization_enabled': True,
+        'verified_phone_label_enabled': False,
+        'vibe_api_enabled': True,
+        'view_counts_everywhere_api_enabled': True
    }


--- a/twitter/search.py
+++ b/twitter/search.py
@@ -13,7 +13,7 @@ from httpx import AsyncClient, Client

 from .constants import *
 from .login import login
-from .util import set_qs, get_headers, find_key
+from .util import get_headers, find_key, build_params

 reset = '\u001b[0m'
 colors = [f'\u001b[{i}m' for i in range(30, 38)]
@@ -39,101 +39,99 @@ class Search:
    def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
        self.save = kwargs.get('save', True)
        self.debug = kwargs.get('debug', 0)
-        self.api = 'https://api.twitter.com/2/search/adaptive.json?'
        self.logger = self._init_logger(**kwargs)
        self.session = self._validate_session(email, username, password, session, **kwargs)

-    def run(self, *args, out: str = 'data', **kwargs):
-        out_path = self.make_output_dirs(out)
-        if kwargs.get('latest', False):
-            search_config['tweet_search_mode'] = 'live'
-        return asyncio.run(self.process(args, search_config, out_path, **kwargs))
+    def run(self, queries: list[dict], limit: int = math.inf, **kwargs):
+        out = Path('data/search_results')
+        out.mkdir(parents=True, exist_ok=True)
+        return asyncio.run(self.process(queries, limit, out, **kwargs))

-    async def process(self, queries: tuple, config: dict, out: Path, **kwargs) -> list:
+    async def process(self, queries: list[dict], limit: int, out: Path, **kwargs) -> list:
        async with AsyncClient(headers=get_headers(self.session)) as s:
-            return await asyncio.gather(*(self.paginate(q, s, config, out, **kwargs) for q in queries))
+            return await asyncio.gather(*(self.paginate(s, q, limit, out, **kwargs) for q in queries))

-    async def paginate(self, query: str, session: AsyncClient, config: dict, out: Path, **kwargs) -> list[dict]:
-        config['q'] = query
-        data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
-        all_data = [data]
+    async def paginate(self, client: AsyncClient, query: dict, limit: int, out: Path, **kwargs) -> list[dict]:
        c = colors.pop() if colors else ''
-        ids = set()
-        while next_cursor:
-            ids |= set(data['globalObjects']['tweets'])
-            if len(ids) >= kwargs.get('limit', math.inf):
+        params = {
+            'variables': {
+                'count': 20,
+                'querySource': 'typed_query',
+            },
+            'features': Operation.default_features,
+            'fieldToggles': {'withArticleRichContentState': False},
+        }
+        params['variables']['rawQuery'] = query['query']
+        params['variables']['product'] = query['category']
+
+        data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs)
+
+        total = set()
+        res = [*entries]
+        while True:
+            if cursor:
+                params['variables']['cursor'] = cursor
+
+            data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs)
+
+            if len(entries) <= 2:  # just cursors
+                return res
+
+            res.extend(entries)
+            unq = set(find_key(entries, 'entryId'))
+            total |= unq
+
+            if self.debug:
+                self.logger.debug(f'{c}{query["query"]}{reset}')
+
+            if len(total) >= limit:
                if self.debug:
                    self.logger.debug(
-                        f'[{GREEN}success{RESET}] Returned {len(ids)} search results for {c}{query}{reset}')
-                return all_data
-            if self.debug:
-                self.logger.debug(f'{c}{query}{reset}')
-            config['cursor'] = next_cursor
-
-            data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
-            if not data:
-                return all_data
-
-            data['query'] = query
+                        f'[{GREEN}success{RESET}] Returned {len(total)} search results for {c}{query["query"]}{reset}')
+                return res

            if self.save:
-                (out / f'raw/{time.time_ns()}.json').write_text(
-                    orjson.dumps(data, option=orjson.OPT_INDENT_2).decode(),
-                    encoding='utf-8'
-                )
-            all_data.append(data)
-        return all_data
+                (out / f'{time.time_ns()}.json').write_bytes(orjson.dumps(entries))

-    async def backoff(self, fn, info, **kwargs):
+    async def backoff(self, fn, **kwargs):
        retries = kwargs.get('retries', 3)
        for i in range(retries + 1):
            try:
-                data, next_cursor = await fn()
-                if not data.get('globalObjects', {}).get('tweets'):
-                    raise Exception
-                return data, next_cursor
+                data, entries, cursor = await fn()
+                if errors := data.get('errors'):
+                    for e in errors:
+                        self.logger.warning(f'{YELLOW}{e.get("message")}{RESET}')
+                        return [], [], ''
+                ids = set(find_key(data, 'entryId'))
+                if len(ids) >= 2:
+                    return data, entries, cursor
            except Exception as e:
                if i == retries:
-                    if self.debug:
-                        self.logger.debug(f'Max retries exceeded\n{e}')
-                    return None, None
+                    self.logger.debug(f'Max retries exceeded\n{e}')
+                    return
                t = 2 ** i + random.random()
-                if self.debug:
-                    self.logger.debug(
-                        f'No data for: {BOLD}{info}{RESET}, retrying in {f"{t:.2f}"} seconds\t\t{e}')
-                time.sleep(t)
+                self.logger.debug(f'Retrying in {f"{t:.2f}"} seconds\t\t{e}')
+                # time.sleep(t)
+                await asyncio.sleep(t)

-    async def get(self, session: AsyncClient, params: dict) -> tuple:
-        url = set_qs(self.api, params, update=True, safe='()')
-        r = await session.get(url)
+    async def get(self, client: AsyncClient, params: dict) -> tuple:
+        _, qid, name = Operation.SearchTimeline
+        r = await client.get(
+            f'https://twitter.com/i/api/graphql/{qid}/{name}',
+            params=build_params(params),
+        )
        data = r.json()
-        next_cursor = self.get_cursor(data)
-        return data, next_cursor
+        cursor = self.get_cursor(data)
+        entries = [y for x in find_key(data, 'entries') for y in x if re.search(r'^(tweet|user)-', y['entryId'])]
+        # add on query info
+        for e in entries:
+            e['query'] = params['variables']['rawQuery']
+        return data, entries, cursor

-    def get_cursor(self, res: dict):
-        try:
-            if live := find_key(res, 'value'):
-                if cursor := [x for x in live if 'scroll' in x]:
-                    return cursor[0]
-            for instr in res['timeline']['instructions']:
-                if replaceEntry := instr.get('replaceEntry'):
-                    cursor = replaceEntry['entry']['content']['operation']['cursor']
-                    if cursor['cursorType'] == 'Bottom':
-                        return cursor['value']
-                    continue
-                for entry in instr['addEntries']['entries']:
-                    if entry['entryId'] == 'cursor-bottom-0':
-                        return entry['content']['operation']['cursor']['value']
-        except Exception as e:
-            if self.debug:
-                self.logger.debug(e)
-
-    def make_output_dirs(self, path: str) -> Path:
-        p = Path(f'{path}')
-        (p / 'raw').mkdir(parents=True, exist_ok=True)
-        (p / 'processed').mkdir(parents=True, exist_ok=True)
-        (p / 'final').mkdir(parents=True, exist_ok=True)
-        return p
+    def get_cursor(self, data: list[dict]):
+        for e in find_key(data, 'content'):
+            if e.get('cursorType') == 'Bottom':
+                return e['value']

    def _init_logger(self, **kwargs) -> Logger:
        if kwargs.get('debug'):