diff --git a/examples/postprocess.py b/examples/postprocess.py new file mode 100644 index 0000000..34c32bb --- /dev/null +++ b/examples/postprocess.py @@ -0,0 +1,96 @@ +import re +import pandas as pd +from twitter.util import find_key + + +def get_tweets(data: list | dict, cols: list[str] = None): + """ + Convert raw GraphQL response to DataFrame + + @param data: tweets + @param cols: option to only include certain columns + @return: DataFrame of tweets + """ + entries = [y for x in find_key(data, 'entries') for y in x] + # filter out promoted tweets + tweets = [x for x in entries if not x.get('entryId').startswith('promoted')] + df = ( + pd.json_normalize(find_key(tweets, 'tweet_results'), max_level=1) + ['result.legacy'].apply(pd.Series) + .dropna(subset='user_id_str') + .assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y")) + .sort_values('created_at', ascending=False) + .reset_index(drop=True) + ) + numeric = [ + 'user_id_str', + 'id_str', + 'favorite_count', + 'quote_count', + 'reply_count', + 'retweet_count', + ] + df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce') + cols = cols or [ + 'id_str', + 'user_id_str', + 'created_at', + 'full_text', + 'favorite_count', + 'quote_count', + 'reply_count', + 'retweet_count', + 'lang', + ] + return df[cols] + + +def get_tweets_urls(data: dict | list, expr: str, cols: list[str] = None) -> pd.DataFrame: + """ + Convert raw GraphQL response to DataFrame + + Search for tweets containing specific urls by regex + + @param data: tweets + @param expr: regex to match urls + @param cols: option to only include certain columns + @return: DataFrame of tweets matching the expression + """ + tweet_results = find_key(data, 'tweet_results') + results = [] + for res in tweet_results: + legacy = res.get('result', {}).get('legacy', {}) + urls = find_key(res, 'expanded_url') + if any(re.search(expr, x) for x in urls): + results.append({'urls': urls} | legacy) + try: + df = ( + pd.DataFrame(results) + .assign(date=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y")) + .sort_values('created_at', ascending=False) + .reset_index(drop=True) + ) + numeric = [ + 'user_id_str', + 'id_str', + 'favorite_count', + 'quote_count', + 'reply_count', + 'retweet_count', + ] + df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce') + cols = cols or [ + 'id_str', + 'user_id_str', + 'created_at', + 'urls', + 'full_text', + 'favorite_count', + 'quote_count', + 'reply_count', + 'retweet_count', + 'lang', + ] + return df[cols] + except Exception as e: + print(e) diff --git a/readme.md b/readme.md index 0bc273c..2661d3d 100644 --- a/readme.md +++ b/readme.md @@ -296,28 +296,34 @@ follower_subset, last_cursor = scraper.followers([user_id], limit=limit, cursor= from twitter.search import Search email, username, password = ..., ..., ... -# default output directory is `data/raw` if save=True -search = Search(email, username, password) +# default output directory is `data/search_results` if save=True +search = Search(email, username, password, save=True, debug=1) -latest_results = search.run( - 'brasil portugal -argentina', - 'paperswithcode -tensorflow -tf', - 'ios android', - limit=100, - latest=True, # get latest tweets only - retries=3, -) - -general_results = search.run( - '(#dogs OR #cats) min_retweets:500', - 'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01', - 'brasil portugal -argentina', - 'paperswithcode -tensorflow -tf', - 'skateboarding baseball guitar', - 'cheese bread butter', - 'ios android', - limit=100, - retries=7, +res = search.run( + limit=37, + retries=5, + queries=[ + { + 'category': 'Top', + 'query': 'paperswithcode -tensorflow -tf' + }, + { + 'category': 'Latest', + 'query': 'test' + }, + { + 'category': 'People', + 'query': 'brasil portugal -argentina' + }, + { + 'category': 'Photos', + 'query': 'greece' + }, + { + 'category': 'Videos', + 'query': 'italy' + }, + ], ) ``` diff --git a/setup.py b/setup.py index 1eb4d1b..7549fe4 100644 --- a/setup.py +++ b/setup.py @@ -315,28 +315,34 @@ setup( from twitter.search import Search email, username, password = ..., ..., ... - # default output directory is `data/raw` if save=True - search = Search(email, username, password) + # default output directory is `data/search_results` if save=True + search = Search(email, username, password, save=True, debug=1) - latest_results = search.run( - 'brasil portugal -argentina', - 'paperswithcode -tensorflow -tf', - 'ios android', - limit=100, - latest=True, # get latest tweets only - retries=3, - ) - - general_results = search.run( - '(#dogs OR #cats) min_retweets:500', - 'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01', - 'brasil portugal -argentina', - 'paperswithcode -tensorflow -tf', - 'skateboarding baseball guitar', - 'cheese bread butter', - 'ios android', - limit=100, - retries=7, + res = search.run( + limit=37, + retries=5, + queries=[ + { + 'category': 'Top', + 'query': 'paperswithcode -tensorflow -tf' + }, + { + 'category': 'Latest', + 'query': 'test' + }, + { + 'category': 'People', + 'query': 'brasil portugal -argentina' + }, + { + 'category': 'Photos', + 'query': 'greece' + }, + { + 'category': 'Videos', + 'query': 'italy' + }, + ], ) ``` diff --git a/twitter/__version__.py b/twitter/__version__.py index 11e7c6c..35533e5 100644 --- a/twitter/__version__.py +++ b/twitter/__version__.py @@ -1,5 +1,5 @@ __title__ = "twitter-api-client" __description__ = "Implementation of Twitter's v1, v2, and GraphQL APIs." -__version__ = "0.10.6" +__version__ = "0.10.7" __author__ = "Trevor Hobenshield" __license__ = "MIT" \ No newline at end of file diff --git a/twitter/constants.py b/twitter/constants.py index 87da1a3..9e85c50 100644 --- a/twitter/constants.py +++ b/twitter/constants.py @@ -65,6 +65,15 @@ ID_MAP = { } +@dataclass +class SearchCategory: + Top = 'Top' + Latest = 'Latest' + People = 'People' + Photos = 'Photos' + Videos = 'Videos' + + @dataclass class SpaceCategory: Top = 'Top' @@ -85,6 +94,7 @@ class SpaceState: @dataclass class Operation: # todo: dynamically update + SearchTimeline = {'rawQuery': str, 'product': str}, 'nK1dw4oV3k4w5TdtcAdSww', 'SearchTimeline' AudioSpaceById = {'id': str}, 'fYAuJHiY3TmYdBmrRtIKhA', 'AudioSpaceById' AudioSpaceSearch = {'filter': str, 'query': str}, 'NTq79TuSz6fHj8lQaferJw', 'AudioSpaceSearch', UserByScreenName = {'screen_name': str}, 'sLVLhk0bGj3MVFEKTdax1w', 'UserByScreenName' @@ -272,7 +282,6 @@ class Operation: RitoFlaggedAccountsTimeline = 'lMzaBZHIbD6GuPqJJQubMg', 'RitoFlaggedAccountsTimeline' RitoFlaggedTweetsTimeline = 'iCuXMibh6yj9AelyjKXDeA', 'RitoFlaggedTweetsTimeline' RitoSuggestedActionsFacePile = 'GnQKeEdL1LyeK3dTQCS1yw', 'RitoSuggestedActionsFacePile' - SearchTimeline = 'gkjsKepM6gl_HmFWoWKfgg', 'SearchTimeline' SetDefault = 'QEMLEzEMzoPNbeauKCCLbg', 'SetDefault' SetSafetyModeSettings = 'qSJIPIpf4gA7Wn21bT3D4w', 'SetSafetyModeSettings' SharingAudiospacesListeningDataWithFollowersUpdate = '5h0kNbk3ii97rmfY6CdgAA', 'SharingAudiospacesListeningDataWithFollowersUpdate' @@ -351,40 +360,42 @@ class Operation: 'withMessages': True, } default_features = { - "blue_business_profile_image_shape_enabled": True, - "creator_subscriptions_tweet_preview_api_enabled": True, - "freedom_of_speech_not_reach_fetch_enabled": False, - "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, - "graphql_timeline_v2_bookmark_timeline": True, - "hidden_profile_likes_enabled": True, - "highlights_tweets_tab_ui_enabled": True, - "interactive_text_enabled": True, - "longform_notetweets_consumption_enabled": True, - "longform_notetweets_inline_media_enabled": False, - "longform_notetweets_rich_text_read_enabled": True, - "longform_notetweets_richtext_consumption_enabled": True, - "profile_foundations_tweet_stats_enabled": True, - "profile_foundations_tweet_stats_tweet_frequency": True, - "responsive_web_birdwatch_note_limit_enabled": True, - "responsive_web_edit_tweet_api_enabled": True, - "responsive_web_enhance_cards_enabled": False, - "responsive_web_graphql_exclude_directive_enabled": True, - "responsive_web_graphql_skip_user_profile_image_extensions_enabled": False, - "responsive_web_graphql_timeline_navigation_enabled": True, - "responsive_web_text_conversations_enabled": False, - "responsive_web_twitter_article_data_v2_enabled": True, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, - "rweb_lists_timeline_redesign_enabled": True, - "spaces_2022_h2_clipping": True, - "spaces_2022_h2_spaces_communities": True, - "standardized_nudges_misinfo": True, - "subscriptions_verification_info_verified_since_enabled": True, - "tweet_awards_web_tipping_enabled": False, - "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False, - "tweetypie_unmention_optimization_enabled": True, - "verified_phone_label_enabled": False, - "vibe_api_enabled": True, - "view_counts_everywhere_api_enabled": True, + 'blue_business_profile_image_shape_enabled': True, + 'creator_subscriptions_tweet_preview_api_enabled': True, + 'freedom_of_speech_not_reach_fetch_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'graphql_timeline_v2_bookmark_timeline': True, + 'hidden_profile_likes_enabled': True, + 'highlights_tweets_tab_ui_enabled': True, + 'interactive_text_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'longform_notetweets_inline_media_enabled': True, + 'longform_notetweets_rich_text_read_enabled': True, + 'longform_notetweets_richtext_consumption_enabled': True, + 'profile_foundations_tweet_stats_enabled': True, + 'profile_foundations_tweet_stats_tweet_frequency': True, + 'responsive_web_birdwatch_note_limit_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'responsive_web_enhance_cards_enabled': False, + 'responsive_web_graphql_exclude_directive_enabled': True, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_media_download_video_enabled': False, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_twitter_article_data_v2_enabled': True, + 'responsive_web_twitter_article_tweet_consumption_enabled': False, + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'rweb_lists_timeline_redesign_enabled': True, + 'spaces_2022_h2_clipping': True, + 'spaces_2022_h2_spaces_communities': True, + 'standardized_nudges_misinfo': True, + 'subscriptions_verification_info_verified_since_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True, + 'tweetypie_unmention_optimization_enabled': True, + 'verified_phone_label_enabled': False, + 'vibe_api_enabled': True, + 'view_counts_everywhere_api_enabled': True } diff --git a/twitter/search.py b/twitter/search.py index df441c7..098a833 100644 --- a/twitter/search.py +++ b/twitter/search.py @@ -13,7 +13,7 @@ from httpx import AsyncClient, Client from .constants import * from .login import login -from .util import set_qs, get_headers, find_key +from .util import get_headers, find_key, build_params reset = '\u001b[0m' colors = [f'\u001b[{i}m' for i in range(30, 38)] @@ -39,101 +39,99 @@ class Search: def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs): self.save = kwargs.get('save', True) self.debug = kwargs.get('debug', 0) - self.api = 'https://api.twitter.com/2/search/adaptive.json?' self.logger = self._init_logger(**kwargs) self.session = self._validate_session(email, username, password, session, **kwargs) - def run(self, *args, out: str = 'data', **kwargs): - out_path = self.make_output_dirs(out) - if kwargs.get('latest', False): - search_config['tweet_search_mode'] = 'live' - return asyncio.run(self.process(args, search_config, out_path, **kwargs)) + def run(self, queries: list[dict], limit: int = math.inf, **kwargs): + out = Path('data/search_results') + out.mkdir(parents=True, exist_ok=True) + return asyncio.run(self.process(queries, limit, out, **kwargs)) - async def process(self, queries: tuple, config: dict, out: Path, **kwargs) -> list: + async def process(self, queries: list[dict], limit: int, out: Path, **kwargs) -> list: async with AsyncClient(headers=get_headers(self.session)) as s: - return await asyncio.gather(*(self.paginate(q, s, config, out, **kwargs) for q in queries)) + return await asyncio.gather(*(self.paginate(s, q, limit, out, **kwargs) for q in queries)) - async def paginate(self, query: str, session: AsyncClient, config: dict, out: Path, **kwargs) -> list[dict]: - config['q'] = query - data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs) - all_data = [data] + async def paginate(self, client: AsyncClient, query: dict, limit: int, out: Path, **kwargs) -> list[dict]: c = colors.pop() if colors else '' - ids = set() - while next_cursor: - ids |= set(data['globalObjects']['tweets']) - if len(ids) >= kwargs.get('limit', math.inf): + params = { + 'variables': { + 'count': 20, + 'querySource': 'typed_query', + }, + 'features': Operation.default_features, + 'fieldToggles': {'withArticleRichContentState': False}, + } + params['variables']['rawQuery'] = query['query'] + params['variables']['product'] = query['category'] + + data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs) + + total = set() + res = [*entries] + while True: + if cursor: + params['variables']['cursor'] = cursor + + data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs) + + if len(entries) <= 2: # just cursors + return res + + res.extend(entries) + unq = set(find_key(entries, 'entryId')) + total |= unq + + if self.debug: + self.logger.debug(f'{c}{query["query"]}{reset}') + + if len(total) >= limit: if self.debug: self.logger.debug( - f'[{GREEN}success{RESET}] Returned {len(ids)} search results for {c}{query}{reset}') - return all_data - if self.debug: - self.logger.debug(f'{c}{query}{reset}') - config['cursor'] = next_cursor - - data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs) - if not data: - return all_data - - data['query'] = query + f'[{GREEN}success{RESET}] Returned {len(total)} search results for {c}{query["query"]}{reset}') + return res if self.save: - (out / f'raw/{time.time_ns()}.json').write_text( - orjson.dumps(data, option=orjson.OPT_INDENT_2).decode(), - encoding='utf-8' - ) - all_data.append(data) - return all_data + (out / f'{time.time_ns()}.json').write_bytes(orjson.dumps(entries)) - async def backoff(self, fn, info, **kwargs): + async def backoff(self, fn, **kwargs): retries = kwargs.get('retries', 3) for i in range(retries + 1): try: - data, next_cursor = await fn() - if not data.get('globalObjects', {}).get('tweets'): - raise Exception - return data, next_cursor + data, entries, cursor = await fn() + if errors := data.get('errors'): + for e in errors: + self.logger.warning(f'{YELLOW}{e.get("message")}{RESET}') + return [], [], '' + ids = set(find_key(data, 'entryId')) + if len(ids) >= 2: + return data, entries, cursor except Exception as e: if i == retries: - if self.debug: - self.logger.debug(f'Max retries exceeded\n{e}') - return None, None + self.logger.debug(f'Max retries exceeded\n{e}') + return t = 2 ** i + random.random() - if self.debug: - self.logger.debug( - f'No data for: {BOLD}{info}{RESET}, retrying in {f"{t:.2f}"} seconds\t\t{e}') - time.sleep(t) + self.logger.debug(f'Retrying in {f"{t:.2f}"} seconds\t\t{e}') + # time.sleep(t) + await asyncio.sleep(t) - async def get(self, session: AsyncClient, params: dict) -> tuple: - url = set_qs(self.api, params, update=True, safe='()') - r = await session.get(url) + async def get(self, client: AsyncClient, params: dict) -> tuple: + _, qid, name = Operation.SearchTimeline + r = await client.get( + f'https://twitter.com/i/api/graphql/{qid}/{name}', + params=build_params(params), + ) data = r.json() - next_cursor = self.get_cursor(data) - return data, next_cursor + cursor = self.get_cursor(data) + entries = [y for x in find_key(data, 'entries') for y in x if re.search(r'^(tweet|user)-', y['entryId'])] + # add on query info + for e in entries: + e['query'] = params['variables']['rawQuery'] + return data, entries, cursor - def get_cursor(self, res: dict): - try: - if live := find_key(res, 'value'): - if cursor := [x for x in live if 'scroll' in x]: - return cursor[0] - for instr in res['timeline']['instructions']: - if replaceEntry := instr.get('replaceEntry'): - cursor = replaceEntry['entry']['content']['operation']['cursor'] - if cursor['cursorType'] == 'Bottom': - return cursor['value'] - continue - for entry in instr['addEntries']['entries']: - if entry['entryId'] == 'cursor-bottom-0': - return entry['content']['operation']['cursor']['value'] - except Exception as e: - if self.debug: - self.logger.debug(e) - - def make_output_dirs(self, path: str) -> Path: - p = Path(f'{path}') - (p / 'raw').mkdir(parents=True, exist_ok=True) - (p / 'processed').mkdir(parents=True, exist_ok=True) - (p / 'final').mkdir(parents=True, exist_ok=True) - return p + def get_cursor(self, data: list[dict]): + for e in find_key(data, 'content'): + if e.get('cursorType') == 'Bottom': + return e['value'] def _init_logger(self, **kwargs) -> Logger: if kwargs.get('debug'):