add new GraphQL endpoint /SearchTimeline. api.twitter.com/2/search/adaptive.json deprecated

This commit is contained in:
trevor hobenshield
2023-07-02 12:09:53 -07:00
parent 55425b7181
commit afba99f2eb
6 changed files with 270 additions and 153 deletions

96
examples/postprocess.py Normal file
View File

@@ -0,0 +1,96 @@
import re
import pandas as pd
from twitter.util import find_key
def get_tweets(data: list | dict, cols: list[str] = None):
"""
Convert raw GraphQL response to DataFrame
@param data: tweets
@param cols: option to only include certain columns
@return: DataFrame of tweets
"""
entries = [y for x in find_key(data, 'entries') for y in x]
# filter out promoted tweets
tweets = [x for x in entries if not x.get('entryId').startswith('promoted')]
df = (
pd.json_normalize(find_key(tweets, 'tweet_results'), max_level=1)
['result.legacy'].apply(pd.Series)
.dropna(subset='user_id_str')
.assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
.sort_values('created_at', ascending=False)
.reset_index(drop=True)
)
numeric = [
'user_id_str',
'id_str',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
]
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
cols = cols or [
'id_str',
'user_id_str',
'created_at',
'full_text',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
'lang',
]
return df[cols]
def get_tweets_urls(data: dict | list, expr: str, cols: list[str] = None) -> pd.DataFrame:
"""
Convert raw GraphQL response to DataFrame
Search for tweets containing specific urls by regex
@param data: tweets
@param expr: regex to match urls
@param cols: option to only include certain columns
@return: DataFrame of tweets matching the expression
"""
tweet_results = find_key(data, 'tweet_results')
results = []
for res in tweet_results:
legacy = res.get('result', {}).get('legacy', {})
urls = find_key(res, 'expanded_url')
if any(re.search(expr, x) for x in urls):
results.append({'urls': urls} | legacy)
try:
df = (
pd.DataFrame(results)
.assign(date=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
.sort_values('created_at', ascending=False)
.reset_index(drop=True)
)
numeric = [
'user_id_str',
'id_str',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
]
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
cols = cols or [
'id_str',
'user_id_str',
'created_at',
'urls',
'full_text',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
'lang',
]
return df[cols]
except Exception as e:
print(e)

View File

@@ -296,28 +296,34 @@ follower_subset, last_cursor = scraper.followers([user_id], limit=limit, cursor=
from twitter.search import Search
email, username, password = ..., ..., ...
# default output directory is `data/raw` if save=True
search = Search(email, username, password)
# default output directory is `data/search_results` if save=True
search = Search(email, username, password, save=True, debug=1)
latest_results = search.run(
'brasil portugal -argentina',
'paperswithcode -tensorflow -tf',
'ios android',
limit=100,
latest=True, # get latest tweets only
retries=3,
)
general_results = search.run(
'(#dogs OR #cats) min_retweets:500',
'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01',
'brasil portugal -argentina',
'paperswithcode -tensorflow -tf',
'skateboarding baseball guitar',
'cheese bread butter',
'ios android',
limit=100,
retries=7,
res = search.run(
limit=37,
retries=5,
queries=[
{
'category': 'Top',
'query': 'paperswithcode -tensorflow -tf'
},
{
'category': 'Latest',
'query': 'test'
},
{
'category': 'People',
'query': 'brasil portugal -argentina'
},
{
'category': 'Photos',
'query': 'greece'
},
{
'category': 'Videos',
'query': 'italy'
},
],
)
```

View File

@@ -315,28 +315,34 @@ setup(
from twitter.search import Search
email, username, password = ..., ..., ...
# default output directory is `data/raw` if save=True
search = Search(email, username, password)
# default output directory is `data/search_results` if save=True
search = Search(email, username, password, save=True, debug=1)
latest_results = search.run(
'brasil portugal -argentina',
'paperswithcode -tensorflow -tf',
'ios android',
limit=100,
latest=True, # get latest tweets only
retries=3,
)
general_results = search.run(
'(#dogs OR #cats) min_retweets:500',
'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01',
'brasil portugal -argentina',
'paperswithcode -tensorflow -tf',
'skateboarding baseball guitar',
'cheese bread butter',
'ios android',
limit=100,
retries=7,
res = search.run(
limit=37,
retries=5,
queries=[
{
'category': 'Top',
'query': 'paperswithcode -tensorflow -tf'
},
{
'category': 'Latest',
'query': 'test'
},
{
'category': 'People',
'query': 'brasil portugal -argentina'
},
{
'category': 'Photos',
'query': 'greece'
},
{
'category': 'Videos',
'query': 'italy'
},
],
)
```

View File

@@ -1,5 +1,5 @@
__title__ = "twitter-api-client"
__description__ = "Implementation of Twitter's v1, v2, and GraphQL APIs."
__version__ = "0.10.6"
__version__ = "0.10.7"
__author__ = "Trevor Hobenshield"
__license__ = "MIT"

View File

@@ -65,6 +65,15 @@ ID_MAP = {
}
@dataclass
class SearchCategory:
Top = 'Top'
Latest = 'Latest'
People = 'People'
Photos = 'Photos'
Videos = 'Videos'
@dataclass
class SpaceCategory:
Top = 'Top'
@@ -85,6 +94,7 @@ class SpaceState:
@dataclass
class Operation:
# todo: dynamically update
SearchTimeline = {'rawQuery': str, 'product': str}, 'nK1dw4oV3k4w5TdtcAdSww', 'SearchTimeline'
AudioSpaceById = {'id': str}, 'fYAuJHiY3TmYdBmrRtIKhA', 'AudioSpaceById'
AudioSpaceSearch = {'filter': str, 'query': str}, 'NTq79TuSz6fHj8lQaferJw', 'AudioSpaceSearch',
UserByScreenName = {'screen_name': str}, 'sLVLhk0bGj3MVFEKTdax1w', 'UserByScreenName'
@@ -272,7 +282,6 @@ class Operation:
RitoFlaggedAccountsTimeline = 'lMzaBZHIbD6GuPqJJQubMg', 'RitoFlaggedAccountsTimeline'
RitoFlaggedTweetsTimeline = 'iCuXMibh6yj9AelyjKXDeA', 'RitoFlaggedTweetsTimeline'
RitoSuggestedActionsFacePile = 'GnQKeEdL1LyeK3dTQCS1yw', 'RitoSuggestedActionsFacePile'
SearchTimeline = 'gkjsKepM6gl_HmFWoWKfgg', 'SearchTimeline'
SetDefault = 'QEMLEzEMzoPNbeauKCCLbg', 'SetDefault'
SetSafetyModeSettings = 'qSJIPIpf4gA7Wn21bT3D4w', 'SetSafetyModeSettings'
SharingAudiospacesListeningDataWithFollowersUpdate = '5h0kNbk3ii97rmfY6CdgAA', 'SharingAudiospacesListeningDataWithFollowersUpdate'
@@ -351,40 +360,42 @@ class Operation:
'withMessages': True,
}
default_features = {
"blue_business_profile_image_shape_enabled": True,
"creator_subscriptions_tweet_preview_api_enabled": True,
"freedom_of_speech_not_reach_fetch_enabled": False,
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
"graphql_timeline_v2_bookmark_timeline": True,
"hidden_profile_likes_enabled": True,
"highlights_tweets_tab_ui_enabled": True,
"interactive_text_enabled": True,
"longform_notetweets_consumption_enabled": True,
"longform_notetweets_inline_media_enabled": False,
"longform_notetweets_rich_text_read_enabled": True,
"longform_notetweets_richtext_consumption_enabled": True,
"profile_foundations_tweet_stats_enabled": True,
"profile_foundations_tweet_stats_tweet_frequency": True,
"responsive_web_birdwatch_note_limit_enabled": True,
"responsive_web_edit_tweet_api_enabled": True,
"responsive_web_enhance_cards_enabled": False,
"responsive_web_graphql_exclude_directive_enabled": True,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
"responsive_web_text_conversations_enabled": False,
"responsive_web_twitter_article_data_v2_enabled": True,
"responsive_web_twitter_blue_verified_badge_is_enabled": True,
"rweb_lists_timeline_redesign_enabled": True,
"spaces_2022_h2_clipping": True,
"spaces_2022_h2_spaces_communities": True,
"standardized_nudges_misinfo": True,
"subscriptions_verification_info_verified_since_enabled": True,
"tweet_awards_web_tipping_enabled": False,
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
"tweetypie_unmention_optimization_enabled": True,
"verified_phone_label_enabled": False,
"vibe_api_enabled": True,
"view_counts_everywhere_api_enabled": True,
'blue_business_profile_image_shape_enabled': True,
'creator_subscriptions_tweet_preview_api_enabled': True,
'freedom_of_speech_not_reach_fetch_enabled': True,
'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True,
'graphql_timeline_v2_bookmark_timeline': True,
'hidden_profile_likes_enabled': True,
'highlights_tweets_tab_ui_enabled': True,
'interactive_text_enabled': True,
'longform_notetweets_consumption_enabled': True,
'longform_notetweets_inline_media_enabled': True,
'longform_notetweets_rich_text_read_enabled': True,
'longform_notetweets_richtext_consumption_enabled': True,
'profile_foundations_tweet_stats_enabled': True,
'profile_foundations_tweet_stats_tweet_frequency': True,
'responsive_web_birdwatch_note_limit_enabled': True,
'responsive_web_edit_tweet_api_enabled': True,
'responsive_web_enhance_cards_enabled': False,
'responsive_web_graphql_exclude_directive_enabled': True,
'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False,
'responsive_web_graphql_timeline_navigation_enabled': True,
'responsive_web_media_download_video_enabled': False,
'responsive_web_text_conversations_enabled': False,
'responsive_web_twitter_article_data_v2_enabled': True,
'responsive_web_twitter_article_tweet_consumption_enabled': False,
'responsive_web_twitter_blue_verified_badge_is_enabled': True,
'rweb_lists_timeline_redesign_enabled': True,
'spaces_2022_h2_clipping': True,
'spaces_2022_h2_spaces_communities': True,
'standardized_nudges_misinfo': True,
'subscriptions_verification_info_verified_since_enabled': True,
'tweet_awards_web_tipping_enabled': False,
'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True,
'tweetypie_unmention_optimization_enabled': True,
'verified_phone_label_enabled': False,
'vibe_api_enabled': True,
'view_counts_everywhere_api_enabled': True
}

View File

@@ -13,7 +13,7 @@ from httpx import AsyncClient, Client
from .constants import *
from .login import login
from .util import set_qs, get_headers, find_key
from .util import get_headers, find_key, build_params
reset = '\u001b[0m'
colors = [f'\u001b[{i}m' for i in range(30, 38)]
@@ -39,101 +39,99 @@ class Search:
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
self.save = kwargs.get('save', True)
self.debug = kwargs.get('debug', 0)
self.api = 'https://api.twitter.com/2/search/adaptive.json?'
self.logger = self._init_logger(**kwargs)
self.session = self._validate_session(email, username, password, session, **kwargs)
def run(self, *args, out: str = 'data', **kwargs):
out_path = self.make_output_dirs(out)
if kwargs.get('latest', False):
search_config['tweet_search_mode'] = 'live'
return asyncio.run(self.process(args, search_config, out_path, **kwargs))
def run(self, queries: list[dict], limit: int = math.inf, **kwargs):
out = Path('data/search_results')
out.mkdir(parents=True, exist_ok=True)
return asyncio.run(self.process(queries, limit, out, **kwargs))
async def process(self, queries: tuple, config: dict, out: Path, **kwargs) -> list:
async def process(self, queries: list[dict], limit: int, out: Path, **kwargs) -> list:
async with AsyncClient(headers=get_headers(self.session)) as s:
return await asyncio.gather(*(self.paginate(q, s, config, out, **kwargs) for q in queries))
return await asyncio.gather(*(self.paginate(s, q, limit, out, **kwargs) for q in queries))
async def paginate(self, query: str, session: AsyncClient, config: dict, out: Path, **kwargs) -> list[dict]:
config['q'] = query
data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
all_data = [data]
async def paginate(self, client: AsyncClient, query: dict, limit: int, out: Path, **kwargs) -> list[dict]:
c = colors.pop() if colors else ''
ids = set()
while next_cursor:
ids |= set(data['globalObjects']['tweets'])
if len(ids) >= kwargs.get('limit', math.inf):
params = {
'variables': {
'count': 20,
'querySource': 'typed_query',
},
'features': Operation.default_features,
'fieldToggles': {'withArticleRichContentState': False},
}
params['variables']['rawQuery'] = query['query']
params['variables']['product'] = query['category']
data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs)
total = set()
res = [*entries]
while True:
if cursor:
params['variables']['cursor'] = cursor
data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs)
if len(entries) <= 2: # just cursors
return res
res.extend(entries)
unq = set(find_key(entries, 'entryId'))
total |= unq
if self.debug:
self.logger.debug(f'{c}{query["query"]}{reset}')
if len(total) >= limit:
if self.debug:
self.logger.debug(
f'[{GREEN}success{RESET}] Returned {len(ids)} search results for {c}{query}{reset}')
return all_data
if self.debug:
self.logger.debug(f'{c}{query}{reset}')
config['cursor'] = next_cursor
data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
if not data:
return all_data
data['query'] = query
f'[{GREEN}success{RESET}] Returned {len(total)} search results for {c}{query["query"]}{reset}')
return res
if self.save:
(out / f'raw/{time.time_ns()}.json').write_text(
orjson.dumps(data, option=orjson.OPT_INDENT_2).decode(),
encoding='utf-8'
)
all_data.append(data)
return all_data
(out / f'{time.time_ns()}.json').write_bytes(orjson.dumps(entries))
async def backoff(self, fn, info, **kwargs):
async def backoff(self, fn, **kwargs):
retries = kwargs.get('retries', 3)
for i in range(retries + 1):
try:
data, next_cursor = await fn()
if not data.get('globalObjects', {}).get('tweets'):
raise Exception
return data, next_cursor
data, entries, cursor = await fn()
if errors := data.get('errors'):
for e in errors:
self.logger.warning(f'{YELLOW}{e.get("message")}{RESET}')
return [], [], ''
ids = set(find_key(data, 'entryId'))
if len(ids) >= 2:
return data, entries, cursor
except Exception as e:
if i == retries:
if self.debug:
self.logger.debug(f'Max retries exceeded\n{e}')
return None, None
self.logger.debug(f'Max retries exceeded\n{e}')
return
t = 2 ** i + random.random()
if self.debug:
self.logger.debug(
f'No data for: {BOLD}{info}{RESET}, retrying in {f"{t:.2f}"} seconds\t\t{e}')
time.sleep(t)
self.logger.debug(f'Retrying in {f"{t:.2f}"} seconds\t\t{e}')
# time.sleep(t)
await asyncio.sleep(t)
async def get(self, session: AsyncClient, params: dict) -> tuple:
url = set_qs(self.api, params, update=True, safe='()')
r = await session.get(url)
async def get(self, client: AsyncClient, params: dict) -> tuple:
_, qid, name = Operation.SearchTimeline
r = await client.get(
f'https://twitter.com/i/api/graphql/{qid}/{name}',
params=build_params(params),
)
data = r.json()
next_cursor = self.get_cursor(data)
return data, next_cursor
cursor = self.get_cursor(data)
entries = [y for x in find_key(data, 'entries') for y in x if re.search(r'^(tweet|user)-', y['entryId'])]
# add on query info
for e in entries:
e['query'] = params['variables']['rawQuery']
return data, entries, cursor
def get_cursor(self, res: dict):
try:
if live := find_key(res, 'value'):
if cursor := [x for x in live if 'scroll' in x]:
return cursor[0]
for instr in res['timeline']['instructions']:
if replaceEntry := instr.get('replaceEntry'):
cursor = replaceEntry['entry']['content']['operation']['cursor']
if cursor['cursorType'] == 'Bottom':
return cursor['value']
continue
for entry in instr['addEntries']['entries']:
if entry['entryId'] == 'cursor-bottom-0':
return entry['content']['operation']['cursor']['value']
except Exception as e:
if self.debug:
self.logger.debug(e)
def make_output_dirs(self, path: str) -> Path:
p = Path(f'{path}')
(p / 'raw').mkdir(parents=True, exist_ok=True)
(p / 'processed').mkdir(parents=True, exist_ok=True)
(p / 'final').mkdir(parents=True, exist_ok=True)
return p
def get_cursor(self, data: list[dict]):
for e in find_key(data, 'content'):
if e.get('cursorType') == 'Bottom':
return e['value']
def _init_logger(self, **kwargs) -> Logger:
if kwargs.get('debug'):