add new GraphQL endpoint /SearchTimeline. api.twitter.com/2/search/adaptive.json deprecated
This commit is contained in:
96
examples/postprocess.py
Normal file
96
examples/postprocess.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import re
|
||||
import pandas as pd
|
||||
from twitter.util import find_key
|
||||
|
||||
|
||||
def get_tweets(data: list | dict, cols: list[str] = None):
|
||||
"""
|
||||
Convert raw GraphQL response to DataFrame
|
||||
|
||||
@param data: tweets
|
||||
@param cols: option to only include certain columns
|
||||
@return: DataFrame of tweets
|
||||
"""
|
||||
entries = [y for x in find_key(data, 'entries') for y in x]
|
||||
# filter out promoted tweets
|
||||
tweets = [x for x in entries if not x.get('entryId').startswith('promoted')]
|
||||
df = (
|
||||
pd.json_normalize(find_key(tweets, 'tweet_results'), max_level=1)
|
||||
['result.legacy'].apply(pd.Series)
|
||||
.dropna(subset='user_id_str')
|
||||
.assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
|
||||
.sort_values('created_at', ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
numeric = [
|
||||
'user_id_str',
|
||||
'id_str',
|
||||
'favorite_count',
|
||||
'quote_count',
|
||||
'reply_count',
|
||||
'retweet_count',
|
||||
]
|
||||
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
|
||||
cols = cols or [
|
||||
'id_str',
|
||||
'user_id_str',
|
||||
'created_at',
|
||||
'full_text',
|
||||
'favorite_count',
|
||||
'quote_count',
|
||||
'reply_count',
|
||||
'retweet_count',
|
||||
'lang',
|
||||
]
|
||||
return df[cols]
|
||||
|
||||
|
||||
def get_tweets_urls(data: dict | list, expr: str, cols: list[str] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Convert raw GraphQL response to DataFrame
|
||||
|
||||
Search for tweets containing specific urls by regex
|
||||
|
||||
@param data: tweets
|
||||
@param expr: regex to match urls
|
||||
@param cols: option to only include certain columns
|
||||
@return: DataFrame of tweets matching the expression
|
||||
"""
|
||||
tweet_results = find_key(data, 'tweet_results')
|
||||
results = []
|
||||
for res in tweet_results:
|
||||
legacy = res.get('result', {}).get('legacy', {})
|
||||
urls = find_key(res, 'expanded_url')
|
||||
if any(re.search(expr, x) for x in urls):
|
||||
results.append({'urls': urls} | legacy)
|
||||
try:
|
||||
df = (
|
||||
pd.DataFrame(results)
|
||||
.assign(date=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
|
||||
.sort_values('created_at', ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
numeric = [
|
||||
'user_id_str',
|
||||
'id_str',
|
||||
'favorite_count',
|
||||
'quote_count',
|
||||
'reply_count',
|
||||
'retweet_count',
|
||||
]
|
||||
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
|
||||
cols = cols or [
|
||||
'id_str',
|
||||
'user_id_str',
|
||||
'created_at',
|
||||
'urls',
|
||||
'full_text',
|
||||
'favorite_count',
|
||||
'quote_count',
|
||||
'reply_count',
|
||||
'retweet_count',
|
||||
'lang',
|
||||
]
|
||||
return df[cols]
|
||||
except Exception as e:
|
||||
print(e)
|
||||
48
readme.md
48
readme.md
@@ -296,28 +296,34 @@ follower_subset, last_cursor = scraper.followers([user_id], limit=limit, cursor=
|
||||
from twitter.search import Search
|
||||
|
||||
email, username, password = ..., ..., ...
|
||||
# default output directory is `data/raw` if save=True
|
||||
search = Search(email, username, password)
|
||||
# default output directory is `data/search_results` if save=True
|
||||
search = Search(email, username, password, save=True, debug=1)
|
||||
|
||||
latest_results = search.run(
|
||||
'brasil portugal -argentina',
|
||||
'paperswithcode -tensorflow -tf',
|
||||
'ios android',
|
||||
limit=100,
|
||||
latest=True, # get latest tweets only
|
||||
retries=3,
|
||||
)
|
||||
|
||||
general_results = search.run(
|
||||
'(#dogs OR #cats) min_retweets:500',
|
||||
'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01',
|
||||
'brasil portugal -argentina',
|
||||
'paperswithcode -tensorflow -tf',
|
||||
'skateboarding baseball guitar',
|
||||
'cheese bread butter',
|
||||
'ios android',
|
||||
limit=100,
|
||||
retries=7,
|
||||
res = search.run(
|
||||
limit=37,
|
||||
retries=5,
|
||||
queries=[
|
||||
{
|
||||
'category': 'Top',
|
||||
'query': 'paperswithcode -tensorflow -tf'
|
||||
},
|
||||
{
|
||||
'category': 'Latest',
|
||||
'query': 'test'
|
||||
},
|
||||
{
|
||||
'category': 'People',
|
||||
'query': 'brasil portugal -argentina'
|
||||
},
|
||||
{
|
||||
'category': 'Photos',
|
||||
'query': 'greece'
|
||||
},
|
||||
{
|
||||
'category': 'Videos',
|
||||
'query': 'italy'
|
||||
},
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
48
setup.py
48
setup.py
@@ -315,28 +315,34 @@ setup(
|
||||
from twitter.search import Search
|
||||
|
||||
email, username, password = ..., ..., ...
|
||||
# default output directory is `data/raw` if save=True
|
||||
search = Search(email, username, password)
|
||||
# default output directory is `data/search_results` if save=True
|
||||
search = Search(email, username, password, save=True, debug=1)
|
||||
|
||||
latest_results = search.run(
|
||||
'brasil portugal -argentina',
|
||||
'paperswithcode -tensorflow -tf',
|
||||
'ios android',
|
||||
limit=100,
|
||||
latest=True, # get latest tweets only
|
||||
retries=3,
|
||||
)
|
||||
|
||||
general_results = search.run(
|
||||
'(#dogs OR #cats) min_retweets:500',
|
||||
'min_faves:10000 @elonmusk until:2023-02-16 since:2023-02-01',
|
||||
'brasil portugal -argentina',
|
||||
'paperswithcode -tensorflow -tf',
|
||||
'skateboarding baseball guitar',
|
||||
'cheese bread butter',
|
||||
'ios android',
|
||||
limit=100,
|
||||
retries=7,
|
||||
res = search.run(
|
||||
limit=37,
|
||||
retries=5,
|
||||
queries=[
|
||||
{
|
||||
'category': 'Top',
|
||||
'query': 'paperswithcode -tensorflow -tf'
|
||||
},
|
||||
{
|
||||
'category': 'Latest',
|
||||
'query': 'test'
|
||||
},
|
||||
{
|
||||
'category': 'People',
|
||||
'query': 'brasil portugal -argentina'
|
||||
},
|
||||
{
|
||||
'category': 'Photos',
|
||||
'query': 'greece'
|
||||
},
|
||||
{
|
||||
'category': 'Videos',
|
||||
'query': 'italy'
|
||||
},
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
__title__ = "twitter-api-client"
|
||||
__description__ = "Implementation of Twitter's v1, v2, and GraphQL APIs."
|
||||
__version__ = "0.10.6"
|
||||
__version__ = "0.10.7"
|
||||
__author__ = "Trevor Hobenshield"
|
||||
__license__ = "MIT"
|
||||
@@ -65,6 +65,15 @@ ID_MAP = {
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchCategory:
|
||||
Top = 'Top'
|
||||
Latest = 'Latest'
|
||||
People = 'People'
|
||||
Photos = 'Photos'
|
||||
Videos = 'Videos'
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpaceCategory:
|
||||
Top = 'Top'
|
||||
@@ -85,6 +94,7 @@ class SpaceState:
|
||||
@dataclass
|
||||
class Operation:
|
||||
# todo: dynamically update
|
||||
SearchTimeline = {'rawQuery': str, 'product': str}, 'nK1dw4oV3k4w5TdtcAdSww', 'SearchTimeline'
|
||||
AudioSpaceById = {'id': str}, 'fYAuJHiY3TmYdBmrRtIKhA', 'AudioSpaceById'
|
||||
AudioSpaceSearch = {'filter': str, 'query': str}, 'NTq79TuSz6fHj8lQaferJw', 'AudioSpaceSearch',
|
||||
UserByScreenName = {'screen_name': str}, 'sLVLhk0bGj3MVFEKTdax1w', 'UserByScreenName'
|
||||
@@ -272,7 +282,6 @@ class Operation:
|
||||
RitoFlaggedAccountsTimeline = 'lMzaBZHIbD6GuPqJJQubMg', 'RitoFlaggedAccountsTimeline'
|
||||
RitoFlaggedTweetsTimeline = 'iCuXMibh6yj9AelyjKXDeA', 'RitoFlaggedTweetsTimeline'
|
||||
RitoSuggestedActionsFacePile = 'GnQKeEdL1LyeK3dTQCS1yw', 'RitoSuggestedActionsFacePile'
|
||||
SearchTimeline = 'gkjsKepM6gl_HmFWoWKfgg', 'SearchTimeline'
|
||||
SetDefault = 'QEMLEzEMzoPNbeauKCCLbg', 'SetDefault'
|
||||
SetSafetyModeSettings = 'qSJIPIpf4gA7Wn21bT3D4w', 'SetSafetyModeSettings'
|
||||
SharingAudiospacesListeningDataWithFollowersUpdate = '5h0kNbk3ii97rmfY6CdgAA', 'SharingAudiospacesListeningDataWithFollowersUpdate'
|
||||
@@ -351,40 +360,42 @@ class Operation:
|
||||
'withMessages': True,
|
||||
}
|
||||
default_features = {
|
||||
"blue_business_profile_image_shape_enabled": True,
|
||||
"creator_subscriptions_tweet_preview_api_enabled": True,
|
||||
"freedom_of_speech_not_reach_fetch_enabled": False,
|
||||
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
|
||||
"graphql_timeline_v2_bookmark_timeline": True,
|
||||
"hidden_profile_likes_enabled": True,
|
||||
"highlights_tweets_tab_ui_enabled": True,
|
||||
"interactive_text_enabled": True,
|
||||
"longform_notetweets_consumption_enabled": True,
|
||||
"longform_notetweets_inline_media_enabled": False,
|
||||
"longform_notetweets_rich_text_read_enabled": True,
|
||||
"longform_notetweets_richtext_consumption_enabled": True,
|
||||
"profile_foundations_tweet_stats_enabled": True,
|
||||
"profile_foundations_tweet_stats_tweet_frequency": True,
|
||||
"responsive_web_birdwatch_note_limit_enabled": True,
|
||||
"responsive_web_edit_tweet_api_enabled": True,
|
||||
"responsive_web_enhance_cards_enabled": False,
|
||||
"responsive_web_graphql_exclude_directive_enabled": True,
|
||||
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
|
||||
"responsive_web_graphql_timeline_navigation_enabled": True,
|
||||
"responsive_web_text_conversations_enabled": False,
|
||||
"responsive_web_twitter_article_data_v2_enabled": True,
|
||||
"responsive_web_twitter_blue_verified_badge_is_enabled": True,
|
||||
"rweb_lists_timeline_redesign_enabled": True,
|
||||
"spaces_2022_h2_clipping": True,
|
||||
"spaces_2022_h2_spaces_communities": True,
|
||||
"standardized_nudges_misinfo": True,
|
||||
"subscriptions_verification_info_verified_since_enabled": True,
|
||||
"tweet_awards_web_tipping_enabled": False,
|
||||
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
|
||||
"tweetypie_unmention_optimization_enabled": True,
|
||||
"verified_phone_label_enabled": False,
|
||||
"vibe_api_enabled": True,
|
||||
"view_counts_everywhere_api_enabled": True,
|
||||
'blue_business_profile_image_shape_enabled': True,
|
||||
'creator_subscriptions_tweet_preview_api_enabled': True,
|
||||
'freedom_of_speech_not_reach_fetch_enabled': True,
|
||||
'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True,
|
||||
'graphql_timeline_v2_bookmark_timeline': True,
|
||||
'hidden_profile_likes_enabled': True,
|
||||
'highlights_tweets_tab_ui_enabled': True,
|
||||
'interactive_text_enabled': True,
|
||||
'longform_notetweets_consumption_enabled': True,
|
||||
'longform_notetweets_inline_media_enabled': True,
|
||||
'longform_notetweets_rich_text_read_enabled': True,
|
||||
'longform_notetweets_richtext_consumption_enabled': True,
|
||||
'profile_foundations_tweet_stats_enabled': True,
|
||||
'profile_foundations_tweet_stats_tweet_frequency': True,
|
||||
'responsive_web_birdwatch_note_limit_enabled': True,
|
||||
'responsive_web_edit_tweet_api_enabled': True,
|
||||
'responsive_web_enhance_cards_enabled': False,
|
||||
'responsive_web_graphql_exclude_directive_enabled': True,
|
||||
'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False,
|
||||
'responsive_web_graphql_timeline_navigation_enabled': True,
|
||||
'responsive_web_media_download_video_enabled': False,
|
||||
'responsive_web_text_conversations_enabled': False,
|
||||
'responsive_web_twitter_article_data_v2_enabled': True,
|
||||
'responsive_web_twitter_article_tweet_consumption_enabled': False,
|
||||
'responsive_web_twitter_blue_verified_badge_is_enabled': True,
|
||||
'rweb_lists_timeline_redesign_enabled': True,
|
||||
'spaces_2022_h2_clipping': True,
|
||||
'spaces_2022_h2_spaces_communities': True,
|
||||
'standardized_nudges_misinfo': True,
|
||||
'subscriptions_verification_info_verified_since_enabled': True,
|
||||
'tweet_awards_web_tipping_enabled': False,
|
||||
'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True,
|
||||
'tweetypie_unmention_optimization_enabled': True,
|
||||
'verified_phone_label_enabled': False,
|
||||
'vibe_api_enabled': True,
|
||||
'view_counts_everywhere_api_enabled': True
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from httpx import AsyncClient, Client
|
||||
|
||||
from .constants import *
|
||||
from .login import login
|
||||
from .util import set_qs, get_headers, find_key
|
||||
from .util import get_headers, find_key, build_params
|
||||
|
||||
reset = '\u001b[0m'
|
||||
colors = [f'\u001b[{i}m' for i in range(30, 38)]
|
||||
@@ -39,101 +39,99 @@ class Search:
|
||||
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
|
||||
self.save = kwargs.get('save', True)
|
||||
self.debug = kwargs.get('debug', 0)
|
||||
self.api = 'https://api.twitter.com/2/search/adaptive.json?'
|
||||
self.logger = self._init_logger(**kwargs)
|
||||
self.session = self._validate_session(email, username, password, session, **kwargs)
|
||||
|
||||
def run(self, *args, out: str = 'data', **kwargs):
|
||||
out_path = self.make_output_dirs(out)
|
||||
if kwargs.get('latest', False):
|
||||
search_config['tweet_search_mode'] = 'live'
|
||||
return asyncio.run(self.process(args, search_config, out_path, **kwargs))
|
||||
def run(self, queries: list[dict], limit: int = math.inf, **kwargs):
|
||||
out = Path('data/search_results')
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
return asyncio.run(self.process(queries, limit, out, **kwargs))
|
||||
|
||||
async def process(self, queries: tuple, config: dict, out: Path, **kwargs) -> list:
|
||||
async def process(self, queries: list[dict], limit: int, out: Path, **kwargs) -> list:
|
||||
async with AsyncClient(headers=get_headers(self.session)) as s:
|
||||
return await asyncio.gather(*(self.paginate(q, s, config, out, **kwargs) for q in queries))
|
||||
return await asyncio.gather(*(self.paginate(s, q, limit, out, **kwargs) for q in queries))
|
||||
|
||||
async def paginate(self, query: str, session: AsyncClient, config: dict, out: Path, **kwargs) -> list[dict]:
|
||||
config['q'] = query
|
||||
data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
|
||||
all_data = [data]
|
||||
async def paginate(self, client: AsyncClient, query: dict, limit: int, out: Path, **kwargs) -> list[dict]:
|
||||
c = colors.pop() if colors else ''
|
||||
ids = set()
|
||||
while next_cursor:
|
||||
ids |= set(data['globalObjects']['tweets'])
|
||||
if len(ids) >= kwargs.get('limit', math.inf):
|
||||
params = {
|
||||
'variables': {
|
||||
'count': 20,
|
||||
'querySource': 'typed_query',
|
||||
},
|
||||
'features': Operation.default_features,
|
||||
'fieldToggles': {'withArticleRichContentState': False},
|
||||
}
|
||||
params['variables']['rawQuery'] = query['query']
|
||||
params['variables']['product'] = query['category']
|
||||
|
||||
data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs)
|
||||
|
||||
total = set()
|
||||
res = [*entries]
|
||||
while True:
|
||||
if cursor:
|
||||
params['variables']['cursor'] = cursor
|
||||
|
||||
data, entries, cursor = await self.backoff(lambda: self.get(client, params), **kwargs)
|
||||
|
||||
if len(entries) <= 2: # just cursors
|
||||
return res
|
||||
|
||||
res.extend(entries)
|
||||
unq = set(find_key(entries, 'entryId'))
|
||||
total |= unq
|
||||
|
||||
if self.debug:
|
||||
self.logger.debug(f'{c}{query["query"]}{reset}')
|
||||
|
||||
if len(total) >= limit:
|
||||
if self.debug:
|
||||
self.logger.debug(
|
||||
f'[{GREEN}success{RESET}] Returned {len(ids)} search results for {c}{query}{reset}')
|
||||
return all_data
|
||||
if self.debug:
|
||||
self.logger.debug(f'{c}{query}{reset}')
|
||||
config['cursor'] = next_cursor
|
||||
|
||||
data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
|
||||
if not data:
|
||||
return all_data
|
||||
|
||||
data['query'] = query
|
||||
f'[{GREEN}success{RESET}] Returned {len(total)} search results for {c}{query["query"]}{reset}')
|
||||
return res
|
||||
|
||||
if self.save:
|
||||
(out / f'raw/{time.time_ns()}.json').write_text(
|
||||
orjson.dumps(data, option=orjson.OPT_INDENT_2).decode(),
|
||||
encoding='utf-8'
|
||||
)
|
||||
all_data.append(data)
|
||||
return all_data
|
||||
(out / f'{time.time_ns()}.json').write_bytes(orjson.dumps(entries))
|
||||
|
||||
async def backoff(self, fn, info, **kwargs):
|
||||
async def backoff(self, fn, **kwargs):
|
||||
retries = kwargs.get('retries', 3)
|
||||
for i in range(retries + 1):
|
||||
try:
|
||||
data, next_cursor = await fn()
|
||||
if not data.get('globalObjects', {}).get('tweets'):
|
||||
raise Exception
|
||||
return data, next_cursor
|
||||
data, entries, cursor = await fn()
|
||||
if errors := data.get('errors'):
|
||||
for e in errors:
|
||||
self.logger.warning(f'{YELLOW}{e.get("message")}{RESET}')
|
||||
return [], [], ''
|
||||
ids = set(find_key(data, 'entryId'))
|
||||
if len(ids) >= 2:
|
||||
return data, entries, cursor
|
||||
except Exception as e:
|
||||
if i == retries:
|
||||
if self.debug:
|
||||
self.logger.debug(f'Max retries exceeded\n{e}')
|
||||
return None, None
|
||||
self.logger.debug(f'Max retries exceeded\n{e}')
|
||||
return
|
||||
t = 2 ** i + random.random()
|
||||
if self.debug:
|
||||
self.logger.debug(
|
||||
f'No data for: {BOLD}{info}{RESET}, retrying in {f"{t:.2f}"} seconds\t\t{e}')
|
||||
time.sleep(t)
|
||||
self.logger.debug(f'Retrying in {f"{t:.2f}"} seconds\t\t{e}')
|
||||
# time.sleep(t)
|
||||
await asyncio.sleep(t)
|
||||
|
||||
async def get(self, session: AsyncClient, params: dict) -> tuple:
|
||||
url = set_qs(self.api, params, update=True, safe='()')
|
||||
r = await session.get(url)
|
||||
async def get(self, client: AsyncClient, params: dict) -> tuple:
|
||||
_, qid, name = Operation.SearchTimeline
|
||||
r = await client.get(
|
||||
f'https://twitter.com/i/api/graphql/{qid}/{name}',
|
||||
params=build_params(params),
|
||||
)
|
||||
data = r.json()
|
||||
next_cursor = self.get_cursor(data)
|
||||
return data, next_cursor
|
||||
cursor = self.get_cursor(data)
|
||||
entries = [y for x in find_key(data, 'entries') for y in x if re.search(r'^(tweet|user)-', y['entryId'])]
|
||||
# add on query info
|
||||
for e in entries:
|
||||
e['query'] = params['variables']['rawQuery']
|
||||
return data, entries, cursor
|
||||
|
||||
def get_cursor(self, res: dict):
|
||||
try:
|
||||
if live := find_key(res, 'value'):
|
||||
if cursor := [x for x in live if 'scroll' in x]:
|
||||
return cursor[0]
|
||||
for instr in res['timeline']['instructions']:
|
||||
if replaceEntry := instr.get('replaceEntry'):
|
||||
cursor = replaceEntry['entry']['content']['operation']['cursor']
|
||||
if cursor['cursorType'] == 'Bottom':
|
||||
return cursor['value']
|
||||
continue
|
||||
for entry in instr['addEntries']['entries']:
|
||||
if entry['entryId'] == 'cursor-bottom-0':
|
||||
return entry['content']['operation']['cursor']['value']
|
||||
except Exception as e:
|
||||
if self.debug:
|
||||
self.logger.debug(e)
|
||||
|
||||
def make_output_dirs(self, path: str) -> Path:
|
||||
p = Path(f'{path}')
|
||||
(p / 'raw').mkdir(parents=True, exist_ok=True)
|
||||
(p / 'processed').mkdir(parents=True, exist_ok=True)
|
||||
(p / 'final').mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
def get_cursor(self, data: list[dict]):
|
||||
for e in find_key(data, 'content'):
|
||||
if e.get('cursorType') == 'Bottom':
|
||||
return e['value']
|
||||
|
||||
def _init_logger(self, **kwargs) -> Logger:
|
||||
if kwargs.get('debug'):
|
||||
|
||||
Reference in New Issue
Block a user