Files
twitter-api-client/examples/postprocess.py

97 lines
2.8 KiB
Python

import re
import pandas as pd
from twitter.util import find_key
def get_tweets(data: list | dict, cols: list[str] = None):
"""
Convert raw GraphQL response to DataFrame
@param data: tweets
@param cols: option to only include certain columns
@return: DataFrame of tweets
"""
entries = [y for x in find_key(data, 'entries') for y in x]
# filter out promoted tweets
tweets = [x for x in entries if not x.get('entryId').startswith('promoted')]
df = (
pd.json_normalize(find_key(tweets, 'tweet_results'), max_level=1)
['result.legacy'].apply(pd.Series)
.dropna(subset='user_id_str')
.assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
.sort_values('created_at', ascending=False)
.reset_index(drop=True)
)
numeric = [
'user_id_str',
'id_str',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
]
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
cols = cols or [
'id_str',
'user_id_str',
'created_at',
'full_text',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
'lang',
]
return df[cols]
def get_tweets_urls(data: dict | list, expr: str, cols: list[str] = None) -> pd.DataFrame:
"""
Convert raw GraphQL response to DataFrame
Search for tweets containing specific urls by regex
@param data: tweets
@param expr: regex to match urls
@param cols: option to only include certain columns
@return: DataFrame of tweets matching the expression
"""
tweet_results = find_key(data, 'tweet_results')
results = []
for res in tweet_results:
legacy = res.get('result', {}).get('legacy', {})
urls = find_key(res, 'expanded_url')
if any(re.search(expr, x) for x in urls):
results.append({'urls': urls} | legacy)
try:
df = (
pd.DataFrame(results)
.assign(date=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
.sort_values('created_at', ascending=False)
.reset_index(drop=True)
)
numeric = [
'user_id_str',
'id_str',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
]
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
cols = cols or [
'id_str',
'user_id_str',
'created_at',
'urls',
'full_text',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
'lang',
]
return df[cols]
except Exception as e:
print(e)