Files
twitter-api-client/examples/simple_example.py
2024-01-12 17:00:01 -08:00

63 lines
1.6 KiB
Python

import pandas as pd
from twitter.util import find_key
from twitter.scraper import Scraper
def parse_tweets(data: list | dict) -> pd.DataFrame:
"""
Parse small subset of relevant features into a DataFrame.
Note: structure of GraphQL response is not consistent, this example may not work in all cases.
@param data: tweets (raw GraphQL response data)
@return: DataFrame of tweets
"""
df = (
pd.json_normalize((
x.get('result', {}).get('tweet', {}).get('legacy') for x in find_key(data, 'tweet_results')),
max_level=1
)
.assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
.sort_values('created_at', ascending=False)
.reset_index(drop=True)
)
numeric = [
'user_id_str',
'id_str',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
]
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
df = df[[
'id_str',
'user_id_str',
'created_at',
'full_text',
'favorite_count',
'quote_count',
'reply_count',
'retweet_count',
'lang',
]]
return df
if __name__ == '__main__':
## sign-in with credentials
email, username, password = ..., ..., ...
scraper = Scraper(email, username, password)
## or, resume session using cookies
# scraper = Scraper(cookies={"ct0": ..., "auth_token": ...})
tweets = scraper.tweets([
..., # tweet ids
])
df = parse_tweets(tweets)
df.to_csv('tweets.csv')
# df.to_parquet('tweets.parquet', engine='pyarrow')