mirror of
https://github.com/trevorhobenshield/twitter-api-client.git
synced 2025-12-20 10:30:00 -05:00
63 lines
1.6 KiB
Python
63 lines
1.6 KiB
Python
import pandas as pd
|
|
from twitter.util import find_key
|
|
from twitter.scraper import Scraper
|
|
|
|
|
|
def parse_tweets(data: list | dict) -> pd.DataFrame:
|
|
"""
|
|
Parse small subset of relevant features into a DataFrame.
|
|
|
|
Note: structure of GraphQL response is not consistent, this example may not work in all cases.
|
|
|
|
@param data: tweets (raw GraphQL response data)
|
|
@return: DataFrame of tweets
|
|
"""
|
|
df = (
|
|
pd.json_normalize((
|
|
x.get('result', {}).get('tweet', {}).get('legacy') for x in find_key(data, 'tweet_results')),
|
|
max_level=1
|
|
)
|
|
.assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
|
|
.sort_values('created_at', ascending=False)
|
|
.reset_index(drop=True)
|
|
)
|
|
numeric = [
|
|
'user_id_str',
|
|
'id_str',
|
|
'favorite_count',
|
|
'quote_count',
|
|
'reply_count',
|
|
'retweet_count',
|
|
]
|
|
df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
|
|
df = df[[
|
|
'id_str',
|
|
'user_id_str',
|
|
'created_at',
|
|
'full_text',
|
|
'favorite_count',
|
|
'quote_count',
|
|
'reply_count',
|
|
'retweet_count',
|
|
'lang',
|
|
]]
|
|
return df
|
|
|
|
|
|
if __name__ == '__main__':
|
|
## sign-in with credentials
|
|
email, username, password = ..., ..., ...
|
|
scraper = Scraper(email, username, password)
|
|
|
|
## or, resume session using cookies
|
|
# scraper = Scraper(cookies={"ct0": ..., "auth_token": ...})
|
|
|
|
tweets = scraper.tweets([
|
|
..., # tweet ids
|
|
])
|
|
|
|
df = parse_tweets(tweets)
|
|
|
|
df.to_csv('tweets.csv')
|
|
# df.to_parquet('tweets.parquet', engine='pyarrow')
|