import pandas as pd
from twitter.util import find_key
from twitter.scraper import Scraper


def parse_tweets(data: list | dict) -> pd.DataFrame:
    """
    Parse small subset of relevant features into a DataFrame.

    Note: structure of GraphQL response is not consistent, this example may not work in all cases.

    @param data: tweets (raw GraphQL response data)
    @return: DataFrame of tweets
    """
    df = (
        pd.json_normalize((
            x.get('result', {}).get('tweet', {}).get('legacy') for x in find_key(data, 'tweet_results')),
            max_level=1
        )
        .assign(created_at=lambda x: pd.to_datetime(x['created_at'], format="%a %b %d %H:%M:%S %z %Y"))
        .sort_values('created_at', ascending=False)
        .reset_index(drop=True)
    )
    numeric = [
        'user_id_str',
        'id_str',
        'favorite_count',
        'quote_count',
        'reply_count',
        'retweet_count',
    ]
    df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')
    df = df[[
        'id_str',
        'user_id_str',
        'created_at',
        'full_text',
        'favorite_count',
        'quote_count',
        'reply_count',
        'retweet_count',
        'lang',
    ]]
    return df


if __name__ == '__main__':
    ## sign-in with credentials
    email, username, password = ..., ..., ...
    scraper = Scraper(email, username, password)

    ## or, resume session using cookies
    # scraper = Scraper(cookies={"ct0": ..., "auth_token": ...})

    tweets = scraper.tweets([
        ...,  # tweet ids
    ])

    df = parse_tweets(tweets)

    df.to_csv('tweets.csv')
    # df.to_parquet('tweets.parquet', engine='pyarrow')