twitter-api-client/twitter/scraper.py

import asyncio
import logging.config
import math
import platform
import sys
from functools import partial
from typing import Generator

import websockets
from httpx import AsyncClient, Limits, ReadTimeout, URL
from tqdm.asyncio import tqdm_asyncio

from .constants import *
from .login import login
from .util import *

try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        import nest_asyncio

        nest_asyncio.apply()
except:
    ...

if platform.system() != 'Windows':
    try:
        import uvloop

        uvloop.install()
    except ImportError as e:
        ...


class Scraper:
    def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
        self.save = kwargs.get('save', True)
        self.debug = kwargs.get('debug', 0)
        self.pbar = kwargs.get('pbar', True)
        self.out = Path(kwargs.get('out', 'data'))
        self.guest = False
        self.logger = self._init_logger(**kwargs)
        self.session = self._validate_session(email, username, password, session, **kwargs)
        self.rate_limits = {}

    def users(self, screen_names: list[str], **kwargs) -> list[dict]:
        """
        Get user data by screen names.

        @param screen_names: list of screen names (usernames)
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.UserByScreenName, screen_names, **kwargs)

    def tweets_by_id(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
        """
        Get tweet metadata by tweet ids.

        @param tweet_ids: list of tweet ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.TweetResultByRestId, tweet_ids, **kwargs)

    def tweets_by_ids(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
        """
        Get tweet metadata by tweet ids.

        Special batch query for tweet data. Most efficient way to get tweets.

        @param tweet_ids: list of tweet ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.TweetResultsByRestIds, batch_ids(tweet_ids), **kwargs)

    def tweets_details(self, tweet_ids: list[int], **kwargs) -> list[dict]:
        """
        Get tweet data by tweet ids.

        Includes tweet metadata as well as comments, replies, etc.

        @param tweet_ids: list of tweet ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.TweetDetail, tweet_ids, **kwargs)

    def tweets(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get tweets by user ids.

        Metadata for users tweets.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.UserTweets, user_ids, **kwargs)

    def tweets_and_replies(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get tweets and replies by user ids.

        Tweet metadata, including replies.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.UserTweetsAndReplies, user_ids, **kwargs)

    def media(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get media by user ids.

        Tweet metadata, filtered for tweets containing media.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.UserMedia, user_ids, **kwargs)

    def likes(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get likes by user ids.

        Tweet metadata for tweets liked by users.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of tweet data as dicts
        """
        return self._run(Operation.Likes, user_ids, **kwargs)

    def followers(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get followers by user ids.

        User data for users followers list.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.Followers, user_ids, **kwargs)

    def following(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get following by user ids.

        User metadata for users following list.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.Following, user_ids, **kwargs)

    def favoriters(self, tweet_ids: list[int], **kwargs) -> list[dict]:
        """
        Get favoriters by tweet ids.

        User data for users who liked these tweets.

        @param tweet_ids: list of tweet ids
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.Favoriters, tweet_ids, **kwargs)

    def retweeters(self, tweet_ids: list[int], **kwargs) -> list[dict]:
        """
        Get retweeters by tweet ids.

        User data for users who retweeted these tweets.

        @param tweet_ids: list of tweet ids
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.Retweeters, tweet_ids, **kwargs)

    def tweet_stats(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get tweet statistics by user ids.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of tweet statistics as dicts
        """
        return self._run(Operation.TweetStats, user_ids, **kwargs)

    def users_by_ids(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get user data by user ids.

        Special batch query for user data. Most efficient way to get user data.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.UsersByRestIds, batch_ids(user_ids), **kwargs)

    def recommended_users(self, user_ids: list[int] = None, **kwargs) -> list[dict]:
        """
        Get recommended users by user ids, or general recommendations if no user ids are provided.

        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of recommended users data as dicts
        """
        if user_ids:
            contexts = [{"context": orjson.dumps({"contextualUserId": x}).decode()} for x in user_ids]
        else:
            contexts = [{'context': None}]
        return self._run(Operation.ConnectTabTimeline, contexts, **kwargs)

    def profile_spotlights(self, screen_names: list[str], **kwargs) -> list[dict]:
        """
        Get user data by screen names.

        This endpoint is included for completeness only.
        Use the batched query `users_by_ids` instead if you wish to pull user profile data.

        @param screen_names: list of user screen names (usernames)
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.ProfileSpotlightsQuery, screen_names, **kwargs)

    def users_by_id(self, user_ids: list[int], **kwargs) -> list[dict]:
        """
        Get user data by user ids.

        This endpoint is included for completeness only.
        Use the batched query `users_by_ids` instead if you wish to pull user profile data.


        @param user_ids: list of user ids
        @param kwargs: optional keyword arguments
        @return: list of user data as dicts
        """
        return self._run(Operation.UserByRestId, user_ids, **kwargs)

    def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, cards: bool = True, hq_img_variant: bool = True, video_thumb: bool = False, out: str = 'media',
                       metadata_out: str = 'media.json', **kwargs) -> dict:
        """
        Download and extract media metadata from Tweets

        @param ids: list of Tweet IDs
        @param photos: download images
        @param videos: download videos
        @param cards: download cards
        @param hq_img_variant: download highest quality image, options: {"orig", "4096x4096"}
        @param video_thumb: download video thumbnails
        @param out: output file for media
        @param metadata_out: output file for media metadata
        @return: media data
        """

        async def process(fns: Generator) -> list:
            limits = {
                'max_connections': kwargs.pop('max_connections', 1000),
                'max_keepalive_connections': kwargs.pop('max_keepalive_connections', None),
                'keepalive_expiry': kwargs.pop('keepalive_expiry', 5.0),
            }
            headers = {'user-agent': random.choice(USER_AGENTS)}
            async with AsyncClient(limits=Limits(**limits), headers=headers, http2=True, verify=False, timeout=60, follow_redirects=True) as client:
                return await tqdm_asyncio.gather(*(fn(client=client) for fn in fns), desc='Downloading Media')

        def download(urls: list[tuple], out: str) -> Generator:
            out = Path(out)
            out.mkdir(parents=True, exist_ok=True)
            chunk_size = kwargs.pop('chunk_size', None)

            async def get(client: AsyncClient, url: str):
                tid, cdn_url = url
                ext = urlsplit(cdn_url).path.split('/')[-1]
                fname = out / f'{tid}_{ext}'
                async with aiofiles.open(fname, 'wb') as fp:
                    async with client.stream('GET', cdn_url) as r:
                        async for chunk in r.aiter_raw(chunk_size):
                            await fp.write(chunk)

            return (partial(get, url=u) for u in urls)

        tweets = self.tweets_by_ids(ids, **kwargs)
        media = {}
        for data in tweets:
            for tweet in data.get('data', {}).get('tweetResult', []):
                # TweetWithVisibilityResults and Tweet have different structures
                root = tweet.get('result', {}).get('tweet', {}) or tweet.get('result', {})
                if _id := root.get('rest_id'):
                    date = root.get('legacy', {}).get('created_at', '')
                    uid = root.get('legacy', {}).get('user_id_str', '')
                    media[_id] = {'date': date, 'uid': uid, 'img': set(), 'video': {'thumb': set(), 'video_info': {}, 'hq': set()}, 'card': []}
                    for _media in (y for x in find_key(root, 'media') for y in x if isinstance(x, list)):
                        if videos:
                            if vinfo := _media.get('video_info'):
                                hq = sorted(vinfo.get('variants', []), key=lambda x: -x.get('bitrate', 0))[0]['url']
                                media[_id]['video']['video_info'] |= vinfo
                                media[_id]['video']['hq'].add(hq)
                        if video_thumb:
                            if url := _media.get('media_url_https', ''):
                                media[_id]['video']['thumb'].add(url)
                        if photos:
                            if (url := _media.get('media_url_https', '')) and "_video_thumb" not in url:
                                if hq_img_variant:
                                    url = f'{url}?name=orig'
                                media[_id]['img'].add(url)
                    if cards:
                        if card := root.get('card', {}).get('legacy', {}):
                            media[_id]['card'].extend(card.get('binding_values', []))
        if metadata_out:
            media = set2list(media)
            metadata_out = Path(metadata_out)
            metadata_out.parent.mkdir(parents=True, exist_ok=True)  # if user specifies subdir
            metadata_out.write_bytes(orjson.dumps(media))

        res = []
        for k, v in media.items():
            tmp = []
            if photos:
                tmp.extend(v['img'])
            if videos:
                tmp.extend(v['video']['hq'])
            if video_thumb:
                tmp.extend(v['video']['thumb'])
            if cards:
                tmp.extend(parse_card_media(v['card']))
            res.extend([(k, m) for m in tmp])
        asyncio.run(process(download(res, out)))
        return media

    def trends(self, utc: list[str] = None) -> dict:
        """
        Get trends for all UTC offsets

        @param utc: optional list of specific UTC offsets
        @return: dict of trends
        """

        async def get_trends(client: AsyncClient, offset: str, url: str):
            try:
                client.headers['x-twitter-utcoffset'] = offset
                r = await client.get(url)
                trends = find_key(r.json(), 'item')
                return {t['content']['trend']['name']: t for t in trends}
            except Exception as e:
                if self.debug:
                    self.logger.error(f'[{RED}error{RESET}] Failed to get trends\n{e}')

        async def process():
            url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params)
            offsets = utc or ["-1200", "-1100", "-1000", "-0900", "-0800", "-0700", "-0600", "-0500", "-0400", "-0300",
                              "-0200", "-0100", "+0000", "+0100", "+0200", "+0300", "+0400", "+0500", "+0600", "+0700",
                              "+0800", "+0900", "+1000", "+1100", "+1200", "+1300", "+1400"]
            async with AsyncClient(headers=get_headers(self.session)) as client:
                tasks = (get_trends(client, o, url) for o in offsets)
                if self.pbar:
                    return await tqdm_asyncio.gather(*tasks, desc='Getting trends')
                return await asyncio.gather(*tasks)

        trends = asyncio.run(process())
        out = self.out / 'raw' / 'trends'
        out.mkdir(parents=True, exist_ok=True)
        (out / f'{time.time_ns()}.json').write_text(orjson.dumps(
            {k: v for d in trends for k, v in d.items()},
            option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode(), encoding='utf-8')
        return trends

    def spaces(self, *, rooms: list[str] = None, search: list[dict] = None, audio: bool = False, chat: bool = False,
               **kwargs) -> list[dict]:
        """
        Get Twitter spaces data

        - Get data for specific rooms or search for rooms.
        - Get audio and/or chat data for rooms.

        @param rooms: list of room ids
        @param search: list of dicts containing search parameters
        @param audio: flag to include audio data
        @param chat: flag to include chat data
        @param kwargs: optional keyword arguments
        @return: list of spaces data
        """
        if rooms:
            spaces = self._run(Operation.AudioSpaceById, rooms, **kwargs)
        else:
            res = self._run(Operation.AudioSpaceSearch, search, **kwargs)
            search_results = set(find_key(res, 'rest_id'))
            spaces = self._run(Operation.AudioSpaceById, search_results, **kwargs)
        if audio or chat:
            return self._get_space_data(spaces, audio, chat)
        return spaces

    def _get_space_data(self, spaces: list[dict], audio=True, chat=True):
        streams = self._check_streams(spaces)
        chat_data = None
        if chat:
            temp = []  # get necessary keys instead of passing large dicts
            for stream in filter(lambda x: x['stream'], streams):
                meta = stream['space']['data']['audioSpace']['metadata']
                if meta['state'] not in {SpaceState.Running, SpaceState.NotStarted}:
                    temp.append({
                        'rest_id': meta['rest_id'],
                        'chat_token': stream['stream']['chatToken'],
                        'media_key': meta['media_key'],
                        'state': meta['state'],
                    })
            chat_data = self._get_chat_data(temp)
        if audio:
            temp = []
            for stream in streams:
                if stream.get('stream'):
                    chunks = self._get_chunks(stream['stream']['source']['location'])
                    temp.append({
                        'rest_id': stream['space']['data']['audioSpace']['metadata']['rest_id'],
                        'chunks': chunks,
                    })
            self._download_audio(temp)
        return chat_data

    async def _get_stream(self, client: AsyncClient, media_key: str) -> dict | None:
        params = {
            'client': 'web',
            'use_syndication_guest_id': 'false',
            'cookie_set_host': 'twitter.com',
        }
        url = f'https://twitter.com/i/api/1.1/live_video_stream/status/{media_key}'
        try:
            r = await client.get(url, params=params)
            return r.json()
        except Exception as e:
            if self.debug:
                self.logger.error(f'stream not available for playback\n{e}')

    async def _init_chat(self, client: AsyncClient, chat_token: str) -> dict:
        payload = {'chat_token': chat_token}  # stream['chatToken']
        url = 'https://proxsee.pscp.tv/api/v2/accessChatPublic'
        r = await client.post(url, json=payload)
        return r.json()

    async def _get_chat(self, client: AsyncClient, endpoint: str, access_token: str, cursor: str = '') -> list[dict]:
        payload = {
            'access_token': access_token,
            'cursor': cursor,
            'limit': 1000,  # or 0
            'since': None,
            'quick_get': True,
        }
        url = f"{endpoint}/chatapi/v1/history"
        r = await client.post(url, json=payload)
        data = r.json()
        res = [data]
        while cursor := data.get('cursor'):
            try:
                r = await client.post(url, json=payload | {'cursor': cursor})
                if r.status_code == 503:
                    # not our fault, service error, something went wrong with the stream
                    break
                data = r.json()
                res.append(data)
            except ReadTimeout as e:
                if self.debug:
                    self.logger.debug(f'End of chat data\n{e}')
                break

        parsed = []
        for r in res:
            messages = r.get('messages', [])
            for msg in messages:
                try:
                    msg['payload'] = orjson.loads(msg.get('payload', '{}'))
                    msg['payload']['body'] = orjson.loads(msg['payload'].get('body'))
                except Exception as e:
                    if self.debug:
                        self.logger.error(f'Failed to parse chat message\n{e}')
            parsed.extend(messages)
        return parsed

    def _get_chunks(self, location: str) -> list[str]:
        try:
            url = URL(location)
            stream_type = url.params.get('type')
            r = self.session.get(
                url=location,
                params={'type': stream_type},
                headers={'authority': url.host}
            )
            # don't need an m3u8 parser
            chunks = re.findall('\n(chunk_.*)\n', r.text, flags=re.I)
            url = '/'.join(location.split('/')[:-1])
            return [f'{url}/{chunk}' for chunk in chunks]
        except Exception as e:
            if self.debug:
                self.logger.error(f'Failed to get chunks\n{e}')

    def _get_chat_data(self, keys: list[dict]) -> list[dict]:
        async def get(c: AsyncClient, key: dict) -> dict:
            info = await self._init_chat(c, key['chat_token'])
            chat = await self._get_chat(c, info['endpoint'], info['access_token'])
            if self.save:
                (self.out / 'raw' / f"chat_{key['rest_id']}.json").write_bytes(orjson.dumps(chat))
            return {
                'space': key['rest_id'],
                'chat': chat,
                'info': info,
            }

        async def process():
            (self.out / 'raw').mkdir(parents=True, exist_ok=True)
            limits = Limits(max_connections=100, max_keepalive_connections=10)
            headers = self.session.headers if self.guest else get_headers(self.session)
            cookies = self.session.cookies
            async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
                tasks = (get(c, key) for key in keys)
                if self.pbar:
                    return await tqdm_asyncio.gather(*tasks, desc='Downloading chat data')
                return await asyncio.gather(*tasks)

        return asyncio.run(process())

    def _download_audio(self, data: list[dict]) -> None:
        async def get(s: AsyncClient, chunk: str, rest_id: str) -> tuple:
            r = await s.get(chunk)
            return rest_id, r

        async def process(data: list[dict]) -> list:
            limits = Limits(max_connections=100, max_keepalive_connections=10)
            headers = self.session.headers if self.guest else get_headers(self.session)
            cookies = self.session.cookies
            async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
                tasks = []
                for d in data:
                    tasks.extend([get(c, chunk, d['rest_id']) for chunk in d['chunks']])
                if self.pbar:
                    return await tqdm_asyncio.gather(*tasks, desc='Downloading audio')
                return await asyncio.gather(*tasks)

        chunks = asyncio.run(process(data))
        streams = {}
        [streams.setdefault(_id, []).append(chunk) for _id, chunk in chunks]
        # ensure chunks are in correct order
        for k, v in streams.items():
            streams[k] = sorted(v, key=lambda x: int(re.findall('_(\d+)_\w\.aac$', x.url.path)[0]))
        out = self.out / 'audio'
        out.mkdir(parents=True, exist_ok=True)
        for space_id, chunks in streams.items():
            # 1hr ~= 50mb
            with open(out / f'{space_id}.aac', 'wb') as fp:
                [fp.write(c.content) for c in chunks]

    def _check_streams(self, keys: list[dict]) -> list[dict]:
        async def get(c: AsyncClient, space: dict) -> dict:
            media_key = space['data']['audioSpace']['metadata']['media_key']
            stream = await self._get_stream(c, media_key)
            return {'space': space, 'stream': stream}

        async def process():
            limits = Limits(max_connections=100, max_keepalive_connections=10)
            headers = self.session.headers if self.guest else get_headers(self.session)
            cookies = self.session.cookies
            async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
                return await asyncio.gather(*(get(c, key) for key in keys))

        return asyncio.run(process())

    def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | list | dict], **kwargs):
        keys, qid, name = operation
        # stay within rate-limits
        if (l := len(queries)) > MAX_ENDPOINT_LIMIT:
            if self.debug:
                self.logger.warning(f'Got {l} queries, truncating to first 500.')
            queries = list(queries)[:MAX_ENDPOINT_LIMIT]

        if all(isinstance(q, dict) for q in queries):
            data = asyncio.run(self._process(operation, list(queries), **kwargs))
            return get_json(data, **kwargs)

        # queries are of type set | list[int|str], need to convert to list[dict]
        _queries = [{k: q} for q in queries for k, v in keys.items()]
        res = asyncio.run(self._process(operation, _queries, **kwargs))
        data = get_json(res, **kwargs)
        return data.pop() if kwargs.get('cursor') else flatten(data)

    async def _query(self, client: AsyncClient, operation: tuple, **kwargs) -> Response:
        keys, qid, name = operation
        params = {
            'variables': Operation.default_variables | keys | kwargs,
            'features': Operation.default_features,
        }
        r = await client.get(f'https://twitter.com/i/api/graphql/{qid}/{name}', params=build_params(params))

        try:
            self.rate_limits[name] = {k: int(v) for k, v in r.headers.items() if 'rate-limit' in k}
        except Exception as e:
            self.logger.debug(f'{e}')

        if self.debug:
            log(self.logger, self.debug, r)
        if self.save:
            await save_json(r, self.out, name, **kwargs)
        return r

    async def _process(self, operation: tuple, queries: list[dict], **kwargs):
        headers = self.session.headers if self.guest else get_headers(self.session)
        cookies = self.session.cookies
        async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
            tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
            if self.pbar:
                return await tqdm_asyncio.gather(*tasks, desc=operation[-1])
            return await asyncio.gather(*tasks)

    async def _paginate(self, client: AsyncClient, operation: tuple, **kwargs):
        limit = kwargs.pop('limit', math.inf)
        cursor = kwargs.pop('cursor', None)
        is_resuming = False
        dups = 0
        DUP_LIMIT = 3
        if cursor:
            is_resuming = True
            res = []
            ids = set()
        else:
            try:
                r = await self._query(client, operation, **kwargs)
                initial_data = r.json()
                res = [r]
                ids = {x for x in find_key(initial_data, 'rest_id') if x[0].isnumeric()}

                cursor = get_cursor(initial_data)
            except Exception as e:
                if self.debug:
                    self.logger.error(f'Failed to get initial pagination data: {e}')
                return
        while (dups < DUP_LIMIT) and cursor:
            prev_len = len(ids)
            if prev_len >= limit:
                break
            try:
                r = await self._query(client, operation, cursor=cursor, **kwargs)
                data = r.json()
            except Exception as e:
                if self.debug:
                    self.logger.error(f'Failed to get pagination data\n{e}')
                return
            cursor = get_cursor(data)
            ids |= {x for x in find_key(data, 'rest_id') if x[0].isnumeric()}

            if self.debug:
                self.logger.debug(f'Unique results: {len(ids)}\tcursor: {cursor}')
            if prev_len == len(ids):
                dups += 1
            res.append(r)
        if is_resuming:
            return res, cursor
        return res

    async def _space_listener(self, chat: dict, frequency: int):
        rand_color = lambda: random.choice([RED, GREEN, RESET, BLUE, CYAN, MAGENTA, YELLOW])
        uri = f"wss://{URL(chat['endpoint']).host}/chatapi/v1/chatnow"
        with open('chatlog.jsonl', 'ab') as fp:
            async with websockets.connect(uri) as ws:
                await ws.send(orjson.dumps({
                    "payload": orjson.dumps({"access_token": chat['access_token']}).decode(),
                    "kind": 3
                }).decode())
                await ws.send(orjson.dumps({
                    "payload": orjson.dumps({
                        "body": orjson.dumps({
                            "room": chat['room_id']
                        }).decode(),
                        "kind": 1
                    }).decode(),
                    "kind": 2
                }).decode())

                prev_message = ''
                prev_user = ''
                while True:
                    msg = await ws.recv()
                    temp = orjson.loads(msg)
                    kind = temp.get('kind')
                    if kind == 1:
                        signature = temp.get('signature')
                        payload = orjson.loads(temp.get('payload'))
                        payload['body'] = orjson.loads(payload.get('body'))
                        res = {
                            'kind': kind,
                            'payload': payload,
                            'signature': signature,
                        }
                        fp.write(orjson.dumps(res) + b'\n')
                        body = payload['body']
                        message = body.get('body')
                        user = body.get('username')
                        # user_id = body.get('user_id')
                        final = body.get('final')

                        if frequency == 1:
                            if final:
                                if user != prev_user:
                                    print()
                                    print(f"({rand_color()}{user}{RESET})")
                                    prev_user = user
                                # print(message, end=' ')
                                print(message)

                        # dirty
                        if frequency == 2:
                            if user and (not final):
                                if user != prev_user:
                                    print()
                                    print(f"({rand_color()}{user}{RESET})")
                                    prev_user = user
                                new_message = re.sub(f'^({prev_message})', '', message, flags=re.I).strip()
                                if len(new_message) < 100:
                                    print(new_message, end=' ')
                                    prev_message = message

    async def _get_live_chats(self, client: Client, spaces: list[dict]):
        async def get(c: AsyncClient, space: dict) -> list[dict]:
            media_key = space['data']['audioSpace']['metadata']['media_key']
            r = await c.get(
                url=f'https://twitter.com/i/api/1.1/live_video_stream/status/{media_key}',
                params={
                    'client': 'web',
                    'use_syndication_guest_id': 'false',
                    'cookie_set_host': 'twitter.com',
                })
            r = await c.post(
                url='https://proxsee.pscp.tv/api/v2/accessChatPublic',
                json={'chat_token': r.json()['chatToken']}
            )
            return r.json()

        limits = Limits(max_connections=100)
        async with AsyncClient(headers=client.headers, limits=limits, timeout=30) as c:
            tasks = (get(c, _id) for _id in spaces)
            if self.pbar:
                return await tqdm_asyncio.gather(*tasks, desc='Getting live transcripts')
            return await asyncio.gather(*tasks)

    def space_live_transcript(self, room: str, frequency: int = 1):
        """
        Log live transcript of a space

        @param room: room id
        @param frequency: granularity of transcript. 1 for real-time, 2 for post-processed or "finalized" transcript
        @return: None
        """

        async def get(spaces: list[dict]):
            client = init_session()
            chats = await self._get_live_chats(client, spaces)
            await asyncio.gather(*(self._space_listener(c, frequency) for c in chats))

        spaces = self.spaces(rooms=[room])
        asyncio.run(get(spaces))

    def spaces_live(self, rooms: list[str]):
        """
        Capture live audio stream from spaces

        Limited to 500 rooms per IP, as defined by twitter's rate limits.

        @param rooms: list of room ids
        @return: None
        """
        chunk_idx = lambda chunk: re.findall('_(\d+)_\w\.aac', chunk)[0]
        sort_chunks = lambda chunks: sorted(chunks, key=lambda x: int(chunk_idx(x)))
        parse_chunks = lambda txt: re.findall('\n(chunk_.*)\n', txt, flags=re.I)

        async def get_m3u8(client: AsyncClient, space: dict) -> dict:
            try:
                media_key = space['data']['audioSpace']['metadata']['media_key']
                r = await client.get(
                    url=f'https://twitter.com/i/api/1.1/live_video_stream/status/{media_key}',
                    params={'client': 'web', 'use_syndication_guest_id': 'false', 'cookie_set_host': 'twitter.com'}
                )
                data = r.json()
                room = data['shareUrl'].split('/')[-1]
                return {"url": data['source']['location'], "room": room}
            except Exception as e:
                room = space['data']['audioSpace']['metadata']['rest_id']
                if self.debug:
                    self.logger.error(f'Failed to get stream info for https://twitter.com/i/spaces/{room}\n{e}')

        async def get_chunks(client: AsyncClient, url: str) -> list[str]:
            try:
                url = URL(url)
                r = await client.get(
                    url=url,
                    params={'type': url.params.get('type')},
                    headers={'authority': url.host}
                )
                base = '/'.join(str(url).split('/')[:-1])
                return [f'{base}/{c}' for c in parse_chunks(r.text)]
            except Exception as e:
                if self.debug:
                    self.logger.error(f'Failed to get chunks\n{e}')

        async def poll_space(client: AsyncClient, space: dict) -> dict | None:
            curr = 0
            lim = 10
            all_chunks = set()
            playlist = await get_m3u8(client, space)
            if not playlist: return
            chunks = await get_chunks(client, playlist['url'])
            if not chunks: return
            out = self.out / 'live'
            out.mkdir(parents=True, exist_ok=True)
            async with aiofiles.open(out / f'{playlist["room"]}.aac', 'wb') as fp:
                while curr < lim:
                    chunks = await get_chunks(client, playlist['url'])
                    if not chunks:
                        return {'space': space, 'chunks': sort_chunks(all_chunks)}
                    new_chunks = set(chunks) - all_chunks
                    all_chunks |= new_chunks
                    for c in sort_chunks(new_chunks):
                        try:
                            if self.debug:
                                self.logger.debug(f"write: chunk [{chunk_idx(c)}]\t{c}")
                            r = await client.get(c)
                            await fp.write(r.content)
                        except Exception as e:
                            if self.debug:
                                self.logger.error(f'Failed to write chunk {c}\n{e}')
                    curr = 0 if new_chunks else curr + 1
                    # wait for new chunks. dynamic playlist is updated every 2-3 seconds
                    await asyncio.sleep(random.random() + 1.5)
            return {'space': space, 'chunks': sort_chunks(all_chunks)}

        async def process(spaces: list[dict]):
            limits = Limits(max_connections=100)
            headers, cookies = self.session.headers, self.session.cookies
            async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
                return await asyncio.gather(*(poll_space(c, space) for space in spaces))

        spaces = self.spaces(rooms=rooms)
        return asyncio.run(process(spaces))

    def _init_logger(self, **kwargs) -> Logger:
        if kwargs.get('debug'):
            cfg = kwargs.get('log_config')
            logging.config.dictConfig(cfg or LOG_CONFIG)

            # only support one logger
            logger_name = list(LOG_CONFIG['loggers'].keys())[0]

            # set level of all other loggers to ERROR
            for name in logging.root.manager.loggerDict:
                if name != logger_name:
                    logging.getLogger(name).setLevel(logging.ERROR)

            return logging.getLogger(logger_name)

    def _validate_session(self, *args, **kwargs):
        email, username, password, session = args

        # validate credentials
        if all((email, username, password)):
            return login(email, username, password, **kwargs)

        # invalid credentials, try validating session
        if session and all(session.cookies.get(c) for c in {'ct0', 'auth_token'}):
            return session

        # invalid credentials and session
        cookies = kwargs.get('cookies')

        # try validating cookies dict
        if isinstance(cookies, dict) and all(cookies.get(c) for c in {'ct0', 'auth_token'}):
            _session = Client(cookies=cookies, follow_redirects=True)
            _session.headers.update(get_headers(_session))
            return _session

        # try validating cookies from file
        if isinstance(cookies, str):
            _session = Client(cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True)
            _session.headers.update(get_headers(_session))
            return _session

        # no session, credentials, or cookies provided. use guest session.
        if self.debug:
            self.logger.warning(f'{RED}This is a guest session, some endpoints cannot be accessed.{RESET}\n')
        self.guest = True
        return session

    @property
    def id(self) -> int:
        """ Get User ID """
        return int(re.findall('"u=(\d+)"', self.session.cookies.get('twid'))[0])

    def save_cookies(self, fname: str = None):
        """ Save cookies to file """
        cookies = self.session.cookies
        Path(f'{fname or cookies.get("username")}.cookies').write_bytes(orjson.dumps(dict(cookies)))

    def _v1_rate_limits(self):
        return self.session.get('https://api.twitter.com/1.1/application/rate_limit_status.json').json()