switching from ujson to orjson, added trends()

This commit is contained in:
Trevor Hobenshield
2023-04-01 12:21:27 -07:00
parent ae73ff46a7
commit 9bb9645b32
5 changed files with 57 additions and 23 deletions

View File

@@ -4,7 +4,7 @@ from setuptools import find_packages, setup
install_requires = [
"tqdm",
"ujson",
"orjson",
"nest_asyncio",
"aiohttp",
"requests",
@@ -13,7 +13,7 @@ install_requires = [
setup(
name="twitter-api-client",
version="0.5.4",
version="0.5.5",
python_requires=">=3.9.7",
description="Twitter API",
long_description=dedent('''

View File

@@ -13,7 +13,7 @@ from pathlib import Path
from urllib.parse import urlencode
from uuid import uuid1, getnode
import ujson
import orjson
from requests import Response
from tqdm import tqdm
@@ -22,7 +22,7 @@ from .config.operations import operations
from .config.settings import *
from .constants import *
from .login import login
from .utils import get_headers, build_query
from .utils import get_headers, build_query, find_key
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
@@ -98,6 +98,7 @@ class Account:
payload['variables'] |= variables
url = f"{self.GRAPHQL_URL}/{qid}/{name}"
r = self.session.post(url, headers=get_headers(self.session), json=payload)
self.check_response(r)
return r
def api(self, path: str, settings: dict) -> Response:
@@ -105,6 +106,7 @@ class Account:
headers['content-type'] = 'application/x-www-form-urlencoded'
url = f'{self.V1_URL}/{path}'
r = self.session.post(url, headers=headers, data=urlencode(settings))
self.check_response(r)
return r
@log(info=['json'])
@@ -200,7 +202,7 @@ class Account:
headers = get_headers(self.session)
headers['content-type'] = 'application/x-www-form-urlencoded'
url = 'https://caps.twitter.com/v2/cards/create.json'
r = self.session.post(url, headers=headers, params={'card_data': ujson.dumps(options)})
r = self.session.post(url, headers=headers, params={'card_data': orjson.dumps(options).decode()})
card_uri = r.json()['card_uri']
r = self.tweet(text, poll_params={'card_uri': card_uri})
return r
@@ -547,3 +549,10 @@ class Account:
url = 'https://twitter.com/i/api/account/self.sessions/revoke_all'
r = self.session.post(url, headers=headers)
return r
@staticmethod
def check_response(r):
if r.status_code == 429:
raise Exception(f'rate limit exceeded: {r.url}')
if find_key(data := r.json(), 'errors'):
logger.debug(f'[{WARN}ERROR{RESET}]: {data}')

View File

@@ -8,15 +8,16 @@ from copy import deepcopy
from pathlib import Path
from urllib.parse import urlsplit
import ujson
import orjson
from aiohttp import ClientSession, TCPConnector
from tqdm import tqdm
from .config.log import log_config
from .config.operations import operations
from .config.settings import trending_params
from .constants import *
from .login import login
from .utils import find_key, build_query, get_headers
from .utils import find_key, build_query, get_headers, set_qs
try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
@@ -215,10 +216,15 @@ class Scraper:
try:
r = await fn()
data = await r.json()
if r.status == 429:
logger.debug(f'rate limit exceeded: {r.url}')
return r, {}
if find_key(data, 'errors'):
logger.debug(f'[{WARN}ERROR{RESET}]: {data}')
return r, data
except Exception as e:
if i == retries:
logger.debug(f'{WARN}Max retries exceeded{RESET}\n{e}')
logger.debug(f'{WARN} Max retries exceeded{RESET}\n{e}')
return
t = 2 ** i + random.random()
logger.debug(f'retrying in {f"{t:.2f}"} seconds\t\t{e}')
@@ -229,8 +235,9 @@ class Scraper:
for d in data:
path = Path(f'data/raw/{d[ID]}')
path.mkdir(parents=True, exist_ok=True)
with open(path / f'{time.time_ns()}_{name}.json', 'w') as fp:
ujson.dump(d, fp, indent=4)
(path / f'{time.time_ns()}_{name}.json').write_text(
orjson.dumps(d, option=orjson.OPT_INDENT_2).decode())
except KeyError as e:
logger.debug(f'failed to save data: {e}')
@@ -248,11 +255,6 @@ class Scraper:
name = urlsplit(post_url).path.replace('/', '_')[1:]
ext = urlsplit(cdn_url).path.split('/')[-1]
try:
# with open(f'{path}/{name}_{ext}', 'wb') as fp:
# r = self.session.get(cdn_url, stream=True)
# for chunk in r.iter_content(chunk_size=chunk_size):
# fp.write(chunk)
r = self.session.get(cdn_url, stream=True)
total_bytes = int(r.headers.get('Content-Length', 0))
desc = f'downloading: {name}'
@@ -282,3 +284,25 @@ class Scraper:
# logger.debug(f'{hq_videos = }')
if hq_videos:
[self.download(url, video) for video in hq_videos]
def trends(self) -> dict:
"""Get trends for all UTC offsets"""
url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params)
headers = get_headers(self.session)
offsets = [f"{str(i).zfill(3)}00" if i < 0 else f"+{str(i).zfill(2)}00" for i in range(-12, 15)]
res = []
for offset in offsets:
headers['x-twitter-utcoffset'] = offset
r = self.session.get(url, headers=headers)
res.append(r.json())
logger.debug(f'getting trends for: {offset = }')
all_trends = {}
for data in res:
trends = find_key(data, 'item')
for t in trends:
all_trends |= {t['content']['trend']['name']: t}
path = Path(f'data/raw/trends')
path.mkdir(parents=True, exist_ok=True)
(path / f'{time.time_ns()}.json').write_text(
orjson.dumps(all_trends, option=orjson.OPT_INDENT_2).decode())
return all_trends

View File

@@ -1,6 +1,6 @@
import asyncio
import atexit
import json
import orjson
import logging.config
import platform
import random
@@ -64,7 +64,7 @@ async def paginate(query: str, session: aiohttp.ClientSession, config: dict, out
config['cursor'] = next_cursor
data, next_cursor = await backoff(lambda: get(session, api, config), query)
data['query'] = query
(out / f'raw/{time.time_ns()}.json').write_text(json.dumps(data, indent=4))
(out / f'raw/{time.time_ns()}.json').write_text(orjson.dumps(data, option=orjson.OPT_INDENT_2).decode())
all_data.append(data)
return all_data
@@ -136,10 +136,10 @@ def make_output_dirs(path: str) -> Path:
@atexit.register
def combine_results(in_path: Path = IN_PATH, out_path: Path = OUT_PATH):
try:
out_path.write_text(json.dumps({
out_path.write_text(orjson.dumps({
k: v
for p in in_path.iterdir() if p.suffix == '.json'
for k, v in json.loads(p.read_text())['globalObjects']['tweets'].items()
}, indent=2))
for k, v in orjson.loads(p.read_text())['globalObjects']['tweets'].items()
}, option=orjson.OPT_INDENT_2).decode())
except Exception as e:
logger.debug(f'FAILED to combine search results, {e}')

View File

@@ -1,11 +1,12 @@
from urllib.parse import urlsplit, urlencode, urlunsplit, parse_qs, quote
import ujson
import orjson
def set_qs(url: str, qs: dict, update=False, **kwargs) -> str:
*_, q, f = urlsplit(url)
return urlunsplit((*_, urlencode(qs | parse_qs(q) if update else qs, doseq=True, quote_via=quote, safe=kwargs.get('safe','')), f))
return urlunsplit((*_, urlencode(qs | parse_qs(q) if update else qs, doseq=True, quote_via=quote,
safe=kwargs.get('safe', '')), f))
def find_key(obj: any, key: str) -> list:
@@ -64,4 +65,4 @@ def get_headers(session) -> dict:
def build_query(params):
return '&'.join(f'{k}={ujson.dumps(v)}' for k, v in params.items())
return '&'.join(f'{k}={orjson.dumps(v).decode()}' for k, v in params.items())