switching from ujson to orjson, added trends()
This commit is contained in:
4
setup.py
4
setup.py
@@ -4,7 +4,7 @@ from setuptools import find_packages, setup
|
||||
|
||||
install_requires = [
|
||||
"tqdm",
|
||||
"ujson",
|
||||
"orjson",
|
||||
"nest_asyncio",
|
||||
"aiohttp",
|
||||
"requests",
|
||||
@@ -13,7 +13,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="twitter-api-client",
|
||||
version="0.5.4",
|
||||
version="0.5.5",
|
||||
python_requires=">=3.9.7",
|
||||
description="Twitter API",
|
||||
long_description=dedent('''
|
||||
|
||||
@@ -13,7 +13,7 @@ from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
from uuid import uuid1, getnode
|
||||
|
||||
import ujson
|
||||
import orjson
|
||||
from requests import Response
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -22,7 +22,7 @@ from .config.operations import operations
|
||||
from .config.settings import *
|
||||
from .constants import *
|
||||
from .login import login
|
||||
from .utils import get_headers, build_query
|
||||
from .utils import get_headers, build_query, find_key
|
||||
|
||||
try:
|
||||
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
|
||||
@@ -98,6 +98,7 @@ class Account:
|
||||
payload['variables'] |= variables
|
||||
url = f"{self.GRAPHQL_URL}/{qid}/{name}"
|
||||
r = self.session.post(url, headers=get_headers(self.session), json=payload)
|
||||
self.check_response(r)
|
||||
return r
|
||||
|
||||
def api(self, path: str, settings: dict) -> Response:
|
||||
@@ -105,6 +106,7 @@ class Account:
|
||||
headers['content-type'] = 'application/x-www-form-urlencoded'
|
||||
url = f'{self.V1_URL}/{path}'
|
||||
r = self.session.post(url, headers=headers, data=urlencode(settings))
|
||||
self.check_response(r)
|
||||
return r
|
||||
|
||||
@log(info=['json'])
|
||||
@@ -200,7 +202,7 @@ class Account:
|
||||
headers = get_headers(self.session)
|
||||
headers['content-type'] = 'application/x-www-form-urlencoded'
|
||||
url = 'https://caps.twitter.com/v2/cards/create.json'
|
||||
r = self.session.post(url, headers=headers, params={'card_data': ujson.dumps(options)})
|
||||
r = self.session.post(url, headers=headers, params={'card_data': orjson.dumps(options).decode()})
|
||||
card_uri = r.json()['card_uri']
|
||||
r = self.tweet(text, poll_params={'card_uri': card_uri})
|
||||
return r
|
||||
@@ -547,3 +549,10 @@ class Account:
|
||||
url = 'https://twitter.com/i/api/account/self.sessions/revoke_all'
|
||||
r = self.session.post(url, headers=headers)
|
||||
return r
|
||||
|
||||
@staticmethod
|
||||
def check_response(r):
|
||||
if r.status_code == 429:
|
||||
raise Exception(f'rate limit exceeded: {r.url}')
|
||||
if find_key(data := r.json(), 'errors'):
|
||||
logger.debug(f'[{WARN}ERROR{RESET}]: {data}')
|
||||
|
||||
@@ -8,15 +8,16 @@ from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
import ujson
|
||||
import orjson
|
||||
from aiohttp import ClientSession, TCPConnector
|
||||
from tqdm import tqdm
|
||||
|
||||
from .config.log import log_config
|
||||
from .config.operations import operations
|
||||
from .config.settings import trending_params
|
||||
from .constants import *
|
||||
from .login import login
|
||||
from .utils import find_key, build_query, get_headers
|
||||
from .utils import find_key, build_query, get_headers, set_qs
|
||||
|
||||
try:
|
||||
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
|
||||
@@ -215,10 +216,15 @@ class Scraper:
|
||||
try:
|
||||
r = await fn()
|
||||
data = await r.json()
|
||||
if r.status == 429:
|
||||
logger.debug(f'rate limit exceeded: {r.url}')
|
||||
return r, {}
|
||||
if find_key(data, 'errors'):
|
||||
logger.debug(f'[{WARN}ERROR{RESET}]: {data}')
|
||||
return r, data
|
||||
except Exception as e:
|
||||
if i == retries:
|
||||
logger.debug(f'{WARN}Max retries exceeded{RESET}\n{e}')
|
||||
logger.debug(f'{WARN} Max retries exceeded{RESET}\n{e}')
|
||||
return
|
||||
t = 2 ** i + random.random()
|
||||
logger.debug(f'retrying in {f"{t:.2f}"} seconds\t\t{e}')
|
||||
@@ -229,8 +235,9 @@ class Scraper:
|
||||
for d in data:
|
||||
path = Path(f'data/raw/{d[ID]}')
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
with open(path / f'{time.time_ns()}_{name}.json', 'w') as fp:
|
||||
ujson.dump(d, fp, indent=4)
|
||||
(path / f'{time.time_ns()}_{name}.json').write_text(
|
||||
orjson.dumps(d, option=orjson.OPT_INDENT_2).decode())
|
||||
|
||||
except KeyError as e:
|
||||
logger.debug(f'failed to save data: {e}')
|
||||
|
||||
@@ -248,11 +255,6 @@ class Scraper:
|
||||
name = urlsplit(post_url).path.replace('/', '_')[1:]
|
||||
ext = urlsplit(cdn_url).path.split('/')[-1]
|
||||
try:
|
||||
# with open(f'{path}/{name}_{ext}', 'wb') as fp:
|
||||
# r = self.session.get(cdn_url, stream=True)
|
||||
# for chunk in r.iter_content(chunk_size=chunk_size):
|
||||
# fp.write(chunk)
|
||||
|
||||
r = self.session.get(cdn_url, stream=True)
|
||||
total_bytes = int(r.headers.get('Content-Length', 0))
|
||||
desc = f'downloading: {name}'
|
||||
@@ -282,3 +284,25 @@ class Scraper:
|
||||
# logger.debug(f'{hq_videos = }')
|
||||
if hq_videos:
|
||||
[self.download(url, video) for video in hq_videos]
|
||||
|
||||
def trends(self) -> dict:
|
||||
"""Get trends for all UTC offsets"""
|
||||
url = set_qs('https://twitter.com/i/api/2/guide.json', trending_params)
|
||||
headers = get_headers(self.session)
|
||||
offsets = [f"{str(i).zfill(3)}00" if i < 0 else f"+{str(i).zfill(2)}00" for i in range(-12, 15)]
|
||||
res = []
|
||||
for offset in offsets:
|
||||
headers['x-twitter-utcoffset'] = offset
|
||||
r = self.session.get(url, headers=headers)
|
||||
res.append(r.json())
|
||||
logger.debug(f'getting trends for: {offset = }')
|
||||
all_trends = {}
|
||||
for data in res:
|
||||
trends = find_key(data, 'item')
|
||||
for t in trends:
|
||||
all_trends |= {t['content']['trend']['name']: t}
|
||||
path = Path(f'data/raw/trends')
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
(path / f'{time.time_ns()}.json').write_text(
|
||||
orjson.dumps(all_trends, option=orjson.OPT_INDENT_2).decode())
|
||||
return all_trends
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import asyncio
|
||||
import atexit
|
||||
import json
|
||||
import orjson
|
||||
import logging.config
|
||||
import platform
|
||||
import random
|
||||
@@ -64,7 +64,7 @@ async def paginate(query: str, session: aiohttp.ClientSession, config: dict, out
|
||||
config['cursor'] = next_cursor
|
||||
data, next_cursor = await backoff(lambda: get(session, api, config), query)
|
||||
data['query'] = query
|
||||
(out / f'raw/{time.time_ns()}.json').write_text(json.dumps(data, indent=4))
|
||||
(out / f'raw/{time.time_ns()}.json').write_text(orjson.dumps(data, option=orjson.OPT_INDENT_2).decode())
|
||||
all_data.append(data)
|
||||
return all_data
|
||||
|
||||
@@ -136,10 +136,10 @@ def make_output_dirs(path: str) -> Path:
|
||||
@atexit.register
|
||||
def combine_results(in_path: Path = IN_PATH, out_path: Path = OUT_PATH):
|
||||
try:
|
||||
out_path.write_text(json.dumps({
|
||||
out_path.write_text(orjson.dumps({
|
||||
k: v
|
||||
for p in in_path.iterdir() if p.suffix == '.json'
|
||||
for k, v in json.loads(p.read_text())['globalObjects']['tweets'].items()
|
||||
}, indent=2))
|
||||
for k, v in orjson.loads(p.read_text())['globalObjects']['tweets'].items()
|
||||
}, option=orjson.OPT_INDENT_2).decode())
|
||||
except Exception as e:
|
||||
logger.debug(f'FAILED to combine search results, {e}')
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
from urllib.parse import urlsplit, urlencode, urlunsplit, parse_qs, quote
|
||||
|
||||
import ujson
|
||||
import orjson
|
||||
|
||||
|
||||
def set_qs(url: str, qs: dict, update=False, **kwargs) -> str:
|
||||
*_, q, f = urlsplit(url)
|
||||
return urlunsplit((*_, urlencode(qs | parse_qs(q) if update else qs, doseq=True, quote_via=quote, safe=kwargs.get('safe','')), f))
|
||||
return urlunsplit((*_, urlencode(qs | parse_qs(q) if update else qs, doseq=True, quote_via=quote,
|
||||
safe=kwargs.get('safe', '')), f))
|
||||
|
||||
|
||||
def find_key(obj: any, key: str) -> list:
|
||||
@@ -64,4 +65,4 @@ def get_headers(session) -> dict:
|
||||
|
||||
|
||||
def build_query(params):
|
||||
return '&'.join(f'{k}={ujson.dumps(v)}' for k, v in params.items())
|
||||
return '&'.join(f'{k}={orjson.dumps(v).decode()}' for k, v in params.items())
|
||||
|
||||
Reference in New Issue
Block a user