add retries param to Search, added search results parsing example

This commit is contained in:
Trevor Hobenshield
2023-05-18 21:23:58 -07:00
parent 28cd391b06
commit 2eac8a2f43
4 changed files with 356 additions and 5 deletions

344
examples/search.ipynb Normal file
View File

@@ -0,0 +1,344 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d815a387",
"metadata": {},
"outputs": [],
"source": [
"# !pip uninstall twitter-api-client -y\n",
"# !pip install twitter-api-client --no-cache-dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1ecf8cb",
"metadata": {},
"outputs": [],
"source": [
"from twitter.search import Search\n",
"import pandas as pd\n",
"\n",
"email, username, password = ..., ..., ...\n",
"search = Search(email, username, password)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "98c65601",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023-05-18 21:20:12,075.075 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:12,656.656 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:13,452.452 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:13,899.899 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:14,539.539 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:14,938.938 DEBUG: [\u001B[32msuccess\u001B[0m] returned 101 search results for \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>created_at</th>\n",
" <th>id</th>\n",
" <th>user_id</th>\n",
" <th>full_text</th>\n",
" <th>lang</th>\n",
" <th>user_url</th>\n",
" <th>tweet_url</th>\n",
" <th>geo</th>\n",
" <th>coordinates</th>\n",
" <th>place</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-05-19 04:07:19+00:00</td>\n",
" <td>1659410380026773509</td>\n",
" <td>809177430602776576</td>\n",
" <td>@msdarlin_ JENNIFER HUDSON first considered al...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/809177430602776576</td>\n",
" <td>https://twitter.com/i/status/1659410380026773509</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-19 03:50:16+00:00</td>\n",
" <td>1659406088578428929</td>\n",
" <td>21226048</td>\n",
" <td>Jennifer Hudson - Believe https://t.co/vjqlw52MjO</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/21226048</td>\n",
" <td>https://twitter.com/i/status/1659406088578428929</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-19 03:03:12+00:00</td>\n",
" <td>1659394245835255808</td>\n",
" <td>174826024</td>\n",
" <td>If Fantasia and Jennifer Hudson do this verzuz...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/174826024</td>\n",
" <td>https://twitter.com/i/status/1659394245835255808</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-05-19 02:44:21+00:00</td>\n",
" <td>1659389499221188609</td>\n",
" <td>1143382733001039873</td>\n",
" <td>jennifer hudson acabou de postar uma foto e no...</td>\n",
" <td>pt</td>\n",
" <td>https://twitter.com/i/user/1143382733001039873</td>\n",
" <td>https://twitter.com/i/status/1659389499221188609</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-05-19 02:41:35+00:00</td>\n",
" <td>1659388805118578689</td>\n",
" <td>1342931884150464512</td>\n",
" <td>Jennifer Hudson</td>\n",
" <td>cy</td>\n",
" <td>https://twitter.com/i/user/1342931884150464512</td>\n",
" <td>https://twitter.com/i/status/1659388805118578689</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>2023-05-17 20:57:29+00:00</td>\n",
" <td>1658939820574400516</td>\n",
" <td>534285941</td>\n",
" <td>I cant 🤣🤣 https://t.co/2tiIyHrMb7</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/534285941</td>\n",
" <td>https://twitter.com/i/status/1658939820574400516</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>2023-05-17 19:46:21+00:00</td>\n",
" <td>1658921918890758148</td>\n",
" <td>417935020</td>\n",
" <td>Idk ask her https://t.co/md7BJf59C2</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/417935020</td>\n",
" <td>https://twitter.com/i/status/1658921918890758148</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>2023-05-17 19:09:50+00:00</td>\n",
" <td>1658912730991009795</td>\n",
" <td>2384861195</td>\n",
" <td>My best hip hop female Dj @ChainzMsDj Dancing ...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/2384861195</td>\n",
" <td>https://twitter.com/i/status/1658912730991009795</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>2023-05-17 16:56:52+00:00</td>\n",
" <td>1658879269232320514</td>\n",
" <td>15733529</td>\n",
" <td>Kelly will sing with D. Smooth\\n\\nThe Complete...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/15733529</td>\n",
" <td>https://twitter.com/i/status/1658879269232320514</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>2023-05-17 14:20:31+00:00</td>\n",
" <td>1658839919278653444</td>\n",
" <td>17230018</td>\n",
" <td>my dream collab? gimme _____ and ______.</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/17230018</td>\n",
" <td>https://twitter.com/i/status/1658839919278653444</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>101 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" created_at id user_id \n",
"0 2023-05-19 04:07:19+00:00 1659410380026773509 809177430602776576 \\\n",
"1 2023-05-19 03:50:16+00:00 1659406088578428929 21226048 \n",
"2 2023-05-19 03:03:12+00:00 1659394245835255808 174826024 \n",
"3 2023-05-19 02:44:21+00:00 1659389499221188609 1143382733001039873 \n",
"4 2023-05-19 02:41:35+00:00 1659388805118578689 1342931884150464512 \n",
".. ... ... ... \n",
"96 2023-05-17 20:57:29+00:00 1658939820574400516 534285941 \n",
"97 2023-05-17 19:46:21+00:00 1658921918890758148 417935020 \n",
"98 2023-05-17 19:09:50+00:00 1658912730991009795 2384861195 \n",
"99 2023-05-17 16:56:52+00:00 1658879269232320514 15733529 \n",
"100 2023-05-17 14:20:31+00:00 1658839919278653444 17230018 \n",
"\n",
" full_text lang \n",
"0 @msdarlin_ JENNIFER HUDSON first considered al... en \\\n",
"1 Jennifer Hudson - Believe https://t.co/vjqlw52MjO en \n",
"2 If Fantasia and Jennifer Hudson do this verzuz... en \n",
"3 jennifer hudson acabou de postar uma foto e no... pt \n",
"4 Jennifer Hudson cy \n",
".. ... ... \n",
"96 I cant 🤣🤣 https://t.co/2tiIyHrMb7 en \n",
"97 Idk ask her https://t.co/md7BJf59C2 en \n",
"98 My best hip hop female Dj @ChainzMsDj Dancing ... en \n",
"99 Kelly will sing with D. Smooth\\n\\nThe Complete... en \n",
"100 my dream collab? gimme _____ and ______. en \n",
"\n",
" user_url \n",
"0 https://twitter.com/i/user/809177430602776576 \\\n",
"1 https://twitter.com/i/user/21226048 \n",
"2 https://twitter.com/i/user/174826024 \n",
"3 https://twitter.com/i/user/1143382733001039873 \n",
"4 https://twitter.com/i/user/1342931884150464512 \n",
".. ... \n",
"96 https://twitter.com/i/user/534285941 \n",
"97 https://twitter.com/i/user/417935020 \n",
"98 https://twitter.com/i/user/2384861195 \n",
"99 https://twitter.com/i/user/15733529 \n",
"100 https://twitter.com/i/user/17230018 \n",
"\n",
" tweet_url geo coordinates place \n",
"0 https://twitter.com/i/status/1659410380026773509 None None None \n",
"1 https://twitter.com/i/status/1659406088578428929 None None None \n",
"2 https://twitter.com/i/status/1659394245835255808 None None None \n",
"3 https://twitter.com/i/status/1659389499221188609 None None None \n",
"4 https://twitter.com/i/status/1659388805118578689 None None None \n",
".. ... ... ... ... \n",
"96 https://twitter.com/i/status/1658939820574400516 None None None \n",
"97 https://twitter.com/i/status/1658921918890758148 None None None \n",
"98 https://twitter.com/i/status/1658912730991009795 None None None \n",
"99 https://twitter.com/i/status/1658879269232320514 None None None \n",
"100 https://twitter.com/i/status/1658839919278653444 None None None \n",
"\n",
"[101 rows x 10 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latest_results = search.run(\n",
" 'jennifer hudson since:2023-05-18',\n",
" limit=100,\n",
" latest=True, # get latest tweets only\n",
" retries=3,\n",
")\n",
"\n",
"flat_results = [y for x in latest_results for y in x]\n",
"data = [r.get('globalObjects', {}).get('tweets', {})for r in flat_results]\n",
"\n",
"base= 'https://twitter.com/i'\n",
"\n",
"df = (\n",
" pd.DataFrame({k:v for d in data for k,v in d.items()})\n",
" .T\n",
" .assign(created_at = lambda x: pd.to_datetime(x['created_at'], format='%a %b %d %H:%M:%S %z %Y'))\n",
" .assign(user_url = lambda x: f\"{base}/user/\"+x['user_id_str'])\n",
" .assign(tweet_url = lambda x: f\"{base}/status/\"+x['id_str'] )\n",
" .sort_values('created_at',ascending=False)\n",
" .drop_duplicates('id')\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"# sample df with a few cols of interest\n",
"sample = df[['created_at','id','user_id','full_text', 'lang',\n",
" 'user_url', 'tweet_url', 'geo', 'coordinates', 'place']]\n",
"\n",
"sample"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -235,6 +235,7 @@ latest_results = search.run(
'ios android',
limit=100,
latest=True, # get latest tweets only
retries=3,
)
general_results = search.run(
@@ -246,6 +247,7 @@ general_results = search.run(
'cheese bread butter',
'ios android',
limit=100,
retries=11,
)
```

View File

@@ -13,7 +13,7 @@ install_requires = [
setup(
name="twitter-api-client",
version="0.7.9",
version="0.8.0",
python_requires=">=3.10.10",
description="Twitter API",
long_description=dedent('''
@@ -237,6 +237,7 @@ setup(
'ios android',
limit=100,
latest=True, # get latest tweets only
retries=3,
)
general_results = search.run(
@@ -248,6 +249,7 @@ setup(
'cheese bread butter',
'ios android',
limit=100,
retries=11,
)
```
'''),

View File

@@ -53,7 +53,7 @@ class Search:
async def paginate(self, query: str, session: AsyncClient, config: dict, out: Path, **kwargs) -> list[
dict]:
config['q'] = query
r, data, next_cursor = await self.backoff(lambda: self.get(session, config), query)
r, data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
all_data = [data]
c = colors.pop() if colors else ''
ids = set()
@@ -65,7 +65,9 @@ class Search:
logger.debug(f'{c}{query}{reset}')
config['cursor'] = next_cursor
r, data, next_cursor = await self.backoff(lambda: self.get(session, config), query)
r, data, next_cursor = await self.backoff(lambda: self.get(session, config), query, **kwargs)
if r is None:
return all_data
data['query'] = query
(out / f'raw/{time.time_ns()}.json').write_text(
orjson.dumps(data, option=orjson.OPT_INDENT_2).decode(),
@@ -74,7 +76,8 @@ class Search:
all_data.append(data)
return all_data
async def backoff(self, fn, info, retries=12):
async def backoff(self, fn, info, **kwargs):
retries = kwargs.get('retries', 3)
for i in range(retries + 1):
try:
r, data, next_cursor = await fn()
@@ -84,7 +87,7 @@ class Search:
except Exception as e:
if i == retries:
logger.debug(f'Max retries exceeded\n{e}')
return
return None, None, None
t = 2 ** i + random.random()
logger.debug(f'No data for: \u001b[1m{info}\u001b[0m | retrying in {f"{t:.2f}"} seconds\t\t{e}')
time.sleep(t)