update examples

This commit is contained in:
Trevor Hobenshield
2023-04-12 18:52:21 -07:00
parent f493c0f8c2
commit 6f27791620
2 changed files with 196 additions and 67 deletions

View File

@@ -1,5 +1,13 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "802043b7",
"metadata": {},
"source": [
"### polars/pandas examples"
]
},
{
"cell_type": "code",
"execution_count": 1,
@@ -24,28 +32,157 @@
"import orjson\n",
"from pathlib import Path\n",
"from twitter.utils import find_key\n",
"import pandas as pd"
"import pandas as pd\n",
"import polars as pl"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4815e47f",
"id": "4703bee3",
"metadata": {},
"outputs": [],
"source": [
"def to_int(tdf: pl.LazyFrame, *args) -> pl.LazyFrame:\n",
" return tdf.with_columns(pl.col(col).cast(pl.Int64, strict=False).alias(col) for col in args)\n",
"\n",
"\n",
"def to_dt(tdf: pl.LazyFrame, fmt: str, *args) -> pl.LazyFrame:\n",
" return tdf.with_columns(pl.col(col).str.strptime(pl.Datetime, fmt).alias(col) for col in args)\n",
"\n",
"\n",
"def get_data(path: Path, expr: str = '', **kwargs) -> dict:\n",
" D = {}\n",
" for p in path.rglob('*'):\n",
" if re.search(expr, p.name, **kwargs):\n",
" D.setdefault(p.stem.split('_')[-1], []).append(orjson.loads(p.read_bytes()))\n",
" return D\n",
" return D"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b0addc33",
"metadata": {},
"outputs": [],
"source": [
"PATH = Path('data/raw')\n",
"\n",
"# filter for users who favorited or retweeted a tweet\n",
"data = get_data(PATH, expr='Favoriters|Retweeters')"
]
},
{
"cell_type": "markdown",
"id": "09efb374",
"metadata": {},
"source": [
"### polars"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e3a70d0e",
"metadata": {},
"outputs": [],
"source": [
"def get_user_details(data: dict, cols: list = None, sort: str = 'created_at') -> pl.LazyFrame:\n",
" numeric = [\n",
" 'fast_followers_count',\n",
" 'favourites_count',\n",
" 'followers_count',\n",
" 'friends_count',\n",
" 'listed_count',\n",
" 'media_count',\n",
" 'normal_followers_count',\n",
" 'statuses_count',\n",
" ]\n",
"\n",
"def get_user_details(data: dict, cols: list = None, sort: str = 'created_at') -> pd.DataFrame:\n",
" \"\"\"\n",
" add \"+\" to sort ascending\n",
" \"\"\"\n",
" D = []\n",
" for u in find_key(data, 'user_results'):\n",
" x = u.get('result', {})\n",
" y = x.get('rest_id')\n",
" if z := x.get('legacy', {}):\n",
" D.append({'rest_id': y} | z)\n",
"\n",
" return (\n",
" pl.LazyFrame(D)\n",
" .unique(subset='rest_id')\n",
" .pipe(to_dt, '%a %b %d %H:%M:%S %z %Y', 'created_at')\n",
" .pipe(to_int, *numeric)\n",
" .sort(sort.strip(\"-\"), descending=\"-\" not in sort)\n",
" .select(cols)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "91495fc2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr > th,\n",
".dataframe > tbody > tr > td {\n",
" text-align: right;\n",
"}\n",
"</style>\n",
"<small>shape: (1855, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>created_at</th><th>screen_name</th><th>followers_count</th></tr><tr><td>datetime[μs, +00:00]</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>2007-03-31 01:16:45 +00:00</td><td>&quot;TheLos&quot;</td><td>1601</td></tr><tr><td>2008-03-18 19:04:59 +00:00</td><td>&quot;wickedjava&quot;</td><td>2986</td></tr><tr><td>2008-04-17 17:30:21 +00:00</td><td>&quot;needless_input...</td><td>218</td></tr><tr><td>2008-06-27 08:58:13 +00:00</td><td>&quot;DebrisStorm&quot;</td><td>178</td></tr><tr><td>2008-07-26 21:58:07 +00:00</td><td>&quot;daka17&quot;</td><td>66</td></tr><tr><td>2008-09-03 23:27:25 +00:00</td><td>&quot;heyitsaaron&quot;</td><td>1230</td></tr><tr><td>2008-09-11 23:37:14 +00:00</td><td>&quot;marinamiss&quot;</td><td>771</td></tr><tr><td>2008-09-18 13:59:25 +00:00</td><td>&quot;shangrila79&quot;</td><td>229</td></tr><tr><td>2008-10-11 07:18:09 +00:00</td><td>&quot;fridayschild71...</td><td>183</td></tr><tr><td>2008-10-27 19:40:43 +00:00</td><td>&quot;Jacelendrahz&quot;</td><td>188</td></tr><tr><td>2008-11-06 21:50:56 +00:00</td><td>&quot;yolo_pinyato&quot;</td><td>2944</td></tr><tr><td>2008-12-05 07:33:23 +00:00</td><td>&quot;El_Dandy40&quot;</td><td>205</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>2023-02-06 15:48:26 +00:00</td><td>&quot;CosmicGhidorah...</td><td>11</td></tr><tr><td>2023-02-08 21:09:17 +00:00</td><td>&quot;backupfHell&quot;</td><td>14</td></tr><tr><td>2023-02-09 19:24:12 +00:00</td><td>&quot;KayFabulous80&quot;</td><td>144</td></tr><tr><td>2023-02-14 04:06:11 +00:00</td><td>&quot;HDBNGRClub&quot;</td><td>3</td></tr><tr><td>2023-02-16 18:38:48 +00:00</td><td>&quot;SladjaMilov14&quot;</td><td>1</td></tr><tr><td>2023-02-17 22:38:58 +00:00</td><td>&quot;c0pas27&quot;</td><td>53</td></tr><tr><td>2023-02-19 06:35:24 +00:00</td><td>&quot;B4NKSCLUB&quot;</td><td>13</td></tr><tr><td>2023-02-19 07:06:15 +00:00</td><td>&quot;Later_Hayter&quot;</td><td>54</td></tr><tr><td>2023-02-21 06:47:49 +00:00</td><td>&quot;hart_kanya&quot;</td><td>2</td></tr><tr><td>2023-02-26 09:43:04 +00:00</td><td>&quot;_Val_Nichole&quot;</td><td>62</td></tr><tr><td>2023-03-04 23:50:32 +00:00</td><td>&quot;Chublosophy&quot;</td><td>346</td></tr><tr><td>2023-03-05 20:56:30 +00:00</td><td>&quot;Erron_20&quot;</td><td>8</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (1855, 3)\n",
"┌────────────────────────────┬────────────────┬─────────────────┐\n",
"│ created_at ┆ screen_name ┆ followers_count │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ datetime[μs, +00:00] ┆ str ┆ i64 │\n",
"╞════════════════════════════╪════════════════╪═════════════════╡\n",
"│ 2007-03-31 01:16:45 +00:00 ┆ TheLos ┆ 1601 │\n",
"│ 2008-03-18 19:04:59 +00:00 ┆ wickedjava ┆ 2986 │\n",
"│ 2008-04-17 17:30:21 +00:00 ┆ needless_input ┆ 218 │\n",
"│ 2008-06-27 08:58:13 +00:00 ┆ DebrisStorm ┆ 178 │\n",
"│ … ┆ … ┆ … │\n",
"│ 2023-02-21 06:47:49 +00:00 ┆ hart_kanya ┆ 2 │\n",
"│ 2023-02-26 09:43:04 +00:00 ┆ _Val_Nichole ┆ 62 │\n",
"│ 2023-03-04 23:50:32 +00:00 ┆ Chublosophy ┆ 346 │\n",
"│ 2023-03-05 20:56:30 +00:00 ┆ Erron_20 ┆ 8 │\n",
"└────────────────────────────┴────────────────┴─────────────────┘"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lf = get_user_details(\n",
" data,\n",
" cols=['created_at', 'screen_name', 'followers_count'],\n",
" sort='-created_at',\n",
")\n",
"\n",
"lf.collect()"
]
},
{
"cell_type": "markdown",
"id": "03aa8cc0",
"metadata": {},
"source": [
"### pandas"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4815e47f",
"metadata": {},
"outputs": [],
"source": [
"def get_user_details2(data: dict, cols: list = None, sort: str = 'created_at') -> pd.DataFrame:\n",
" D = []\n",
" for u in find_key(data, 'user_results'):\n",
" x = u.get('result', {})\n",
@@ -56,7 +193,7 @@
" pd.DataFrame(D)\n",
" .drop_duplicates('rest_id')\n",
" .assign(created_at=lambda x: pd.to_datetime(x['created_at']))\n",
" .sort_values(sort.strip('+'), ascending='+' in sort)\n",
" .sort_values(sort.strip('-'), ascending='-' in sort)\n",
" .reset_index(drop=True)\n",
" )\n",
" n = [x for x in df.columns if 'count' in x]\n",
@@ -66,21 +203,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 8,
"id": "feb0251b",
"metadata": {},
"outputs": [],
"source": [
"PATH = Path('data/raw')\n",
"\n",
"data = get_data(PATH, expr='Favoriters|Retweeters') # filter for users who favorited or retweeted a tweet"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c711659e",
"metadata": {},
"outputs": [
{
"data": {
@@ -111,33 +236,33 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2014-07-14 20:24:30+00:00</td>\n",
" <td>larry_deramus</td>\n",
" <td>25513</td>\n",
" <td>2007-03-31 01:16:45+00:00</td>\n",
" <td>TheLos</td>\n",
" <td>1601</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2012-12-06 17:07:13+00:00</td>\n",
" <td>OneAhmedSagheer</td>\n",
" <td>22092</td>\n",
" <td>2008-03-18 19:04:59+00:00</td>\n",
" <td>wickedjava</td>\n",
" <td>2986</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2013-08-12 21:56:09+00:00</td>\n",
" <td>eriikaswrld</td>\n",
" <td>16872</td>\n",
" <td>2008-04-17 17:30:21+00:00</td>\n",
" <td>needless_input</td>\n",
" <td>218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2009-04-17 06:02:06+00:00</td>\n",
" <td>BrandonEsWolf</td>\n",
" <td>15561</td>\n",
" <td>2008-06-27 08:58:13+00:00</td>\n",
" <td>DebrisStorm</td>\n",
" <td>178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2009-06-18 23:30:10+00:00</td>\n",
" <td>badgalvitoria</td>\n",
" <td>14621</td>\n",
" <td>2008-07-26 21:58:07+00:00</td>\n",
" <td>daka17</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
@@ -147,33 +272,33 @@
" </tr>\n",
" <tr>\n",
" <th>1850</th>\n",
" <td>2023-01-06 17:18:17+00:00</td>\n",
" <td>Jeff62245805</td>\n",
" <td>1</td>\n",
" <td>2023-02-19 07:06:15+00:00</td>\n",
" <td>Later_Hayter</td>\n",
" <td>54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1851</th>\n",
" <td>2023-02-16 18:38:48+00:00</td>\n",
" <td>SladjaMilov14</td>\n",
" <td>1</td>\n",
" <td>2023-02-21 06:47:49+00:00</td>\n",
" <td>hart_kanya</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1852</th>\n",
" <td>2022-10-20 23:47:34+00:00</td>\n",
" <td>DylonsaurusRex</td>\n",
" <td>0</td>\n",
" <td>2023-02-26 09:43:04+00:00</td>\n",
" <td>_Val_Nichole</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1853</th>\n",
" <td>2022-10-14 22:10:56+00:00</td>\n",
" <td>SanduskyAddison</td>\n",
" <td>0</td>\n",
" <td>2023-03-04 23:50:32+00:00</td>\n",
" <td>Chublosophy</td>\n",
" <td>346</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1854</th>\n",
" <td>2020-10-11 18:43:54+00:00</td>\n",
" <td>Jeff59977360</td>\n",
" <td>0</td>\n",
" <td>2023-03-05 20:56:30+00:00</td>\n",
" <td>Erron_20</td>\n",
" <td>8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@@ -181,32 +306,36 @@
"</div>"
],
"text/plain": [
" created_at screen_name followers_count\n",
"0 2014-07-14 20:24:30+00:00 larry_deramus 25513\n",
"1 2012-12-06 17:07:13+00:00 OneAhmedSagheer 22092\n",
"2 2013-08-12 21:56:09+00:00 eriikaswrld 16872\n",
"3 2009-04-17 06:02:06+00:00 BrandonEsWolf 15561\n",
"4 2009-06-18 23:30:10+00:00 badgalvitoria 14621\n",
"... ... ... ...\n",
"1850 2023-01-06 17:18:17+00:00 Jeff62245805 1\n",
"1851 2023-02-16 18:38:48+00:00 SladjaMilov14 1\n",
"1852 2022-10-20 23:47:34+00:00 DylonsaurusRex 0\n",
"1853 2022-10-14 22:10:56+00:00 SanduskyAddison 0\n",
"1854 2020-10-11 18:43:54+00:00 Jeff59977360 0\n",
" created_at screen_name followers_count\n",
"0 2007-03-31 01:16:45+00:00 TheLos 1601\n",
"1 2008-03-18 19:04:59+00:00 wickedjava 2986\n",
"2 2008-04-17 17:30:21+00:00 needless_input 218\n",
"3 2008-06-27 08:58:13+00:00 DebrisStorm 178\n",
"4 2008-07-26 21:58:07+00:00 daka17 66\n",
"... ... ... ...\n",
"1850 2023-02-19 07:06:15+00:00 Later_Hayter 54\n",
"1851 2023-02-21 06:47:49+00:00 hart_kanya 2\n",
"1852 2023-02-26 09:43:04+00:00 _Val_Nichole 62\n",
"1853 2023-03-04 23:50:32+00:00 Chublosophy 346\n",
"1854 2023-03-05 20:56:30+00:00 Erron_20 8\n",
"\n",
"[1855 rows x 3 columns]"
]
},
"execution_count": 5,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = get_user_details(\n",
"PATH = Path('/home/x/PycharmProjects/twitter-api-client/_test/data')\n",
"\n",
"data = get_data(PATH, expr='Favoriters|Retweeters') # filter for users who favorited or retweeted a tweet\n",
"\n",
"df = get_user_details2(\n",
" data,\n",
" cols = ['created_at','screen_name','followers_count'],\n",
" sort = 'followers_count',\n",
" sort = '-created_at',\n",
")\n",
"\n",
"df"

View File

@@ -13,7 +13,7 @@ install_requires = [
setup(
name="twitter-api-client",
version="0.6.0",
version="0.6.1",
python_requires=">=3.11.0",
description="Twitter API",
long_description=dedent('''