add example notebook

This commit is contained in:
Trevor Hobenshield
2024-04-05 14:39:48 -07:00
parent 7002b140aa
commit e388f3174c
6 changed files with 1584 additions and 2161 deletions

View File

@@ -1,401 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"> Note: structure of GraphQL response is not consistent, these examples may not work in all cases."
],
"metadata": {
"collapsed": false
},
"id": "fce8131509380867"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6c99787b",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"from twitter.scraper import Scraper\n",
"from twitter.util import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd3b7a57",
"metadata": {},
"outputs": [],
"source": [
"scraper = Scraper(session=init_session())"
]
},
{
"cell_type": "markdown",
"id": "17a91f72",
"metadata": {},
"source": [
"### get chat log"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "597b3a0f",
"metadata": {},
"outputs": [],
"source": [
"room_id = '1eaJbrAPnBVJX'\n",
"spaces = scraper.spaces(rooms=[room_id], audio=0, chat=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "faaa76b1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>room</th>\n",
" <th>timestamp</th>\n",
" <th>twitter_id</th>\n",
" <th>username</th>\n",
" <th>body</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:32:19</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:32:55</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:33:01</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:33:13</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>Hi Alan, are you there?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:33:17</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>I am.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:00</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>Yeah, it's a great platform.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>252</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:03</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Alright, cool. Sounds good. Thank you.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>253</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:03</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>OK. Thank you.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:04</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Right.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:05</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>Bye.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>256 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" room timestamp twitter_id username \n",
"0 1eaJbrAPnBVJX 2023-05-25 21:32:19 1106321031566893057 jimfarley98 \\\n",
"1 1eaJbrAPnBVJX 2023-05-25 21:32:55 1106321031566893057 jimfarley98 \n",
"2 1eaJbrAPnBVJX 2023-05-25 21:33:01 1106321031566893057 jimfarley98 \n",
"3 1eaJbrAPnBVJX 2023-05-25 21:33:13 1106321031566893057 jimfarley98 \n",
"4 1eaJbrAPnBVJX 2023-05-25 21:33:17 44196397 elonmusk \n",
".. ... ... ... ... \n",
"251 1eaJbrAPnBVJX 2023-05-25 22:00:00 1106321031566893057 jimfarley98 \n",
"252 1eaJbrAPnBVJX 2023-05-25 22:00:03 44196397 elonmusk \n",
"253 1eaJbrAPnBVJX 2023-05-25 22:00:03 1106321031566893057 jimfarley98 \n",
"254 1eaJbrAPnBVJX 2023-05-25 22:00:04 44196397 elonmusk \n",
"255 1eaJbrAPnBVJX 2023-05-25 22:00:05 1106321031566893057 jimfarley98 \n",
"\n",
" body \n",
"0 \n",
"1 \n",
"2 \n",
"3 Hi Alan, are you there? \n",
"4 I am. \n",
".. ... \n",
"251 Yeah, it's a great platform. \n",
"252 Alright, cool. Sounds good. Thank you. \n",
"253 OK. Thank you. \n",
"254 Right. \n",
"255 Bye. \n",
"\n",
"[256 rows x 5 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chat = pd.json_normalize(spaces[0]['chat'])\n",
"chat = chat[chat['payload.body.final'] == True]\n",
"dates = ['payload.body.timestamp']\n",
"chat[dates] = chat[dates].apply(pd.to_datetime, unit='ms').apply(lambda x: x.dt.strftime(\"%Y-%m-%d %H:%M:%S %z\"))\n",
"chat = chat.sort_values('payload.body.timestamp').reset_index(drop=True)\n",
"chat = chat[[\n",
" 'payload.room',\n",
" 'payload.body.timestamp',\n",
" 'payload.sender.twitter_id',\n",
" 'payload.body.username',\n",
" 'payload.body.body',\n",
"]]\n",
"chat.columns = chat.columns.str.replace('(payload|body|sender).','',regex=True).str.replace('.','_')\n",
"# chat.to_csv(f'{room_id}.csv',index=False)\n",
"chat"
]
},
{
"cell_type": "markdown",
"id": "118919dd",
"metadata": {},
"source": [
"### query chat"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3010c52a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>room</th>\n",
" <th>timestamp</th>\n",
" <th>twitter_id</th>\n",
" <th>username</th>\n",
" <th>body</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:35:05</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Yeah, well, well, it's certainly super excitin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:36:00</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>API access so like you know like a Ford vehicl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:36:23</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>And we're very, very much appreciative of of F...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:38:29</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>But but I think it is the the the teams have d...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>191</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:53:52</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Yeah, I agree with that. We should probably no...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>211</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:56:13</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>So, you know, it is certainly the Tesla intent...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>248</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:59:45</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Likewise there, it's an honor to be working wi...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" room timestamp twitter_id username \n",
"22 1eaJbrAPnBVJX 2023-05-25 21:35:05 44196397 elonmusk \\\n",
"30 1eaJbrAPnBVJX 2023-05-25 21:36:00 44196397 elonmusk \n",
"33 1eaJbrAPnBVJX 2023-05-25 21:36:23 44196397 elonmusk \n",
"47 1eaJbrAPnBVJX 2023-05-25 21:38:29 44196397 elonmusk \n",
"191 1eaJbrAPnBVJX 2023-05-25 21:53:52 44196397 elonmusk \n",
"211 1eaJbrAPnBVJX 2023-05-25 21:56:13 44196397 elonmusk \n",
"248 1eaJbrAPnBVJX 2023-05-25 21:59:45 44196397 elonmusk \n",
"\n",
" body \n",
"22 Yeah, well, well, it's certainly super excitin... \n",
"30 API access so like you know like a Ford vehicl... \n",
"33 And we're very, very much appreciative of of F... \n",
"47 But but I think it is the the the teams have d... \n",
"191 Yeah, I agree with that. We should probably no... \n",
"211 So, you know, it is certainly the Tesla intent... \n",
"248 Likewise there, it's an honor to be working wi... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flags = re.I\n",
"(chat\n",
" .query('body.str.contains(\"\\sford\",regex=True,flags=@flags)')\n",
" .query('username.str.contains(\"elonmusk\",regex=True,flags=@flags)')\n",
")\n",
"\n",
"## alternatively\n",
"# chat[\n",
"# chat.body.str.contains('\\sford',regex=True,flags=re.I)\n",
"# &\n",
"# chat.username.str.contains('elonmusk',regex=True,flags=re.I)\n",
"# ]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

1584
examples/example.ipynb Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,370 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "802043b7",
"metadata": {},
"source": [
"### polars/pandas examples\n",
"\n",
"> Note: structure of GraphQL response is not consistent, these examples may not work in all cases.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "54cd50bf",
"metadata": {},
"outputs": [],
"source": [
"# !pip uninstall twitter-api-client -y\n",
"# !pip install twitter-api-client --no-cache-dir"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a3172006",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import re\n",
"from pathlib import Path\n",
"\n",
"import orjson\n",
"import pandas as pd\n",
"import polars as pl\n",
"\n",
"from twitter.util import find_key"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4703bee3",
"metadata": {},
"outputs": [],
"source": [
"def to_int(tdf: pl.LazyFrame, *args) -> pl.LazyFrame:\n",
" return tdf.with_columns(pl.col(col).cast(pl.Int64, strict=False).alias(col) for col in args)\n",
"\n",
"\n",
"def to_dt(tdf: pl.LazyFrame, fmt: str, *args) -> pl.LazyFrame:\n",
" return tdf.with_columns(pl.col(col).str.strptime(pl.Datetime, fmt).alias(col) for col in args)\n",
"\n",
"\n",
"def get_data(path: Path, expr: str = '', **kwargs) -> dict:\n",
" D = {}\n",
" for p in path.rglob('*'):\n",
" if re.search(expr, p.name, **kwargs):\n",
" D.setdefault(p.stem.split('_')[-1], []).append(orjson.loads(p.read_bytes()))\n",
" return D"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b0addc33",
"metadata": {},
"outputs": [],
"source": [
"PATH = Path('data/raw')\n",
"\n",
"# filter for users who favorited or retweeted a tweet\n",
"data = get_data(PATH, expr='Favoriters|Retweeters')"
]
},
{
"cell_type": "markdown",
"id": "09efb374",
"metadata": {},
"source": [
"### polars"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e3a70d0e",
"metadata": {},
"outputs": [],
"source": [
"def get_user_details(data: dict, cols: list = None, sort: str = 'created_at') -> pl.LazyFrame:\n",
" numeric = [\n",
" 'fast_followers_count',\n",
" 'favourites_count',\n",
" 'followers_count',\n",
" 'friends_count',\n",
" 'listed_count',\n",
" 'media_count',\n",
" 'normal_followers_count',\n",
" 'statuses_count',\n",
" ]\n",
"\n",
" D = []\n",
" for u in find_key(data, 'user_results'):\n",
" x = u.get('result', {})\n",
" y = x.get('rest_id')\n",
" if z := x.get('legacy', {}):\n",
" D.append({'rest_id': y} | z)\n",
"\n",
" return (\n",
" pl.LazyFrame(D)\n",
" .unique(subset='rest_id')\n",
" .pipe(to_dt, '%a %b %d %H:%M:%S %z %Y', 'created_at')\n",
" .pipe(to_int, *numeric)\n",
" .sort(sort.strip(\"-\"), descending=\"-\" not in sort)\n",
" .select(cols)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "91495fc2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr > th,\n",
".dataframe > tbody > tr > td {\n",
" text-align: right;\n",
"}\n",
"</style>\n",
"<small>shape: (1855, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>created_at</th><th>screen_name</th><th>followers_count</th></tr><tr><td>datetime[μs, +00:00]</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>2007-03-31 01:16:45 +00:00</td><td>&quot;TheLos&quot;</td><td>1601</td></tr><tr><td>2008-03-18 19:04:59 +00:00</td><td>&quot;wickedjava&quot;</td><td>2986</td></tr><tr><td>2008-04-17 17:30:21 +00:00</td><td>&quot;needless_input...</td><td>218</td></tr><tr><td>2008-06-27 08:58:13 +00:00</td><td>&quot;DebrisStorm&quot;</td><td>178</td></tr><tr><td>2008-07-26 21:58:07 +00:00</td><td>&quot;daka17&quot;</td><td>66</td></tr><tr><td>2008-09-03 23:27:25 +00:00</td><td>&quot;heyitsaaron&quot;</td><td>1230</td></tr><tr><td>2008-09-11 23:37:14 +00:00</td><td>&quot;marinamiss&quot;</td><td>771</td></tr><tr><td>2008-09-18 13:59:25 +00:00</td><td>&quot;shangrila79&quot;</td><td>229</td></tr><tr><td>2008-10-11 07:18:09 +00:00</td><td>&quot;fridayschild71...</td><td>183</td></tr><tr><td>2008-10-27 19:40:43 +00:00</td><td>&quot;Jacelendrahz&quot;</td><td>188</td></tr><tr><td>2008-11-06 21:50:56 +00:00</td><td>&quot;yolo_pinyato&quot;</td><td>2944</td></tr><tr><td>2008-12-05 07:33:23 +00:00</td><td>&quot;El_Dandy40&quot;</td><td>205</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>2023-02-06 15:48:26 +00:00</td><td>&quot;CosmicGhidorah...</td><td>11</td></tr><tr><td>2023-02-08 21:09:17 +00:00</td><td>&quot;backupfHell&quot;</td><td>14</td></tr><tr><td>2023-02-09 19:24:12 +00:00</td><td>&quot;KayFabulous80&quot;</td><td>144</td></tr><tr><td>2023-02-14 04:06:11 +00:00</td><td>&quot;HDBNGRClub&quot;</td><td>3</td></tr><tr><td>2023-02-16 18:38:48 +00:00</td><td>&quot;SladjaMilov14&quot;</td><td>1</td></tr><tr><td>2023-02-17 22:38:58 +00:00</td><td>&quot;c0pas27&quot;</td><td>53</td></tr><tr><td>2023-02-19 06:35:24 +00:00</td><td>&quot;B4NKSCLUB&quot;</td><td>13</td></tr><tr><td>2023-02-19 07:06:15 +00:00</td><td>&quot;Later_Hayter&quot;</td><td>54</td></tr><tr><td>2023-02-21 06:47:49 +00:00</td><td>&quot;hart_kanya&quot;</td><td>2</td></tr><tr><td>2023-02-26 09:43:04 +00:00</td><td>&quot;_Val_Nichole&quot;</td><td>62</td></tr><tr><td>2023-03-04 23:50:32 +00:00</td><td>&quot;Chublosophy&quot;</td><td>346</td></tr><tr><td>2023-03-05 20:56:30 +00:00</td><td>&quot;Erron_20&quot;</td><td>8</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (1855, 3)\n",
"┌────────────────────────────┬────────────────┬─────────────────┐\n",
"│ created_at ┆ screen_name ┆ followers_count │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ datetime[μs, +00:00] ┆ str ┆ i64 │\n",
"╞════════════════════════════╪════════════════╪═════════════════╡\n",
"│ 2007-03-31 01:16:45 +00:00 ┆ TheLos ┆ 1601 │\n",
"│ 2008-03-18 19:04:59 +00:00 ┆ wickedjava ┆ 2986 │\n",
"│ 2008-04-17 17:30:21 +00:00 ┆ needless_input ┆ 218 │\n",
"│ 2008-06-27 08:58:13 +00:00 ┆ DebrisStorm ┆ 178 │\n",
"│ … ┆ … ┆ … │\n",
"│ 2023-02-21 06:47:49 +00:00 ┆ hart_kanya ┆ 2 │\n",
"│ 2023-02-26 09:43:04 +00:00 ┆ _Val_Nichole ┆ 62 │\n",
"│ 2023-03-04 23:50:32 +00:00 ┆ Chublosophy ┆ 346 │\n",
"│ 2023-03-05 20:56:30 +00:00 ┆ Erron_20 ┆ 8 │\n",
"└────────────────────────────┴────────────────┴─────────────────┘"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lf = get_user_details(\n",
" data,\n",
" cols=['created_at', 'screen_name', 'followers_count'],\n",
" sort='-created_at',\n",
")\n",
"\n",
"lf.collect()"
]
},
{
"cell_type": "markdown",
"id": "03aa8cc0",
"metadata": {},
"source": [
"### pandas"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4815e47f",
"metadata": {},
"outputs": [],
"source": [
"def get_user_details2(data: dict, cols: list = None, sort: str = 'created_at') -> pd.DataFrame:\n",
" D = []\n",
" for u in find_key(data, 'user_results'):\n",
" x = u.get('result', {})\n",
" y = x.get('rest_id')\n",
" if z := x.get('legacy', {}):\n",
" D.append({'rest_id': y} | z)\n",
" df = (\n",
" pd.DataFrame(D)\n",
" .drop_duplicates('rest_id')\n",
" .assign(created_at=lambda x: pd.to_datetime(x['created_at']))\n",
" .sort_values(sort.strip('-'), ascending='-' in sort)\n",
" .reset_index(drop=True)\n",
" )\n",
" n = [x for x in df.columns if 'count' in x]\n",
" df[n] = df[n].apply(pd.to_numeric, errors='coerce')\n",
" return df[cols] if cols else df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "feb0251b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>created_at</th>\n",
" <th>screen_name</th>\n",
" <th>followers_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2007-03-31 01:16:45+00:00</td>\n",
" <td>TheLos</td>\n",
" <td>1601</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2008-03-18 19:04:59+00:00</td>\n",
" <td>wickedjava</td>\n",
" <td>2986</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2008-04-17 17:30:21+00:00</td>\n",
" <td>needless_input</td>\n",
" <td>218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2008-06-27 08:58:13+00:00</td>\n",
" <td>DebrisStorm</td>\n",
" <td>178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2008-07-26 21:58:07+00:00</td>\n",
" <td>daka17</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1850</th>\n",
" <td>2023-02-19 07:06:15+00:00</td>\n",
" <td>Later_Hayter</td>\n",
" <td>54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1851</th>\n",
" <td>2023-02-21 06:47:49+00:00</td>\n",
" <td>hart_kanya</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1852</th>\n",
" <td>2023-02-26 09:43:04+00:00</td>\n",
" <td>_Val_Nichole</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1853</th>\n",
" <td>2023-03-04 23:50:32+00:00</td>\n",
" <td>Chublosophy</td>\n",
" <td>346</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1854</th>\n",
" <td>2023-03-05 20:56:30+00:00</td>\n",
" <td>Erron_20</td>\n",
" <td>8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1855 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" created_at screen_name followers_count\n",
"0 2007-03-31 01:16:45+00:00 TheLos 1601\n",
"1 2008-03-18 19:04:59+00:00 wickedjava 2986\n",
"2 2008-04-17 17:30:21+00:00 needless_input 218\n",
"3 2008-06-27 08:58:13+00:00 DebrisStorm 178\n",
"4 2008-07-26 21:58:07+00:00 daka17 66\n",
"... ... ... ...\n",
"1850 2023-02-19 07:06:15+00:00 Later_Hayter 54\n",
"1851 2023-02-21 06:47:49+00:00 hart_kanya 2\n",
"1852 2023-02-26 09:43:04+00:00 _Val_Nichole 62\n",
"1853 2023-03-04 23:50:32+00:00 Chublosophy 346\n",
"1854 2023-03-05 20:56:30+00:00 Erron_20 8\n",
"\n",
"[1855 rows x 3 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH = Path('data/raw')\n",
"\n",
"data = get_data(PATH, expr='Favoriters|Retweeters') # filter for users who favorited or retweeted a tweet\n",
"\n",
"df = get_user_details2(\n",
" data,\n",
" cols = ['created_at','screen_name','followers_count'],\n",
" sort = '-created_at',\n",
")\n",
"\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,353 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"> Note: structure of GraphQL response is not consistent, these examples may not work in all cases."
],
"metadata": {
"collapsed": false
},
"id": "85ee96fb4899b369"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d815a387",
"metadata": {},
"outputs": [],
"source": [
"# !pip uninstall twitter-api-client -y\n",
"# !pip install twitter-api-client --no-cache-dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1ecf8cb",
"metadata": {},
"outputs": [],
"source": [
"from twitter.search import Search\n",
"import pandas as pd\n",
"\n",
"email, username, password = ..., ..., ...\n",
"search = Search(email, username, password)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "98c65601",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023-05-18 21:20:12,075.075 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:12,656.656 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:13,452.452 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:13,899.899 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:14,539.539 DEBUG: \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n",
"2023-05-18 21:20:14,938.938 DEBUG: [\u001B[32msuccess\u001B[0m] returned 101 search results for \u001B[37mjennifer hudson since:2023-05-18\u001B[0m\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>created_at</th>\n",
" <th>id</th>\n",
" <th>user_id</th>\n",
" <th>full_text</th>\n",
" <th>lang</th>\n",
" <th>user_url</th>\n",
" <th>tweet_url</th>\n",
" <th>geo</th>\n",
" <th>coordinates</th>\n",
" <th>place</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-05-19 04:07:19+00:00</td>\n",
" <td>1659410380026773509</td>\n",
" <td>809177430602776576</td>\n",
" <td>@msdarlin_ JENNIFER HUDSON first considered al...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/809177430602776576</td>\n",
" <td>https://twitter.com/i/status/1659410380026773509</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-19 03:50:16+00:00</td>\n",
" <td>1659406088578428929</td>\n",
" <td>21226048</td>\n",
" <td>Jennifer Hudson - Believe https://t.co/vjqlw52MjO</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/21226048</td>\n",
" <td>https://twitter.com/i/status/1659406088578428929</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-19 03:03:12+00:00</td>\n",
" <td>1659394245835255808</td>\n",
" <td>174826024</td>\n",
" <td>If Fantasia and Jennifer Hudson do this verzuz...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/174826024</td>\n",
" <td>https://twitter.com/i/status/1659394245835255808</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-05-19 02:44:21+00:00</td>\n",
" <td>1659389499221188609</td>\n",
" <td>1143382733001039873</td>\n",
" <td>jennifer hudson acabou de postar uma foto e no...</td>\n",
" <td>pt</td>\n",
" <td>https://twitter.com/i/user/1143382733001039873</td>\n",
" <td>https://twitter.com/i/status/1659389499221188609</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-05-19 02:41:35+00:00</td>\n",
" <td>1659388805118578689</td>\n",
" <td>1342931884150464512</td>\n",
" <td>Jennifer Hudson</td>\n",
" <td>cy</td>\n",
" <td>https://twitter.com/i/user/1342931884150464512</td>\n",
" <td>https://twitter.com/i/status/1659388805118578689</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>2023-05-17 20:57:29+00:00</td>\n",
" <td>1658939820574400516</td>\n",
" <td>534285941</td>\n",
" <td>I cant 🤣🤣 https://t.co/2tiIyHrMb7</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/534285941</td>\n",
" <td>https://twitter.com/i/status/1658939820574400516</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>2023-05-17 19:46:21+00:00</td>\n",
" <td>1658921918890758148</td>\n",
" <td>417935020</td>\n",
" <td>Idk ask her https://t.co/md7BJf59C2</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/417935020</td>\n",
" <td>https://twitter.com/i/status/1658921918890758148</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>2023-05-17 19:09:50+00:00</td>\n",
" <td>1658912730991009795</td>\n",
" <td>2384861195</td>\n",
" <td>My best hip hop female Dj @ChainzMsDj Dancing ...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/2384861195</td>\n",
" <td>https://twitter.com/i/status/1658912730991009795</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>2023-05-17 16:56:52+00:00</td>\n",
" <td>1658879269232320514</td>\n",
" <td>15733529</td>\n",
" <td>Kelly will sing with D. Smooth\\n\\nThe Complete...</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/15733529</td>\n",
" <td>https://twitter.com/i/status/1658879269232320514</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>2023-05-17 14:20:31+00:00</td>\n",
" <td>1658839919278653444</td>\n",
" <td>17230018</td>\n",
" <td>my dream collab? gimme _____ and ______.</td>\n",
" <td>en</td>\n",
" <td>https://twitter.com/i/user/17230018</td>\n",
" <td>https://twitter.com/i/status/1658839919278653444</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>101 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" created_at id user_id \n",
"0 2023-05-19 04:07:19+00:00 1659410380026773509 809177430602776576 \\\n",
"1 2023-05-19 03:50:16+00:00 1659406088578428929 21226048 \n",
"2 2023-05-19 03:03:12+00:00 1659394245835255808 174826024 \n",
"3 2023-05-19 02:44:21+00:00 1659389499221188609 1143382733001039873 \n",
"4 2023-05-19 02:41:35+00:00 1659388805118578689 1342931884150464512 \n",
".. ... ... ... \n",
"96 2023-05-17 20:57:29+00:00 1658939820574400516 534285941 \n",
"97 2023-05-17 19:46:21+00:00 1658921918890758148 417935020 \n",
"98 2023-05-17 19:09:50+00:00 1658912730991009795 2384861195 \n",
"99 2023-05-17 16:56:52+00:00 1658879269232320514 15733529 \n",
"100 2023-05-17 14:20:31+00:00 1658839919278653444 17230018 \n",
"\n",
" full_text lang \n",
"0 @msdarlin_ JENNIFER HUDSON first considered al... en \\\n",
"1 Jennifer Hudson - Believe https://t.co/vjqlw52MjO en \n",
"2 If Fantasia and Jennifer Hudson do this verzuz... en \n",
"3 jennifer hudson acabou de postar uma foto e no... pt \n",
"4 Jennifer Hudson cy \n",
".. ... ... \n",
"96 I cant 🤣🤣 https://t.co/2tiIyHrMb7 en \n",
"97 Idk ask her https://t.co/md7BJf59C2 en \n",
"98 My best hip hop female Dj @ChainzMsDj Dancing ... en \n",
"99 Kelly will sing with D. Smooth\\n\\nThe Complete... en \n",
"100 my dream collab? gimme _____ and ______. en \n",
"\n",
" user_url \n",
"0 https://twitter.com/i/user/809177430602776576 \\\n",
"1 https://twitter.com/i/user/21226048 \n",
"2 https://twitter.com/i/user/174826024 \n",
"3 https://twitter.com/i/user/1143382733001039873 \n",
"4 https://twitter.com/i/user/1342931884150464512 \n",
".. ... \n",
"96 https://twitter.com/i/user/534285941 \n",
"97 https://twitter.com/i/user/417935020 \n",
"98 https://twitter.com/i/user/2384861195 \n",
"99 https://twitter.com/i/user/15733529 \n",
"100 https://twitter.com/i/user/17230018 \n",
"\n",
" tweet_url geo coordinates place \n",
"0 https://twitter.com/i/status/1659410380026773509 None None None \n",
"1 https://twitter.com/i/status/1659406088578428929 None None None \n",
"2 https://twitter.com/i/status/1659394245835255808 None None None \n",
"3 https://twitter.com/i/status/1659389499221188609 None None None \n",
"4 https://twitter.com/i/status/1659388805118578689 None None None \n",
".. ... ... ... ... \n",
"96 https://twitter.com/i/status/1658939820574400516 None None None \n",
"97 https://twitter.com/i/status/1658921918890758148 None None None \n",
"98 https://twitter.com/i/status/1658912730991009795 None None None \n",
"99 https://twitter.com/i/status/1658879269232320514 None None None \n",
"100 https://twitter.com/i/status/1658839919278653444 None None None \n",
"\n",
"[101 rows x 10 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latest_results = search.run(\n",
" 'jennifer hudson since:2023-05-18',\n",
" limit=100,\n",
" latest=True, # get latest tweets only\n",
" retries=3,\n",
")\n",
"\n",
"flat_results = [y for x in latest_results for y in x]\n",
"data = [r.get('globalObjects', {}).get('tweets', {})for r in flat_results]\n",
"base= 'https://twitter.com/i'\n",
"\n",
"df = (\n",
" pd.DataFrame({k:v for d in data for k,v in d.items()})\n",
" .T\n",
" .assign(created_at = lambda x: pd.to_datetime(x['created_at'], format='%a %b %d %H:%M:%S %z %Y'))\n",
" .assign(user_url = lambda x: f\"{base}/user/\"+x['user_id_str'])\n",
" .assign(tweet_url = lambda x: f\"{base}/status/\"+x['id_str'] )\n",
" .sort_values('created_at',ascending=False)\n",
" .drop_duplicates('id')\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"# sample df with a few cols of interest\n",
"sample = df[['created_at','id','user_id','full_text', 'lang',\n",
" 'user_url', 'tweet_url', 'geo', 'coordinates', 'place']]\n",
"\n",
"sample"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,258 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"> Note: structure of GraphQL response is not consistent, these examples may not work in all cases."
],
"metadata": {
"collapsed": false
},
"id": "56e4acd2bbce8025"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2f27b8db",
"metadata": {},
"outputs": [],
"source": [
"# !pip install twitter-api-client==0.9.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7885eecd",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from twitter.scraper import Scraper\n",
"from twitter.util import *\n",
"import pandas as pd\n",
"\n",
"scraper = Scraper(session=init_session())\n",
"\n",
"# example tweet\n",
"tweet = scraper.tweets_details([1476988122986647553], limit=500, pbar=False)\n",
"\n",
"# unnest items and filter deleted tweets\n",
"items = [y for x in find_key(tweet, 'items') for y in x if not find_key(y, 'tombstone')]\n",
"\n",
"# index into relevant data points\n",
"tweet_results = [x.get('result') for x in find_key(items, 'tweet_results')]\n",
"\n",
"df = (\n",
" pd.json_normalize(tweet_results)\n",
" # remove duplicate replies if needed\n",
" .drop_duplicates('rest_id')\n",
" # clean up column names for illustrative purposes\n",
" .assign(date=lambda x: pd.to_datetime(x['legacy.created_at']).dt.strftime(\"%Y-%m-%d %H:%M:%S\"))\n",
" .assign(root_tweet=lambda x: x['legacy.conversation_id_str'])\n",
" .assign(text=lambda x: x['legacy.full_text'])\n",
" .assign(tweet=lambda x: x['rest_id'])\n",
" .assign(username=lambda x: x['core.user_results.result.legacy.screen_name'])\n",
" # sort by newest replies to root_tweet\n",
" .sort_values('date', ascending=False)\n",
" .reset_index(drop=True)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f5ddad65",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>username</th>\n",
" <th>tweet</th>\n",
" <th>text</th>\n",
" <th>root_tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-05-25 19:52:44</td>\n",
" <td>dtweedys</td>\n",
" <td>1661822628393254913</td>\n",
" <td>@MKBHD 🤺🔸💜 #SubTweets ftw 🔸🌑🕊️</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-07 14:30:30</td>\n",
" <td>BallSchlonger</td>\n",
" <td>1655218553858015233</td>\n",
" <td>@jacknft_4 @MKBHD sorry i cant</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-03 21:13:33</td>\n",
" <td>jacknft_4</td>\n",
" <td>1653870434075410436</td>\n",
" <td>@MKBHD Great! Dm me please</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-05-02 17:38:40</td>\n",
" <td>tanmayshah28</td>\n",
" <td>1653453969476456450</td>\n",
" <td>@MKBHD Thank you! Lets too!</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-04-02 14:08:55</td>\n",
" <td>iStevenPlays</td>\n",
" <td>1642529545415712768</td>\n",
" <td>@MKBHD I got you.\\n\\n🔥__🔥</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>2021-12-31 18:46:59</td>\n",
" <td>sbeams</td>\n",
" <td>1476988269187448832</td>\n",
" <td>@MKBHD https://t.co/1Fu5cKmPTn</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>2021-12-31 18:46:48</td>\n",
" <td>woworeoo</td>\n",
" <td>1476988222609793033</td>\n",
" <td>@MKBHD Yeah Yeah no more tweets from you</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>201</th>\n",
" <td>2021-12-31 18:46:46</td>\n",
" <td>disisjorj</td>\n",
" <td>1476988217576640517</td>\n",
" <td>@MKBHD @vassizzle</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>202</th>\n",
" <td>2021-12-31 18:46:46</td>\n",
" <td>TechWizYT</td>\n",
" <td>1476988215043276802</td>\n",
" <td>@MKBHD Great advice!</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>203</th>\n",
" <td>2021-12-31 18:46:41</td>\n",
" <td>0x_flea</td>\n",
" <td>1476988197146185730</td>\n",
" <td>@MKBHD ❤️🤘🏼</td>\n",
" <td>1476988122986647553</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>204 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" date username tweet \n",
"0 2023-05-25 19:52:44 dtweedys 1661822628393254913 \\\n",
"1 2023-05-07 14:30:30 BallSchlonger 1655218553858015233 \n",
"2 2023-05-03 21:13:33 jacknft_4 1653870434075410436 \n",
"3 2023-05-02 17:38:40 tanmayshah28 1653453969476456450 \n",
"4 2023-04-02 14:08:55 iStevenPlays 1642529545415712768 \n",
".. ... ... ... \n",
"199 2021-12-31 18:46:59 sbeams 1476988269187448832 \n",
"200 2021-12-31 18:46:48 woworeoo 1476988222609793033 \n",
"201 2021-12-31 18:46:46 disisjorj 1476988217576640517 \n",
"202 2021-12-31 18:46:46 TechWizYT 1476988215043276802 \n",
"203 2021-12-31 18:46:41 0x_flea 1476988197146185730 \n",
"\n",
" text root_tweet \n",
"0 @MKBHD 🤺🔸💜 #SubTweets ftw 🔸🌑🕊️ 1476988122986647553 \n",
"1 @jacknft_4 @MKBHD sorry i cant 1476988122986647553 \n",
"2 @MKBHD Great! Dm me please 1476988122986647553 \n",
"3 @MKBHD Thank you! Lets too! 1476988122986647553 \n",
"4 @MKBHD I got you.\\n\\n🔥__🔥 1476988122986647553 \n",
".. ... ... \n",
"199 @MKBHD https://t.co/1Fu5cKmPTn 1476988122986647553 \n",
"200 @MKBHD Yeah Yeah no more tweets from you 1476988122986647553 \n",
"201 @MKBHD @vassizzle 1476988122986647553 \n",
"202 @MKBHD Great advice! 1476988122986647553 \n",
"203 @MKBHD ❤️🤘🏼 1476988122986647553 \n",
"\n",
"[204 rows x 5 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[[\n",
" 'date',\n",
" 'username',\n",
" 'tweet',\n",
" 'text',\n",
" 'root_tweet',\n",
"]]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,779 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"> Note: structure of GraphQL response is not consistent, these examples may not work in all cases."
],
"metadata": {
"collapsed": false
},
"id": "4739fa454bb20238"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f65b5a54",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import time\n",
"import pandas as pd\n",
"\n",
"from twitter.scraper import Scraper\n",
"from twitter.util import find_key"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "1d7714a8",
"metadata": {},
"outputs": [],
"source": [
"email, username, password = ..., ..., ...\n",
"scraper = Scraper(email, username, password)"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"tweets = scraper.tweets([33836629]).pop()\n",
"tweets_and_replies = scraper.tweets_and_replies([33836629]).pop()"
],
"metadata": {
"collapsed": false
},
"id": "d9c839bfb7d99004"
},
{
"cell_type": "markdown",
"id": "a1339a2b",
"metadata": {},
"source": [
"### Find all unique urls in users tweets"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "f64a96d9",
"metadata": {},
"outputs": [],
"source": [
"unique_urls = set(find_key(tweets, 'expanded_url'))\n",
"unique_urls"
]
},
{
"cell_type": "markdown",
"id": "c8184cd5",
"metadata": {},
"source": [
"### Get summary of user tweet data"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "9e87995c",
"metadata": {},
"outputs": [],
"source": [
"tweet_data = []\n",
"for d in tweets + tweets_and_replies:\n",
" instructions = find_key(d, 'instructions').pop()\n",
" entries = find_key(instructions, 'entries').pop()\n",
" for entry in entries:\n",
" legacy = find_key(entry, 'legacy')\n",
" tweet_data.extend(legacy)\n",
"\n",
"user_key = 'can_dm' # filter using arbitrary key that only users have\n",
"expr = (x for x in tweet_data for k in x if k != user_key)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "224d5078",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id_str</th>\n",
" <th>id_str</th>\n",
" <th>created_at</th>\n",
" <th>favorite_count</th>\n",
" <th>full_text</th>\n",
" <th>quote_count</th>\n",
" <th>reply_count</th>\n",
" <th>retweet_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>33836629</td>\n",
" <td>1637213069301649408</td>\n",
" <td>2023-03-18 22:03:08+00:00</td>\n",
" <td>69.0</td>\n",
" <td>@theamazingdrj Yes the integration right into ...</td>\n",
" <td>1.0</td>\n",
" <td>6.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1181493805356158978</td>\n",
" <td>1637212448674684928</td>\n",
" <td>2023-03-18 22:00:40+00:00</td>\n",
" <td>9.0</td>\n",
" <td>@karpathy How does it compare to using chatGPT...</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>33836629</td>\n",
" <td>1637188599967027200</td>\n",
" <td>2023-03-18 20:25:54+00:00</td>\n",
" <td>13.0</td>\n",
" <td>@ErikSchluntz Very likely</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1374841081293021188</td>\n",
" <td>1637183652458283008</td>\n",
" <td>2023-03-18 20:06:14+00:00</td>\n",
" <td>6.0</td>\n",
" <td>@karpathy Do you think this will work well for...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>33836629</td>\n",
" <td>1637154111333494784</td>\n",
" <td>2023-03-18 18:08:51+00:00</td>\n",
" <td>5.0</td>\n",
" <td>@aliapanahi logprobs kwarg https://t.co/4Uuh4V...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219</th>\n",
" <td>33836629</td>\n",
" <td>1600031572442218497</td>\n",
" <td>2022-12-06 07:37:08+00:00</td>\n",
" <td>248.0</td>\n",
" <td>😂 stop Riley probably up there as someone who ...</td>\n",
" <td>2.0</td>\n",
" <td>8.0</td>\n",
" <td>12.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220</th>\n",
" <td>16535432</td>\n",
" <td>1600012570949058560</td>\n",
" <td>2022-12-06 06:21:38+00:00</td>\n",
" <td>1698.0</td>\n",
" <td>To get a sense of how hyped LLMs are right now...</td>\n",
" <td>18.0</td>\n",
" <td>47.0</td>\n",
" <td>96.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>221</th>\n",
" <td>33836629</td>\n",
" <td>1593417987687473152</td>\n",
" <td>2022-11-18 01:37:07+00:00</td>\n",
" <td>206.0</td>\n",
" <td>If previous neural nets are special-purpose co...</td>\n",
" <td>5.0</td>\n",
" <td>2.0</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>33836629</td>\n",
" <td>1528792715810394112</td>\n",
" <td>2022-05-23 17:39:21+00:00</td>\n",
" <td>3044.0</td>\n",
" <td>Something I've been doing for a few years that...</td>\n",
" <td>42.0</td>\n",
" <td>184.0</td>\n",
" <td>115.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>223</th>\n",
" <td>33836629</td>\n",
" <td>1528453604515778560</td>\n",
" <td>2022-05-22 19:11:51+00:00</td>\n",
" <td>914.0</td>\n",
" <td>real-world data distribution is ~N(0,1)\\ngood ...</td>\n",
" <td>11.0</td>\n",
" <td>47.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>224 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id_str id_str created_at \\\n",
"0 33836629 1637213069301649408 2023-03-18 22:03:08+00:00 \n",
"1 1181493805356158978 1637212448674684928 2023-03-18 22:00:40+00:00 \n",
"2 33836629 1637188599967027200 2023-03-18 20:25:54+00:00 \n",
"3 1374841081293021188 1637183652458283008 2023-03-18 20:06:14+00:00 \n",
"4 33836629 1637154111333494784 2023-03-18 18:08:51+00:00 \n",
".. ... ... ... \n",
"219 33836629 1600031572442218497 2022-12-06 07:37:08+00:00 \n",
"220 16535432 1600012570949058560 2022-12-06 06:21:38+00:00 \n",
"221 33836629 1593417987687473152 2022-11-18 01:37:07+00:00 \n",
"222 33836629 1528792715810394112 2022-05-23 17:39:21+00:00 \n",
"223 33836629 1528453604515778560 2022-05-22 19:11:51+00:00 \n",
"\n",
" favorite_count full_text \\\n",
"0 69.0 @theamazingdrj Yes the integration right into ... \n",
"1 9.0 @karpathy How does it compare to using chatGPT... \n",
"2 13.0 @ErikSchluntz Very likely \n",
"3 6.0 @karpathy Do you think this will work well for... \n",
"4 5.0 @aliapanahi logprobs kwarg https://t.co/4Uuh4V... \n",
".. ... ... \n",
"219 248.0 😂 stop Riley probably up there as someone who ... \n",
"220 1698.0 To get a sense of how hyped LLMs are right now... \n",
"221 206.0 If previous neural nets are special-purpose co... \n",
"222 3044.0 Something I've been doing for a few years that... \n",
"223 914.0 real-world data distribution is ~N(0,1)\\ngood ... \n",
"\n",
" quote_count reply_count retweet_count \n",
"0 1.0 6.0 4.0 \n",
"1 0.0 2.0 1.0 \n",
"2 0.0 1.0 1.0 \n",
"3 0.0 1.0 0.0 \n",
"4 0.0 1.0 1.0 \n",
".. ... ... ... \n",
"219 2.0 8.0 12.0 \n",
"220 18.0 47.0 96.0 \n",
"221 5.0 2.0 16.0 \n",
"222 42.0 184.0 115.0 \n",
"223 11.0 47.0 65.0 \n",
"\n",
"[224 rows x 8 columns]"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## simple subset of relevant tweet fields \n",
"cols = [\n",
" 'user_id_str',\n",
" 'id_str',\n",
" 'created_at',\n",
" 'favorite_count',\n",
" 'full_text',\n",
" 'quote_count',\n",
" 'reply_count',\n",
" 'retweet_count',\n",
" # 'retweeted',\n",
" # 'conversation_id_str',\n",
" # 'favorited',\n",
" # 'is_quote_status',\n",
" # 'lang',\n",
" # 'quoted_status_id_str',\n",
"]\n",
"\n",
"df = pd.DataFrame(expr)[cols]\n",
"\n",
"df['created_at'] = pd.to_datetime(df['created_at'], format=\"%a %b %d %H:%M:%S %z %Y\")\n",
"\n",
"numeric = [\n",
" 'favorite_count',\n",
" 'quote_count',\n",
" 'reply_count',\n",
" 'retweet_count',\n",
"]\n",
"\n",
"df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')\n",
"\n",
"## drop duplicates, sort by date\n",
"df = (df\n",
" .dropna(subset='id_str')\n",
" .drop_duplicates(subset='id_str')\n",
" .sort_values('created_at', ascending=False)\n",
" .reset_index(drop=True)\n",
" )\n",
"\n",
"# df.to_feather(f'{time.time_ns()}.feather')\n",
"# df.to_parquet(f'{time.time_ns()}.parquet')\n",
"df.to_csv(f'{time.time_ns()}.csv', index=False)\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "9103413b",
"metadata": {},
"source": [
"### search tweet text"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "401712a3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id_str</th>\n",
" <th>id_str</th>\n",
" <th>created_at</th>\n",
" <th>favorite_count</th>\n",
" <th>full_text</th>\n",
" <th>quote_count</th>\n",
" <th>reply_count</th>\n",
" <th>retweet_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>52667700</td>\n",
" <td>1637152715716583424</td>\n",
" <td>2023-03-18 18:03:18+00:00</td>\n",
" <td>99.0</td>\n",
" <td>@karpathy Sometimes I wish people could unders...</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>33836629</td>\n",
" <td>1637147822482165760</td>\n",
" <td>2023-03-18 17:43:52+00:00</td>\n",
" <td>325.0</td>\n",
" <td>If not careful, fine-tuning collapses entropy ...</td>\n",
" <td>5.0</td>\n",
" <td>9.0</td>\n",
" <td>21.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>788533935886077952</td>\n",
" <td>1636786608916819968</td>\n",
" <td>2023-03-17 17:48:32+00:00</td>\n",
" <td>411.0</td>\n",
" <td>I finally installed github copilot (better lat...</td>\n",
" <td>5.0</td>\n",
" <td>15.0</td>\n",
" <td>14.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>33836629</td>\n",
" <td>1636765735627395073</td>\n",
" <td>2023-03-17 16:25:35+00:00</td>\n",
" <td>22.0</td>\n",
" <td>@BlancheMinerva @JosephJacks_ I didnt work on...</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>33836629</td>\n",
" <td>1636459245184106497</td>\n",
" <td>2023-03-16 20:07:42+00:00</td>\n",
" <td>1254.0</td>\n",
" <td>Less publicized but highly awesome aspect of G...</td>\n",
" <td>10.0</td>\n",
" <td>38.0</td>\n",
" <td>132.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>33836629</td>\n",
" <td>1620875263700799488</td>\n",
" <td>2023-02-01 20:02:31+00:00</td>\n",
" <td>10.0</td>\n",
" <td>@portisto @trending_repos sad. The way they co...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>65629552</td>\n",
" <td>1620850430254223360</td>\n",
" <td>2023-02-01 18:23:51+00:00</td>\n",
" <td>7.0</td>\n",
" <td>@trending_repos @karpathy How can a main langu...</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>33836629</td>\n",
" <td>1620811724952866816</td>\n",
" <td>2023-02-01 15:50:03+00:00</td>\n",
" <td>245.0</td>\n",
" <td>@trending_repos wow</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>1162359127294861314</td>\n",
" <td>1620749130556669952</td>\n",
" <td>2023-02-01 11:41:19+00:00</td>\n",
" <td>2541.0</td>\n",
" <td>Trending repository of the month 🏆\\n \\nnanoGP...</td>\n",
" <td>9.0</td>\n",
" <td>19.0</td>\n",
" <td>320.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>33836629</td>\n",
" <td>1620187595979513857</td>\n",
" <td>2023-01-30 22:29:59+00:00</td>\n",
" <td>15.0</td>\n",
" <td>@hi_tysam It was very nice to read through top...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151</th>\n",
" <td>1615441883672502291</td>\n",
" <td>1620185408721256449</td>\n",
" <td>2023-01-30 22:21:17+00:00</td>\n",
" <td>15.0</td>\n",
" <td>@karpathy I'm honored and a bit stunned. Wow, ...</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178</th>\n",
" <td>33836629</td>\n",
" <td>1613250487838707712</td>\n",
" <td>2023-01-11 19:04:23+00:00</td>\n",
" <td>2257.0</td>\n",
" <td>Didn't tweet nanoGPT yet (quietly getting it t...</td>\n",
" <td>24.0</td>\n",
" <td>39.0</td>\n",
" <td>303.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>186</th>\n",
" <td>33836629</td>\n",
" <td>1608895189078380544</td>\n",
" <td>2022-12-30 18:37:59+00:00</td>\n",
" <td>4356.0</td>\n",
" <td>Nice read on reverse engineering of GitHub Cop...</td>\n",
" <td>145.0</td>\n",
" <td>85.0</td>\n",
" <td>555.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190</th>\n",
" <td>33836629</td>\n",
" <td>1607791539258003457</td>\n",
" <td>2022-12-27 17:32:28+00:00</td>\n",
" <td>556.0</td>\n",
" <td>Context I realized I have to split up minGPT b...</td>\n",
" <td>2.0</td>\n",
" <td>23.0</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id_str id_str created_at \\\n",
"9 52667700 1637152715716583424 2023-03-18 18:03:18+00:00 \n",
"14 33836629 1637147822482165760 2023-03-18 17:43:52+00:00 \n",
"17 788533935886077952 1636786608916819968 2023-03-17 17:48:32+00:00 \n",
"18 33836629 1636765735627395073 2023-03-17 16:25:35+00:00 \n",
"20 33836629 1636459245184106497 2023-03-16 20:07:42+00:00 \n",
"144 33836629 1620875263700799488 2023-02-01 20:02:31+00:00 \n",
"145 65629552 1620850430254223360 2023-02-01 18:23:51+00:00 \n",
"146 33836629 1620811724952866816 2023-02-01 15:50:03+00:00 \n",
"147 1162359127294861314 1620749130556669952 2023-02-01 11:41:19+00:00 \n",
"150 33836629 1620187595979513857 2023-01-30 22:29:59+00:00 \n",
"151 1615441883672502291 1620185408721256449 2023-01-30 22:21:17+00:00 \n",
"178 33836629 1613250487838707712 2023-01-11 19:04:23+00:00 \n",
"186 33836629 1608895189078380544 2022-12-30 18:37:59+00:00 \n",
"190 33836629 1607791539258003457 2022-12-27 17:32:28+00:00 \n",
"\n",
" favorite_count full_text \\\n",
"9 99.0 @karpathy Sometimes I wish people could unders... \n",
"14 325.0 If not careful, fine-tuning collapses entropy ... \n",
"17 411.0 I finally installed github copilot (better lat... \n",
"18 22.0 @BlancheMinerva @JosephJacks_ I didnt work on... \n",
"20 1254.0 Less publicized but highly awesome aspect of G... \n",
"144 10.0 @portisto @trending_repos sad. The way they co... \n",
"145 7.0 @trending_repos @karpathy How can a main langu... \n",
"146 245.0 @trending_repos wow \n",
"147 2541.0 Trending repository of the month 🏆\\n \\nnanoGP... \n",
"150 15.0 @hi_tysam It was very nice to read through top... \n",
"151 15.0 @karpathy I'm honored and a bit stunned. Wow, ... \n",
"178 2257.0 Didn't tweet nanoGPT yet (quietly getting it t... \n",
"186 4356.0 Nice read on reverse engineering of GitHub Cop... \n",
"190 556.0 Context I realized I have to split up minGPT b... \n",
"\n",
" quote_count reply_count retweet_count \n",
"9 2.0 1.0 5.0 \n",
"14 5.0 9.0 21.0 \n",
"17 5.0 15.0 14.0 \n",
"18 0.0 4.0 1.0 \n",
"20 10.0 38.0 132.0 \n",
"144 0.0 1.0 2.0 \n",
"145 0.0 4.0 0.0 \n",
"146 0.0 6.0 4.0 \n",
"147 9.0 19.0 320.0 \n",
"150 0.0 1.0 2.0 \n",
"151 0.0 3.0 0.0 \n",
"178 24.0 39.0 303.0 \n",
"186 145.0 85.0 555.0 \n",
"190 2.0 23.0 16.0 "
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.full_text.str.contains('repos?i?|github', regex=True, flags=re.I)]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "96ebc3fd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id_str</th>\n",
" <th>id_str</th>\n",
" <th>created_at</th>\n",
" <th>favorite_count</th>\n",
" <th>full_text</th>\n",
" <th>quote_count</th>\n",
" <th>reply_count</th>\n",
" <th>retweet_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>1615441883672502291</td>\n",
" <td>1632577588529954819</td>\n",
" <td>2023-03-06 03:03:23+00:00</td>\n",
" <td>91.0</td>\n",
" <td>Speed up your LLM research exploration with a ...</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>14.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>33836629</td>\n",
" <td>1621578354024677377</td>\n",
" <td>2023-02-03 18:36:21+00:00</td>\n",
" <td>5276.0</td>\n",
" <td>The most dramatic optimization to nanoGPT so f...</td>\n",
" <td>57.0</td>\n",
" <td>89.0</td>\n",
" <td>353.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>1162359127294861314</td>\n",
" <td>1620749130556669952</td>\n",
" <td>2023-02-01 11:41:19+00:00</td>\n",
" <td>2541.0</td>\n",
" <td>Trending repository of the month 🏆\\n \\nnanoGP...</td>\n",
" <td>9.0</td>\n",
" <td>19.0</td>\n",
" <td>320.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>172</th>\n",
" <td>33836629</td>\n",
" <td>1615398117683388417</td>\n",
" <td>2023-01-17 17:18:18+00:00</td>\n",
" <td>21166.0</td>\n",
" <td>🔥 New (1h56m) video lecture: \"Let's build GPT:...</td>\n",
" <td>331.0</td>\n",
" <td>546.0</td>\n",
" <td>3321.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178</th>\n",
" <td>33836629</td>\n",
" <td>1613250487838707712</td>\n",
" <td>2023-01-11 19:04:23+00:00</td>\n",
" <td>2257.0</td>\n",
" <td>Didn't tweet nanoGPT yet (quietly getting it t...</td>\n",
" <td>24.0</td>\n",
" <td>39.0</td>\n",
" <td>303.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id_str id_str created_at \\\n",
"58 1615441883672502291 1632577588529954819 2023-03-06 03:03:23+00:00 \n",
"143 33836629 1621578354024677377 2023-02-03 18:36:21+00:00 \n",
"147 1162359127294861314 1620749130556669952 2023-02-01 11:41:19+00:00 \n",
"172 33836629 1615398117683388417 2023-01-17 17:18:18+00:00 \n",
"178 33836629 1613250487838707712 2023-01-11 19:04:23+00:00 \n",
"\n",
" favorite_count full_text \\\n",
"58 91.0 Speed up your LLM research exploration with a ... \n",
"143 5276.0 The most dramatic optimization to nanoGPT so f... \n",
"147 2541.0 Trending repository of the month 🏆\\n \\nnanoGP... \n",
"172 21166.0 🔥 New (1h56m) video lecture: \"Let's build GPT:... \n",
"178 2257.0 Didn't tweet nanoGPT yet (quietly getting it t... \n",
"\n",
" quote_count reply_count retweet_count \n",
"58 2.0 3.0 14.0 \n",
"143 57.0 89.0 353.0 \n",
"147 9.0 19.0 320.0 \n",
"172 331.0 546.0 3321.0 \n",
"178 24.0 39.0 303.0 "
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flags = re.I\n",
"\n",
"(df\n",
" .query('full_text.str.contains(\"nanogpt\", regex=True, flags=@flags)', engine='python')\n",
" # .query(...)\n",
" # .query(...)\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}