392 lines
13 KiB
Plaintext
392 lines
13 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "6c99787b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import re\n",
|
||
"import pandas as pd\n",
|
||
"from twitter.scraper import Scraper\n",
|
||
"from twitter.util import *"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "dd3b7a57",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"scraper = Scraper(session=init_session())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "17a91f72",
|
||
"metadata": {},
|
||
"source": [
|
||
"### get chat log"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "597b3a0f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"room_id = '1eaJbrAPnBVJX'\n",
|
||
"spaces = scraper.spaces(rooms=[room_id], audio=0, chat=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "faaa76b1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>room</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>twitter_id</th>\n",
|
||
" <th>username</th>\n",
|
||
" <th>body</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:32:19</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:32:55</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:33:01</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:33:13</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td>Hi Alan, are you there?</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:33:17</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>I am.</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>251</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 22:00:00</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td>Yeah, it's a great platform.</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>252</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 22:00:03</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>Alright, cool. Sounds good. Thank you.</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>253</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 22:00:03</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td>OK. Thank you.</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>254</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 22:00:04</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>Right.</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>255</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 22:00:05</td>\n",
|
||
" <td>1106321031566893057</td>\n",
|
||
" <td>jimfarley98</td>\n",
|
||
" <td>Bye.</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>256 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" room timestamp twitter_id username \n",
|
||
"0 1eaJbrAPnBVJX 2023-05-25 21:32:19 1106321031566893057 jimfarley98 \\\n",
|
||
"1 1eaJbrAPnBVJX 2023-05-25 21:32:55 1106321031566893057 jimfarley98 \n",
|
||
"2 1eaJbrAPnBVJX 2023-05-25 21:33:01 1106321031566893057 jimfarley98 \n",
|
||
"3 1eaJbrAPnBVJX 2023-05-25 21:33:13 1106321031566893057 jimfarley98 \n",
|
||
"4 1eaJbrAPnBVJX 2023-05-25 21:33:17 44196397 elonmusk \n",
|
||
".. ... ... ... ... \n",
|
||
"251 1eaJbrAPnBVJX 2023-05-25 22:00:00 1106321031566893057 jimfarley98 \n",
|
||
"252 1eaJbrAPnBVJX 2023-05-25 22:00:03 44196397 elonmusk \n",
|
||
"253 1eaJbrAPnBVJX 2023-05-25 22:00:03 1106321031566893057 jimfarley98 \n",
|
||
"254 1eaJbrAPnBVJX 2023-05-25 22:00:04 44196397 elonmusk \n",
|
||
"255 1eaJbrAPnBVJX 2023-05-25 22:00:05 1106321031566893057 jimfarley98 \n",
|
||
"\n",
|
||
" body \n",
|
||
"0 \n",
|
||
"1 \n",
|
||
"2 \n",
|
||
"3 Hi Alan, are you there? \n",
|
||
"4 I am. \n",
|
||
".. ... \n",
|
||
"251 Yeah, it's a great platform. \n",
|
||
"252 Alright, cool. Sounds good. Thank you. \n",
|
||
"253 OK. Thank you. \n",
|
||
"254 Right. \n",
|
||
"255 Bye. \n",
|
||
"\n",
|
||
"[256 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"chat = pd.json_normalize(spaces[0]['chat'])\n",
|
||
"chat = chat[chat['payload.body.final'] == True]\n",
|
||
"dates = ['payload.body.timestamp']\n",
|
||
"chat[dates] = chat[dates].apply(pd.to_datetime, unit='ms').apply(lambda x: x.dt.strftime(\"%Y-%m-%d %H:%M:%S %z\"))\n",
|
||
"chat = chat.sort_values('payload.body.timestamp').reset_index(drop=True)\n",
|
||
"chat = chat[[\n",
|
||
" 'payload.room',\n",
|
||
" 'payload.body.timestamp',\n",
|
||
" 'payload.sender.twitter_id',\n",
|
||
" 'payload.body.username',\n",
|
||
" 'payload.body.body',\n",
|
||
"]]\n",
|
||
"chat.columns = chat.columns.str.replace('(payload|body|sender).','',regex=True).str.replace('.','_')\n",
|
||
"# chat.to_csv(f'{room_id}.csv',index=False)\n",
|
||
"chat"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "118919dd",
|
||
"metadata": {},
|
||
"source": [
|
||
"### query chat"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "3010c52a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>room</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>twitter_id</th>\n",
|
||
" <th>username</th>\n",
|
||
" <th>body</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:35:05</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>Yeah, well, well, it's certainly super excitin...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:36:00</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>API access so like you know like a Ford vehicl...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:36:23</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>And we're very, very much appreciative of of F...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>47</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:38:29</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>But but I think it is the the the teams have d...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>191</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:53:52</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>Yeah, I agree with that. We should probably no...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>211</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:56:13</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>So, you know, it is certainly the Tesla intent...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>248</th>\n",
|
||
" <td>1eaJbrAPnBVJX</td>\n",
|
||
" <td>2023-05-25 21:59:45</td>\n",
|
||
" <td>44196397</td>\n",
|
||
" <td>elonmusk</td>\n",
|
||
" <td>Likewise there, it's an honor to be working wi...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" room timestamp twitter_id username \n",
|
||
"22 1eaJbrAPnBVJX 2023-05-25 21:35:05 44196397 elonmusk \\\n",
|
||
"30 1eaJbrAPnBVJX 2023-05-25 21:36:00 44196397 elonmusk \n",
|
||
"33 1eaJbrAPnBVJX 2023-05-25 21:36:23 44196397 elonmusk \n",
|
||
"47 1eaJbrAPnBVJX 2023-05-25 21:38:29 44196397 elonmusk \n",
|
||
"191 1eaJbrAPnBVJX 2023-05-25 21:53:52 44196397 elonmusk \n",
|
||
"211 1eaJbrAPnBVJX 2023-05-25 21:56:13 44196397 elonmusk \n",
|
||
"248 1eaJbrAPnBVJX 2023-05-25 21:59:45 44196397 elonmusk \n",
|
||
"\n",
|
||
" body \n",
|
||
"22 Yeah, well, well, it's certainly super excitin... \n",
|
||
"30 API access so like you know like a Ford vehicl... \n",
|
||
"33 And we're very, very much appreciative of of F... \n",
|
||
"47 But but I think it is the the the teams have d... \n",
|
||
"191 Yeah, I agree with that. We should probably no... \n",
|
||
"211 So, you know, it is certainly the Tesla intent... \n",
|
||
"248 Likewise there, it's an honor to be working wi... "
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"flags = re.I\n",
|
||
"(chat\n",
|
||
" .query('body.str.contains(\"\\sford\",regex=True,flags=@flags)')\n",
|
||
" .query('username.str.contains(\"elonmusk\",regex=True,flags=@flags)')\n",
|
||
")\n",
|
||
"\n",
|
||
"## alternatively\n",
|
||
"# chat[\n",
|
||
"# chat.body.str.contains('\\sford',regex=True,flags=re.I)\n",
|
||
"# &\n",
|
||
"# chat.username.str.contains('elonmusk',regex=True,flags=re.I)\n",
|
||
"# ]"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|