Files
twitter-api-client/examples/chat_log.ipynb
2024-01-12 17:00:01 -08:00

402 lines
13 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"source": [
"> Note: structure of GraphQL response is not consistent, these examples may not work in all cases."
],
"metadata": {
"collapsed": false
},
"id": "fce8131509380867"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6c99787b",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"from twitter.scraper import Scraper\n",
"from twitter.util import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd3b7a57",
"metadata": {},
"outputs": [],
"source": [
"scraper = Scraper(session=init_session())"
]
},
{
"cell_type": "markdown",
"id": "17a91f72",
"metadata": {},
"source": [
"### get chat log"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "597b3a0f",
"metadata": {},
"outputs": [],
"source": [
"room_id = '1eaJbrAPnBVJX'\n",
"spaces = scraper.spaces(rooms=[room_id], audio=0, chat=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "faaa76b1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>room</th>\n",
" <th>timestamp</th>\n",
" <th>twitter_id</th>\n",
" <th>username</th>\n",
" <th>body</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:32:19</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:32:55</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:33:01</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:33:13</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>Hi Alan, are you there?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:33:17</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>I am.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:00</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>Yeah, it's a great platform.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>252</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:03</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Alright, cool. Sounds good. Thank you.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>253</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:03</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>OK. Thank you.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:04</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Right.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 22:00:05</td>\n",
" <td>1106321031566893057</td>\n",
" <td>jimfarley98</td>\n",
" <td>Bye.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>256 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" room timestamp twitter_id username \n",
"0 1eaJbrAPnBVJX 2023-05-25 21:32:19 1106321031566893057 jimfarley98 \\\n",
"1 1eaJbrAPnBVJX 2023-05-25 21:32:55 1106321031566893057 jimfarley98 \n",
"2 1eaJbrAPnBVJX 2023-05-25 21:33:01 1106321031566893057 jimfarley98 \n",
"3 1eaJbrAPnBVJX 2023-05-25 21:33:13 1106321031566893057 jimfarley98 \n",
"4 1eaJbrAPnBVJX 2023-05-25 21:33:17 44196397 elonmusk \n",
".. ... ... ... ... \n",
"251 1eaJbrAPnBVJX 2023-05-25 22:00:00 1106321031566893057 jimfarley98 \n",
"252 1eaJbrAPnBVJX 2023-05-25 22:00:03 44196397 elonmusk \n",
"253 1eaJbrAPnBVJX 2023-05-25 22:00:03 1106321031566893057 jimfarley98 \n",
"254 1eaJbrAPnBVJX 2023-05-25 22:00:04 44196397 elonmusk \n",
"255 1eaJbrAPnBVJX 2023-05-25 22:00:05 1106321031566893057 jimfarley98 \n",
"\n",
" body \n",
"0 \n",
"1 \n",
"2 \n",
"3 Hi Alan, are you there? \n",
"4 I am. \n",
".. ... \n",
"251 Yeah, it's a great platform. \n",
"252 Alright, cool. Sounds good. Thank you. \n",
"253 OK. Thank you. \n",
"254 Right. \n",
"255 Bye. \n",
"\n",
"[256 rows x 5 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chat = pd.json_normalize(spaces[0]['chat'])\n",
"chat = chat[chat['payload.body.final'] == True]\n",
"dates = ['payload.body.timestamp']\n",
"chat[dates] = chat[dates].apply(pd.to_datetime, unit='ms').apply(lambda x: x.dt.strftime(\"%Y-%m-%d %H:%M:%S %z\"))\n",
"chat = chat.sort_values('payload.body.timestamp').reset_index(drop=True)\n",
"chat = chat[[\n",
" 'payload.room',\n",
" 'payload.body.timestamp',\n",
" 'payload.sender.twitter_id',\n",
" 'payload.body.username',\n",
" 'payload.body.body',\n",
"]]\n",
"chat.columns = chat.columns.str.replace('(payload|body|sender).','',regex=True).str.replace('.','_')\n",
"# chat.to_csv(f'{room_id}.csv',index=False)\n",
"chat"
]
},
{
"cell_type": "markdown",
"id": "118919dd",
"metadata": {},
"source": [
"### query chat"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3010c52a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>room</th>\n",
" <th>timestamp</th>\n",
" <th>twitter_id</th>\n",
" <th>username</th>\n",
" <th>body</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:35:05</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Yeah, well, well, it's certainly super excitin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:36:00</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>API access so like you know like a Ford vehicl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:36:23</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>And we're very, very much appreciative of of F...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:38:29</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>But but I think it is the the the teams have d...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>191</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:53:52</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Yeah, I agree with that. We should probably no...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>211</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:56:13</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>So, you know, it is certainly the Tesla intent...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>248</th>\n",
" <td>1eaJbrAPnBVJX</td>\n",
" <td>2023-05-25 21:59:45</td>\n",
" <td>44196397</td>\n",
" <td>elonmusk</td>\n",
" <td>Likewise there, it's an honor to be working wi...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" room timestamp twitter_id username \n",
"22 1eaJbrAPnBVJX 2023-05-25 21:35:05 44196397 elonmusk \\\n",
"30 1eaJbrAPnBVJX 2023-05-25 21:36:00 44196397 elonmusk \n",
"33 1eaJbrAPnBVJX 2023-05-25 21:36:23 44196397 elonmusk \n",
"47 1eaJbrAPnBVJX 2023-05-25 21:38:29 44196397 elonmusk \n",
"191 1eaJbrAPnBVJX 2023-05-25 21:53:52 44196397 elonmusk \n",
"211 1eaJbrAPnBVJX 2023-05-25 21:56:13 44196397 elonmusk \n",
"248 1eaJbrAPnBVJX 2023-05-25 21:59:45 44196397 elonmusk \n",
"\n",
" body \n",
"22 Yeah, well, well, it's certainly super excitin... \n",
"30 API access so like you know like a Ford vehicl... \n",
"33 And we're very, very much appreciative of of F... \n",
"47 But but I think it is the the the teams have d... \n",
"191 Yeah, I agree with that. We should probably no... \n",
"211 So, you know, it is certainly the Tesla intent... \n",
"248 Likewise there, it's an honor to be working wi... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flags = re.I\n",
"(chat\n",
" .query('body.str.contains(\"\\sford\",regex=True,flags=@flags)')\n",
" .query('username.str.contains(\"elonmusk\",regex=True,flags=@flags)')\n",
")\n",
"\n",
"## alternatively\n",
"# chat[\n",
"# chat.body.str.contains('\\sford',regex=True,flags=re.I)\n",
"# &\n",
"# chat.username.str.contains('elonmusk',regex=True,flags=re.I)\n",
"# ]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}