twitter-api-client/examples/tweets.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "> Note: structure of GraphQL response is not consistent, these examples may not work in all cases."
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "4739fa454bb20238"
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f65b5a54",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import time\n",
    "import pandas as pd\n",
    "\n",
    "from twitter.scraper import Scraper\n",
    "from twitter.util import find_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "1d7714a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "email, username, password = ..., ..., ...\n",
    "scraper = Scraper(email, username, password)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "tweets = scraper.tweets([33836629]).pop()\n",
    "tweets_and_replies = scraper.tweets_and_replies([33836629]).pop()"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "d9c839bfb7d99004"
  },
  {
   "cell_type": "markdown",
   "id": "a1339a2b",
   "metadata": {},
   "source": [
    "### Find all unique urls in users tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "f64a96d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_urls = set(find_key(tweets, 'expanded_url'))\n",
    "unique_urls"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8184cd5",
   "metadata": {},
   "source": [
    "### Get summary of user tweet data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "9e87995c",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_data = []\n",
    "for d in tweets + tweets_and_replies:\n",
    "    instructions = find_key(d, 'instructions').pop()\n",
    "    entries = find_key(instructions, 'entries').pop()\n",
    "    for entry in entries:\n",
    "        legacy = find_key(entry, 'legacy')\n",
    "        tweet_data.extend(legacy)\n",
    "\n",
    "user_key = 'can_dm'  # filter using arbitrary key that only users have\n",
    "expr = (x for x in tweet_data for k in x if k != user_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "224d5078",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id_str</th>\n",
       "      <th>id_str</th>\n",
       "      <th>created_at</th>\n",
       "      <th>favorite_count</th>\n",
       "      <th>full_text</th>\n",
       "      <th>quote_count</th>\n",
       "      <th>reply_count</th>\n",
       "      <th>retweet_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1637213069301649408</td>\n",
       "      <td>2023-03-18 22:03:08+00:00</td>\n",
       "      <td>69.0</td>\n",
       "      <td>@theamazingdrj Yes the integration right into ...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1181493805356158978</td>\n",
       "      <td>1637212448674684928</td>\n",
       "      <td>2023-03-18 22:00:40+00:00</td>\n",
       "      <td>9.0</td>\n",
       "      <td>@karpathy How does it compare to using chatGPT...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1637188599967027200</td>\n",
       "      <td>2023-03-18 20:25:54+00:00</td>\n",
       "      <td>13.0</td>\n",
       "      <td>@ErikSchluntz Very likely</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1374841081293021188</td>\n",
       "      <td>1637183652458283008</td>\n",
       "      <td>2023-03-18 20:06:14+00:00</td>\n",
       "      <td>6.0</td>\n",
       "      <td>@karpathy Do you think this will work well for...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1637154111333494784</td>\n",
       "      <td>2023-03-18 18:08:51+00:00</td>\n",
       "      <td>5.0</td>\n",
       "      <td>@aliapanahi logprobs kwarg https://t.co/4Uuh4V...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1600031572442218497</td>\n",
       "      <td>2022-12-06 07:37:08+00:00</td>\n",
       "      <td>248.0</td>\n",
       "      <td>😂 stop Riley probably up there as someone who ...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>220</th>\n",
       "      <td>16535432</td>\n",
       "      <td>1600012570949058560</td>\n",
       "      <td>2022-12-06 06:21:38+00:00</td>\n",
       "      <td>1698.0</td>\n",
       "      <td>To get a sense of how hyped LLMs are right now...</td>\n",
       "      <td>18.0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>96.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>221</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1593417987687473152</td>\n",
       "      <td>2022-11-18 01:37:07+00:00</td>\n",
       "      <td>206.0</td>\n",
       "      <td>If previous neural nets are special-purpose co...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>222</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1528792715810394112</td>\n",
       "      <td>2022-05-23 17:39:21+00:00</td>\n",
       "      <td>3044.0</td>\n",
       "      <td>Something I've been doing for a few years that...</td>\n",
       "      <td>42.0</td>\n",
       "      <td>184.0</td>\n",
       "      <td>115.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>223</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1528453604515778560</td>\n",
       "      <td>2022-05-22 19:11:51+00:00</td>\n",
       "      <td>914.0</td>\n",
       "      <td>real-world data distribution is ~N(0,1)\\ngood ...</td>\n",
       "      <td>11.0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>224 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             user_id_str               id_str                created_at  \\\n",
       "0               33836629  1637213069301649408 2023-03-18 22:03:08+00:00   \n",
       "1    1181493805356158978  1637212448674684928 2023-03-18 22:00:40+00:00   \n",
       "2               33836629  1637188599967027200 2023-03-18 20:25:54+00:00   \n",
       "3    1374841081293021188  1637183652458283008 2023-03-18 20:06:14+00:00   \n",
       "4               33836629  1637154111333494784 2023-03-18 18:08:51+00:00   \n",
       "..                   ...                  ...                       ...   \n",
       "219             33836629  1600031572442218497 2022-12-06 07:37:08+00:00   \n",
       "220             16535432  1600012570949058560 2022-12-06 06:21:38+00:00   \n",
       "221             33836629  1593417987687473152 2022-11-18 01:37:07+00:00   \n",
       "222             33836629  1528792715810394112 2022-05-23 17:39:21+00:00   \n",
       "223             33836629  1528453604515778560 2022-05-22 19:11:51+00:00   \n",
       "\n",
       "     favorite_count                                          full_text  \\\n",
       "0              69.0  @theamazingdrj Yes the integration right into ...   \n",
       "1               9.0  @karpathy How does it compare to using chatGPT...   \n",
       "2              13.0                          @ErikSchluntz Very likely   \n",
       "3               6.0  @karpathy Do you think this will work well for...   \n",
       "4               5.0  @aliapanahi logprobs kwarg https://t.co/4Uuh4V...   \n",
       "..              ...                                                ...   \n",
       "219           248.0  😂 stop Riley probably up there as someone who ...   \n",
       "220          1698.0  To get a sense of how hyped LLMs are right now...   \n",
       "221           206.0  If previous neural nets are special-purpose co...   \n",
       "222          3044.0  Something I've been doing for a few years that...   \n",
       "223           914.0  real-world data distribution is ~N(0,1)\\ngood ...   \n",
       "\n",
       "     quote_count  reply_count  retweet_count  \n",
       "0            1.0          6.0            4.0  \n",
       "1            0.0          2.0            1.0  \n",
       "2            0.0          1.0            1.0  \n",
       "3            0.0          1.0            0.0  \n",
       "4            0.0          1.0            1.0  \n",
       "..           ...          ...            ...  \n",
       "219          2.0          8.0           12.0  \n",
       "220         18.0         47.0           96.0  \n",
       "221          5.0          2.0           16.0  \n",
       "222         42.0        184.0          115.0  \n",
       "223         11.0         47.0           65.0  \n",
       "\n",
       "[224 rows x 8 columns]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## simple subset of relevant tweet fields        \n",
    "cols = [\n",
    "    'user_id_str',\n",
    "    'id_str',\n",
    "    'created_at',\n",
    "    'favorite_count',\n",
    "    'full_text',\n",
    "    'quote_count',\n",
    "    'reply_count',\n",
    "    'retweet_count',\n",
    "    #  'retweeted',\n",
    "    #  'conversation_id_str',\n",
    "    #  'favorited',\n",
    "    #  'is_quote_status',\n",
    "    #  'lang',\n",
    "    #  'quoted_status_id_str',\n",
    "]\n",
    "\n",
    "df = pd.DataFrame(expr)[cols]\n",
    "\n",
    "df['created_at'] = pd.to_datetime(df['created_at'], format=\"%a %b %d %H:%M:%S %z %Y\")\n",
    "\n",
    "numeric = [\n",
    "    'favorite_count',\n",
    "    'quote_count',\n",
    "    'reply_count',\n",
    "    'retweet_count',\n",
    "]\n",
    "\n",
    "df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')\n",
    "\n",
    "## drop duplicates, sort by date\n",
    "df = (df\n",
    "      .dropna(subset='id_str')\n",
    "      .drop_duplicates(subset='id_str')\n",
    "      .sort_values('created_at', ascending=False)\n",
    "      .reset_index(drop=True)\n",
    "      )\n",
    "\n",
    "# df.to_feather(f'{time.time_ns()}.feather')\n",
    "# df.to_parquet(f'{time.time_ns()}.parquet')\n",
    "df.to_csv(f'{time.time_ns()}.csv', index=False)\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9103413b",
   "metadata": {},
   "source": [
    "### search tweet text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "401712a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id_str</th>\n",
       "      <th>id_str</th>\n",
       "      <th>created_at</th>\n",
       "      <th>favorite_count</th>\n",
       "      <th>full_text</th>\n",
       "      <th>quote_count</th>\n",
       "      <th>reply_count</th>\n",
       "      <th>retweet_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>52667700</td>\n",
       "      <td>1637152715716583424</td>\n",
       "      <td>2023-03-18 18:03:18+00:00</td>\n",
       "      <td>99.0</td>\n",
       "      <td>@karpathy Sometimes I wish people could unders...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1637147822482165760</td>\n",
       "      <td>2023-03-18 17:43:52+00:00</td>\n",
       "      <td>325.0</td>\n",
       "      <td>If not careful, fine-tuning collapses entropy ...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>21.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>788533935886077952</td>\n",
       "      <td>1636786608916819968</td>\n",
       "      <td>2023-03-17 17:48:32+00:00</td>\n",
       "      <td>411.0</td>\n",
       "      <td>I finally installed github copilot (better lat...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1636765735627395073</td>\n",
       "      <td>2023-03-17 16:25:35+00:00</td>\n",
       "      <td>22.0</td>\n",
       "      <td>@BlancheMinerva @JosephJacks_ I didn’t work on...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1636459245184106497</td>\n",
       "      <td>2023-03-16 20:07:42+00:00</td>\n",
       "      <td>1254.0</td>\n",
       "      <td>Less publicized but highly awesome aspect of G...</td>\n",
       "      <td>10.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>132.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1620875263700799488</td>\n",
       "      <td>2023-02-01 20:02:31+00:00</td>\n",
       "      <td>10.0</td>\n",
       "      <td>@portisto @trending_repos sad. The way they co...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>65629552</td>\n",
       "      <td>1620850430254223360</td>\n",
       "      <td>2023-02-01 18:23:51+00:00</td>\n",
       "      <td>7.0</td>\n",
       "      <td>@trending_repos @karpathy How can a main langu...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1620811724952866816</td>\n",
       "      <td>2023-02-01 15:50:03+00:00</td>\n",
       "      <td>245.0</td>\n",
       "      <td>@trending_repos wow</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>1162359127294861314</td>\n",
       "      <td>1620749130556669952</td>\n",
       "      <td>2023-02-01 11:41:19+00:00</td>\n",
       "      <td>2541.0</td>\n",
       "      <td>Trending repository of the month 🏆\\n  \\nnanoGP...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>320.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1620187595979513857</td>\n",
       "      <td>2023-01-30 22:29:59+00:00</td>\n",
       "      <td>15.0</td>\n",
       "      <td>@hi_tysam It was very nice to read through top...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>1615441883672502291</td>\n",
       "      <td>1620185408721256449</td>\n",
       "      <td>2023-01-30 22:21:17+00:00</td>\n",
       "      <td>15.0</td>\n",
       "      <td>@karpathy I'm honored and a bit stunned. Wow, ...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>178</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1613250487838707712</td>\n",
       "      <td>2023-01-11 19:04:23+00:00</td>\n",
       "      <td>2257.0</td>\n",
       "      <td>Didn't tweet nanoGPT yet (quietly getting it t...</td>\n",
       "      <td>24.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>303.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>186</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1608895189078380544</td>\n",
       "      <td>2022-12-30 18:37:59+00:00</td>\n",
       "      <td>4356.0</td>\n",
       "      <td>Nice read on reverse engineering of GitHub Cop...</td>\n",
       "      <td>145.0</td>\n",
       "      <td>85.0</td>\n",
       "      <td>555.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>190</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1607791539258003457</td>\n",
       "      <td>2022-12-27 17:32:28+00:00</td>\n",
       "      <td>556.0</td>\n",
       "      <td>Context I realized I have to split up minGPT b...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             user_id_str               id_str                created_at  \\\n",
       "9               52667700  1637152715716583424 2023-03-18 18:03:18+00:00   \n",
       "14              33836629  1637147822482165760 2023-03-18 17:43:52+00:00   \n",
       "17    788533935886077952  1636786608916819968 2023-03-17 17:48:32+00:00   \n",
       "18              33836629  1636765735627395073 2023-03-17 16:25:35+00:00   \n",
       "20              33836629  1636459245184106497 2023-03-16 20:07:42+00:00   \n",
       "144             33836629  1620875263700799488 2023-02-01 20:02:31+00:00   \n",
       "145             65629552  1620850430254223360 2023-02-01 18:23:51+00:00   \n",
       "146             33836629  1620811724952866816 2023-02-01 15:50:03+00:00   \n",
       "147  1162359127294861314  1620749130556669952 2023-02-01 11:41:19+00:00   \n",
       "150             33836629  1620187595979513857 2023-01-30 22:29:59+00:00   \n",
       "151  1615441883672502291  1620185408721256449 2023-01-30 22:21:17+00:00   \n",
       "178             33836629  1613250487838707712 2023-01-11 19:04:23+00:00   \n",
       "186             33836629  1608895189078380544 2022-12-30 18:37:59+00:00   \n",
       "190             33836629  1607791539258003457 2022-12-27 17:32:28+00:00   \n",
       "\n",
       "     favorite_count                                          full_text  \\\n",
       "9              99.0  @karpathy Sometimes I wish people could unders...   \n",
       "14            325.0  If not careful, fine-tuning collapses entropy ...   \n",
       "17            411.0  I finally installed github copilot (better lat...   \n",
       "18             22.0  @BlancheMinerva @JosephJacks_ I didn’t work on...   \n",
       "20           1254.0  Less publicized but highly awesome aspect of G...   \n",
       "144            10.0  @portisto @trending_repos sad. The way they co...   \n",
       "145             7.0  @trending_repos @karpathy How can a main langu...   \n",
       "146           245.0                                @trending_repos wow   \n",
       "147          2541.0  Trending repository of the month 🏆\\n  \\nnanoGP...   \n",
       "150            15.0  @hi_tysam It was very nice to read through top...   \n",
       "151            15.0  @karpathy I'm honored and a bit stunned. Wow, ...   \n",
       "178          2257.0  Didn't tweet nanoGPT yet (quietly getting it t...   \n",
       "186          4356.0  Nice read on reverse engineering of GitHub Cop...   \n",
       "190           556.0  Context I realized I have to split up minGPT b...   \n",
       "\n",
       "     quote_count  reply_count  retweet_count  \n",
       "9            2.0          1.0            5.0  \n",
       "14           5.0          9.0           21.0  \n",
       "17           5.0         15.0           14.0  \n",
       "18           0.0          4.0            1.0  \n",
       "20          10.0         38.0          132.0  \n",
       "144          0.0          1.0            2.0  \n",
       "145          0.0          4.0            0.0  \n",
       "146          0.0          6.0            4.0  \n",
       "147          9.0         19.0          320.0  \n",
       "150          0.0          1.0            2.0  \n",
       "151          0.0          3.0            0.0  \n",
       "178         24.0         39.0          303.0  \n",
       "186        145.0         85.0          555.0  \n",
       "190          2.0         23.0           16.0  "
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.full_text.str.contains('repos?i?|github', regex=True, flags=re.I)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "96ebc3fd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id_str</th>\n",
       "      <th>id_str</th>\n",
       "      <th>created_at</th>\n",
       "      <th>favorite_count</th>\n",
       "      <th>full_text</th>\n",
       "      <th>quote_count</th>\n",
       "      <th>reply_count</th>\n",
       "      <th>retweet_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>1615441883672502291</td>\n",
       "      <td>1632577588529954819</td>\n",
       "      <td>2023-03-06 03:03:23+00:00</td>\n",
       "      <td>91.0</td>\n",
       "      <td>Speed up your LLM research exploration with a ...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1621578354024677377</td>\n",
       "      <td>2023-02-03 18:36:21+00:00</td>\n",
       "      <td>5276.0</td>\n",
       "      <td>The most dramatic optimization to nanoGPT so f...</td>\n",
       "      <td>57.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>353.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>1162359127294861314</td>\n",
       "      <td>1620749130556669952</td>\n",
       "      <td>2023-02-01 11:41:19+00:00</td>\n",
       "      <td>2541.0</td>\n",
       "      <td>Trending repository of the month 🏆\\n  \\nnanoGP...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>320.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1615398117683388417</td>\n",
       "      <td>2023-01-17 17:18:18+00:00</td>\n",
       "      <td>21166.0</td>\n",
       "      <td>🔥 New (1h56m) video lecture: \"Let's build GPT:...</td>\n",
       "      <td>331.0</td>\n",
       "      <td>546.0</td>\n",
       "      <td>3321.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>178</th>\n",
       "      <td>33836629</td>\n",
       "      <td>1613250487838707712</td>\n",
       "      <td>2023-01-11 19:04:23+00:00</td>\n",
       "      <td>2257.0</td>\n",
       "      <td>Didn't tweet nanoGPT yet (quietly getting it t...</td>\n",
       "      <td>24.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>303.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             user_id_str               id_str                created_at  \\\n",
       "58   1615441883672502291  1632577588529954819 2023-03-06 03:03:23+00:00   \n",
       "143             33836629  1621578354024677377 2023-02-03 18:36:21+00:00   \n",
       "147  1162359127294861314  1620749130556669952 2023-02-01 11:41:19+00:00   \n",
       "172             33836629  1615398117683388417 2023-01-17 17:18:18+00:00   \n",
       "178             33836629  1613250487838707712 2023-01-11 19:04:23+00:00   \n",
       "\n",
       "     favorite_count                                          full_text  \\\n",
       "58             91.0  Speed up your LLM research exploration with a ...   \n",
       "143          5276.0  The most dramatic optimization to nanoGPT so f...   \n",
       "147          2541.0  Trending repository of the month 🏆\\n  \\nnanoGP...   \n",
       "172         21166.0  🔥 New (1h56m) video lecture: \"Let's build GPT:...   \n",
       "178          2257.0  Didn't tweet nanoGPT yet (quietly getting it t...   \n",
       "\n",
       "     quote_count  reply_count  retweet_count  \n",
       "58           2.0          3.0           14.0  \n",
       "143         57.0         89.0          353.0  \n",
       "147          9.0         19.0          320.0  \n",
       "172        331.0        546.0         3321.0  \n",
       "178         24.0         39.0          303.0  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flags = re.I\n",
    "\n",
    "(df\n",
    " .query('full_text.str.contains(\"nanogpt\", regex=True, flags=@flags)', engine='python')\n",
    " # .query(...)\n",
    " # .query(...)\n",
    " )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}