Files
538data/subreddit-algebra/processData.sql
Karolis Matulionis 4a52d40cc6 Update processData.sql
2018-05-15 22:54:15 +02:00

29 lines
1.6 KiB
SQL

##### Part 0: Formatted and processed data in BigQuery
## Creating list of number of users in each subreddit:
## Thanks to Reddit users /u/Stuck_In_the_Matrix for pulling the data originally and /u/fhoffa for hosting the data on BigQery
SELECT subreddit, authors, DENSE_RANK() OVER (ORDER BY authors DESC) AS rank_authors
FROM (SELECT subreddit, SUM(1) as authors
FROM (SELECT subreddit, author, COUNT(1) as cnt
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
GROUP BY subreddit, author HAVING cnt > 0)
GROUP BY subreddit) t
ORDER BY authors DESC;
# Creating list of number of users who authored at least 10 posts in pairs of subreddits:
SELECT t1.subreddit as t1_subreddit, t2.subreddit as t2_subreddit, SUM(1) as NumOverlaps
FROM (SELECT subreddit, author, COUNT(1) as cnt
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
AND subreddit IN (SELECT subreddit FROM [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501]
WHERE rank_authors>200 AND rank_authors<2201)
GROUP BY subreddit, author HAVING cnt > 10) t1
JOIN (SELECT subreddit, author, COUNT(1) as cnt
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
GROUP BY subreddit, author HAVING cnt > 10) t2
ON t1.author=t2.author
WHERE t1.subreddit!=t2.subreddit
GROUP BY t1_subreddit, t2_subreddit