adding aubreddit algebra

2017-03-23 06:46:16 -04:00
parent fbfa7b9a60
commit cdf5559d2a
4 changed files with 228 additions and 0 deletions
--- a/subreddit-algebra/computeUserOverlap.sql
+++ b/subreddit-algebra/computeUserOverlap.sql
@@ -0,0 +1,36 @@
+# Calculating overlap between /r/The_Donald and /r/Conservative commenters
+SELECT sub_a, sub_b, percent, sub_ac, sub_bc
+FROM (
+SELECT sub_a, sub_b, percent, COUNT(*) OVER(PARTITION BY sub_a) sub_ac, sub_bc
+FROM(
+SELECT a.subreddit sub_a, b.subreddit sub_b, INTEGER(100*COUNT(*)/FIRST(authors)) percent, COUNT(*) OVER(PARTITION BY sub_b) sub_bc
+FROM (
+ SELECT author, subreddit, authors
+ FROM FLATTEN((
+   SELECT UNIQUE(author) author, a.subreddit subreddit, FIRST(authors) authors
+   FROM [fh-bigquery:reddit_comments.all_starting_201501] a
+   JOIN [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501] b
+   ON a.subreddit=b.subreddit
+   WHERE rank_authors>0 and rank_authors<500
+   GROUP EACH BY 2  
+ ),author)
+) a
+JOIN EACH (
+ SELECT author, subreddit
+ FROM FLATTEN((
+   SELECT UNIQUE(author) author, subreddit
+   FROM [fh-bigquery:reddit_comments.all_starting_201501]
+   WHERE subreddit IN (SELECT subreddit FROM [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501] 
+     WHERE rank_authors>0 and rank_authors<500
+   )
+   GROUP BY 2
+ ),author)
+) b
+ON a.author=b.author
+WHERE a.subreddit!=b.subreddit
+AND (a.subreddit='The_Donald' OR b.subreddit='The_Donald')
+GROUP EACH BY 1,2
+HAVING percent>5
+)
+)
+ORDER BY 3 DESC
--- a/subreddit-algebra/processData.sql
+++ b/subreddit-algebra/processData.sql
@@ -0,0 +1,28 @@
+##### Part 0: Formatted and processed data in BigQuery
+
+## Creating list of number of users in each subreddit: 
+## Thanks to Reddit users /u/Stuck_In_the_Matrix for pulling the data originally and /u/fhoffa for hosting the data on BigQery
+SELECT subreddit, authors, DENSE_RANK() OVER (ORDER BY authors DESC) AS rank_authors
+FROM (SELECT subreddit, SUM(1) as authors
+     FROM (SELECT subreddit, author, COUNT(1) as cnt 
+         FROM [fh-bigquery:reddit_comments.all_starting_201501]
+         WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
+         GROUP BY subreddit, author HAVING cnt > 0)
+     GROUP BY subreddit) t
+ORDER BY authors DESC;
+
+# Creating list of number of users who authored at least 10 posts in pairs of subreddits: 
+SELECT t1.subreddit, t2.subreddit, SUM(1) as NumOverlaps
+FROM (SELECT subreddit, author, COUNT(1) as cnt 
+     FROM [fh-bigquery:reddit_comments.all_starting_201501]
+     WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
+     AND subreddit IN (SELECT subreddit FROM [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501]
+       WHERE rank_authors>200 AND rank_authors<2201)
+     GROUP BY subreddit, author HAVING cnt > 10) t1
+JOIN (SELECT subreddit, author, COUNT(1) as cnt 
+     FROM [fh-bigquery:reddit_comments.all_starting_201501]
+     WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
+     GROUP BY subreddit, author HAVING cnt > 10) t2
+ON t1.author=t2.author
+WHERE t1.subreddit!=t2.subreddit
+GROUP BY t1.subreddit, t2.subreddit
--- a/subreddit-algebra/readme.md
+++ b/subreddit-algebra/readme.md
@@ -0,0 +1,13 @@
+### Subreddit Algebra
+
+This directory contains the code and data behind the story: [Dissecting Trump's Most Rabid Online Following](https://fivethirtyeight.com/features/dissecting-trumps-most-rabid-online-following/)
+
+The raw data (an online cache of Reddit comments going back to 2005) is from [Google's Big Query](https://bigquery.cloud.google.com/table/fh-bigquery:reddit_comments.2015_05) and more information about the data can [be found here](https://www.reddit.com/r/bigquery/comments/3cej2b/17_billion_reddit_comments_loaded_on_bigquery/).
+
+Details about the three files of code in this folder:
+
+File | Description
+---|---------
+`processData.sql` | SQL code for filtering, processing and formatting Reddit comment data from Google's Big Query. (Note that if you click on the raw data link above, this SQL query will automatically be loaded).
+`subredditVectorAnalysis.R` | Conducts a latent semantic analysis of over 50,000 subreddits that creates a vector representation of each one based on commenter co-occurence. It also implements "subreddit algebra:" the ability to add and subtract different subreddits to reveal how they relate to one another.
+`computeUserOverlap.sql` | A separate SQL query used for computing the user overlap between r/The_Donald and other subreddits
--- a/subreddit-algebra/subredditVectorAnalysis.r
+++ b/subreddit-algebra/subredditVectorAnalysis.r
@@ -0,0 +1,151 @@
+#######################################
+#
+# Program to analyze distance between
+# Reddit subreddits using the cooccurrence
+# of commentors across subreddits. 
+# Also implements "subreddit algebra"
+# by adding and subtracting subreddit
+# vectors. 
+# By @martintrevor_ for FiveThirtyEight
+#
+#######################################
+
+library(reshape2)
+library(lsa)
+library(ggtern)
+
+##### Part 1: Load in the data
+
+# This CSV file was created by running the SQL code in processData.sql in Google's BigQuery
+rawsubredditvecs = read.table("all_starting_2015_01_overlaps_top2200_no200_10com_allrank_mod_122716.csv",header=TRUE,sep=",")
+
+##### Part 2: Format and clean data for analysis
+
+castsubredditvecs = dcast(rawsubredditvecs,t1_subreddit~t2_subreddit,FUN="identity",fill=0)
+subredditvecst = as.matrix(castsubredditvecs[,-1])
+rownames(subredditvecst) = castsubredditvecs[,1]
+subredditvecs = t(subredditvecst)
+subredditvecssums = apply(subredditvecs,1,sum)
+subredditvecsnorm = sweep(subredditvecs,1,subredditvecssums,"/")
+subredditvecssumscontext = apply(subredditvecs,2,sum)
+contextprobs = subredditvecssumscontext/sum(subredditvecssumscontext)
+subredditvecspmi = log(sweep(subredditvecsnorm,2,contextprobs,"/")) # PMI version
+subredditvecsppmi = subredditvecspmi
+subredditvecsppmi[subredditvecspmi<0] = 0 # PPMI version
+scalar1 <- function(x) {x / sqrt(sum(x^2))} # Function to normalize vectors to unit length
+subredditvecsppminorm = t(apply(subredditvecsppmi,1,scalar1))
+
+##### Part 3: Analysis of subreddit similarities
+
+## Looking at which subreddits are closest to each other (and combinations of subreddits)
+cursubmat = subredditvecsppminorm
+cursubmatt = t(cursubmat)
+currownameslc = tolower(rownames(cursubmat))
+# Function to calculate subreddit similarities and perform algebra
+# Note that curops always has a leading "+"
+findrelsubreddit <- function(cursubs,curops,numret=20) {
+    cursubs = tolower(cursubs)
+    curvec = 0
+    for(i in 1:length(cursubs)) {
+	    curvec = ifelse(curops[i]=="+",list(curvec + cursubmat[which(currownameslc==cursubs[i]),]),list(curvec - cursubmat[which(currownameslc==cursubs[i]),]))[[1]]
+    }
+    curclosesubs = cosine(x=curvec,y=cursubmatt)
+    curclosesubso = order(curclosesubs,decreasing=TRUE)
+    curclosesubsorder = curclosesubs[curclosesubso]
+    curclosesubsorderc = curclosesubsorder[-which(tolower(names(curclosesubsorder))%in%cursubs)]
+return(head(curclosesubsorderc,numret))
+}
+
+## Political examples
+
+# /r/The_Donald
+cursubs = c("the_donald")
+curops = c("+")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/The_Donald - /r/politics
+cursubs = c("the_donald","politics")
+curops = c("+","-")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/hillaryclinton - /r/politics
+cursubs = c("hillaryclinton","politics")
+curops = c("+","-")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/The_Donald - /r/SandersforPresident
+cursubs = c("the_donald","sandersforpresident")
+curops = c("+","-")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/SandersforPresident - /r/The_Donald
+cursubs = c("sandersforpresident","the_donald")
+curops = c("+","-")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/fatpeoplehate + /r/CoonTown + /r/politics
+cursubs = c("fatpeoplehate","coontown","politics")
+curops = c("+","+","+")
+findrelsubreddit(cursubs,curops,5)
+
+## Validation examples
+
+# /r/nba + /r/minnesota
+cursubs = c("nba","minnesota")
+curops = c("+","+")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/personalfinance - /r/Frugal
+cursubs = c("personalfinance","frugal")
+curops = c("+","-")
+findrelsubreddit(cursubs,curops,5)
+
+# /r/Fitness + /r/TwoXChromosomes
+cursubs = c("fitness","twoxchromosomes")
+curops = c("+","+")
+findrelsubreddit(cursubs,curops,5)
+
+## Creating the ternary plot
+
+# Similatrity to /r/The_Donald
+cursubs = c("the_donald")
+curops = c("+")
+Dsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
+# Similarity to /r/SandersforPresident
+cursubs = c("sandersforpresident")
+curops = c("+")
+Ssubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
+# Similarity to /r/hillaryclinton
+cursubs = c("hillaryclinton")
+curops = c("+")
+Hsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
+# List of subreddits we're interested in
+ternarysubs = c("theredpill","coontown","fatpeoplehate","politics","worldnews","news","sjwhate","thebluepill","feminism","books","political_revolution","basicincome")
+Dternarysubsims = Dsubsims[tolower(names(Dsubsims))%in%ternarysubs]
+Sternarysubsims = Ssubsims[tolower(names(Ssubsims))%in%ternarysubs]
+Hternarysubsims = Hsubsims[tolower(names(Hsubsims))%in%ternarysubs]
+# Normalizing the matrix
+allternarysubsims = transform(merge(transform(merge(Sternarysubsims,Dternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
+colnames(allternarysubsims) = c("S","D","H")
+allternarysubsimssums = apply(allternarysubsims,1,sum)
+allternarysubsimsnorm = sweep(allternarysubsims,1,allternarysubsimssums,"/")
+# Creating the plot
+pdf("./ternaryplotanno.pdf",height=10,width=10)
+ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + geom_text(label=rownames(allternarysubsimsnorm),hjust=0,vjust=0)
+dev.off()
+pdf("./ternaryplot.pdf",height=10,width=10)
+ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + theme_classic()
+dev.off()
+
+# Find subreddits that are particularly biased towards any of the three main candidate subreddits
+allsubsims = transform(merge(transform(merge(Ssubsims,Dsubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hsubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
+colnames(allsubsims) = c("S","D","H")
+chooseunique = c("H") # Set candidate subreddit of interest
+curunique = 1/(allsubsims[,(!(colnames(allsubsims)==chooseunique))]/allsubsims[,chooseunique]) # Calculate fold enrichment of target candidate subreddit over other candidate subreddits for all other subreddits
+allsubsimsmin = apply(allsubsims,1,min)
+curuniquemin = apply(curunique,1,min)
+curuniqueminc = curuniquemin[-which(allsubsimsmin==0)]
+curuniquemat = data.frame(enrich=curuniqueminc,allsubsims[match(names(curuniqueminc),rownames(allsubsims)),])
+curuniquemato = curuniquemat[order(curuniquemat$enrich,decreasing=TRUE),]
+curuniquematoc = curuniquemato[which(curuniquemato[,chooseunique]>=0.25),] # Threshold for high enrichment and high raw similarity
+head(curuniquematoc,20)