adding aubreddit algebra

This commit is contained in:
Koeze
2017-03-23 06:46:16 -04:00
parent fbfa7b9a60
commit cdf5559d2a
4 changed files with 228 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
# Calculating overlap between /r/The_Donald and /r/Conservative commenters
SELECT sub_a, sub_b, percent, sub_ac, sub_bc
FROM (
SELECT sub_a, sub_b, percent, COUNT(*) OVER(PARTITION BY sub_a) sub_ac, sub_bc
FROM(
SELECT a.subreddit sub_a, b.subreddit sub_b, INTEGER(100*COUNT(*)/FIRST(authors)) percent, COUNT(*) OVER(PARTITION BY sub_b) sub_bc
FROM (
SELECT author, subreddit, authors
FROM FLATTEN((
SELECT UNIQUE(author) author, a.subreddit subreddit, FIRST(authors) authors
FROM [fh-bigquery:reddit_comments.all_starting_201501] a
JOIN [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501] b
ON a.subreddit=b.subreddit
WHERE rank_authors>0 and rank_authors<500
GROUP EACH BY 2
),author)
) a
JOIN EACH (
SELECT author, subreddit
FROM FLATTEN((
SELECT UNIQUE(author) author, subreddit
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE subreddit IN (SELECT subreddit FROM [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501]
WHERE rank_authors>0 and rank_authors<500
)
GROUP BY 2
),author)
) b
ON a.author=b.author
WHERE a.subreddit!=b.subreddit
AND (a.subreddit='The_Donald' OR b.subreddit='The_Donald')
GROUP EACH BY 1,2
HAVING percent>5
)
)
ORDER BY 3 DESC

View File

@@ -0,0 +1,28 @@
##### Part 0: Formatted and processed data in BigQuery
## Creating list of number of users in each subreddit:
## Thanks to Reddit users /u/Stuck_In_the_Matrix for pulling the data originally and /u/fhoffa for hosting the data on BigQery
SELECT subreddit, authors, DENSE_RANK() OVER (ORDER BY authors DESC) AS rank_authors
FROM (SELECT subreddit, SUM(1) as authors
FROM (SELECT subreddit, author, COUNT(1) as cnt
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
GROUP BY subreddit, author HAVING cnt > 0)
GROUP BY subreddit) t
ORDER BY authors DESC;
# Creating list of number of users who authored at least 10 posts in pairs of subreddits:
SELECT t1.subreddit, t2.subreddit, SUM(1) as NumOverlaps
FROM (SELECT subreddit, author, COUNT(1) as cnt
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
AND subreddit IN (SELECT subreddit FROM [subreddit-vectors:subredditoverlaps.subr_rank_all_starting_201501]
WHERE rank_authors>200 AND rank_authors<2201)
GROUP BY subreddit, author HAVING cnt > 10) t1
JOIN (SELECT subreddit, author, COUNT(1) as cnt
FROM [fh-bigquery:reddit_comments.all_starting_201501]
WHERE author NOT IN (SELECT author FROM [fh-bigquery:reddit_comments.bots_201505])
GROUP BY subreddit, author HAVING cnt > 10) t2
ON t1.author=t2.author
WHERE t1.subreddit!=t2.subreddit
GROUP BY t1.subreddit, t2.subreddit

View File

@@ -0,0 +1,13 @@
### Subreddit Algebra
This directory contains the code and data behind the story: [Dissecting Trump's Most Rabid Online Following](https://fivethirtyeight.com/features/dissecting-trumps-most-rabid-online-following/)
The raw data (an online cache of Reddit comments going back to 2005) is from [Google's Big Query](https://bigquery.cloud.google.com/table/fh-bigquery:reddit_comments.2015_05) and more information about the data can [be found here](https://www.reddit.com/r/bigquery/comments/3cej2b/17_billion_reddit_comments_loaded_on_bigquery/).
Details about the three files of code in this folder:
File | Description
---|---------
`processData.sql` | SQL code for filtering, processing and formatting Reddit comment data from Google's Big Query. (Note that if you click on the raw data link above, this SQL query will automatically be loaded).
`subredditVectorAnalysis.R` | Conducts a latent semantic analysis of over 50,000 subreddits that creates a vector representation of each one based on commenter co-occurence. It also implements "subreddit algebra:" the ability to add and subtract different subreddits to reveal how they relate to one another.
`computeUserOverlap.sql` | A separate SQL query used for computing the user overlap between r/The_Donald and other subreddits

View File

@@ -0,0 +1,151 @@
#######################################
#
# Program to analyze distance between
# Reddit subreddits using the cooccurrence
# of commentors across subreddits.
# Also implements "subreddit algebra"
# by adding and subtracting subreddit
# vectors.
# By @martintrevor_ for FiveThirtyEight
#
#######################################
library(reshape2)
library(lsa)
library(ggtern)
##### Part 1: Load in the data
# This CSV file was created by running the SQL code in processData.sql in Google's BigQuery
rawsubredditvecs = read.table("all_starting_2015_01_overlaps_top2200_no200_10com_allrank_mod_122716.csv",header=TRUE,sep=",")
##### Part 2: Format and clean data for analysis
castsubredditvecs = dcast(rawsubredditvecs,t1_subreddit~t2_subreddit,FUN="identity",fill=0)
subredditvecst = as.matrix(castsubredditvecs[,-1])
rownames(subredditvecst) = castsubredditvecs[,1]
subredditvecs = t(subredditvecst)
subredditvecssums = apply(subredditvecs,1,sum)
subredditvecsnorm = sweep(subredditvecs,1,subredditvecssums,"/")
subredditvecssumscontext = apply(subredditvecs,2,sum)
contextprobs = subredditvecssumscontext/sum(subredditvecssumscontext)
subredditvecspmi = log(sweep(subredditvecsnorm,2,contextprobs,"/")) # PMI version
subredditvecsppmi = subredditvecspmi
subredditvecsppmi[subredditvecspmi<0] = 0 # PPMI version
scalar1 <- function(x) {x / sqrt(sum(x^2))} # Function to normalize vectors to unit length
subredditvecsppminorm = t(apply(subredditvecsppmi,1,scalar1))
##### Part 3: Analysis of subreddit similarities
## Looking at which subreddits are closest to each other (and combinations of subreddits)
cursubmat = subredditvecsppminorm
cursubmatt = t(cursubmat)
currownameslc = tolower(rownames(cursubmat))
# Function to calculate subreddit similarities and perform algebra
# Note that curops always has a leading "+"
findrelsubreddit <- function(cursubs,curops,numret=20) {
cursubs = tolower(cursubs)
curvec = 0
for(i in 1:length(cursubs)) {
curvec = ifelse(curops[i]=="+",list(curvec + cursubmat[which(currownameslc==cursubs[i]),]),list(curvec - cursubmat[which(currownameslc==cursubs[i]),]))[[1]]
}
curclosesubs = cosine(x=curvec,y=cursubmatt)
curclosesubso = order(curclosesubs,decreasing=TRUE)
curclosesubsorder = curclosesubs[curclosesubso]
curclosesubsorderc = curclosesubsorder[-which(tolower(names(curclosesubsorder))%in%cursubs)]
return(head(curclosesubsorderc,numret))
}
## Political examples
# /r/The_Donald
cursubs = c("the_donald")
curops = c("+")
findrelsubreddit(cursubs,curops,5)
# /r/The_Donald - /r/politics
cursubs = c("the_donald","politics")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/hillaryclinton - /r/politics
cursubs = c("hillaryclinton","politics")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/The_Donald - /r/SandersforPresident
cursubs = c("the_donald","sandersforpresident")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/SandersforPresident - /r/The_Donald
cursubs = c("sandersforpresident","the_donald")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/fatpeoplehate + /r/CoonTown + /r/politics
cursubs = c("fatpeoplehate","coontown","politics")
curops = c("+","+","+")
findrelsubreddit(cursubs,curops,5)
## Validation examples
# /r/nba + /r/minnesota
cursubs = c("nba","minnesota")
curops = c("+","+")
findrelsubreddit(cursubs,curops,5)
# /r/personalfinance - /r/Frugal
cursubs = c("personalfinance","frugal")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/Fitness + /r/TwoXChromosomes
cursubs = c("fitness","twoxchromosomes")
curops = c("+","+")
findrelsubreddit(cursubs,curops,5)
## Creating the ternary plot
# Similatrity to /r/The_Donald
cursubs = c("the_donald")
curops = c("+")
Dsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
# Similarity to /r/SandersforPresident
cursubs = c("sandersforpresident")
curops = c("+")
Ssubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
# Similarity to /r/hillaryclinton
cursubs = c("hillaryclinton")
curops = c("+")
Hsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
# List of subreddits we're interested in
ternarysubs = c("theredpill","coontown","fatpeoplehate","politics","worldnews","news","sjwhate","thebluepill","feminism","books","political_revolution","basicincome")
Dternarysubsims = Dsubsims[tolower(names(Dsubsims))%in%ternarysubs]
Sternarysubsims = Ssubsims[tolower(names(Ssubsims))%in%ternarysubs]
Hternarysubsims = Hsubsims[tolower(names(Hsubsims))%in%ternarysubs]
# Normalizing the matrix
allternarysubsims = transform(merge(transform(merge(Sternarysubsims,Dternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
colnames(allternarysubsims) = c("S","D","H")
allternarysubsimssums = apply(allternarysubsims,1,sum)
allternarysubsimsnorm = sweep(allternarysubsims,1,allternarysubsimssums,"/")
# Creating the plot
pdf("./ternaryplotanno.pdf",height=10,width=10)
ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + geom_text(label=rownames(allternarysubsimsnorm),hjust=0,vjust=0)
dev.off()
pdf("./ternaryplot.pdf",height=10,width=10)
ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + theme_classic()
dev.off()
# Find subreddits that are particularly biased towards any of the three main candidate subreddits
allsubsims = transform(merge(transform(merge(Ssubsims,Dsubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hsubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
colnames(allsubsims) = c("S","D","H")
chooseunique = c("H") # Set candidate subreddit of interest
curunique = 1/(allsubsims[,(!(colnames(allsubsims)==chooseunique))]/allsubsims[,chooseunique]) # Calculate fold enrichment of target candidate subreddit over other candidate subreddits for all other subreddits
allsubsimsmin = apply(allsubsims,1,min)
curuniquemin = apply(curunique,1,min)
curuniqueminc = curuniquemin[-which(allsubsimsmin==0)]
curuniquemat = data.frame(enrich=curuniqueminc,allsubsims[match(names(curuniqueminc),rownames(allsubsims)),])
curuniquemato = curuniquemat[order(curuniquemat$enrich,decreasing=TRUE),]
curuniquematoc = curuniquemato[which(curuniquemato[,chooseunique]>=0.25),] # Threshold for high enrichment and high raw similarity
head(curuniquematoc,20)