152 lines
6.2 KiB
R
152 lines
6.2 KiB
R
#######################################
|
|
#
|
|
# Program to analyze distance between
|
|
# Reddit subreddits using the cooccurrence
|
|
# of commentors across subreddits.
|
|
# Also implements "subreddit algebra"
|
|
# by adding and subtracting subreddit
|
|
# vectors.
|
|
# By @martintrevor_ for FiveThirtyEight
|
|
#
|
|
#######################################
|
|
|
|
library(reshape2)
|
|
library(lsa)
|
|
library(ggtern)
|
|
|
|
##### Part 1: Load in the data
|
|
|
|
# This CSV file was created by running the SQL code in processData.sql in Google's BigQuery
|
|
rawsubredditvecs = read.table("all_starting_2015_01_overlaps_top2200_no200_10com_allrank_mod_122716.csv",header=TRUE,sep=",")
|
|
|
|
##### Part 2: Format and clean data for analysis
|
|
|
|
castsubredditvecs = dcast(rawsubredditvecs,t1_subreddit~t2_subreddit,FUN="identity",fill=0)
|
|
subredditvecst = as.matrix(castsubredditvecs[,-1])
|
|
rownames(subredditvecst) = castsubredditvecs[,1]
|
|
subredditvecs = t(subredditvecst)
|
|
subredditvecssums = apply(subredditvecs,1,sum)
|
|
subredditvecsnorm = sweep(subredditvecs,1,subredditvecssums,"/")
|
|
subredditvecssumscontext = apply(subredditvecs,2,sum)
|
|
contextprobs = subredditvecssumscontext/sum(subredditvecssumscontext)
|
|
subredditvecspmi = log(sweep(subredditvecsnorm,2,contextprobs,"/")) # PMI version
|
|
subredditvecsppmi = subredditvecspmi
|
|
subredditvecsppmi[subredditvecspmi<0] = 0 # PPMI version
|
|
scalar1 <- function(x) {x / sqrt(sum(x^2))} # Function to normalize vectors to unit length
|
|
subredditvecsppminorm = t(apply(subredditvecsppmi,1,scalar1))
|
|
|
|
##### Part 3: Analysis of subreddit similarities
|
|
|
|
## Looking at which subreddits are closest to each other (and combinations of subreddits)
|
|
cursubmat = subredditvecsppminorm
|
|
cursubmatt = t(cursubmat)
|
|
currownameslc = tolower(rownames(cursubmat))
|
|
# Function to calculate subreddit similarities and perform algebra
|
|
# Note that curops always has a leading "+"
|
|
findrelsubreddit <- function(cursubs,curops,numret=20) {
|
|
cursubs = tolower(cursubs)
|
|
curvec = 0
|
|
for(i in 1:length(cursubs)) {
|
|
curvec = ifelse(curops[i]=="+",list(curvec + cursubmat[which(currownameslc==cursubs[i]),]),list(curvec - cursubmat[which(currownameslc==cursubs[i]),]))[[1]]
|
|
}
|
|
curclosesubs = cosine(x=curvec,y=cursubmatt)
|
|
curclosesubso = order(curclosesubs,decreasing=TRUE)
|
|
curclosesubsorder = curclosesubs[curclosesubso]
|
|
curclosesubsorderc = curclosesubsorder[-which(tolower(names(curclosesubsorder))%in%cursubs)]
|
|
return(head(curclosesubsorderc,numret))
|
|
}
|
|
|
|
## Political examples
|
|
|
|
# /r/The_Donald
|
|
cursubs = c("the_donald")
|
|
curops = c("+")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/The_Donald - /r/politics
|
|
cursubs = c("the_donald","politics")
|
|
curops = c("+","-")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/hillaryclinton - /r/politics
|
|
cursubs = c("hillaryclinton","politics")
|
|
curops = c("+","-")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/The_Donald - /r/SandersforPresident
|
|
cursubs = c("the_donald","sandersforpresident")
|
|
curops = c("+","-")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/SandersforPresident - /r/The_Donald
|
|
cursubs = c("sandersforpresident","the_donald")
|
|
curops = c("+","-")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/fatpeoplehate + /r/CoonTown + /r/politics
|
|
cursubs = c("fatpeoplehate","coontown","politics")
|
|
curops = c("+","+","+")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
## Validation examples
|
|
|
|
# /r/nba + /r/minnesota
|
|
cursubs = c("nba","minnesota")
|
|
curops = c("+","+")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/personalfinance - /r/Frugal
|
|
cursubs = c("personalfinance","frugal")
|
|
curops = c("+","-")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
# /r/Fitness + /r/TwoXChromosomes
|
|
cursubs = c("fitness","twoxchromosomes")
|
|
curops = c("+","+")
|
|
findrelsubreddit(cursubs,curops,5)
|
|
|
|
## Creating the ternary plot
|
|
|
|
# Similatrity to /r/The_Donald
|
|
cursubs = c("the_donald")
|
|
curops = c("+")
|
|
Dsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
|
|
# Similarity to /r/SandersforPresident
|
|
cursubs = c("sandersforpresident")
|
|
curops = c("+")
|
|
Ssubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
|
|
# Similarity to /r/hillaryclinton
|
|
cursubs = c("hillaryclinton")
|
|
curops = c("+")
|
|
Hsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
|
|
# List of subreddits we're interested in
|
|
ternarysubs = c("theredpill","coontown","fatpeoplehate","politics","worldnews","news","sjwhate","thebluepill","feminism","books","political_revolution","basicincome")
|
|
Dternarysubsims = Dsubsims[tolower(names(Dsubsims))%in%ternarysubs]
|
|
Sternarysubsims = Ssubsims[tolower(names(Ssubsims))%in%ternarysubs]
|
|
Hternarysubsims = Hsubsims[tolower(names(Hsubsims))%in%ternarysubs]
|
|
# Normalizing the matrix
|
|
allternarysubsims = transform(merge(transform(merge(Sternarysubsims,Dternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
|
|
colnames(allternarysubsims) = c("S","D","H")
|
|
allternarysubsimssums = apply(allternarysubsims,1,sum)
|
|
allternarysubsimsnorm = sweep(allternarysubsims,1,allternarysubsimssums,"/")
|
|
# Creating the plot
|
|
pdf("./ternaryplotanno.pdf",height=10,width=10)
|
|
ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + geom_text(label=rownames(allternarysubsimsnorm),hjust=0,vjust=0)
|
|
dev.off()
|
|
pdf("./ternaryplot.pdf",height=10,width=10)
|
|
ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + theme_classic()
|
|
dev.off()
|
|
|
|
# Find subreddits that are particularly biased towards any of the three main candidate subreddits
|
|
allsubsims = transform(merge(transform(merge(Ssubsims,Dsubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hsubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
|
|
colnames(allsubsims) = c("S","D","H")
|
|
chooseunique = c("H") # Set candidate subreddit of interest
|
|
curunique = 1/(allsubsims[,(!(colnames(allsubsims)==chooseunique))]/allsubsims[,chooseunique]) # Calculate fold enrichment of target candidate subreddit over other candidate subreddits for all other subreddits
|
|
allsubsimsmin = apply(allsubsims,1,min)
|
|
curuniquemin = apply(curunique,1,min)
|
|
curuniqueminc = curuniquemin[-which(allsubsimsmin==0)]
|
|
curuniquemat = data.frame(enrich=curuniqueminc,allsubsims[match(names(curuniqueminc),rownames(allsubsims)),])
|
|
curuniquemato = curuniquemat[order(curuniquemat$enrich,decreasing=TRUE),]
|
|
curuniquematoc = curuniquemato[which(curuniquemato[,chooseunique]>=0.25),] # Threshold for high enrichment and high raw similarity
|
|
head(curuniquematoc,20)
|