add data, script for gop repeater story

This commit is contained in:
Ritchie King
2016-03-10 16:27:19 -05:00
parent 7195b62427
commit ccb5a5cf1e
3 changed files with 18964 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
### The phrases most repeated by GOP candidates
The raw data behind the story [These Are The Phrases Each GOP Candidate Repeats Most](http://fivethirtyeight.com/features/these-are-the-phrases-each-gop-candidate-repeats-most)
File | Description
---|---------
`gop_debate_all.txt` | Plain text transcription of all Republican primary debates from the August 6, 2015 Fox News debate in Cleveland through the March 3, 2016 Fox News debate in Detroit
`robopol2.py` | Python script that conducts a term frequencyinverse document frequency (tf-idf) analysis of the text in `gop_debate_all.txt`

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,207 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import random
import numpy
import math
import string
import operator
from collections import defaultdict
# candidates to pay attention to
candidates = ["TRUMP", "CRUZ", "RUBIO", "KASICH"]
# n-gram lengths to iterate through
min_N = 1 # inclusive
max_N = 15 # exclusive
####
#### HELPER FUNCTIONS
####
# returns a dict mapping each n-gram that appears in the corpus to its frequency in the corpus
def ngram_freqs(corpus, n):
# generate a list of all n-grams in the corpus
ngrams = []
for i in range(n, len(corpus)):
if not "<BR>" in tuple(corpus[i-n:i]):
ngrams += [tuple(corpus[i-n:i])]
# count the frequency of each n-gram
freq_dict = defaultdict(int)
for ngram in ngrams:
freq_dict[ngram] += 1
return freq_dict
# combines two dicts by performing the provided operation on their values
def combine_dicts(a, b, op=operator.add):
return dict(a.items() + b.items() + [(k, op(a[k], b[k])) for k in set(b) & set(a)])
# checks whether two n-grams overlap too much to include both
def overlap(a, b):
max_overlap = min(3, len(a), len(b))
overlap = False
# the begnning of a is in b
if '-'.join(a[:max_overlap]) in '-'.join(b):
overlap = True
# the end of a is in b
if '-'.join(a[-max_overlap:]) in '-'.join(b):
overlap = True
# the begnning of b is in a
if '-'.join(b[:max_overlap]) in '-'.join(a):
overlap = True
# the end of b is in a
if '-'.join(b[-max_overlap:]) in '-'.join(a):
overlap = True
return overlap
####
#### ANALYSIS FUNCTIONS
####
# returns a list of corpora, each a sequential list of all words said by one candidate
def corpus_list_from_file(filename):
# load all words from the file into memory
words = open(filename).read().split()
# initialize the list of corpora
corpus_list = []
for candidate in candidates:
corpus_list += [[]]
# iterate through words, putting them in the correct corpus
speaker_index = -1
for word in words:
# change of speaker
if word[-1] == ":" and word.isupper():
# name of the new speaker
speaker = word[:-1]
# speaker is one of the candidates
if speaker in candidates:
speaker_index = candidates.index(speaker)
# speaker is moderator or candidate not listed
else:
speaker_index = -1
# add a speaking break indicator
corpus_list[speaker_index] += ["<BR>"]
# regular word
elif word[0] is not "(" and word[-1] is not ")":
# remove punctuation and convert to lowercase
word = word.translate(string.maketrans("",""), string.punctuation).lower()
if speaker_index >= 0:
if word is not "":
corpus_list[speaker_index] += [word]
return corpus_list
# returns a list of dicts, each mapping an n-gram to its frequency in the respective corpus
def freq_dicts_from_corpus_list(corpus_list):
# initialize the list of dicts
freq_dicts = []
for candidate in range(len(candidates)):
freq_dicts += [defaultdict(int)]
# iteratively add all n-grams
for n in range(min_N, max_N):
for candidate in range(len(candidates)):
corpus = corpus_list[candidate]
dict_to_add = ngram_freqs(corpus, n)
freq_dicts[candidate] = combine_dicts(freq_dicts[candidate], dict_to_add)
return freq_dicts
# returns a list of dicts, each mapping an n-gram to its tf-idf in the respective corpus
# see https://en.wikipedia.org/wiki/Tf-idf for further information
def tfidf_dicts_from_freq_dicts(freq_dicts):
# initialize the list of dicts
tfidf_dicts = []
for candidate in range(len(candidates)):
tfidf_dicts += [defaultdict(int)]
# create a dict that maps an n-gram to the number of corpora containing that n-gram
num_containing = defaultdict(int)
for candidate in range(len(candidates)):
for ngram in freq_dicts[candidate]:
num_containing[ngram] += 1
# calculate tf-idf for each n-gram in each corpus
for candidate in range(len(candidates)):
for ngram in freq_dicts[candidate]:
tf = freq_dicts[candidate][ngram]
idf = math.log(len(candidates) / num_containing[ngram])
# normalize by length of n-gram
tfidf_dicts[candidate][ngram] = tf * idf * len(ngram)
# kill anything ending in "and" "or" "of" "with"
if ngram[-1] in ["and", "or", "of", "with"]:
tfidf_dicts[candidate][ngram] = 0
return tfidf_dicts
# kills any phrase (tfidf=0) contained inside a larger phrase with a higher score
def prune_substrings(tfidf_dicts, prune_thru=1000):
pruned = tfidf_dicts
for candidate in range(len(candidates)):
# growing list of n-grams in list form
so_far = []
ngrams_sorted = sorted(tfidf_dicts[candidate].items(), key=operator.itemgetter(1), reverse=True)[:prune_thru]
for ngram in ngrams_sorted:
# contained in a previous aka 'better' phrase
for better_ngram in so_far:
if overlap(list(better_ngram), list(ngram[0])):
#print "PRUNING!! "
#print list(better_ngram)
#print list(ngram[0])
pruned[candidate][ngram[0]] = 0
# not contained, so add to so_far to prevent future subphrases
else:
so_far += [list(ngram[0])]
return pruned
# sorts the n-grams for a candidate by tf-idf
def top_ngrams_for_candidate(tfidf_dicts, candidate, count=20):
return sorted(tfidf_dicts[candidate].items(), key=operator.itemgetter(1), reverse=True)[:count]
def main():
corpus_list = corpus_list_from_file("gop_debate_all.txt")
freq_dicts = freq_dicts_from_corpus_list(corpus_list)
tfidf_dicts = tfidf_dicts_from_freq_dicts(freq_dicts)
tfidf_dicts = prune_substrings(tfidf_dicts)
# print the top ngrams sorted by tfidf
for candidate in range(len(candidates)):
print candidates[candidate]
for ngram in top_ngrams_for_candidate(tfidf_dicts, candidate, 400):
print ngram
if __name__ == '__main__':
main()