mirror of
https://github.com/jprdonnelly/538data.git
synced 2025-12-19 17:37:43 -05:00
add data, script for gop repeater story
This commit is contained in:
8
repeated-phrases-gop/README.md
Normal file
8
repeated-phrases-gop/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
### The phrases most repeated by GOP candidates
|
||||
|
||||
The raw data behind the story [These Are The Phrases Each GOP Candidate Repeats Most](http://fivethirtyeight.com/features/these-are-the-phrases-each-gop-candidate-repeats-most)
|
||||
|
||||
File | Description
|
||||
---|---------
|
||||
`gop_debate_all.txt` | Plain text transcription of all Republican primary debates from the August 6, 2015 Fox News debate in Cleveland through the March 3, 2016 Fox News debate in Detroit
|
||||
`robopol2.py` | Python script that conducts a term frequency–inverse document frequency (tf-idf) analysis of the text in `gop_debate_all.txt`
|
||||
18749
repeated-phrases-gop/gop_debate_all.txt
Normal file
18749
repeated-phrases-gop/gop_debate_all.txt
Normal file
File diff suppressed because it is too large
Load Diff
207
repeated-phrases-gop/robopol2.py
Normal file
207
repeated-phrases-gop/robopol2.py
Normal file
@@ -0,0 +1,207 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import random
|
||||
import numpy
|
||||
import math
|
||||
import string
|
||||
import operator
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
|
||||
# candidates to pay attention to
|
||||
candidates = ["TRUMP", "CRUZ", "RUBIO", "KASICH"]
|
||||
|
||||
# n-gram lengths to iterate through
|
||||
min_N = 1 # inclusive
|
||||
max_N = 15 # exclusive
|
||||
|
||||
|
||||
|
||||
####
|
||||
#### HELPER FUNCTIONS
|
||||
####
|
||||
|
||||
# returns a dict mapping each n-gram that appears in the corpus to its frequency in the corpus
|
||||
def ngram_freqs(corpus, n):
|
||||
|
||||
# generate a list of all n-grams in the corpus
|
||||
ngrams = []
|
||||
for i in range(n, len(corpus)):
|
||||
if not "<BR>" in tuple(corpus[i-n:i]):
|
||||
ngrams += [tuple(corpus[i-n:i])]
|
||||
|
||||
# count the frequency of each n-gram
|
||||
freq_dict = defaultdict(int)
|
||||
for ngram in ngrams:
|
||||
freq_dict[ngram] += 1
|
||||
|
||||
return freq_dict
|
||||
|
||||
# combines two dicts by performing the provided operation on their values
|
||||
def combine_dicts(a, b, op=operator.add):
|
||||
return dict(a.items() + b.items() + [(k, op(a[k], b[k])) for k in set(b) & set(a)])
|
||||
|
||||
# checks whether two n-grams overlap too much to include both
|
||||
def overlap(a, b):
|
||||
max_overlap = min(3, len(a), len(b))
|
||||
overlap = False
|
||||
|
||||
# the begnning of a is in b
|
||||
if '-'.join(a[:max_overlap]) in '-'.join(b):
|
||||
overlap = True
|
||||
# the end of a is in b
|
||||
if '-'.join(a[-max_overlap:]) in '-'.join(b):
|
||||
overlap = True
|
||||
# the begnning of b is in a
|
||||
if '-'.join(b[:max_overlap]) in '-'.join(a):
|
||||
overlap = True
|
||||
# the end of b is in a
|
||||
if '-'.join(b[-max_overlap:]) in '-'.join(a):
|
||||
overlap = True
|
||||
|
||||
return overlap
|
||||
|
||||
####
|
||||
#### ANALYSIS FUNCTIONS
|
||||
####
|
||||
|
||||
# returns a list of corpora, each a sequential list of all words said by one candidate
|
||||
def corpus_list_from_file(filename):
|
||||
|
||||
# load all words from the file into memory
|
||||
words = open(filename).read().split()
|
||||
|
||||
# initialize the list of corpora
|
||||
corpus_list = []
|
||||
for candidate in candidates:
|
||||
corpus_list += [[]]
|
||||
|
||||
# iterate through words, putting them in the correct corpus
|
||||
speaker_index = -1
|
||||
|
||||
for word in words:
|
||||
|
||||
# change of speaker
|
||||
if word[-1] == ":" and word.isupper():
|
||||
# name of the new speaker
|
||||
speaker = word[:-1]
|
||||
|
||||
# speaker is one of the candidates
|
||||
if speaker in candidates:
|
||||
speaker_index = candidates.index(speaker)
|
||||
|
||||
# speaker is moderator or candidate not listed
|
||||
else:
|
||||
speaker_index = -1
|
||||
|
||||
# add a speaking break indicator
|
||||
corpus_list[speaker_index] += ["<BR>"]
|
||||
|
||||
# regular word
|
||||
elif word[0] is not "(" and word[-1] is not ")":
|
||||
|
||||
# remove punctuation and convert to lowercase
|
||||
word = word.translate(string.maketrans("",""), string.punctuation).lower()
|
||||
|
||||
if speaker_index >= 0:
|
||||
if word is not "":
|
||||
corpus_list[speaker_index] += [word]
|
||||
|
||||
return corpus_list
|
||||
|
||||
# returns a list of dicts, each mapping an n-gram to its frequency in the respective corpus
|
||||
def freq_dicts_from_corpus_list(corpus_list):
|
||||
|
||||
# initialize the list of dicts
|
||||
freq_dicts = []
|
||||
for candidate in range(len(candidates)):
|
||||
freq_dicts += [defaultdict(int)]
|
||||
|
||||
# iteratively add all n-grams
|
||||
for n in range(min_N, max_N):
|
||||
for candidate in range(len(candidates)):
|
||||
corpus = corpus_list[candidate]
|
||||
dict_to_add = ngram_freqs(corpus, n)
|
||||
freq_dicts[candidate] = combine_dicts(freq_dicts[candidate], dict_to_add)
|
||||
|
||||
return freq_dicts
|
||||
|
||||
# returns a list of dicts, each mapping an n-gram to its tf-idf in the respective corpus
|
||||
# see https://en.wikipedia.org/wiki/Tf-idf for further information
|
||||
def tfidf_dicts_from_freq_dicts(freq_dicts):
|
||||
|
||||
# initialize the list of dicts
|
||||
tfidf_dicts = []
|
||||
for candidate in range(len(candidates)):
|
||||
tfidf_dicts += [defaultdict(int)]
|
||||
|
||||
# create a dict that maps an n-gram to the number of corpora containing that n-gram
|
||||
num_containing = defaultdict(int)
|
||||
for candidate in range(len(candidates)):
|
||||
for ngram in freq_dicts[candidate]:
|
||||
num_containing[ngram] += 1
|
||||
|
||||
# calculate tf-idf for each n-gram in each corpus
|
||||
for candidate in range(len(candidates)):
|
||||
for ngram in freq_dicts[candidate]:
|
||||
tf = freq_dicts[candidate][ngram]
|
||||
idf = math.log(len(candidates) / num_containing[ngram])
|
||||
|
||||
# normalize by length of n-gram
|
||||
tfidf_dicts[candidate][ngram] = tf * idf * len(ngram)
|
||||
|
||||
# kill anything ending in "and" "or" "of" "with"
|
||||
if ngram[-1] in ["and", "or", "of", "with"]:
|
||||
tfidf_dicts[candidate][ngram] = 0
|
||||
|
||||
return tfidf_dicts
|
||||
|
||||
# kills any phrase (tfidf=0) contained inside a larger phrase with a higher score
|
||||
def prune_substrings(tfidf_dicts, prune_thru=1000):
|
||||
|
||||
pruned = tfidf_dicts
|
||||
|
||||
for candidate in range(len(candidates)):
|
||||
# growing list of n-grams in list form
|
||||
so_far = []
|
||||
|
||||
ngrams_sorted = sorted(tfidf_dicts[candidate].items(), key=operator.itemgetter(1), reverse=True)[:prune_thru]
|
||||
for ngram in ngrams_sorted:
|
||||
# contained in a previous aka 'better' phrase
|
||||
for better_ngram in so_far:
|
||||
if overlap(list(better_ngram), list(ngram[0])):
|
||||
#print "PRUNING!! "
|
||||
#print list(better_ngram)
|
||||
#print list(ngram[0])
|
||||
|
||||
pruned[candidate][ngram[0]] = 0
|
||||
# not contained, so add to so_far to prevent future subphrases
|
||||
else:
|
||||
so_far += [list(ngram[0])]
|
||||
|
||||
return pruned
|
||||
|
||||
# sorts the n-grams for a candidate by tf-idf
|
||||
def top_ngrams_for_candidate(tfidf_dicts, candidate, count=20):
|
||||
return sorted(tfidf_dicts[candidate].items(), key=operator.itemgetter(1), reverse=True)[:count]
|
||||
|
||||
|
||||
def main():
|
||||
corpus_list = corpus_list_from_file("gop_debate_all.txt")
|
||||
freq_dicts = freq_dicts_from_corpus_list(corpus_list)
|
||||
tfidf_dicts = tfidf_dicts_from_freq_dicts(freq_dicts)
|
||||
tfidf_dicts = prune_substrings(tfidf_dicts)
|
||||
|
||||
# print the top ngrams sorted by tfidf
|
||||
for candidate in range(len(candidates)):
|
||||
print candidates[candidate]
|
||||
for ngram in top_ngrams_for_candidate(tfidf_dicts, candidate, 400):
|
||||
print ngram
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user