add data, script for gop repeater story

2025-12-19 17:37:43 -05:00 · 2016-03-10 16:27:19 -05:00
parent 7195b62427
commit ccb5a5cf1e
3 changed files with 18964 additions and 0 deletions
--- a/repeated-phrases-gop/README.md
+++ b/repeated-phrases-gop/README.md
@@ -0,0 +1,8 @@
+### The phrases most repeated by GOP candidates
+
+The raw data behind the story [These Are The Phrases Each GOP Candidate Repeats Most](http://fivethirtyeight.com/features/these-are-the-phrases-each-gop-candidate-repeats-most)
+
+File | Description
+---|---------
+`gop_debate_all.txt` | Plain text transcription of all Republican primary debates from the August 6, 2015 Fox News debate in Cleveland through the March 3, 2016 Fox News debate in Detroit
+`robopol2.py` | Python script that conducts a term frequency–inverse document frequency (tf-idf) analysis of the text in `gop_debate_all.txt`
--- a/repeated-phrases-gop/gop_debate_all.txt
+++ b/repeated-phrases-gop/gop_debate_all.txt
--- a/repeated-phrases-gop/robopol2.py
+++ b/repeated-phrases-gop/robopol2.py
@@ -0,0 +1,207 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import random
+import numpy
+import math
+import string
+import operator
+from collections import defaultdict
+
+
+
+# candidates to pay attention to
+candidates = ["TRUMP", "CRUZ", "RUBIO", "KASICH"]
+
+# n-gram lengths to iterate through
+min_N = 1       # inclusive
+max_N = 15      # exclusive
+
+
+
+####
+####  HELPER FUNCTIONS
+####
+
+# returns a dict mapping each n-gram that appears in the corpus to its frequency in the corpus
+def ngram_freqs(corpus, n):
+    
+    # generate a list of all n-grams in the corpus
+    ngrams = []
+    for i in range(n, len(corpus)):
+        if not "<BR>" in tuple(corpus[i-n:i]):
+            ngrams += [tuple(corpus[i-n:i])]
+    
+    # count the frequency of each n-gram
+    freq_dict = defaultdict(int)
+    for ngram in ngrams:
+        freq_dict[ngram] += 1
+    
+    return freq_dict
+
+# combines two dicts by performing the provided operation on their values
+def combine_dicts(a, b, op=operator.add):
+    return dict(a.items() + b.items() + [(k, op(a[k], b[k])) for k in set(b) & set(a)])
+
+# checks whether two n-grams overlap too much to include both
+def overlap(a, b):
+    max_overlap = min(3, len(a), len(b))
+    overlap = False
+    
+    # the begnning of a is in b
+    if '-'.join(a[:max_overlap]) in '-'.join(b):
+        overlap = True
+    # the end of a is in b
+    if '-'.join(a[-max_overlap:]) in '-'.join(b):
+        overlap = True
+    # the begnning of b is in a
+    if '-'.join(b[:max_overlap]) in '-'.join(a):
+        overlap = True
+    # the end of b is in a
+    if '-'.join(b[-max_overlap:]) in '-'.join(a):
+        overlap = True
+    
+    return overlap
+
+####
+####  ANALYSIS FUNCTIONS
+####
+
+# returns a list of corpora, each a sequential list of all words said by one candidate
+def corpus_list_from_file(filename):
+    
+    # load all words from the file into memory
+    words = open(filename).read().split()
+    
+    # initialize the list of corpora
+    corpus_list = []
+    for candidate in candidates:
+        corpus_list += [[]]
+    
+    # iterate through words, putting them in the correct corpus
+    speaker_index = -1
+    
+    for word in words:
+        
+        # change of speaker
+        if word[-1] == ":" and word.isupper():
+            # name of the new speaker
+            speaker = word[:-1]
+            
+            # speaker is one of the candidates
+            if speaker in candidates:
+                speaker_index = candidates.index(speaker)
+            
+            # speaker is moderator or candidate not listed
+            else:
+                speaker_index = -1
+            
+            # add a speaking break indicator
+            corpus_list[speaker_index] += ["<BR>"]
+        
+        # regular word
+        elif word[0] is not "(" and word[-1] is not ")":
+            
+            # remove punctuation and convert to lowercase
+            word = word.translate(string.maketrans("",""), string.punctuation).lower()
+            
+            if speaker_index >= 0:
+                if word is not "":
+                    corpus_list[speaker_index] += [word]
+    
+    return corpus_list
+
+# returns a list of dicts, each mapping an n-gram to its frequency in the respective corpus
+def freq_dicts_from_corpus_list(corpus_list):
+    
+    # initialize the list of dicts
+    freq_dicts = []
+    for candidate in range(len(candidates)):
+        freq_dicts += [defaultdict(int)]
+    
+    # iteratively add all n-grams
+    for n in range(min_N, max_N):
+        for candidate in range(len(candidates)):
+            corpus = corpus_list[candidate]
+            dict_to_add = ngram_freqs(corpus, n)
+            freq_dicts[candidate] = combine_dicts(freq_dicts[candidate], dict_to_add)
+    
+    return freq_dicts
+
+# returns a list of dicts, each mapping an n-gram to its tf-idf in the respective corpus
+# see https://en.wikipedia.org/wiki/Tf-idf for further information
+def tfidf_dicts_from_freq_dicts(freq_dicts):
+    
+    # initialize the list of dicts
+    tfidf_dicts = []
+    for candidate in range(len(candidates)):
+        tfidf_dicts += [defaultdict(int)]
+    
+    # create a dict that maps an n-gram to the number of corpora containing that n-gram
+    num_containing = defaultdict(int)
+    for candidate in range(len(candidates)):
+        for ngram in freq_dicts[candidate]:
+            num_containing[ngram] += 1
+    
+    # calculate tf-idf for each n-gram in each corpus
+    for candidate in range(len(candidates)):
+        for ngram in freq_dicts[candidate]:
+            tf = freq_dicts[candidate][ngram]
+            idf = math.log(len(candidates) / num_containing[ngram])
+            
+            # normalize by length of n-gram
+            tfidf_dicts[candidate][ngram] = tf * idf * len(ngram)
+            
+            # kill anything ending in "and" "or" "of" "with"
+            if ngram[-1] in ["and", "or", "of", "with"]:
+                tfidf_dicts[candidate][ngram] = 0
+
+    return tfidf_dicts
+
+# kills any phrase (tfidf=0) contained inside a larger phrase with a higher score
+def prune_substrings(tfidf_dicts, prune_thru=1000):
+    
+    pruned = tfidf_dicts
+    
+    for candidate in range(len(candidates)):
+        # growing list of n-grams in list form
+        so_far = []
+        
+        ngrams_sorted = sorted(tfidf_dicts[candidate].items(), key=operator.itemgetter(1), reverse=True)[:prune_thru]
+        for ngram in ngrams_sorted:
+            # contained in a previous aka 'better' phrase
+            for better_ngram in so_far:
+                if overlap(list(better_ngram), list(ngram[0])):
+                    #print "PRUNING!! "
+                    #print list(better_ngram)
+                    #print list(ngram[0])
+                    
+                    pruned[candidate][ngram[0]] = 0
+            # not contained, so add to so_far to prevent future subphrases
+            else:
+                so_far += [list(ngram[0])]
+    
+    return pruned 
+
+# sorts the n-grams for a candidate by tf-idf
+def top_ngrams_for_candidate(tfidf_dicts, candidate, count=20):
+    return sorted(tfidf_dicts[candidate].items(), key=operator.itemgetter(1), reverse=True)[:count]
+
+
+def main():
+    corpus_list = corpus_list_from_file("gop_debate_all.txt")
+    freq_dicts = freq_dicts_from_corpus_list(corpus_list)
+    tfidf_dicts = tfidf_dicts_from_freq_dicts(freq_dicts)
+    tfidf_dicts = prune_substrings(tfidf_dicts)
+    
+    # print the top ngrams sorted by tfidf
+    for candidate in range(len(candidates)):
+        print candidates[candidate]
+        for ngram in top_ngrams_for_candidate(tfidf_dicts, candidate, 400):
+            print ngram
+    
+    
+    
+if __name__ == '__main__':
+    main()
+