cleaned up bob ross clustering script

2014-04-23 12:29:03 -04:00
parent 346beabae8
commit 72163aede2
1 changed files with 55 additions and 22 deletions
--- a/bob-ross/cluster-paintings.py
+++ b/bob-ross/cluster-paintings.py
@@ -1,26 +1,59 @@
-from numpy import array
+"""
+Clusters Bob Ross paintings by features.
+
+By Walter Hickey <walter.hickey@fivethirtyeight.com>
+
+See http://fivethirtyeight.com/features/a-statistical-analysis-of-the-work-of-bob-ross/
+"""
+
+import numpy as np
 from scipy.cluster.vq import vq, kmeans, whiten
 import math
+import csv

-# TK: Load data from file as array and assign to bobross
- 
-# Normalizes according to st.dev.                
-whitened = whiten(bobross)
-output = kmeans(whitened,10)
-print output
+def main():

-# Determines distance between each of 403 vectors and each centroid, and finds closest neighbor
-for i in range(0,403):
-	print i+1 
-	# Dist between centroid 0 and vector
-	distance = math.sqrt(sum((whitened[i] - output[0][0]) ** 2)) 
-	# Group is the centroid it is closest to so far, set initally to centroid 0
-	group = 0
-	# Combo combines distance and group into a single entity
-	combo = (distance, group)
-	# Tests the vector i against the 10 centroids, finds nearest neighbor:
-	for x in range (0,10):
-		distance_temp = math.sqrt(sum((whitened[i] - output[0][x]) ** 2))
-		if distance_temp < combo[0]:
-			combo = (distance_temp,x)
-	print combo	
+    # load data into vectors of 1s and 0s for each tag
+    with open('elements-by-episode.csv','r') as csvfile:
+        reader = csv.reader(csvfile)
+        reader.next() # skip header
+        data = []
+        for row in reader:
+            data.append(map(lambda x: int(x), row[2:])) # exclude EPISODE and TITLE columns
+
+    # convert to numpy matrix
+    matrix = np.array(data)
+
+    # remove colums that have been tagged less than 5 times
+    columns_to_remove = []
+    for col in range(np.shape(matrix)[1]):
+        if sum(matrix[:,col]) <= 5:
+            columns_to_remove.append(col) 
+    matrix = np.delete(matrix, columns_to_remove, axis=1)
+
+    # normalize according to stddev
+    whitened = whiten(matrix)
+    output = kmeans(whitened, 10)
+
+    print "episode", "distance", "cluster"
+
+    # determine distance between each of 403 vectors and each centroid, find closest neighbor
+    for i, v in enumerate(whitened):
+
+        # distance between centroid 0 and feature vector
+        distance = math.sqrt(sum((v - output[0][0]) ** 2))
+
+        # group is the centroid it is closest to so far, set initally to centroid 0
+        group = 0
+        closest_match = (distance, group)
+
+        # test the vector i against the 10 centroids, find nearest neighbor
+        for x in range (0, 10):
+            dist_x = math.sqrt(sum((v - output[0][x]) ** 2))
+            if dist_x < closest_match[0]:
+                closest_match = (dist_x, x)
+
+        print i+1, closest_match[0], closest_match[1]
+
+if __name__ == "__main__":
+    main()