From 72163aede21aed35a66239850e7f8eba0dee90e4 Mon Sep 17 00:00:00 2001 From: Andrei Scheinkman Date: Wed, 23 Apr 2014 12:29:03 -0400 Subject: [PATCH] cleaned up bob ross clustering script --- bob-ross/cluster-paintings.py | 77 +++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/bob-ross/cluster-paintings.py b/bob-ross/cluster-paintings.py index fd2245a..9a969dc 100644 --- a/bob-ross/cluster-paintings.py +++ b/bob-ross/cluster-paintings.py @@ -1,26 +1,59 @@ -from numpy import array +""" +Clusters Bob Ross paintings by features. + +By Walter Hickey + +See http://fivethirtyeight.com/features/a-statistical-analysis-of-the-work-of-bob-ross/ +""" + +import numpy as np from scipy.cluster.vq import vq, kmeans, whiten import math +import csv -# TK: Load data from file as array and assign to bobross - -# Normalizes according to st.dev. -whitened = whiten(bobross) -output = kmeans(whitened,10) -print output +def main(): -# Determines distance between each of 403 vectors and each centroid, and finds closest neighbor -for i in range(0,403): - print i+1 - # Dist between centroid 0 and vector - distance = math.sqrt(sum((whitened[i] - output[0][0]) ** 2)) - # Group is the centroid it is closest to so far, set initally to centroid 0 - group = 0 - # Combo combines distance and group into a single entity - combo = (distance, group) - # Tests the vector i against the 10 centroids, finds nearest neighbor: - for x in range (0,10): - distance_temp = math.sqrt(sum((whitened[i] - output[0][x]) ** 2)) - if distance_temp < combo[0]: - combo = (distance_temp,x) - print combo + # load data into vectors of 1s and 0s for each tag + with open('elements-by-episode.csv','r') as csvfile: + reader = csv.reader(csvfile) + reader.next() # skip header + data = [] + for row in reader: + data.append(map(lambda x: int(x), row[2:])) # exclude EPISODE and TITLE columns + + # convert to numpy matrix + matrix = np.array(data) + + # remove colums that have been tagged less than 5 times + columns_to_remove = [] + for col in range(np.shape(matrix)[1]): + if sum(matrix[:,col]) <= 5: + columns_to_remove.append(col) + matrix = np.delete(matrix, columns_to_remove, axis=1) + + # normalize according to stddev + whitened = whiten(matrix) + output = kmeans(whitened, 10) + + print "episode", "distance", "cluster" + + # determine distance between each of 403 vectors and each centroid, find closest neighbor + for i, v in enumerate(whitened): + + # distance between centroid 0 and feature vector + distance = math.sqrt(sum((v - output[0][0]) ** 2)) + + # group is the centroid it is closest to so far, set initally to centroid 0 + group = 0 + closest_match = (distance, group) + + # test the vector i against the 10 centroids, find nearest neighbor + for x in range (0, 10): + dist_x = math.sqrt(sum((v - output[0][x]) ** 2)) + if dist_x < closest_match[0]: + closest_match = (dist_x, x) + + print i+1, closest_match[0], closest_match[1] + +if __name__ == "__main__": + main() \ No newline at end of file