cleaned up bob ross clustering script

This commit is contained in:
Andrei Scheinkman
2014-04-23 12:29:03 -04:00
parent 346beabae8
commit 72163aede2

View File

@@ -1,26 +1,59 @@
from numpy import array
"""
Clusters Bob Ross paintings by features.
By Walter Hickey <walter.hickey@fivethirtyeight.com>
See http://fivethirtyeight.com/features/a-statistical-analysis-of-the-work-of-bob-ross/
"""
import numpy as np
from scipy.cluster.vq import vq, kmeans, whiten
import math
import csv
# TK: Load data from file as array and assign to bobross
# Normalizes according to st.dev.
whitened = whiten(bobross)
output = kmeans(whitened,10)
print output
def main():
# Determines distance between each of 403 vectors and each centroid, and finds closest neighbor
for i in range(0,403):
print i+1
# Dist between centroid 0 and vector
distance = math.sqrt(sum((whitened[i] - output[0][0]) ** 2))
# Group is the centroid it is closest to so far, set initally to centroid 0
group = 0
# Combo combines distance and group into a single entity
combo = (distance, group)
# Tests the vector i against the 10 centroids, finds nearest neighbor:
for x in range (0,10):
distance_temp = math.sqrt(sum((whitened[i] - output[0][x]) ** 2))
if distance_temp < combo[0]:
combo = (distance_temp,x)
print combo
# load data into vectors of 1s and 0s for each tag
with open('elements-by-episode.csv','r') as csvfile:
reader = csv.reader(csvfile)
reader.next() # skip header
data = []
for row in reader:
data.append(map(lambda x: int(x), row[2:])) # exclude EPISODE and TITLE columns
# convert to numpy matrix
matrix = np.array(data)
# remove colums that have been tagged less than 5 times
columns_to_remove = []
for col in range(np.shape(matrix)[1]):
if sum(matrix[:,col]) <= 5:
columns_to_remove.append(col)
matrix = np.delete(matrix, columns_to_remove, axis=1)
# normalize according to stddev
whitened = whiten(matrix)
output = kmeans(whitened, 10)
print "episode", "distance", "cluster"
# determine distance between each of 403 vectors and each centroid, find closest neighbor
for i, v in enumerate(whitened):
# distance between centroid 0 and feature vector
distance = math.sqrt(sum((v - output[0][0]) ** 2))
# group is the centroid it is closest to so far, set initally to centroid 0
group = 0
closest_match = (distance, group)
# test the vector i against the 10 centroids, find nearest neighbor
for x in range (0, 10):
dist_x = math.sqrt(sum((v - output[0][x]) ** 2))
if dist_x < closest_match[0]:
closest_match = (dist_x, x)
print i+1, closest_match[0], closest_match[1]
if __name__ == "__main__":
main()