classic rock data and code

2014-07-06 19:07:25 -04:00
parent b6f652243d
commit a4f0c674e2
5 changed files with 557 additions and 0 deletions
--- a/classic-rock/classic-rock-raw-data.csv
+++ b/classic-rock/classic-rock-raw-data.csv
--- a/classic-rock/classic-rock-song-list.csv
+++ b/classic-rock/classic-rock-song-list.csv
--- a/classic-rock/compiling_radio.py
+++ b/classic-rock/compiling_radio.py
@@ -0,0 +1,110 @@
+def hr_pull(x,y):
+	iteration = 0
+	callsign = x[2]
+	new_filename = callsign + ".txt"
+	songlist = ""
+	while (iteration < y):
+		iteration = iteration + 1
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		readfile = open(filename,"r")
+		newchunk = readfile.read()
+		songlist = newchunk + "\n" + songlist
+		readfile.close()
+	writefile = open(new_filename,"w")
+	writefile.write(songlist)
+	writefile.close()
+		
+def hh_pull(x,y):
+	iteration = 0
+	callsign = x[2]
+	new_filename = callsign + ".txt"
+	songlist = ""
+	while (iteration < y):
+		iteration = iteration + 1
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		readfile = open(filename,"r")
+		newchunk = readfile.read()
+		songlist = newchunk + "\n" + songlist
+		readfile.close()
+	writefile = open(new_filename,"w")
+	writefile.write(songlist)
+	writefile.close()	
+
+def dy_pull(x,y):
+	iteration = 0
+	callsign = x[2]	
+	new_filename = callsign + ".txt"
+	songlist = ""
+	while (iteration < y):
+		iteration = iteration + 1
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		readfile = open(filename,"r")
+		newchunk = readfile.read()
+		songlist = newchunk + "\n" + songlist
+		readfile.close()
+	writefile = open(new_filename,"w")
+	writefile.write(songlist)
+	writefile.close()
+	
+	
+cc1 = ("http://www.q1043.com/services/now_playing.html?streamId=1465&limit=25",0,"WAXQ","")
+cc2 = ("http://www.lonestar925.com/services/now_playing.html?streamId=3379&limit=25",0,"KZPS","")
+cc3 = ("http://www.wbig.com/services/now_playing.html?streamId=2505&limit=25",0,"WBIG","")
+cc4 = ("http://www.big1059.com/services/now_playing.html?streamId=557&limit=25",0,"WBGG","")
+cc5 = ("http://www.thefox.com/services/now_playing.html?streamId=393&limit=25",0,"KRFX","")
+cc6 = ("http://www.dve.com/services/now_playing.html?streamId=2017&limit=25",0,"WDVE","")
+cc7 = ("http://www.wrfx.com/services/now_playing.html?streamId=1613&limit=25",0,"WRFX","")
+cc8 = ("http://www.kzep.com/services/now_playing.html?streamId=4051&limit=25",0,"KZEP","")
+cc9 = ("http://www.101kgb.com/services/now_playing.html?streamId=237&limit=25",0,"KGB","")
+gm1 = ("http://www.wcsx.com/recentlyplayed.aspx",0,"WCSX","")
+gm2 = ("http://www.wmgk.com/broadcasthistory.aspx",0,"WMGK","")
+cx1 = ("http://www.1073theeagle.com/lsp/",0,"WXGL","")
+cx2 = ("http://www.houstonseagle.com/lsp/",0,"KGLK","")
+cx3 = ("http://www.971theriver.com/lsp/",0,"WSRV","")
+cb1 = ("http://wzlx.cbslocal.com/playlist/",0,"WZLX","")
+cb2 = ("http://wncx.cbslocal.com/playlist/",0,"WNCX","")
+cb3 = ("http://kzok.cbslocal.com/playlist/",0,"KZOK","")
+tg1 = ("http://wlup.tunegenie.com/onair/",0,"WLUP","")
+tg2 = ("http://wofx.tunegenie.com/onair/",0,"WOFX","")
+tg3 = ("http://kgon.tunegenie.com/onair/",0,"KGON","")
+tg4 = ("http://kcfx.tunegenie.com/onair/",0,"KCFX","")
+tg5 = ("http://klos.tunegenie.com/onair/",0,"KLOS","")
+tg6 = ("http://kseg.tunegenie.com/onair/",0,"KSEG","")
+tg7 = ("http://kufx.tunegenie.com/onair/",0,"KUFX","")
+ll1 = ("http://player.listenlive.co/24751/en/songhistory",0,"KQRS","")
+ll2 = ("http://player.listenlive.co/25951/en/songhistory",0,"KSAN","")
+ke1 = ("http://www.kshe95.com/broadcasthistory",0,"KSHE","")
+kx1 = ("http://kslx.com/playlist",0,"KSLX","")
+
+hr_pull(cc1,121)
+hr_pull(cc2,121)
+hr_pull(cc3,121)
+hr_pull(cc4,121)
+hr_pull(cc5,121)
+hr_pull(cc6,121)
+hr_pull(cc7,121)
+hr_pull(cc8,121)
+hr_pull(cc9,121)
+hr_pull(tg1,121)
+hr_pull(tg2,121)
+hr_pull(tg3,121)
+hr_pull(tg4,121)
+hr_pull(tg5,121)
+hr_pull(tg6,121)
+hr_pull(tg7,121)
+hr_pull(ke1,121)
+hh_pull(cx1,241)
+hh_pull(cx2,241)
+hh_pull(cx3,241)
+hh_pull(ll1,241)
+hh_pull(ll2,241)
+hh_pull(kx1,241)
+dy_pull(gm1,6)
+dy_pull(gm2,6)
+dy_pull(cb1,6)
+dy_pull(cb2,6)
+dy_pull(cb3,6)
+	
+	
+print "Done"
+	
--- a/classic-rock/radio.py
+++ b/classic-rock/radio.py
@@ -0,0 +1,407 @@
+import time
+import urllib2
+import sys
+
+"""
+next step: make it create new file each time, run cleanup op
+"""
+
+def cc_pull(x):
+	try:
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		offset = 0
+		new_last_song = last_song
+		page = response.read()
+		while (counter < 20):
+			offset = page.find('}},{"track":')
+			song = page[page.find('":"')+3:page.find('","')]
+			artist = page[page.find('artistName":"')+13:page.find('","amgArtistId"')]
+			page = page[offset + 3:]
+			song = song.replace("\/","/")		
+			artist = artist.replace("\/","/")
+			counter = counter + 1
+			entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+			if (song == last_song):
+				break
+			elif (counter == 1):	
+				new_last_song = song
+				record.write(entry)
+			else:
+				record.write(entry)
+		y = (x[0],iteration,x[2],new_last_song)
+		time.sleep(3)
+		record.close()
+		return y
+	except:
+		time.sleep(3)
+		return x
+
+
+def gm_pull(x):
+	try:	
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		first = True
+		new_last_song = last_song	
+		while (counter < 10000):
+			line = response.readline()
+			if '" -' in line:
+				song = line[line.find('"')+1:line.find(" -")-1]
+				artist = line[line.find("- ")+1:]
+				artist = artist.strip()
+				entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"		
+				record.write(entry)
+				"""
+				line = response.readline()
+				line = response.readline()
+				line = response.readline()
+				line = response.readline()
+				if "Visit iTunes" in line:
+					itunes_link = line[line.find('href="')+5:line.find('" target="')]
+					TO DO: CREATE FILE
+					WRITE ITUNES LINKS TO IT
+					THEN, LATER, GRAB ALBUM RELEASE YEARS
+				"""
+			counter = counter + 1
+		y = (x[0],iteration,x[2],new_last_song)
+		record.close()	
+		time.sleep(3)
+		return y		
+	except:
+		time.sleep(3)
+		return x
+
+def cx_pull(x):
+	try:	
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		first = True	
+		new_last_song = last_song
+		while (counter < 10000):
+			line = response.readline()
+			if 'cmPlaylistContent' in line:
+				song = line[line.find('/">')+3:line.find("</a></strong>")]
+				artist = line[line.find("alt=")+5:line.find('" class="')]
+				artist = artist.strip()
+				song = song.replace("&#39;","'")		
+				artist = artist.replace("&#39;","'")			
+				entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+				if (song == last_song):
+					break
+				elif first:	
+					new_last_song = song
+					record.write(entry)
+					first = False
+				else:
+					record.write(entry)
+				"""	
+				if "Download Song:" in line:
+					line = response.readline()
+					line = response.readline()
+					if "apple" in line:
+						itunes_link = line[line.find('href="')+5:line.find('">iTu')			
+						TO DO: CREATE FILE
+						WRITE ITUNES LINKS TO IT
+						THEN, LATER, GRAB ALBUM RELEASE YEARS
+				"""		
+			counter = counter + 1
+		y = (x[0],iteration,x[2],new_last_song)
+		record.close()	
+		time.sleep(3)
+		return y
+	except:
+		time.sleep(3)
+		return x	
+
+def cb_pull(x):
+	try:
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		first = True	
+		new_last_song = last_song	
+		while (counter < 10000):
+			line = response.readline()
+			if '<div class="track_title"' in line:
+				song = line[line.find('rel=')+5:line.find('">')]
+				line = response.readline()
+				line = response.readline()
+				artist = line[line.find('rel=')+5:line.find('">')]
+				line = response.readline()
+				line = response.readline()
+				album = line[line.find('rel=')+5:line.find('">')]
+				song = song.replace("&#039;","'")		
+				artist = artist.replace("&#039;","'")
+				album = album.replace("&#039;","'")						
+				entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+				record.write(entry)
+			counter = counter + 1	
+		y = (x[0],iteration,x[2],new_last_song)
+		time.sleep(3)
+		record.close()	
+		return y	
+	except:
+		time.sleep(3)
+		return x
+	
+
+def tg_pull(x):
+	try:
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		first = True
+		new_last_song = last_song		
+		while (counter < 10000):
+			line = response.readline()
+			if '<div class="song"><' in line:
+				counter = counter + 1
+			elif '<div class="song">' in line:
+				song = line[line.find('"song">')+7:line.find('</div>')]
+				song = song.replace("&#39;","'")
+				line = response.readline()
+				artist = line[line.find('<div>')+5:line.find(' <span')]
+				song = song.replace("&#39;","'")		
+				artist = artist.replace("&#39;","'")
+				song = song.replace("&amp;","&")		
+				artist = artist.replace("&amp;","&")			
+				entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+				if (song == last_song):
+					break
+				elif first:	
+					new_last_song = song
+					record.write(entry)
+					first = False
+				else:
+					record.write(entry)
+			counter = counter + 1	
+		y = (x[0],iteration,x[2],new_last_song)
+		time.sleep(3)
+		record.close()	
+		return y		
+	except:
+		time.sleep(3)
+		return x
+
+def ll_pull(x):
+	try:
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		new_last_song = last_song	
+		while (counter < 10000):
+			line = response.readline()
+			if 'var songs = ' in line:
+				tencount = 0
+				while (tencount < 10):
+					song = line[line.find('"title":"')+9:line.find('","')]
+					line = line[line.find('"artist":')+10:]
+					artist = line[:line.find('"')]
+					line = line[line.find('},{"timestamp":'):]
+					entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+					new_last_song = song
+					record.write(entry)	
+					tencount = tencount + 1					
+				break	
+			counter = counter + 1	
+		y = (x[0],iteration,x[2],new_last_song)
+		time.sleep(3)
+		record.close()	
+		return y
+	except:
+		time.sleep(3)
+		return x
+
+
+def kx_pull(x):
+	try:
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		first = True	
+		new_last_song = last_song
+		while (counter < 10000):
+			line = response.readline()
+			if 'play-song' in line:
+				song = line[line.find('>')+1:line.find("</")]
+				line = response.readline()
+				artist = line[line.find('by ')+3:line.find('</')]
+				entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+				if (song == last_song):
+					break
+				elif first:	
+					new_last_song = song
+					record.write(entry)
+					first = False
+				else:
+					record.write(entry)
+			counter = counter + 1	
+		y = (x[0],iteration,x[2],new_last_song)
+		time.sleep(3)
+		record.close()	
+		return y	
+	except:
+		time.sleep(3)
+		return x
+
+def ke_pull(x):
+	try:
+		iteration = x[1] + 1
+		url = x[0]
+		callsign = x[2]
+		filename = callsign + str(iteration).rjust(3,'0') + ".txt"
+		record = open(filename,"w")	
+		last_song = x[3]
+		response = urllib2.urlopen(url)
+		counter = 0 
+		first = True
+		new_last_song = last_song		
+		while (counter < 10000):
+			line = response.readline()
+			if 'views-field-field-title' in line:
+				song = line[line.find('field-content">')+15:line.find("</div>")]
+				line = response.readline()
+				artist = line[line.find('<span>')+6:line.find('</span>')]
+				song = song.replace("&#039;","'")		
+				artist = artist.replace("&#039;","'")
+				entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
+				if (song == last_song):
+					break
+				elif first:	
+					new_last_song = song
+					record.write(entry)
+					first = False
+				else:
+					record.write(entry)
+			counter = counter + 1
+		y = (x[0],iteration,x[2],new_last_song)
+		record.close()	
+		time.sleep(3)
+		return y	
+	except:
+		time.sleep(3)
+		return x
+
+
+
+
+
+cc1 = ("http://www.q1043.com/services/now_playing.html?streamId=1465&limit=25",0,"WAXQ","")
+cc2 = ("http://www.lonestar925.com/services/now_playing.html?streamId=3379&limit=25",0,"KZPS","")
+cc3 = ("http://www.wbig.com/services/now_playing.html?streamId=2505&limit=25",0,"WBIG","")
+cc4 = ("http://www.big1059.com/services/now_playing.html?streamId=557&limit=25",0,"WBGG","")
+cc5 = ("http://www.thefox.com/services/now_playing.html?streamId=393&limit=25",0,"KRFX","")
+cc6 = ("http://www.dve.com/services/now_playing.html?streamId=2017&limit=25",0,"WDVE","")
+cc7 = ("http://www.wrfx.com/services/now_playing.html?streamId=1613&limit=25",0,"WRFX","")
+cc8 = ("http://www.kzep.com/services/now_playing.html?streamId=4051&limit=25",0,"KZEP","")
+cc9 = ("http://www.101kgb.com/services/now_playing.html?streamId=237&limit=25",0,"KGB","")
+gm1 = ("http://www.wcsx.com/recentlyplayed.aspx",0,"WCSX","")
+gm2 = ("http://www.wmgk.com/broadcasthistory.aspx",0,"WMGK","")
+cx1 = ("http://www.1073theeagle.com/lsp/",0,"WXGL","")
+cx2 = ("http://www.houstonseagle.com/lsp/",0,"KGLK","")
+cx3 = ("http://www.971theriver.com/lsp/",0,"WSRV","")
+cb1 = ("http://wzlx.cbslocal.com/playlist/",0,"WZLX","")
+cb2 = ("http://wncx.cbslocal.com/playlist/",0,"WNCX","")
+cb3 = ("http://kzok.cbslocal.com/playlist/",0,"KZOK","")
+tg1 = ("http://wlup.tunegenie.com/onair/",0,"WLUP","")
+tg2 = ("http://wofx.tunegenie.com/onair/",0,"WOFX","")
+tg3 = ("http://kgon.tunegenie.com/onair/",0,"KGON","")
+tg4 = ("http://kcfx.tunegenie.com/onair/",0,"KCFX","")
+tg5 = ("http://klos.tunegenie.com/onair/",0,"KLOS","")
+tg6 = ("http://kseg.tunegenie.com/onair/",0,"KSEG","")
+tg7 = ("http://kufx.tunegenie.com/onair/",0,"KUFX","")
+ll1 = ("http://player.listenlive.co/24751/en/songhistory",0,"KQRS","")
+ll2 = ("http://player.listenlive.co/25951/en/songhistory",0,"KSAN","")
+ke1 = ("http://www.kshe95.com/broadcasthistory",0,"KSHE","")
+kx1 = ("http://kslx.com/playlist",0,"KSLX","")
+
+
+while True:
+	now = time.time()
+	timer = time.localtime(now)
+	#on the hour
+	if (timer[4] == 58):
+		cc1 = cc_pull(cc1)
+		cc2 = cc_pull(cc2)
+		cc3 = cc_pull(cc3)
+		cc4 = cc_pull(cc4)
+		cc5 = cc_pull(cc5)
+		cc6 = cc_pull(cc6)
+		cc7 = cc_pull(cc7)
+		cc8 = cc_pull(cc8)
+		cc9 = cc_pull(cc9)
+		cx1 = cx_pull(cx1)
+		cx2 = cx_pull(cx2)
+		cx3 = cx_pull(cx3)
+		tg1 = tg_pull(tg1)
+		tg2 = tg_pull(tg2)
+		tg3 = tg_pull(tg3)
+		tg4 = tg_pull(tg4)
+		tg5 = tg_pull(tg5)
+		tg6 = tg_pull(tg6)
+		tg7 = tg_pull(tg7)
+		ll1 = ll_pull(ll1)
+		ll2 = ll_pull(ll2)
+		ke1 = ke_pull(ke1)
+		kx1 = kx_pull(kx1)
+		time.sleep(30)	
+	elif (timer[4] == 28):
+		cx1 = cx_pull(cx1)
+		cx2 = cx_pull(cx2)
+		cx3 = cx_pull(cx3)
+		ll1 = ll_pull(ll1)
+		ll2 = ll_pull(ll2)
+		kx1 = kx_pull(kx1)
+		time.sleep(30)	
+	elif (timer[4] == 54 and timer[3] == 23):
+		gm1 = gm_pull(gm1)
+		gm2 = gm_pull(gm2)
+		cb1 = cb_pull(cb1)
+		cb2 = cb_pull(cb2)
+		cb3 = cb_pull(cb3)
+		time.sleep(30)		
+	time.sleep(30)
+
+
+
+
+
--- a/classic-rock/readme-classicrock.txt
+++ b/classic-rock/readme-classicrock.txt
@@ -0,0 +1,38 @@
+classic-rock-raw-data:
+
+Each line represents a play of a song on a radio station. 
+-The first element, RAW_SONG, is the song text scraped from the radio station
+-The second element, Song Clean, is the song's title. It's been made so that all versions 
+	of the RAW_SONG — be they (live) or spelled differently point to the same text in this \
+	field. So even if we scraped "{Don't Fear} The Reaper" or "(Don't Fear) The Reaper"
+	or merely "The Reaper" by Blue Oyster Cult, the text in Song Clean is always "(Don't Fear) The Reaper"
+-The third element, RAW_ARTIST, is the artist text scraped from the radio station
+-The fourth element, ARTIST CLEAN, is a unified version of Raw Artist. So even if we scraped 
+	"Blue Öyster Cult" or "Blue Oyster Cult" or "Blue ?yster Cult", this field would always 
+	read as "Blue Oyster Cult". 
+-The fifth element is that station callsign of the song play
+-The sixth element is time the song was pulled. Python measures time as seconds since January 1, 1970.
+-The seventh element is a unique ID assigned to each play, formed by the callsign of the
+	station that played it and a four digit number, where 0001 is the last song played on the station
+	in our set and the highest number is the first song we pulled, if you want to order them.
+-The eight element combines Song Clean and ARTIST CLEAN. It can be used for connecting
+	this data set to the dataset of unique songs.
+-The ninth element is a zero or one used to find if this is the first mention of a given song, 
+	it's pretty pointless. 
+	
+classic-rock-song-list:
+
+Each line represents one song in the set
+-Song Clean is the name of the song 
+-ARTIST CLEAN is the name of the artist
+-Release Year is the release year, according to SongFacts. If there isn't  a listed year, I couldn't
+	find an entry for the song on SongFacts
+-COMBINED is the combined song and artist and can be used to connect this dataset to classic-rock-raw-data
+-First? is always 1
+-Year? is 1 if there was a found year and 0 if no year was found
+-PlayCount is the number of plays of the song across all stations.
+-F*G is the number of plays of the song across all stations, if a year was found.
+
+radio.py is the program to scrape the data from radio sites
+
+compiling_radio.py is the program to consolidate the output of radio.py into one file per station.