classic rock data and code

This commit is contained in:
andrewflowers
2014-07-06 19:07:25 -04:00
parent b6f652243d
commit a4f0c674e2
5 changed files with 557 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,110 @@
def hr_pull(x,y):
iteration = 0
callsign = x[2]
new_filename = callsign + ".txt"
songlist = ""
while (iteration < y):
iteration = iteration + 1
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
readfile = open(filename,"r")
newchunk = readfile.read()
songlist = newchunk + "\n" + songlist
readfile.close()
writefile = open(new_filename,"w")
writefile.write(songlist)
writefile.close()
def hh_pull(x,y):
iteration = 0
callsign = x[2]
new_filename = callsign + ".txt"
songlist = ""
while (iteration < y):
iteration = iteration + 1
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
readfile = open(filename,"r")
newchunk = readfile.read()
songlist = newchunk + "\n" + songlist
readfile.close()
writefile = open(new_filename,"w")
writefile.write(songlist)
writefile.close()
def dy_pull(x,y):
iteration = 0
callsign = x[2]
new_filename = callsign + ".txt"
songlist = ""
while (iteration < y):
iteration = iteration + 1
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
readfile = open(filename,"r")
newchunk = readfile.read()
songlist = newchunk + "\n" + songlist
readfile.close()
writefile = open(new_filename,"w")
writefile.write(songlist)
writefile.close()
cc1 = ("http://www.q1043.com/services/now_playing.html?streamId=1465&limit=25",0,"WAXQ","")
cc2 = ("http://www.lonestar925.com/services/now_playing.html?streamId=3379&limit=25",0,"KZPS","")
cc3 = ("http://www.wbig.com/services/now_playing.html?streamId=2505&limit=25",0,"WBIG","")
cc4 = ("http://www.big1059.com/services/now_playing.html?streamId=557&limit=25",0,"WBGG","")
cc5 = ("http://www.thefox.com/services/now_playing.html?streamId=393&limit=25",0,"KRFX","")
cc6 = ("http://www.dve.com/services/now_playing.html?streamId=2017&limit=25",0,"WDVE","")
cc7 = ("http://www.wrfx.com/services/now_playing.html?streamId=1613&limit=25",0,"WRFX","")
cc8 = ("http://www.kzep.com/services/now_playing.html?streamId=4051&limit=25",0,"KZEP","")
cc9 = ("http://www.101kgb.com/services/now_playing.html?streamId=237&limit=25",0,"KGB","")
gm1 = ("http://www.wcsx.com/recentlyplayed.aspx",0,"WCSX","")
gm2 = ("http://www.wmgk.com/broadcasthistory.aspx",0,"WMGK","")
cx1 = ("http://www.1073theeagle.com/lsp/",0,"WXGL","")
cx2 = ("http://www.houstonseagle.com/lsp/",0,"KGLK","")
cx3 = ("http://www.971theriver.com/lsp/",0,"WSRV","")
cb1 = ("http://wzlx.cbslocal.com/playlist/",0,"WZLX","")
cb2 = ("http://wncx.cbslocal.com/playlist/",0,"WNCX","")
cb3 = ("http://kzok.cbslocal.com/playlist/",0,"KZOK","")
tg1 = ("http://wlup.tunegenie.com/onair/",0,"WLUP","")
tg2 = ("http://wofx.tunegenie.com/onair/",0,"WOFX","")
tg3 = ("http://kgon.tunegenie.com/onair/",0,"KGON","")
tg4 = ("http://kcfx.tunegenie.com/onair/",0,"KCFX","")
tg5 = ("http://klos.tunegenie.com/onair/",0,"KLOS","")
tg6 = ("http://kseg.tunegenie.com/onair/",0,"KSEG","")
tg7 = ("http://kufx.tunegenie.com/onair/",0,"KUFX","")
ll1 = ("http://player.listenlive.co/24751/en/songhistory",0,"KQRS","")
ll2 = ("http://player.listenlive.co/25951/en/songhistory",0,"KSAN","")
ke1 = ("http://www.kshe95.com/broadcasthistory",0,"KSHE","")
kx1 = ("http://kslx.com/playlist",0,"KSLX","")
hr_pull(cc1,121)
hr_pull(cc2,121)
hr_pull(cc3,121)
hr_pull(cc4,121)
hr_pull(cc5,121)
hr_pull(cc6,121)
hr_pull(cc7,121)
hr_pull(cc8,121)
hr_pull(cc9,121)
hr_pull(tg1,121)
hr_pull(tg2,121)
hr_pull(tg3,121)
hr_pull(tg4,121)
hr_pull(tg5,121)
hr_pull(tg6,121)
hr_pull(tg7,121)
hr_pull(ke1,121)
hh_pull(cx1,241)
hh_pull(cx2,241)
hh_pull(cx3,241)
hh_pull(ll1,241)
hh_pull(ll2,241)
hh_pull(kx1,241)
dy_pull(gm1,6)
dy_pull(gm2,6)
dy_pull(cb1,6)
dy_pull(cb2,6)
dy_pull(cb3,6)
print "Done"

407
classic-rock/radio.py Normal file
View File

@@ -0,0 +1,407 @@
import time
import urllib2
import sys
"""
next step: make it create new file each time, run cleanup op
"""
def cc_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
offset = 0
new_last_song = last_song
page = response.read()
while (counter < 20):
offset = page.find('}},{"track":')
song = page[page.find('":"')+3:page.find('","')]
artist = page[page.find('artistName":"')+13:page.find('","amgArtistId"')]
page = page[offset + 3:]
song = song.replace("\/","/")
artist = artist.replace("\/","/")
counter = counter + 1
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
if (song == last_song):
break
elif (counter == 1):
new_last_song = song
record.write(entry)
else:
record.write(entry)
y = (x[0],iteration,x[2],new_last_song)
time.sleep(3)
record.close()
return y
except:
time.sleep(3)
return x
def gm_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
first = True
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if '" -' in line:
song = line[line.find('"')+1:line.find(" -")-1]
artist = line[line.find("- ")+1:]
artist = artist.strip()
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
record.write(entry)
"""
line = response.readline()
line = response.readline()
line = response.readline()
line = response.readline()
if "Visit iTunes" in line:
itunes_link = line[line.find('href="')+5:line.find('" target="')]
TO DO: CREATE FILE
WRITE ITUNES LINKS TO IT
THEN, LATER, GRAB ALBUM RELEASE YEARS
"""
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
record.close()
time.sleep(3)
return y
except:
time.sleep(3)
return x
def cx_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
first = True
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if 'cmPlaylistContent' in line:
song = line[line.find('/">')+3:line.find("</a></strong>")]
artist = line[line.find("alt=")+5:line.find('" class="')]
artist = artist.strip()
song = song.replace("&#39;","'")
artist = artist.replace("&#39;","'")
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
if (song == last_song):
break
elif first:
new_last_song = song
record.write(entry)
first = False
else:
record.write(entry)
"""
if "Download Song:" in line:
line = response.readline()
line = response.readline()
if "apple" in line:
itunes_link = line[line.find('href="')+5:line.find('">iTu')
TO DO: CREATE FILE
WRITE ITUNES LINKS TO IT
THEN, LATER, GRAB ALBUM RELEASE YEARS
"""
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
record.close()
time.sleep(3)
return y
except:
time.sleep(3)
return x
def cb_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
first = True
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if '<div class="track_title"' in line:
song = line[line.find('rel=')+5:line.find('">')]
line = response.readline()
line = response.readline()
artist = line[line.find('rel=')+5:line.find('">')]
line = response.readline()
line = response.readline()
album = line[line.find('rel=')+5:line.find('">')]
song = song.replace("&#039;","'")
artist = artist.replace("&#039;","'")
album = album.replace("&#039;","'")
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
record.write(entry)
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
time.sleep(3)
record.close()
return y
except:
time.sleep(3)
return x
def tg_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
first = True
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if '<div class="song"><' in line:
counter = counter + 1
elif '<div class="song">' in line:
song = line[line.find('"song">')+7:line.find('</div>')]
song = song.replace("&#39;","'")
line = response.readline()
artist = line[line.find('<div>')+5:line.find(' <span')]
song = song.replace("&#39;","'")
artist = artist.replace("&#39;","'")
song = song.replace("&amp;","&")
artist = artist.replace("&amp;","&")
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
if (song == last_song):
break
elif first:
new_last_song = song
record.write(entry)
first = False
else:
record.write(entry)
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
time.sleep(3)
record.close()
return y
except:
time.sleep(3)
return x
def ll_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if 'var songs = ' in line:
tencount = 0
while (tencount < 10):
song = line[line.find('"title":"')+9:line.find('","')]
line = line[line.find('"artist":')+10:]
artist = line[:line.find('"')]
line = line[line.find('},{"timestamp":'):]
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
new_last_song = song
record.write(entry)
tencount = tencount + 1
break
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
time.sleep(3)
record.close()
return y
except:
time.sleep(3)
return x
def kx_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
first = True
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if 'play-song' in line:
song = line[line.find('>')+1:line.find("</")]
line = response.readline()
artist = line[line.find('by ')+3:line.find('</')]
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
if (song == last_song):
break
elif first:
new_last_song = song
record.write(entry)
first = False
else:
record.write(entry)
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
time.sleep(3)
record.close()
return y
except:
time.sleep(3)
return x
def ke_pull(x):
try:
iteration = x[1] + 1
url = x[0]
callsign = x[2]
filename = callsign + str(iteration).rjust(3,'0') + ".txt"
record = open(filename,"w")
last_song = x[3]
response = urllib2.urlopen(url)
counter = 0
first = True
new_last_song = last_song
while (counter < 10000):
line = response.readline()
if 'views-field-field-title' in line:
song = line[line.find('field-content">')+15:line.find("</div>")]
line = response.readline()
artist = line[line.find('<span>')+6:line.find('</span>')]
song = song.replace("&#039;","'")
artist = artist.replace("&#039;","'")
entry = song + "|" + artist + "|" + callsign + "|" + str(time.time()) + "\n"
if (song == last_song):
break
elif first:
new_last_song = song
record.write(entry)
first = False
else:
record.write(entry)
counter = counter + 1
y = (x[0],iteration,x[2],new_last_song)
record.close()
time.sleep(3)
return y
except:
time.sleep(3)
return x
cc1 = ("http://www.q1043.com/services/now_playing.html?streamId=1465&limit=25",0,"WAXQ","")
cc2 = ("http://www.lonestar925.com/services/now_playing.html?streamId=3379&limit=25",0,"KZPS","")
cc3 = ("http://www.wbig.com/services/now_playing.html?streamId=2505&limit=25",0,"WBIG","")
cc4 = ("http://www.big1059.com/services/now_playing.html?streamId=557&limit=25",0,"WBGG","")
cc5 = ("http://www.thefox.com/services/now_playing.html?streamId=393&limit=25",0,"KRFX","")
cc6 = ("http://www.dve.com/services/now_playing.html?streamId=2017&limit=25",0,"WDVE","")
cc7 = ("http://www.wrfx.com/services/now_playing.html?streamId=1613&limit=25",0,"WRFX","")
cc8 = ("http://www.kzep.com/services/now_playing.html?streamId=4051&limit=25",0,"KZEP","")
cc9 = ("http://www.101kgb.com/services/now_playing.html?streamId=237&limit=25",0,"KGB","")
gm1 = ("http://www.wcsx.com/recentlyplayed.aspx",0,"WCSX","")
gm2 = ("http://www.wmgk.com/broadcasthistory.aspx",0,"WMGK","")
cx1 = ("http://www.1073theeagle.com/lsp/",0,"WXGL","")
cx2 = ("http://www.houstonseagle.com/lsp/",0,"KGLK","")
cx3 = ("http://www.971theriver.com/lsp/",0,"WSRV","")
cb1 = ("http://wzlx.cbslocal.com/playlist/",0,"WZLX","")
cb2 = ("http://wncx.cbslocal.com/playlist/",0,"WNCX","")
cb3 = ("http://kzok.cbslocal.com/playlist/",0,"KZOK","")
tg1 = ("http://wlup.tunegenie.com/onair/",0,"WLUP","")
tg2 = ("http://wofx.tunegenie.com/onair/",0,"WOFX","")
tg3 = ("http://kgon.tunegenie.com/onair/",0,"KGON","")
tg4 = ("http://kcfx.tunegenie.com/onair/",0,"KCFX","")
tg5 = ("http://klos.tunegenie.com/onair/",0,"KLOS","")
tg6 = ("http://kseg.tunegenie.com/onair/",0,"KSEG","")
tg7 = ("http://kufx.tunegenie.com/onair/",0,"KUFX","")
ll1 = ("http://player.listenlive.co/24751/en/songhistory",0,"KQRS","")
ll2 = ("http://player.listenlive.co/25951/en/songhistory",0,"KSAN","")
ke1 = ("http://www.kshe95.com/broadcasthistory",0,"KSHE","")
kx1 = ("http://kslx.com/playlist",0,"KSLX","")
while True:
now = time.time()
timer = time.localtime(now)
#on the hour
if (timer[4] == 58):
cc1 = cc_pull(cc1)
cc2 = cc_pull(cc2)
cc3 = cc_pull(cc3)
cc4 = cc_pull(cc4)
cc5 = cc_pull(cc5)
cc6 = cc_pull(cc6)
cc7 = cc_pull(cc7)
cc8 = cc_pull(cc8)
cc9 = cc_pull(cc9)
cx1 = cx_pull(cx1)
cx2 = cx_pull(cx2)
cx3 = cx_pull(cx3)
tg1 = tg_pull(tg1)
tg2 = tg_pull(tg2)
tg3 = tg_pull(tg3)
tg4 = tg_pull(tg4)
tg5 = tg_pull(tg5)
tg6 = tg_pull(tg6)
tg7 = tg_pull(tg7)
ll1 = ll_pull(ll1)
ll2 = ll_pull(ll2)
ke1 = ke_pull(ke1)
kx1 = kx_pull(kx1)
time.sleep(30)
elif (timer[4] == 28):
cx1 = cx_pull(cx1)
cx2 = cx_pull(cx2)
cx3 = cx_pull(cx3)
ll1 = ll_pull(ll1)
ll2 = ll_pull(ll2)
kx1 = kx_pull(kx1)
time.sleep(30)
elif (timer[4] == 54 and timer[3] == 23):
gm1 = gm_pull(gm1)
gm2 = gm_pull(gm2)
cb1 = cb_pull(cb1)
cb2 = cb_pull(cb2)
cb3 = cb_pull(cb3)
time.sleep(30)
time.sleep(30)

View File

@@ -0,0 +1,38 @@
classic-rock-raw-data:
Each line represents a play of a song on a radio station.
-The first element, RAW_SONG, is the song text scraped from the radio station
-The second element, Song Clean, is the song's title. It's been made so that all versions
of the RAW_SONG — be they (live) or spelled differently point to the same text in this \
field. So even if we scraped "{Don't Fear} The Reaper" or "(Don't Fear) The Reaper"
or merely "The Reaper" by Blue Oyster Cult, the text in Song Clean is always "(Don't Fear) The Reaper"
-The third element, RAW_ARTIST, is the artist text scraped from the radio station
-The fourth element, ARTIST CLEAN, is a unified version of Raw Artist. So even if we scraped
"Blue Öyster Cult" or "Blue Oyster Cult" or "Blue ?yster Cult", this field would always
read as "Blue Oyster Cult".
-The fifth element is that station callsign of the song play
-The sixth element is time the song was pulled. Python measures time as seconds since January 1, 1970.
-The seventh element is a unique ID assigned to each play, formed by the callsign of the
station that played it and a four digit number, where 0001 is the last song played on the station
in our set and the highest number is the first song we pulled, if you want to order them.
-The eight element combines Song Clean and ARTIST CLEAN. It can be used for connecting
this data set to the dataset of unique songs.
-The ninth element is a zero or one used to find if this is the first mention of a given song,
it's pretty pointless.
classic-rock-song-list:
Each line represents one song in the set
-Song Clean is the name of the song
-ARTIST CLEAN is the name of the artist
-Release Year is the release year, according to SongFacts. If there isn't a listed year, I couldn't
find an entry for the song on SongFacts
-COMBINED is the combined song and artist and can be used to connect this dataset to classic-rock-raw-data
-First? is always 1
-Year? is 1 if there was a found year and 0 if no year was found
-PlayCount is the number of plays of the song across all stations.
-F*G is the number of plays of the song across all stations, if a year was found.
radio.py is the program to scrape the data from radio sites
compiling_radio.py is the program to consolidate the output of radio.py into one file per station.