diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index b2b634e..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/scripts/ICPSR_id.py b/scripts/ICPSR_id.py index db84d1d..d602f67 100644 --- a/scripts/ICPSR_id.py +++ b/scripts/ICPSR_id.py @@ -12,94 +12,12 @@ import re import utils import urllib2 import requests -from utils import download, load_data, save_data, parse_date +from utils import download, load_data, save_data, parse_date, states, congress_from_legislative_year, legislative_year import json import string import csv import unicodedata -def congress_from_legislative_year(year): - return ((year + 1) / 2) - 894 - -def current_legislative_year(date=None): - if not date: - date = datetime.datetime.now() - year = date.year - - if date.month == 1: - if date.day == 1 or date.day == 2: - return date.year - 1 - #yaml has no time data, so can't distinguish between pre/post noon dates. So, since this script is based on start-dates to determine congress numbers, starting anytime on 01-03 is the new congress - elif date.day == 3: - return date.year - else: - return date.year - else: - return date.year - -states = { - 'AK': 'Alaska', - 'AL': 'Alabama', - 'AR': 'Arkansas', - 'AS': 'American Samoa', - 'AZ': 'Arizona', - 'CA': 'California', - 'CO': 'Colorado', - 'CT': 'Connecticut', - 'DC': 'District of Columbia', - 'DE': 'Delaware', - 'FL': 'Florida', - 'GA': 'Georgia', - 'GU': 'Guam', - 'HI': 'Hawaii', - 'IA': 'Iowa', - 'ID': 'Idaho', - 'IL': 'Illinois', - 'IN': 'Indiana', - 'KS': 'Kansas', - 'KY': 'Kentucky', - 'LA': 'Louisiana', - 'MA': 'Massachusetts', - 'MD': 'Maryland', - 'ME': 'Maine', - 'MI': 'Michigan', - 'MN': 'Minnesota', - 'MO': 'Missouri', - 'MP': 'Northern Mariana Islands', - 'MS': 'Mississippi', - 'MT': 'Montana', - 'NA': 'National', - 'NC': 'North Carolina', - 'ND': 'North Dakota', - 'NE': 'Nebraska', - 'NH': 'New Hampshire', - 'NJ': 'New Jersey', - 'NM': 'New Mexico', - 'NV': 'Nevada', - 'NY': 'New York', - 'OH': 'Ohio', - 'OK': 'Oklahoma', - 'OR': 'Oregon', - 'PA': 'Pennsylvania', - 'PR': 'Puerto Rico', - 'RI': 'Rhode Island', - 'SC': 'South Carolina', - 'SD': 'South Dakota', - 'TN': 'Tennessee', - 'TX': 'Texas', - 'UT': 'Utah', - 'VA': 'Virginia', - 'VI': 'Virgin Islands', - 'VT': 'Vermont', - 'WA': 'Washington', - 'WI': 'Wisconsin', - 'WV': 'West Virginia', - 'WY': 'Wyoming', - 'OL': 'Orleans', - 'DK': 'Dakota', - 'PI': 'Philippine Islands' -} - debug = utils.flags().get('debug', False) # default to caching @@ -108,11 +26,9 @@ force = not cache only_bioguide = utils.flags().get('bioguide', None) - congress = utils.flags().get('congress',None) - filename_historical = "legislators-historical.yaml" filename_current = "legislators-current.yaml" data_files = [] @@ -129,96 +45,93 @@ if congress == None: raise Exception("the --congress flag is required") elif congress == "113": url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord" - destination = "icpsr/source/senate_rollcall%s.txt" % congress - senate_data = utils.download(url_senate, destination, force) - url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord" - destination = "icpsr/source/house_rollcall%s.txt" % congress - house_data = utils.download(url_house, destination, force) elif int(congress) <10 and int(congress) >0: url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress - destination = "icpsr/source/senate_rollcall%s.txt" % congress - senate_data = utils.download(url_senate, destination, force) - url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress - destination = "icpsr/source/house_rollcall%s.txt" % congress - house_data = utils.download(url_house, destination, force) -elif int(congress) < congress_from_legislative_year(current_legislative_year()) and int(congress) >= 10: +elif int(congress) < 113 and int(congress) >= 10: url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress - destination = "icpsr/source/senate_rollcall%s.txt" % congress - senate_data = utils.download(url_senate, destination, force) - url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress - destination = "icpsr/source/house_rollcall%s.txt" % congress - house_data = utils.download(url_house, destination, force) else: raise Exception("no data for congress " + congress) -cw = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb")) -cw.writerow(["matches","icpsr_name","icpsr_state","is_territory"]) +senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress +senate_data = utils.download(url_senate, senate_destination, force) + +house_destination = "icpsr/source/house_rollcall%s.txt" % congress +house_data = utils.download(url_house, house_destination, force) + +error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb")) +error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"]) + + read_files = [(senate_data,"sen"),(house_data,"rep")] print "Running for congress " + congress -for r in read_files: - for f in data_files: - for m in f[0]: +for read_file in read_files: + for data_file in data_files: + for legislator in data_file[0]: num_matches = 0 # # this can't run unless we've already collected a bioguide for this person - bioguide = m["id"].get("bioguide", None) + bioguide = legislator["id"].get("bioguide", None) # if we've limited this to just one bioguide, skip over everyone else if only_bioguide and (bioguide != only_bioguide): - num_matches += 1 - continue - #skip if icpsr id is currently in data - if "icpsr" in m["id"]: - num_matches += 1 continue #if not in currently read chamber, skip - chamber = m['terms'][len(m['terms'])-1]['type'] - if chamber != r[1]: - num_matches += 1 + chamber = legislator['terms'][len(legislator['terms'])-1]['type'] + if chamber != read_file[1]: continue #only run for selected congress - latest_congress = congress_from_legislative_year(current_legislative_year(parse_date(m['terms'][len(m['terms'])-1]['start']))) + latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start']))) if chamber == "sen": congresses = [latest_congress,latest_congress+1,latest_congress+2] else: congresses =[latest_congress] if int(congress) not in congresses: - num_matches += 1 continue # pull data to match from yaml - last_name_unicode = m['name']['last'].upper().strip().replace('\'','') + last_name_unicode = legislator['name']['last'].upper().strip().replace('\'','') last_name = unicodedata.normalize('NFD', unicode(last_name_unicode)).encode('ascii', 'ignore') - state = states[m['terms'][len(m['terms'])-1]['state']].upper()[:7].strip() + state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip() # select icpsr source data based on more recent chamber - - lines = r[0].split('\n') - for l in lines: - disp = False + + write_id = "" + lines = read_file[0].split('\n') + for line in lines: # parse source data - icpsr_state = l[12:20].strip() - icpsr_name = l[21:].strip().strip(string.digits).strip() - icpsr_id = l[3:8].strip() + icpsr_state = line[12:20].strip() + icpsr_name = line[21:].strip().strip(string.digits).strip() + icpsr_id = line[3:8].strip() #ensure unique match if icpsr_name[:8] == last_name[:8] and state == icpsr_state: num_matches += 1 write_id = icpsr_id + #skip if icpsr id is currently in data + if "icpsr" in legislator["id"]: + if write_id == legislator["id"]["icpsr"] or write_id == "": + continue + elif write_id != legislator["id"]["icpsr"] and write_id != "": + error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id]) + print "ID updated for %s" % last_name if num_matches == 1: - m['id']['icpsr'] = int(write_id) - elif num_matches == 0: - print "No matches found for " + last_name + ", " + state + "in congress " + str(congress) - cw.writerow(["0",last_name,state]) + legislator['id']['icpsr'] = int(write_id) else: - if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER": - cw.writerow([str(num_matches),last_name[:8],state,"Y"]) + if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP": + error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"]) else: print str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress) - cw.writerow([str(num_matches),last_name,state,"N"]) + error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"]) + - save_data(f[0], f[1]) \ No newline at end of file + save_data(data_file[0], data_file[1]) + +## the following three lines can be run as a separate script to update icpsr id's for all historical congresses +# import os + +# for i in range(1,114): +# os.system("python ICPSR_id.py --congress=" + str(i)) \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index 0edd5ad..b700cd7 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -1,6 +1,69 @@ # Helpful functions for finding data about members and committees CURRENT_CONGRESS = 113 +states = { + 'AK': 'Alaska', + 'AL': 'Alabama', + 'AR': 'Arkansas', + 'AS': 'American Samoa', + 'AZ': 'Arizona', + 'CA': 'California', + 'CO': 'Colorado', + 'CT': 'Connecticut', + 'DC': 'District of Columbia', + 'DE': 'Delaware', + 'FL': 'Florida', + 'GA': 'Georgia', + 'GU': 'Guam', + 'HI': 'Hawaii', + 'IA': 'Iowa', + 'ID': 'Idaho', + 'IL': 'Illinois', + 'IN': 'Indiana', + 'KS': 'Kansas', + 'KY': 'Kentucky', + 'LA': 'Louisiana', + 'MA': 'Massachusetts', + 'MD': 'Maryland', + 'ME': 'Maine', + 'MI': 'Michigan', + 'MN': 'Minnesota', + 'MO': 'Missouri', + 'MP': 'Northern Mariana Islands', + 'MS': 'Mississippi', + 'MT': 'Montana', + 'NA': 'National', + 'NC': 'North Carolina', + 'ND': 'North Dakota', + 'NE': 'Nebraska', + 'NH': 'New Hampshire', + 'NJ': 'New Jersey', + 'NM': 'New Mexico', + 'NV': 'Nevada', + 'NY': 'New York', + 'OH': 'Ohio', + 'OK': 'Oklahoma', + 'OR': 'Oregon', + 'PA': 'Pennsylvania', + 'PR': 'Puerto Rico', + 'RI': 'Rhode Island', + 'SC': 'South Carolina', + 'SD': 'South Dakota', + 'TN': 'Tennessee', + 'TX': 'Texas', + 'UT': 'Utah', + 'VA': 'Virginia', + 'VI': 'Virgin Islands', + 'VT': 'Vermont', + 'WA': 'Washington', + 'WI': 'Wisconsin', + 'WV': 'West Virginia', + 'WY': 'Wyoming', + 'OL': 'Orleans', + 'DK': 'Dakota', + 'PI': 'Philippine Islands' +} + import os, errno, sys, traceback @@ -8,6 +71,35 @@ import re, htmlentitydefs import pprint from datetime import datetime +def current_congress(): + year = current_legislative_year() + return congress_from_legislative_year(year) + +def congress_from_legislative_year(year): + return ((year + 1) / 2) - 894 + +def legislative_year(date=None): + if not date: + date = datetime.datetime.now() + + year = date.year + + if date.month == 1: + if date.day == 1 or date.day == 2: + return date.year - 1 + elif date.day == 3: + if isinstance(date,datetime): + if date.hour < 12: + return date.year -1 + else: + return date.year + else: + return date.year + else: + return date.year + else: + return date.year + def parse_date(date): return datetime.strptime(date, "%Y-%m-%d").date()