mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2026-05-12 03:00:10 -04:00
updated icpsr.py
This commit is contained in:
@@ -12,94 +12,12 @@ import re
|
||||
import utils
|
||||
import urllib2
|
||||
import requests
|
||||
from utils import download, load_data, save_data, parse_date
|
||||
from utils import download, load_data, save_data, parse_date, states, congress_from_legislative_year, legislative_year
|
||||
import json
|
||||
import string
|
||||
import csv
|
||||
import unicodedata
|
||||
|
||||
def congress_from_legislative_year(year):
|
||||
return ((year + 1) / 2) - 894
|
||||
|
||||
def current_legislative_year(date=None):
|
||||
if not date:
|
||||
date = datetime.datetime.now()
|
||||
year = date.year
|
||||
|
||||
if date.month == 1:
|
||||
if date.day == 1 or date.day == 2:
|
||||
return date.year - 1
|
||||
#yaml has no time data, so can't distinguish between pre/post noon dates. So, since this script is based on start-dates to determine congress numbers, starting anytime on 01-03 is the new congress
|
||||
elif date.day == 3:
|
||||
return date.year
|
||||
else:
|
||||
return date.year
|
||||
else:
|
||||
return date.year
|
||||
|
||||
states = {
|
||||
'AK': 'Alaska',
|
||||
'AL': 'Alabama',
|
||||
'AR': 'Arkansas',
|
||||
'AS': 'American Samoa',
|
||||
'AZ': 'Arizona',
|
||||
'CA': 'California',
|
||||
'CO': 'Colorado',
|
||||
'CT': 'Connecticut',
|
||||
'DC': 'District of Columbia',
|
||||
'DE': 'Delaware',
|
||||
'FL': 'Florida',
|
||||
'GA': 'Georgia',
|
||||
'GU': 'Guam',
|
||||
'HI': 'Hawaii',
|
||||
'IA': 'Iowa',
|
||||
'ID': 'Idaho',
|
||||
'IL': 'Illinois',
|
||||
'IN': 'Indiana',
|
||||
'KS': 'Kansas',
|
||||
'KY': 'Kentucky',
|
||||
'LA': 'Louisiana',
|
||||
'MA': 'Massachusetts',
|
||||
'MD': 'Maryland',
|
||||
'ME': 'Maine',
|
||||
'MI': 'Michigan',
|
||||
'MN': 'Minnesota',
|
||||
'MO': 'Missouri',
|
||||
'MP': 'Northern Mariana Islands',
|
||||
'MS': 'Mississippi',
|
||||
'MT': 'Montana',
|
||||
'NA': 'National',
|
||||
'NC': 'North Carolina',
|
||||
'ND': 'North Dakota',
|
||||
'NE': 'Nebraska',
|
||||
'NH': 'New Hampshire',
|
||||
'NJ': 'New Jersey',
|
||||
'NM': 'New Mexico',
|
||||
'NV': 'Nevada',
|
||||
'NY': 'New York',
|
||||
'OH': 'Ohio',
|
||||
'OK': 'Oklahoma',
|
||||
'OR': 'Oregon',
|
||||
'PA': 'Pennsylvania',
|
||||
'PR': 'Puerto Rico',
|
||||
'RI': 'Rhode Island',
|
||||
'SC': 'South Carolina',
|
||||
'SD': 'South Dakota',
|
||||
'TN': 'Tennessee',
|
||||
'TX': 'Texas',
|
||||
'UT': 'Utah',
|
||||
'VA': 'Virginia',
|
||||
'VI': 'Virgin Islands',
|
||||
'VT': 'Vermont',
|
||||
'WA': 'Washington',
|
||||
'WI': 'Wisconsin',
|
||||
'WV': 'West Virginia',
|
||||
'WY': 'Wyoming',
|
||||
'OL': 'Orleans',
|
||||
'DK': 'Dakota',
|
||||
'PI': 'Philippine Islands'
|
||||
}
|
||||
|
||||
debug = utils.flags().get('debug', False)
|
||||
|
||||
# default to caching
|
||||
@@ -108,11 +26,9 @@ force = not cache
|
||||
|
||||
|
||||
only_bioguide = utils.flags().get('bioguide', None)
|
||||
|
||||
congress = utils.flags().get('congress',None)
|
||||
|
||||
|
||||
|
||||
filename_historical = "legislators-historical.yaml"
|
||||
filename_current = "legislators-current.yaml"
|
||||
data_files = []
|
||||
@@ -129,96 +45,93 @@ if congress == None:
|
||||
raise Exception("the --congress flag is required")
|
||||
elif congress == "113":
|
||||
url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord"
|
||||
destination = "icpsr/source/senate_rollcall%s.txt" % congress
|
||||
senate_data = utils.download(url_senate, destination, force)
|
||||
|
||||
url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord"
|
||||
destination = "icpsr/source/house_rollcall%s.txt" % congress
|
||||
house_data = utils.download(url_house, destination, force)
|
||||
elif int(congress) <10 and int(congress) >0:
|
||||
url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress
|
||||
destination = "icpsr/source/senate_rollcall%s.txt" % congress
|
||||
senate_data = utils.download(url_senate, destination, force)
|
||||
|
||||
url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress
|
||||
destination = "icpsr/source/house_rollcall%s.txt" % congress
|
||||
house_data = utils.download(url_house, destination, force)
|
||||
elif int(congress) < congress_from_legislative_year(current_legislative_year()) and int(congress) >= 10:
|
||||
elif int(congress) < 113 and int(congress) >= 10:
|
||||
url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress
|
||||
destination = "icpsr/source/senate_rollcall%s.txt" % congress
|
||||
senate_data = utils.download(url_senate, destination, force)
|
||||
|
||||
url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress
|
||||
destination = "icpsr/source/house_rollcall%s.txt" % congress
|
||||
house_data = utils.download(url_house, destination, force)
|
||||
else:
|
||||
raise Exception("no data for congress " + congress)
|
||||
|
||||
cw = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb"))
|
||||
cw.writerow(["matches","icpsr_name","icpsr_state","is_territory"])
|
||||
senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress
|
||||
senate_data = utils.download(url_senate, senate_destination, force)
|
||||
|
||||
house_destination = "icpsr/source/house_rollcall%s.txt" % congress
|
||||
house_data = utils.download(url_house, house_destination, force)
|
||||
|
||||
error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb"))
|
||||
error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"])
|
||||
|
||||
|
||||
|
||||
read_files = [(senate_data,"sen"),(house_data,"rep")]
|
||||
print "Running for congress " + congress
|
||||
for r in read_files:
|
||||
for f in data_files:
|
||||
for m in f[0]:
|
||||
for read_file in read_files:
|
||||
for data_file in data_files:
|
||||
for legislator in data_file[0]:
|
||||
num_matches = 0
|
||||
# # this can't run unless we've already collected a bioguide for this person
|
||||
bioguide = m["id"].get("bioguide", None)
|
||||
bioguide = legislator["id"].get("bioguide", None)
|
||||
# if we've limited this to just one bioguide, skip over everyone else
|
||||
if only_bioguide and (bioguide != only_bioguide):
|
||||
num_matches += 1
|
||||
continue
|
||||
#skip if icpsr id is currently in data
|
||||
if "icpsr" in m["id"]:
|
||||
num_matches += 1
|
||||
continue
|
||||
#if not in currently read chamber, skip
|
||||
chamber = m['terms'][len(m['terms'])-1]['type']
|
||||
if chamber != r[1]:
|
||||
num_matches += 1
|
||||
chamber = legislator['terms'][len(legislator['terms'])-1]['type']
|
||||
if chamber != read_file[1]:
|
||||
continue
|
||||
|
||||
#only run for selected congress
|
||||
latest_congress = congress_from_legislative_year(current_legislative_year(parse_date(m['terms'][len(m['terms'])-1]['start'])))
|
||||
latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start'])))
|
||||
if chamber == "sen":
|
||||
congresses = [latest_congress,latest_congress+1,latest_congress+2]
|
||||
else:
|
||||
congresses =[latest_congress]
|
||||
|
||||
if int(congress) not in congresses:
|
||||
num_matches += 1
|
||||
continue
|
||||
|
||||
# pull data to match from yaml
|
||||
|
||||
last_name_unicode = m['name']['last'].upper().strip().replace('\'','')
|
||||
last_name_unicode = legislator['name']['last'].upper().strip().replace('\'','')
|
||||
last_name = unicodedata.normalize('NFD', unicode(last_name_unicode)).encode('ascii', 'ignore')
|
||||
state = states[m['terms'][len(m['terms'])-1]['state']].upper()[:7].strip()
|
||||
state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip()
|
||||
# select icpsr source data based on more recent chamber
|
||||
|
||||
lines = r[0].split('\n')
|
||||
for l in lines:
|
||||
disp = False
|
||||
|
||||
write_id = ""
|
||||
lines = read_file[0].split('\n')
|
||||
for line in lines:
|
||||
# parse source data
|
||||
icpsr_state = l[12:20].strip()
|
||||
icpsr_name = l[21:].strip().strip(string.digits).strip()
|
||||
icpsr_id = l[3:8].strip()
|
||||
icpsr_state = line[12:20].strip()
|
||||
icpsr_name = line[21:].strip().strip(string.digits).strip()
|
||||
icpsr_id = line[3:8].strip()
|
||||
|
||||
#ensure unique match
|
||||
if icpsr_name[:8] == last_name[:8] and state == icpsr_state:
|
||||
num_matches += 1
|
||||
write_id = icpsr_id
|
||||
#skip if icpsr id is currently in data
|
||||
if "icpsr" in legislator["id"]:
|
||||
if write_id == legislator["id"]["icpsr"] or write_id == "":
|
||||
continue
|
||||
elif write_id != legislator["id"]["icpsr"] and write_id != "":
|
||||
error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id])
|
||||
print "ID updated for %s" % last_name
|
||||
if num_matches == 1:
|
||||
m['id']['icpsr'] = int(write_id)
|
||||
elif num_matches == 0:
|
||||
print "No matches found for " + last_name + ", " + state + "in congress " + str(congress)
|
||||
cw.writerow(["0",last_name,state])
|
||||
legislator['id']['icpsr'] = int(write_id)
|
||||
else:
|
||||
if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER":
|
||||
cw.writerow([str(num_matches),last_name[:8],state,"Y"])
|
||||
if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP":
|
||||
error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"])
|
||||
else:
|
||||
print str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress)
|
||||
cw.writerow([str(num_matches),last_name,state,"N"])
|
||||
error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"])
|
||||
|
||||
|
||||
save_data(f[0], f[1])
|
||||
save_data(data_file[0], data_file[1])
|
||||
|
||||
## the following three lines can be run as a separate script to update icpsr id's for all historical congresses
|
||||
# import os
|
||||
|
||||
# for i in range(1,114):
|
||||
# os.system("python ICPSR_id.py --congress=" + str(i))
|
||||
@@ -1,6 +1,69 @@
|
||||
# Helpful functions for finding data about members and committees
|
||||
|
||||
CURRENT_CONGRESS = 113
|
||||
states = {
|
||||
'AK': 'Alaska',
|
||||
'AL': 'Alabama',
|
||||
'AR': 'Arkansas',
|
||||
'AS': 'American Samoa',
|
||||
'AZ': 'Arizona',
|
||||
'CA': 'California',
|
||||
'CO': 'Colorado',
|
||||
'CT': 'Connecticut',
|
||||
'DC': 'District of Columbia',
|
||||
'DE': 'Delaware',
|
||||
'FL': 'Florida',
|
||||
'GA': 'Georgia',
|
||||
'GU': 'Guam',
|
||||
'HI': 'Hawaii',
|
||||
'IA': 'Iowa',
|
||||
'ID': 'Idaho',
|
||||
'IL': 'Illinois',
|
||||
'IN': 'Indiana',
|
||||
'KS': 'Kansas',
|
||||
'KY': 'Kentucky',
|
||||
'LA': 'Louisiana',
|
||||
'MA': 'Massachusetts',
|
||||
'MD': 'Maryland',
|
||||
'ME': 'Maine',
|
||||
'MI': 'Michigan',
|
||||
'MN': 'Minnesota',
|
||||
'MO': 'Missouri',
|
||||
'MP': 'Northern Mariana Islands',
|
||||
'MS': 'Mississippi',
|
||||
'MT': 'Montana',
|
||||
'NA': 'National',
|
||||
'NC': 'North Carolina',
|
||||
'ND': 'North Dakota',
|
||||
'NE': 'Nebraska',
|
||||
'NH': 'New Hampshire',
|
||||
'NJ': 'New Jersey',
|
||||
'NM': 'New Mexico',
|
||||
'NV': 'Nevada',
|
||||
'NY': 'New York',
|
||||
'OH': 'Ohio',
|
||||
'OK': 'Oklahoma',
|
||||
'OR': 'Oregon',
|
||||
'PA': 'Pennsylvania',
|
||||
'PR': 'Puerto Rico',
|
||||
'RI': 'Rhode Island',
|
||||
'SC': 'South Carolina',
|
||||
'SD': 'South Dakota',
|
||||
'TN': 'Tennessee',
|
||||
'TX': 'Texas',
|
||||
'UT': 'Utah',
|
||||
'VA': 'Virginia',
|
||||
'VI': 'Virgin Islands',
|
||||
'VT': 'Vermont',
|
||||
'WA': 'Washington',
|
||||
'WI': 'Wisconsin',
|
||||
'WV': 'West Virginia',
|
||||
'WY': 'Wyoming',
|
||||
'OL': 'Orleans',
|
||||
'DK': 'Dakota',
|
||||
'PI': 'Philippine Islands'
|
||||
}
|
||||
|
||||
|
||||
|
||||
import os, errno, sys, traceback
|
||||
@@ -8,6 +71,35 @@ import re, htmlentitydefs
|
||||
import pprint
|
||||
from datetime import datetime
|
||||
|
||||
def current_congress():
|
||||
year = current_legislative_year()
|
||||
return congress_from_legislative_year(year)
|
||||
|
||||
def congress_from_legislative_year(year):
|
||||
return ((year + 1) / 2) - 894
|
||||
|
||||
def legislative_year(date=None):
|
||||
if not date:
|
||||
date = datetime.datetime.now()
|
||||
|
||||
year = date.year
|
||||
|
||||
if date.month == 1:
|
||||
if date.day == 1 or date.day == 2:
|
||||
return date.year - 1
|
||||
elif date.day == 3:
|
||||
if isinstance(date,datetime):
|
||||
if date.hour < 12:
|
||||
return date.year -1
|
||||
else:
|
||||
return date.year
|
||||
else:
|
||||
return date.year
|
||||
else:
|
||||
return date.year
|
||||
else:
|
||||
return date.year
|
||||
|
||||
def parse_date(date):
|
||||
return datetime.strptime(date, "%Y-%m-%d").date()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user