updated icpsr.py

2026-05-12 03:00:10 -04:00 · 2013-07-30 11:34:07 -04:00
parent 9f489cf1f6
commit 16aa96e350
3 changed files with 139 additions and 134 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/scripts/ICPSR_id.py
+++ b/scripts/ICPSR_id.py
@@ -12,94 +12,12 @@ import re
 import utils
 import urllib2
 import requests
-from utils import download, load_data, save_data, parse_date
+from utils import download, load_data, save_data, parse_date, states, congress_from_legislative_year, legislative_year
 import json
 import string
 import csv
 import unicodedata

-def congress_from_legislative_year(year):
-    return ((year + 1) / 2) - 894
-
-def current_legislative_year(date=None):
-    if not date:
-        date = datetime.datetime.now()
-    year = date.year
-
-    if date.month == 1:
-        if date.day == 1 or date.day == 2:
-            return date.year - 1
-        #yaml has no time data, so can't distinguish between pre/post noon dates. So, since this script is based on start-dates to determine congress numbers, starting anytime on 01-03 is the new congress
-        elif date.day == 3:
-            return date.year
-        else:
-            return date.year
-    else:
-        return date.year
-
-states = {
-        'AK': 'Alaska',
-        'AL': 'Alabama',
-        'AR': 'Arkansas',
-        'AS': 'American Samoa',
-        'AZ': 'Arizona',
-        'CA': 'California',
-        'CO': 'Colorado',
-        'CT': 'Connecticut',
-        'DC': 'District of Columbia',
-        'DE': 'Delaware',
-        'FL': 'Florida',
-        'GA': 'Georgia',
-        'GU': 'Guam',
-        'HI': 'Hawaii',
-        'IA': 'Iowa',
-        'ID': 'Idaho',
-        'IL': 'Illinois',
-        'IN': 'Indiana',
-        'KS': 'Kansas',
-        'KY': 'Kentucky',
-        'LA': 'Louisiana',
-        'MA': 'Massachusetts',
-        'MD': 'Maryland',
-        'ME': 'Maine',
-        'MI': 'Michigan',
-        'MN': 'Minnesota',
-        'MO': 'Missouri',
-        'MP': 'Northern Mariana Islands',
-        'MS': 'Mississippi',
-        'MT': 'Montana',
-        'NA': 'National',
-        'NC': 'North Carolina',
-        'ND': 'North Dakota',
-        'NE': 'Nebraska',
-        'NH': 'New Hampshire',
-        'NJ': 'New Jersey',
-        'NM': 'New Mexico',
-        'NV': 'Nevada',
-        'NY': 'New York',
-        'OH': 'Ohio',
-        'OK': 'Oklahoma',
-        'OR': 'Oregon',
-        'PA': 'Pennsylvania',
-        'PR': 'Puerto Rico',
-        'RI': 'Rhode Island',
-        'SC': 'South Carolina',
-        'SD': 'South Dakota',
-        'TN': 'Tennessee',
-        'TX': 'Texas',
-        'UT': 'Utah',
-        'VA': 'Virginia',
-        'VI': 'Virgin Islands',
-        'VT': 'Vermont',
-        'WA': 'Washington',
-        'WI': 'Wisconsin',
-        'WV': 'West Virginia',
-        'WY': 'Wyoming',
-        'OL': 'Orleans',
-        'DK': 'Dakota',
-        'PI': 'Philippine Islands'
-}
-
 debug = utils.flags().get('debug', False)

 # default to caching
@@ -108,11 +26,9 @@ force = not cache


 only_bioguide = utils.flags().get('bioguide', None)
-
 congress = utils.flags().get('congress',None)


-
 filename_historical = "legislators-historical.yaml"
 filename_current = "legislators-current.yaml"
 data_files = []
@@ -129,96 +45,93 @@ if congress == None:
    raise Exception("the --congress flag is required")
 elif congress == "113":
    url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord"
-    destination = "icpsr/source/senate_rollcall%s.txt" % congress
-    senate_data = utils.download(url_senate, destination, force)
-
    url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord"
-    destination = "icpsr/source/house_rollcall%s.txt" % congress
-    house_data = utils.download(url_house, destination, force)
 elif int(congress) <10 and int(congress) >0:
    url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress
-    destination = "icpsr/source/senate_rollcall%s.txt" % congress
-    senate_data = utils.download(url_senate, destination, force)
-
    url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress
-    destination = "icpsr/source/house_rollcall%s.txt" % congress
-    house_data = utils.download(url_house, destination, force)
-elif int(congress) < congress_from_legislative_year(current_legislative_year()) and int(congress) >= 10:
+elif int(congress) < 113 and int(congress) >= 10:
    url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress
-    destination = "icpsr/source/senate_rollcall%s.txt" % congress
-    senate_data = utils.download(url_senate, destination, force)
-
    url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress
-    destination = "icpsr/source/house_rollcall%s.txt" % congress
-    house_data = utils.download(url_house, destination, force)
 else:
    raise Exception("no data for congress " + congress)

-cw = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb"))
-cw.writerow(["matches","icpsr_name","icpsr_state","is_territory"])
+senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress
+senate_data = utils.download(url_senate, senate_destination, force)
+
+house_destination = "icpsr/source/house_rollcall%s.txt" % congress
+house_data = utils.download(url_house, house_destination, force)
+
+error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb"))
+error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"])
+
+

 read_files = [(senate_data,"sen"),(house_data,"rep")]
 print "Running for congress " + congress
-for r in read_files:
-    for f in data_files:
-        for m in f[0]:
+for read_file in read_files:
+    for data_file in data_files:
+        for legislator in data_file[0]:
            num_matches = 0
            # # this can't run unless we've already collected a bioguide for this person
-            bioguide = m["id"].get("bioguide", None)
+            bioguide = legislator["id"].get("bioguide", None)
            # if we've limited this to just one bioguide, skip over everyone else
            if only_bioguide and (bioguide != only_bioguide):
-                num_matches += 1
-                continue
-            #skip if icpsr id is currently in data
-            if "icpsr" in m["id"]:
-                num_matches += 1
                continue
            #if not in currently read chamber, skip
-            chamber = m['terms'][len(m['terms'])-1]['type']
-            if chamber != r[1]:
-                num_matches += 1
+            chamber = legislator['terms'][len(legislator['terms'])-1]['type']
+            if chamber != read_file[1]:
                continue

            #only run for selected congress
-            latest_congress = congress_from_legislative_year(current_legislative_year(parse_date(m['terms'][len(m['terms'])-1]['start'])))
+            latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start'])))
            if chamber == "sen":
                congresses = [latest_congress,latest_congress+1,latest_congress+2]
            else:
                congresses =[latest_congress]

            if int(congress) not in congresses:
-                num_matches += 1
                continue

            # pull data to match from yaml
            
-            last_name_unicode = m['name']['last'].upper().strip().replace('\'','')
+            last_name_unicode = legislator['name']['last'].upper().strip().replace('\'','')
            last_name = unicodedata.normalize('NFD', unicode(last_name_unicode)).encode('ascii', 'ignore')
-            state = states[m['terms'][len(m['terms'])-1]['state']].upper()[:7].strip()
+            state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip()
            # select icpsr source data based on more recent chamber
-     
-            lines = r[0].split('\n')
-            for l in lines:
-                disp = False
+            
+            write_id = ""
+            lines = read_file[0].split('\n')
+            for line in lines:
                # parse source data
-                icpsr_state = l[12:20].strip()
-                icpsr_name = l[21:].strip().strip(string.digits).strip()
-                icpsr_id = l[3:8].strip()
+                icpsr_state = line[12:20].strip()
+                icpsr_name = line[21:].strip().strip(string.digits).strip()
+                icpsr_id = line[3:8].strip()

                #ensure unique match
                if icpsr_name[:8] == last_name[:8] and state == icpsr_state:
                    num_matches += 1
                    write_id = icpsr_id
+            #skip if icpsr id is currently in data
+            if "icpsr" in legislator["id"]:
+                if write_id == legislator["id"]["icpsr"] or write_id == "":
+                    continue
+                elif write_id != legislator["id"]["icpsr"] and write_id != "":
+                    error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id])
+                    print "ID updated for %s" % last_name
            if num_matches == 1:
-                m['id']['icpsr'] = int(write_id)
-            elif num_matches == 0:
-                print "No matches found for " + last_name + ", " + state + "in congress " + str(congress)
-                cw.writerow(["0",last_name,state])
+                legislator['id']['icpsr'] = int(write_id)
            else:
-                if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER":
-                    cw.writerow([str(num_matches),last_name[:8],state,"Y"])
+                if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP":
+                    error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"])
                else:
                    print str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress)
-                    cw.writerow([str(num_matches),last_name,state,"N"])
+                    error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"])
+ 

-        save_data(f[0], f[1])
+        save_data(data_file[0], data_file[1])
+
+## the following three lines can be run as a separate script to update icpsr id's for all historical congresses
+# import os
+
+# for i in range(1,114):
+#     os.system("python ICPSR_id.py --congress=" + str(i))
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -1,6 +1,69 @@
 # Helpful functions for finding data about members and committees

 CURRENT_CONGRESS = 113
+states = {
+        'AK': 'Alaska',
+        'AL': 'Alabama',
+        'AR': 'Arkansas',
+        'AS': 'American Samoa',
+        'AZ': 'Arizona',
+        'CA': 'California',
+        'CO': 'Colorado',
+        'CT': 'Connecticut',
+        'DC': 'District of Columbia',
+        'DE': 'Delaware',
+        'FL': 'Florida',
+        'GA': 'Georgia',
+        'GU': 'Guam',
+        'HI': 'Hawaii',
+        'IA': 'Iowa',
+        'ID': 'Idaho',
+        'IL': 'Illinois',
+        'IN': 'Indiana',
+        'KS': 'Kansas',
+        'KY': 'Kentucky',
+        'LA': 'Louisiana',
+        'MA': 'Massachusetts',
+        'MD': 'Maryland',
+        'ME': 'Maine',
+        'MI': 'Michigan',
+        'MN': 'Minnesota',
+        'MO': 'Missouri',
+        'MP': 'Northern Mariana Islands',
+        'MS': 'Mississippi',
+        'MT': 'Montana',
+        'NA': 'National',
+        'NC': 'North Carolina',
+        'ND': 'North Dakota',
+        'NE': 'Nebraska',
+        'NH': 'New Hampshire',
+        'NJ': 'New Jersey',
+        'NM': 'New Mexico',
+        'NV': 'Nevada',
+        'NY': 'New York',
+        'OH': 'Ohio',
+        'OK': 'Oklahoma',
+        'OR': 'Oregon',
+        'PA': 'Pennsylvania',
+        'PR': 'Puerto Rico',
+        'RI': 'Rhode Island',
+        'SC': 'South Carolina',
+        'SD': 'South Dakota',
+        'TN': 'Tennessee',
+        'TX': 'Texas',
+        'UT': 'Utah',
+        'VA': 'Virginia',
+        'VI': 'Virgin Islands',
+        'VT': 'Vermont',
+        'WA': 'Washington',
+        'WI': 'Wisconsin',
+        'WV': 'West Virginia',
+        'WY': 'Wyoming',
+        'OL': 'Orleans',
+        'DK': 'Dakota',
+        'PI': 'Philippine Islands'
+}
+


 import os, errno, sys, traceback
@@ -8,6 +71,35 @@ import re, htmlentitydefs
 import pprint
 from datetime import datetime

+def current_congress():
+  year = current_legislative_year()
+  return congress_from_legislative_year(year)
+
+def congress_from_legislative_year(year):
+  return ((year + 1) / 2) - 894
+
+def legislative_year(date=None):
+  if not date:
+    date = datetime.datetime.now()
+
+  year = date.year
+
+  if date.month == 1:
+    if date.day == 1 or date.day == 2:
+      return date.year - 1
+    elif date.day == 3:
+        if isinstance(date,datetime):
+          if date.hour < 12:
+            return date.year -1
+          else:
+            return date.year
+        else:
+          return date.year
+    else:
+      return date.year
+  else:
+    return date.year
+
 def parse_date(date):
  return datetime.strptime(date, "%Y-%m-%d").date()