congress-legislators/scripts/thomas_ids.py

#!/usr/bin/env python

# Update current THOMAS IDs using beta.congress.gov. Congressmen's
# IDs are updated directly. For Senators, we just print out new
# IDs because name matching is hard.

import lxml.html, io, urllib.request, urllib.parse, urllib.error
import re
import utils
from utils import download, load_data, save_data

def run():
  CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter

  # constants
  state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"}

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache

  # load in current members
  y = load_data("legislators-current.yaml")
  by_district = { }
  existing_senator_ids = set()
  for m in y:
    last_term = m['terms'][-1]
    if last_term['type'] == 'rep':
      full_district = "%s%02d" % (last_term['state'], int(last_term['district']))
      by_district[full_district] = m
    elif last_term['type'] == 'sen':
      if "thomas" in m["id"]:
        existing_senator_ids.add(m["id"]["thomas"])

  seen_ids = set()
  for chamber in ("House of Representatives", "Senate"):
    url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % (
      urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber))
    cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber)
    try:
      body = download(url, cache, force)
      dom = lxml.html.parse(io.StringIO(body)).getroot()
    except lxml.etree.XMLSyntaxError:
      print("Error parsing: ", url)
      continue

    for node in dom.xpath("//ul[@class='results_list']/li"):
      thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1))

      # THOMAS misassigned these 'new' IDs to existing individuals.
      if thomas_id in ('02139', '02132'):
        continue

      name = node.xpath('h2/a')[0].text

      state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip()
      state = state_names[state]

      if chamber == "House of Representatives":
        # There's enough information to easily pick out which Member this refers to, so write it
        # directly to the file.
        district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip()
        if district == "At Large": district = 0
        district = "%02d" % int(district)

        if state + district not in by_district:
          print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.")
          continue

        if state + district in seen_ids:
          print("Congress.gov lists two people for %s%s!" % (state, district))
        seen_ids.add(state+district)

        by_district[state + district]["id"]["thomas"] = thomas_id

      elif chamber == "Senate":
        # For senators we'd have to match on name or something else, so that's too difficult.
        # Just look for new IDs.
        if thomas_id not in existing_senator_ids:
          print("Please manually set", thomas_id, "for", name, "from", state)

  save_data(y, "legislators-current.yaml")

if __name__ == '__main__':
  run()