congress-legislators/scripts/house_history.py

#!/usr/bin/env python

# gets bioguide id for every member with a house history ID:

# options:
#  --cache: load from cache if present on disk (default: true)
#  --current: do *only* current legislators (default: true)
#  --historical: do *only* historical legislators (default: false)
#  --bioguide: do *only* a single legislator

import lxml.html, io
import utils
import requests
from utils import load_data, save_data

def run():

  # pick either current or historical
  # order is important here, since current defaults to true
  if utils.flags().get('historical', False):
    filename = "legislators-historical.yaml"
  elif utils.flags().get('current', True):
    filename = "legislators-current.yaml"
  else:
    print("No legislators selected.")
    exit(0)

  print("Loading %s..." % filename)
  legislators = load_data(filename)

  # reoriented cache to access by bioguide ID
  by_bioguide = { }
  for m in legislators:
    if "bioguide" in m["id"]:
      by_bioguide[m["id"]["bioguide"]] = m

  count = 0

  for id in range(8245,21131):
    print(id)
    url = "http://history.house.gov/People/Detail/%s" % id
    r = requests.get(url, allow_redirects=False)
    if r.status_code == 200:
        dom = lxml.html.parse(io.StringIO(r.text)).getroot()
        try:
            bioguide_link = dom.cssselect("a.view-in-bioguide")[0].get('href')
            bioguide_id = bioguide_link.split('=')[1]
            by_bioguide[bioguide_id]["id"]["house_history"] = id
            count = count + 1
        except:
            continue
    else:
        continue

  print("Saving data to %s..." % filename)
  save_data(legislators, filename)

  print("Saved %d legislators to %s" % (count, filename))

if __name__ == '__main__':
  run()