Files
congress-legislators/scripts/house_history.py
2016-02-15 13:56:21 +02:00

68 lines
1.9 KiB
Python
Executable File

#!/usr/bin/env python
# Stores a house_history ID for all legislators that don't yet
# have one, by scraping history.house.gov.
import lxml.html, io
import requests
from utils import load_data, save_data
import sys
def run():
# load legislators YAML files
yamlfiles = { }
for fn in ('historical', 'current'):
fn = 'legislators-%s.yaml' % fn
print("Loading %s..." % fn)
yamlfiles[fn] = load_data(fn)
# reoriented cache to access by bioguide ID
by_bioguide = { }
known_house_history_ids = set()
for legislators in yamlfiles.values():
for m in legislators:
if "bioguide" in m["id"]:
by_bioguide[m["id"]["bioguide"]] = m
if "house_history" in m["id"]:
known_house_history_ids.add(m["id"]["house_history"])
count = 0
# scrape history.house.gov
if len(sys.argv) == 1:
id_range = range(22000, 25000)
else:
id_range = [int(arg) for arg in sys.argv[1:]]
for id in id_range:
# skip known IDs
if id in known_house_history_ids:
continue
print(id)
bioguide_id = get_bioguide_for_house_history_id(id)
if bioguide_id and bioguide_id in by_bioguide:
print(id, bioguide_id)
by_bioguide[bioguide_id]["id"]["house_history"] = id
count = count + 1
# write YAML files to disk
for filename, legislators in yamlfiles.items():
print("Saving data to %s..." % filename)
save_data(legislators, filename)
# how many updates did we make?
print("Saved %d legislators" % count)
def get_bioguide_for_house_history_id(id):
url = "http://history.house.gov/People/Detail/%s" % id
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
dom = lxml.html.parse(io.StringIO(r.text)).getroot()
try:
bioguide_link = dom.cssselect("a.view-in-bioguide")[0].get('href')
return bioguide_link.split('=')[1]
except:
return None
else:
return None
if __name__ == '__main__':
run()