mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 09:50:37 -05:00
85 lines
4.1 KiB
Python
Executable File
85 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# Update current THOMAS IDs using beta.congress.gov. Congressmen's
|
|
# IDs are updated directly. For Senators, we just print out new
|
|
# IDs because name matching is hard.
|
|
|
|
import lxml.html, io, urllib.request, urllib.parse, urllib.error
|
|
import re
|
|
import utils
|
|
from utils import download, load_data, save_data
|
|
|
|
def run():
|
|
CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter
|
|
|
|
# constants
|
|
state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"}
|
|
|
|
# default to not caching
|
|
cache = utils.flags().get('cache', False)
|
|
force = not cache
|
|
|
|
# load in current members
|
|
y = load_data("legislators-current.yaml")
|
|
by_district = { }
|
|
existing_senator_ids = set()
|
|
for m in y:
|
|
last_term = m['terms'][-1]
|
|
if last_term['type'] == 'rep':
|
|
full_district = "%s%02d" % (last_term['state'], int(last_term['district']))
|
|
by_district[full_district] = m
|
|
elif last_term['type'] == 'sen':
|
|
if "thomas" in m["id"]:
|
|
existing_senator_ids.add(m["id"]["thomas"])
|
|
|
|
seen_ids = set()
|
|
for chamber in ("House of Representatives", "Senate"):
|
|
url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % (
|
|
urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber))
|
|
cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber)
|
|
try:
|
|
body = download(url, cache, force)
|
|
dom = lxml.html.parse(io.StringIO(body)).getroot()
|
|
except lxml.etree.XMLSyntaxError:
|
|
print("Error parsing: ", url)
|
|
continue
|
|
|
|
for node in dom.xpath("//ul[@class='results_list']/li"):
|
|
thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1))
|
|
|
|
# THOMAS misassigned these 'new' IDs to existing individuals.
|
|
if thomas_id in ('02139', '02132'):
|
|
continue
|
|
|
|
name = node.xpath('h2/a')[0].text
|
|
|
|
state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip()
|
|
state = state_names[state]
|
|
|
|
if chamber == "House of Representatives":
|
|
# There's enough information to easily pick out which Member this refers to, so write it
|
|
# directly to the file.
|
|
district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip()
|
|
if district == "At Large": district = 0
|
|
district = "%02d" % int(district)
|
|
|
|
if state + district not in by_district:
|
|
print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.")
|
|
continue
|
|
|
|
if state + district in seen_ids:
|
|
print("Congress.gov lists two people for %s%s!" % (state, district))
|
|
seen_ids.add(state+district)
|
|
|
|
by_district[state + district]["id"]["thomas"] = thomas_id
|
|
|
|
elif chamber == "Senate":
|
|
# For senators we'd have to match on name or something else, so that's too difficult.
|
|
# Just look for new IDs.
|
|
if thomas_id not in existing_senator_ids:
|
|
print("Please manually set", thomas_id, "for", name, "from", state)
|
|
|
|
save_data(y, "legislators-current.yaml")
|
|
|
|
if __name__ == '__main__':
|
|
run() |