Files
congress-legislators/scripts/senate_contacts.py

183 lines
5.7 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# Update current senator's website and address from www.senate.gov.
import lxml.etree, io
import string, re
from datetime import datetime
import utils
from utils import download, load_data, save_data, parse_date
import urllib.request
def run():
today = datetime.now().date()
# default to not caching
cache = utils.flags().get('cache', False)
force = not cache
y = load_data("legislators-current.yaml")
# Map bioguide IDs to dicts. Reference the same dicts
# in y so we are updating y when we update biogiude.
bioguide = { }
by_name = { }
for m in y:
if "bioguide" in m["id"]:
bioguide[m["id"]["bioguide"]] = m
party = m["terms"][-1]["party"][0]
state = m["terms"][-1]["state"]
last_name = m["name"]["last"]
member_full = "%s (%s-%s)" % (last_name, party, state)
by_name[member_full] = m
print("Fetching general Senate information from senators_cfm.xml...")
url = "https://www.senate.gov/general/contact_information/senators_cfm.xml"
body = download(url, "legislators/senate.xml", force, { "binary": True })
dom = lxml.etree.parse(io.BytesIO(body)) # file has an <?xml declaration and so must be parsed as a bytes array
for node in dom.xpath("member"):
bioguide_id = str(node.xpath("string(bioguide_id)")).strip()
member_full = node.xpath("string(member_full)")
if bioguide_id == "":
print("Someone has an empty bioguide ID!")
print(lxml.etree.tostring(node))
continue
print("[%s] Processing Senator %s..." % (bioguide_id, member_full))
# find member record in our YAML, either by bioguide_id or member_full
if bioguide_id in bioguide:
member = bioguide[bioguide_id]
else:
if member_full in by_name:
member = by_name[member_full]
else:
print("Bioguide ID '%s' and full name '%s' not recognized." % (bioguide_id, member_full))
exit(0)
try:
term = member["terms"][-1]
except IndexError:
print("Member has no terms", bioguide_id, member_full)
continue
if today < parse_date(term["start"]) or today > parse_date(term["end"]):
print("Member's last listed term is not current", bioguide_id, member_full, term["start"])
continue
if term["type"] != "sen":
print("Member's last listed term is not a Senate term", bioguide_id, member_full)
continue
if term["state"] != str(node.xpath("string(state)")):
print("Member's last listed term has the wrong state", bioguide_id, member_full)
continue
if "district" in term: del term["district"]
full_name = str(node.xpath("string(first_name)"))
suffix = None
if ", " in full_name: full_name, suffix = full_name.split(", ")
full_name += " " + str(node.xpath("string(last_name)"))
if suffix: full_name += ", " + suffix
member["name"]["official_full"] = re.sub("'", "", full_name)
member["id"]["bioguide"] = bioguide_id
term["class"] = { "Class I": 1, "Class II": 2, "Class III": 3}[ node.xpath("string(class)") ]
term["party"] = { "D": "Democrat", "R": "Republican", "I": "Independent", "ID": "Independent"}[ node.xpath("string(party)") ]
url = str(node.xpath("string(website)")).strip()
if not url.startswith("/"):
# temporary home pages for new senators are relative links?
# hit the URL to resolve any redirects to get the canonical URL,
# since the listing sometimes gives URLs that redirect.
try:
req = urllib.request.Request(url)
req.add_header("User-Agent", "https://github.com/unitedstates/congress-legislators")
resp = urllib.request.urlopen(req)
url = resp.geturl()
except Exception as e:
print(url, e)
# kill trailing slash
url = re.sub("/$", "", url)
term["url"] = url
#contact forms are sometimes listed as the base url, ignore if such case
contact_form = str(node.xpath("string(email)")).strip()
if contact_form and contact_form.rstrip("/") != term['url']:
term['contact_form'] = contact_form
term["address"] = str(node.xpath("string(address)")).strip().replace("\n ", " ")
term["office"] = string.capwords(term["address"].upper().split(" WASHINGTON ")[0])
phone = str(node.xpath("string(phone)")).strip()
term["phone"] = phone.replace("(", "").replace(")", "").replace(" ", "-")
print("\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...")
url = "https://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml"
body = download(url, "legislators/senate_cvc.xml", force)
dom = lxml.etree.parse(io.StringIO(body))
for node in dom.getroot():
if node.tag == "lastUpdate":
date, time = node.getchildren()
print("Last updated: %s, %s" % (date.text, time.text))
continue
bioguide_id = str(node.xpath("string(bioguideId)")).strip()
if bioguide_id == "":
print("Someone has an empty bioguide ID!")
print(lxml.etree.tostring(node))
continue
last_name = node.xpath("string(name/last)")
party = node.xpath("string(party)")
state = node.xpath("string(state)")
member_full = "%s (%s-%s)" % (last_name, party, state)
print("[%s] Processing Senator %s..." % (bioguide_id, member_full))
# find member record in our YAML, either by bioguide_id or member_full
if bioguide_id in bioguide:
member = bioguide[bioguide_id]
else:
if member_full in by_name:
member = by_name[member_full]
else:
print("Bioguide ID '%s' and synthesized official name '%s' not recognized." % (bioguide_id, member_full))
continue # exit(0)
try:
term = member["terms"][-1]
except IndexError:
print("Member has no terms", bioguide_id, member_full)
continue
if "id" not in member:
member["id"] = {}
member["id"]["lis"] = node.attrib["lis_member_id"]
state_rank = node.xpath("string(stateRank)")
if state_rank == '1':
term["state_rank"] = "senior"
elif state_rank == '2':
term["state_rank"] = "junior"
print("Saving data...")
save_data(y, "legislators-current.yaml")
if __name__ == '__main__':
run()