Files
congress/tasks/utils.py

1160 lines
40 KiB
Python

import os
import os.path
import errno
import sys
import traceback
import zipfile
import platform
import re
import htmlentitydefs
import json
from pytz import timezone
import datetime
import time
from lxml import html, etree
import scrapelib
import pprint
import logging
import subprocess
import smtplib
import email.utils
from email.mime.text import MIMEText
import getpass
# read in an opt-in config file for changing directories and supplying email settings
# returns None if it's not there, and this should always be handled gracefully
path = "config.yml"
if os.path.exists(path):
# Don't use a cached config file, just in case, and direct_yaml_load is not yet defined.
import yaml
config = yaml.load(open(path))
else:
config = None
eastern_time_zone = timezone('US/Eastern')
# scraper should be instantiated at class-load time, so that it can rate limit appropriately
scraper = scrapelib.Scraper(requests_per_minute=120, retry_attempts=3)
scraper.user_agent = "unitedstates/congress (https://github.com/unitedstates/congress)"
def format_datetime(obj):
if isinstance(obj, datetime.datetime):
return eastern_time_zone.localize(obj.replace(microsecond=0)).isoformat()
elif isinstance(obj, datetime.date):
return obj.isoformat()
elif isinstance(obj, (str, unicode)):
return obj
else:
return None
def current_congress():
year = current_legislative_year()
return congress_from_legislative_year(year)
def congress_from_legislative_year(year):
return ((year + 1) / 2) - 894
def current_legislative_year(date=None):
if not date:
date = datetime.datetime.now()
year = date.year
if date.month == 1:
if date.day == 1 or date.day == 2:
return date.year - 1
elif date.day == 3 and date.hour < 12:
return date.year - 1
else:
return date.year
else:
return date.year
def get_congress_first_year(congress):
return (((int(congress) + 894) * 2) - 1)
# get the three calendar years that the Congress extends through (Jan 3 to Jan 3).
def get_congress_years(congress):
y1 = get_congress_first_year(congress)
return (y1, y1 + 1, y1 + 2)
# Get a list of Congresses associated with a particular term.
# XXX: This can be highly unreliable and may be deeply flawed.
# XXX: This would be much simpler if we already included Congresses in the data.
def get_term_congresses(term):
start_year = int(format_datetime(term["start"])[:4])
end_year = int(format_datetime(term["end"])[:4])
start_congress = congress_from_legislative_year(start_year)
start_congress_years = get_congress_years(start_congress)
start_congress_first_year = start_congress_years[0]
if term["type"] in ["sen"]:
end_congress_years = get_congress_years(start_congress + 2)
congresses = [start_congress, start_congress + 1, start_congress + 2]
elif term["type"] in ["prez", "viceprez"] or term["state"] in ["PR"]:
end_congress_years = get_congress_years(start_congress + 1)
congresses = [start_congress, start_congress + 1]
else:
end_congress_years = start_congress_years
congresses = [start_congress]
end_congress_last_year = end_congress_years[2]
valid_congresses = (start_year >= start_congress_first_year) and (end_year <= end_congress_last_year)
# if not valid_congresses:
# print term["type"], start_congress, (start_year, start_congress_first_year), (end_year, end_congress_last_year)
return congresses if valid_congresses else []
# bill_type, bill_number, congress
def split_bill_id(bill_id):
return re.match("^([a-z]+)(\d+)-(\d+)$", bill_id).groups()
# "hjres1234-115"
def build_bill_id(bill_type, bill_number, congress):
return "%s%s-%s" % (bill_type, bill_number, congress)
# bill_type, bill_number, congress, version_code
def split_bill_version_id(bill_version_id):
return re.match("^([a-z]+)(\d+)-(\d+)-([a-z\d]+)$", bill_version_id).groups()
# "hjres1234-115-enr"
def build_bill_version_id(bill_type, bill_number, congress, version_code):
return "%s%s-%s-%s" % (bill_type, bill_number, congress, version_code)
def split_vote_id(vote_id):
# Sessions are either four-digit years for modern day votes or a digit or letter
# for historical votes before sessions were basically calendar years.
return re.match("^(h|s)(\d+)-(\d+).(\d\d\d\d|[0-9A-Z])$", vote_id).groups()
# nomination_type (always PN), nomination_number, congress
# nomination_number is usually a number, but can be hyphenated, e.g. PN64-01-111
# which would produce a nomination_number of "64-01"
def split_nomination_id(nomination_id):
try:
return re.match("^([A-z]{2})([\d-]+)-(\d+)$", nomination_id).groups()
except Exception, e:
logging.error("Unabled to parse %s" % nomination_id)
return (None, None, None)
def process_set(to_fetch, fetch_func, options, *extra_args):
errors = []
saved = []
skips = []
for id in to_fetch:
try:
results = fetch_func(id, options, *extra_args)
except Exception, e:
if options.get('raise', False):
raise
else:
errors.append((id, e, format_exception(e)))
continue
if results.get('ok', False):
if results.get('saved', False):
saved.append(id)
logging.info("[%s] Updated" % id)
else:
skips.append(id)
logging.warn("[%s] Skipping: %s" % (id, results['reason']))
else:
errors.append((id, results, None))
logging.error("[%s] Error: %s" % (id, results['reason']))
if len(errors) > 0:
message = "\nErrors for %s items:\n" % len(errors)
for id, error, msg in errors:
message += "\n\n"
if isinstance(error, Exception):
message += "[%s] Exception:\n\n" % id
message += msg
else:
message += "[%s] %s" % (id, error)
admin(message) # email if possible
logging.warning("\nErrors for %s." % len(errors))
logging.warning("Skipped %s." % len(skips))
logging.warning("Saved data for %s." % len(saved))
return saved + skips # all of the OK's
# Download file at `url`, cache to `destination`.
# Takes many options to customize behavior.
_download_zip_files = {}
def download(url, destination=None, options={}):
# uses cache by default, override (True) to ignore
force = options.get('force', False)
# saves in cache dir by default, override (False) to save to exact destination
to_cache = options.get('to_cache', True)
# unescapes HTML encoded characters by default, set this (True) to not do that
is_binary = options.get('binary', False)
# used by test suite to use special (versioned) test cache dir
test = options.get('test', False)
# if need a POST request with data
postdata = options.get('postdata', False)
timeout = float(options.get('timeout', 30)) # The low level socket api requires a float
urlopen_kwargs = {'timeout': timeout}
# caller cares about actually bytes or only success/fail
needs_content = options.get('needs_content', True) or not is_binary or postdata
# form the path to the file if we intend on saving it to disk
if destination:
if to_cache:
if test:
cache = test_cache_dir()
else:
cache = cache_dir()
cache_path = os.path.join(cache, destination)
else:
cache_path = destination
# If we are working in the cache directory, look for a zip file
# anywhere along the path like "cache/93/bills.zip", and see if
# the file is already cached inside it (e.g. as 'bills/pages/...").
# If it is, and force is true, then raise an Exception because we
# can't update the ZIP file with new content (I imagine it would
# be very slow). If force is false, return the content from the
# archive.
if destination and to_cache:
dparts = destination.split(os.sep)
for i in xrange(len(dparts) - 1):
# form the ZIP file name and test if it exists...
zfn = os.path.join(cache, *dparts[:i + 1]) + ".zip"
if not os.path.exists(zfn):
continue
# load and keep the ZIP file instance in memory because it's slow to instantiate this object
zf = _download_zip_files.get(zfn)
if not zf:
zf = zipfile.ZipFile(zfn, "r")
_download_zip_files[zfn] = zf
logging.warn("Loaded: %s" % zfn)
# see if the inner file exists, and if so read the bytes
try:
zfn_inner = os.path.join(*dparts[i:])
body = zf.read(zfn_inner)
except KeyError:
# does not exist
continue
if not test:
logging.info("Cached: (%s, %s)" % (zfn + "#" + zfn_inner, url))
if force:
raise Exception("Cannot re-download a file already cached to a ZIP file.")
if not is_binary:
body = body.decode("utf8")
body = unescape(body)
return body
# Load the file from disk if it's already been downloaded and force is False.
if destination and (not force) and os.path.exists(cache_path):
if not test:
logging.info("Cached: (%s, %s)" % (cache_path, url))
if not needs_content:
return True
with open(cache_path, 'r') as f:
body = f.read()
if not is_binary:
body = body.decode("utf8")
# Download from the network and cache to disk.
else:
try:
logging.info("Downloading: %s" % url)
if postdata:
response = scraper.urlopen(url, 'POST', postdata, **urlopen_kwargs)
else:
# If we're just downloading the file and the caller doesn't
# need the response data, then starting wget to download the
# file is much faster for large files. Don't know why. Something
# hopefully we can improve in scrapelib in the future.
#
# needs_content is currently only set to false when downloading
# bill text files like PDFs.
#
# Skip this fast path if wget is not present in its expected location.
with open(os.devnull, 'w') as tempf:
if platform.system() == 'Windows':
wget_exists = (subprocess.call("where wget", stdout=tempf, stderr=tempf, shell=True) == 0)
else:
wget_exists = (subprocess.call("which wget", stdout=tempf, stderr=tempf, shell=True) == 0)
if not needs_content and wget_exists:
mkdir_p(os.path.dirname(cache_path))
if subprocess.call(["wget", "-q", "-O", cache_path, url]) == 0:
return True
else:
# wget failed. when that happens it leaves a zero-byte file on disk, which
# for us means we've created an invalid file, so delete it.
os.unlink(cache_path)
return None
response = scraper.urlopen(url, **urlopen_kwargs)
if not is_binary:
body = response # a subclass of a 'unicode' instance
if not isinstance(body, unicode):
raise ValueError("Content not decoded.")
else:
body = response.bytes # a 'str' instance
if isinstance(body, unicode):
raise ValueError("Binary content improperly decoded.")
except scrapelib.HTTPError as e:
logging.error("Error downloading %s:\n\n%s" % (url, format_exception(e)))
return None
# don't allow 0-byte files
if (not body) or (not body.strip()):
return None
# cache content to disk
if destination:
write(body if is_binary else body.encode("utf8"), cache_path)
if not is_binary:
body = unescape(body)
return body
def write(content, destination, options={}):
if options.get("diff"):
# Instead of writing the file, do a comparison with what's on disk
# to test any changes. But be nice and replace any update date with
# what's in the previous file so we avoid spurrious changes. Use
# how updated_at appears in the JSON and in the XML.
if os.path.exists(destination):
with open(destination) as f:
existing_content = f.read()
for pattern in ('"updated_at": ".*?"', 'updated=".*?"'):
m1 = re.search(pattern, existing_content)
m2 = re.search(pattern, content)
if m1 and m2:
content = content.replace(m2.group(0), m1.group(0))
# Avoid writing to disk and spawning `diff` by checking if
# the files match in memory.
if content == existing_content:
return
# Shell `diff` and let it display output directly to the console.
# Write `content` to disk first so diff can see it. Maybe more
# efficient to pipe?
fn = "/tmp/congress-changed-file"
with open(fn, 'w') as f:
f.write(content)
os.system("diff -u %s %s" % (destination, fn))
os.unlink(fn)
return
# Save the content to disk.
mkdir_p(os.path.dirname(destination))
f = open(destination, 'w')
f.write(content)
f.close()
def write_json(data, destination):
return write(
json.dumps(data,
sort_keys=True,
indent=2,
default=format_datetime
),
destination
)
def read(destination):
if os.path.exists(destination):
with open(destination) as f:
return f.read()
# dict1 gets overwritten with anything in dict2
def merge(dict1, dict2):
return dict(dict1.items() + dict2.items())
# de-dupe a list, taken from:
# http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order
def uniq(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if x not in seen and not seen_add(x)]
import os
import errno
# mkdir -p in python, from:
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else:
raise
def xpath_regex(doc, element, pattern):
return doc.xpath(
"//%s[re:match(text(), '%s')]" % (element, pattern),
namespaces={"re": "http://exslt.org/regular-expressions"})
# taken from http://effbot.org/zone/re-sub.htm#unescape-html
def unescape(text):
def remove_unicode_control(str):
remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
return remove_re.sub('', str)
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
text = re.sub("&#?\w+;", fixup, text)
text = remove_unicode_control(text)
return text
def extract_bills(text, session):
bill_ids = []
p = re.compile('((S\.|H\.)(\s?J\.|\s?R\.|\s?Con\.| ?)(\s?Res\.)*\s?\d+)', flags=re.IGNORECASE)
bill_matches = p.findall(text)
if bill_matches:
for b in bill_matches:
bill_text = "%s-%s" % (b[0].lower().replace(" ", '').replace('.', '').replace("con", "c"), session)
if bill_text not in bill_ids:
bill_ids.append(bill_text)
return bill_ids
# uses config values if present
def cache_dir():
cache = None
if config:
output = config.get('output', None)
if output:
cache = output.get('cache', None)
if not cache:
cache = "cache"
return cache
def test_cache_dir():
return "test/fixtures/cache"
# uses config values if present
def data_dir():
data = None
if config:
output = config.get('output', None)
if output:
data = output.get('data', None)
if not data:
data = "data"
return data
# if email settings are supplied, email the text - otherwise, just print it
def admin(body):
try:
if isinstance(body, Exception):
body = format_exception(body)
logging.error(body) # always print it
if config:
details = config.get('email', None)
if details:
send_email(body)
except Exception as exception:
print "Exception logging message to admin, halting as to avoid loop"
print format_exception(exception)
def format_exception(exception):
exc_type, exc_value, exc_traceback = sys.exc_info()
return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
# this should only be called if the settings are definitely there
def send_email(message):
settings = config['email']
# adapted from http://www.doughellmann.com/PyMOTW/smtplib/
msg = MIMEText(message)
msg.set_unixfrom('author')
msg['To'] = email.utils.formataddr(('Recipient', settings['to']))
msg['From'] = email.utils.formataddr((settings['from_name'], settings['from']))
msg['Subject'] = settings['subject']
server = smtplib.SMTP(settings['hostname'])
try:
server.ehlo()
if settings['starttls'] and server.has_extn('STARTTLS'):
server.starttls()
server.ehlo()
server.login(settings['user_name'], settings['password'])
server.sendmail(settings['from'], [settings['to']], msg.as_string())
finally:
server.quit()
logging.info("Sent email to %s" % settings['to'])
thomas_types = {
'hr': ('HR', 'H.R.'),
'hres': ('HE', 'H.RES.'),
'hjres': ('HJ', 'H.J.RES.'),
'hconres': ('HC', 'H.CON.RES.'),
's': ('SN', 'S.'),
'sres': ('SE', 'S.RES.'),
'sjres': ('SJ', 'S.J.RES.'),
'sconres': ('SC', 'S.CON.RES.'),
'hamdt': ('HZ', 'H.AMDT.'),
'samdt': ('SP', 'S.AMDT.'),
'supamdt': ('SU', 'S.UP.AMDT.'),
}
thomas_types_2 = dict((v[0], k) for (k, v) in thomas_types.items()) # map e.g. { SE: sres, ...}
# cached committee map to map names to IDs
committee_names = {}
# get the mapping from THOMAS's committee names to THOMAS's committee IDs
# found on the advanced search page. committee_names[congress][name] = ID
# with subcommittee names as the committee name plus a pipe plus the subcommittee
# name.
def fetch_committee_names(congress, options):
congress = int(congress)
# Parse the THOMAS advanced search pages for the names that THOMAS uses for
# committees on bill pages, and map those to the IDs for the committees that are
# listed on the advanced search pages (but aren't shown on bill pages).
if not options.get('test', False):
logging.info("[%d] Fetching committee names..." % congress)
# allow body to be passed in from fixtures
if options.has_key('body'):
body = options['body']
else:
body = download(
"http://thomas.loc.gov/home/LegislativeData.php?&n=BSS&c=%d" % congress,
"%s/meta/thomas_committee_names.html" % congress,
options)
for chamber, options in re.findall('>Choose (House|Senate) Committees</option>(.*?)</select>', body, re.I | re.S):
for name, id in re.findall(r'<option value="(.*?)\{(.*?)}">', options, re.I | re.S):
id = str(id).upper()
name = name.strip().replace(" ", " ") # weirdness
if id.endswith("00"):
# Map chamber + committee name to its ID, minus the 00 at the end. On bill pages,
# committees appear as e.g. "House Finance." Except the JCSE.
if id != "JCSE00":
name = chamber + " " + name
# Correct for some oddness on THOMAS (but not on Congress.gov): The House Committee
# on House Administration appears just as "House Administration" and in the 104th/105th
# Congresses appears as "House Oversight" (likewise the full name is House Committee
# on House Oversight --- it's the House Administration committee still).
if name == "House House Administration":
name = "House Administration"
if name == "House House Oversight":
name = "House Oversight"
committee_names[name] = id[0:-2]
else:
# map committee ID + "|" + subcommittee name to the zero-padded subcommittee numeric ID
committee_names[id[0:-2] + "|" + name] = id[-2:]
# Correct for a limited number of other ways committees appear, owing probably to the
# committee name being changed mid-way through a Congress.
if congress == 95:
committee_names["House Intelligence (Select)"] = committee_names["House Intelligence (Permanent Select)"]
if congress == 96:
committee_names["Senate Human Resources"] = "SSHR"
if congress == 97:
committee_names["Senate Small Business (Select)"] = committee_names["Senate Small Business"]
if congress == 98:
committee_names["Senate Indian Affairs (Select)"] = committee_names["Senate Indian Affairs (Permanent Select)"]
if congress == 100:
committee_names["HSPO|Hoc Task Force on Presidential Pay Recommendation"] = committee_names["HSPO|Ad Hoc Task Force on Presidential Pay Recommendation"]
if congress == 103:
committee_names["Senate Indian Affairs (Permanent Select)"] = committee_names["Senate Indian Affairs"]
if congress == 108:
# This appears to be a mistake, a subcommittee appearing as a full committee. Map it to
# the full committee for now.
committee_names["House Antitrust (Full Committee Task Force)"] = committee_names["House Judiciary"]
committee_names["House Homeland Security"] = committee_names["House Homeland Security (Select)"]
if congress in range(108, 113):
committee_names["House Intelligence"] = committee_names["House Intelligence (Permanent Select)"]
def make_node(parent, tag, text, **attrs):
"""Make a node in an XML document."""
n = etree.Element(tag)
parent.append(n)
n.text = text
for k, v in attrs.items():
if v is None:
continue
if isinstance(v, datetime.datetime):
v = format_datetime(v)
n.set(k.replace("___", ""), v)
return n
# Correct mistakes on THOMAS
def thomas_corrections(thomas_id):
# C.A. Dutch Ruppersberger
if thomas_id == "02188":
thomas_id = "01728"
# Pat Toomey
if thomas_id == "01594":
thomas_id = "02085"
return thomas_id
# Return a subset of a mapping type
def slice_map(m, *args):
n = {}
for arg in args:
if arg in m:
n[arg] = m[arg]
return n
# Load a YAML file directly.
def direct_yaml_load(filename):
import yaml
try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import Loader, Dumper
return yaml.load(open(filename), Loader=Loader)
# Load a pickle file.
def pickle_load(filename):
import pickle
return pickle.load(open(filename))
# Write to a pickle file.
def pickle_write(data, filename):
import pickle
mkdir_p(os.path.dirname(filename))
return pickle.dump(data, open(filename, "w"))
# Get the hash used to verify the contents of a file.
def get_file_hash(filename):
import hashlib
return hashlib.sha1(open(filename).read()).hexdigest()
# Get the location of the cached version of a file.
def get_cache_filename(filename):
return os.path.join(cache_dir(), filename + '.pickle')
# Check if the cached file is newer.
def check_cached_file(filename, cache_filename):
return (os.path.exists(cache_filename) and os.stat(cache_filename).st_mtime > os.stat(filename).st_mtime)
# Problem with finding a cache entry.
# Cache a file.
def cache_write(file_data, filename, file_hash):
cache_data = {"hash": file_hash, "data": file_data}
return pickle_write(cache_data, filename)
# Attempt to load a cached version of a YAML file before loading the YAML file directly.
def yaml_load(filename):
file_hash = get_file_hash(filename)
cache_filename = get_cache_filename(filename)
# Try to load a cached version of the requested YAML file.
try:
yaml_data = cache_load(cache_filename, file_hash)
except CacheError:
# We don't have a cached version of the requested YAML file available, so we have to load it directly.
logging.warn("Using original YAML file...")
# Load the requested YAML file directly.
yaml_data = direct_yaml_load(filename)
# Cache the YAML data so we can retrieve it more quickly next time.
cache_write(yaml_data, cache_filename, file_hash)
else:
# We have a cached version of the requested YAML file available, so we can use it.
logging.info("Using cached YAML file...")
return yaml_data
# Make sure we have the congress-legislators repository available.
has_congress_legislators_repo = False
def require_congress_legislators_repo():
global has_congress_legislators_repo
# Once we have the congress-legislators repo, we don't need to keep getting it.
if has_congress_legislators_repo:
return
# Clone the congress-legislators repo if we don't have it.
if not os.path.exists("congress-legislators"):
logging.warn("Cloning the congress-legislators repo...")
os.system("git clone -q --depth 1 https://github.com/unitedstates/congress-legislators congress-legislators")
if os.environ.get("UPDATE_CONGRESS_LEGISLATORS") != "NO":
# Update the repo so we have the latest.
logging.warn("Updating the congress-legislators repo...")
# these two == git pull, but git pull ignores -q on the merge part so is less quiet
os.system("cd congress-legislators; git fetch -pq; git merge --ff-only -q origin/master")
# We now have the congress-legislators repo.
has_congress_legislators_repo = True
lookup_legislator_cache = []
def lookup_legislator(congress, role_type, name, state, party, when, id_requested, exclude=set()):
# This is a basic lookup function given the legislator's name, state, party,
# and the date of the vote.
# On the first load, cache all of the legislators' terms in memory.
# Group by Congress so we can limit our search later to be faster.
global lookup_legislator_cache
if not lookup_legislator_cache:
require_congress_legislators_repo()
lookup_legislator_cache = {} # from Congress number to list of (moc,term) tuples that might be in that Congress
for filename in ("legislators-historical", "legislators-current"):
for moc in yaml_load("congress-legislators/%s.yaml" % (filename)):
for term in moc["terms"]:
for c in xrange(congress_from_legislative_year(int(term['start'][0:4])) - 1,
congress_from_legislative_year(int(term['end'][0:4])) + 1 + 1):
lookup_legislator_cache.setdefault(c, []).append((moc, term))
def to_ascii(name):
name = name.replace("-", " ")
if not isinstance(name, unicode):
return name
import unicodedata
return u"".join(c for c in unicodedata.normalize('NFKD', name) if not unicodedata.combining(c))
# Scan all of the terms that cover 'when' for a match.
if isinstance(when, datetime.datetime):
when = when.date()
when = when.isoformat()
name_parts = to_ascii(name).split(", ", 1)
matches = []
for moc, term in lookup_legislator_cache[congress]:
# Make sure the date is surrounded by the term start/end dates.
if term['start'] > when:
continue # comparing ISO-formatted date strings
if term['end'] < when:
continue # comparing ISO-formatted date strings
# Compare the role type, state, and party, except for people who we know changed party.
if term['type'] != role_type:
continue
if term['state'] != state:
continue
if term['party'][0] != party and name not in ("Laughlin", "Crenshaw", "Goode", "Martinez", "Parker", "Emerson", "Tauzin", "Hayes", "Deal", "Forbes"):
continue
# When doing process-of-elimination matching, don't match on people we've already seen.
if moc["id"].get(id_requested) in exclude:
continue
# Compare the last name. Allow "Chenoweth" to match "Chenoweth Hage", but also
# allow "Millender McDonald" to match itself.
for name_info_rec in [moc['name']] + moc.get('other_names', []):
# for other_names, check that the record covers the right date range
if 'start' in name_info_rec and name_info_rec['start'] > when:
continue # comparing ISO-formatted date strings
if 'end' in name_info_rec and name_info_rec['end'] < when:
continue # comparing ISO-formatted date strings
# in order to process an other_name we have to go like this...
name_info = dict(moc['name']) # clone
name_info.update(name_info_rec) # override with the other_name information
# check last name
if name_parts[0] != to_ascii(name_info['last']) \
and name_parts[0] not in to_ascii(name_info['last']).split(" "):
continue # no match
# Compare the first name. Allow it to match either the first or middle name,
# and an initialized version of the first name (i.e. "E." matches "Eddie").
# Test the whole string (so that "Jo Ann" is compared to "Jo Ann") but also
# the first part of a string split (so "E. B." is compared as "E." to "Eddie").
first_names = (to_ascii(name_info['first']), to_ascii(name_info.get('nickname', "")), to_ascii(name_info['first'])[0] + ".")
if len(name_parts) >= 2 and \
name_parts[1] not in first_names and \
name_parts[1].split(" ")[0] not in first_names:
continue
break # match
else:
# no match
continue
# This is a possible match.
matches.append((moc, term))
# Return if there is a unique match.
if len(matches) == 0:
logging.warn("Could not match name %s (%s-%s; %s) to any legislator." % (name, state, party, when))
return None
if len(matches) > 1:
logging.warn("Multiple matches of name %s (%s-%s; %s) to legislators (excludes %s)." % (name, state, party, when, str(exclude)))
return None
return matches[0][0]['id'][id_requested]
# Create a map from one piece of legislators data to another.
# 'map_from' and 'map_to' are plain text terms used for the logging output and the filenames.
# 'map_function' is the function that actually does the mapping from one value to another.
# 'filename' is the source of the data to be mapped. (Default: "legislators-current")
# 'legislators_map' is the base object to build the map on top of; it's primarily used to combine maps using create_combined_legislators_map(). (Default: {})
def create_legislators_map(map_from, map_to, map_function, filename="legislators-current", legislators_map={}):
# Make sure we have the congress-legislators repo available.
require_congress_legislators_repo()
cache_filename = get_cache_filename("map-%s-%s-%s" % (map_from.lower().replace(" ", "_"), map_to.lower().replace(" ", "_"), filename))
# Check if the cached pickle file is newer than the original YAML file.
if check_cached_file("congress-legislators/%s.yaml" % (filename), cache_filename):
# The pickle file is newer, so it's probably safe to use the cached map.
logging.info("Using cached map from %s to %s for %s..." % (map_from, map_to, filename))
legislators_map = pickle_load(cache_filename)
else:
# The YAML file is newer, so we have to generate a new map.
logging.warn("Generating new map from %s to %s for %s..." % (map_from, map_to, filename))
# Load the YAML file and create a map based on the provided map function.
# Because we'll be caching the YAML file in a pickled file, create the cache
# directory where that will be stored.
if not os.path.exists("cache/congress-legislators"):
os.mkdir("cache/congress-legislators")
for item in yaml_load("congress-legislators/%s.yaml" % (filename)):
legislators_map = map_function(legislators_map, item)
# Save the new map to a new pickle file.
pickle_write(legislators_map, cache_filename)
return legislators_map
# Create a legislators map combining data from multiple legislators files.
# 'map_from', 'map_to', 'map_function' are passed directly to create_legislators_map().
# 'filenames' is the list of the sources of the data to be mapped. (Default: [ "executive", "legislators-historical", "legislators-current" ])
def create_combined_legislators_map(map_from, map_to, map_function, filenames=["executive", "legislators-historical", "legislators-current"]):
combined_legislators_map = {}
for filename in filenames:
combined_legislators_map = create_legislators_map(map_from, map_to, map_function, filename, combined_legislators_map)
return combined_legislators_map
# Generate a map between a person's many IDs.
person_id_map = {}
def generate_person_id_map():
def map_function(person_id_map, person):
for source_id_type, source_id in person["id"].items():
# Instantiate this ID type.
if source_id_type not in person_id_map:
person_id_map[source_id_type] = {}
# Certain ID types have multiple IDs.
source_ids = source_id if isinstance(source_id, list) else [source_id]
for source_id in source_ids:
# Instantiate this value for this ID type.
if source_id not in person_id_map[source_id_type]:
person_id_map[source_id_type][source_id] = {}
# Loop through all the ID types and values and map them to this ID type.
for target_id_type, target_id in person["id"].items():
# Don't map an ID type to itself.
if target_id_type != source_id_type:
person_id_map[source_id_type][source_id][target_id_type] = target_id
return person_id_map
# Make the person ID map available in the global space.
global person_id_map
person_id_map = create_combined_legislators_map("person", "ID", map_function)
# Return the map generated by generate_person_id_map().
def get_person_id_map():
global person_id_map
# If the person ID map is not available yet, generate it.
if not person_id_map:
generate_person_id_map()
return person_id_map
# Get a particular ID for a person from another ID.
# 'source_id_type' is the ID type provided to identify the person.
# 'source_id' is the provided ID of the aforementioned type.
# 'target_id_type' is the desired ID type for the aforementioned person.
def get_person_id(source_id_type, source_id, target_id_type):
person_id_map = get_person_id_map()
if source_id_type not in person_id_map:
raise KeyError("'%s' is not a valid ID type." % (source_id_type))
if source_id not in person_id_map[source_id_type]:
raise KeyError("'%s' is not a valid '%s' ID." % (source_id, source_id_type))
if target_id_type not in person_id_map[source_id_type][source_id]:
raise KeyError("No corresponding '%s' ID for '%s' ID '%s'." % (target_id_type, source_id_type, source_id))
return person_id_map[source_id_type][source_id][target_id_type]
# Generate a map from a person to the Congresses they served during.
person_congresses_map = {}
def generate_person_congresses_map():
def map_function(person_congresses_map, person):
try:
bioguide_id = person["id"]["bioguide"]
except KeyError:
# print person["id"], person["name"]
return person_congresses_map
if bioguide_id not in person_congresses_map:
person_congresses_map[bioguide_id] = []
for term in person["terms"]:
for congress in get_term_congresses(term):
person_congresses_map[bioguide_id].append(congress)
person_congresses_map[bioguide_id].sort()
return person_congresses_map
# Make the person congresses map available in the global space.
global person_congresses_map
person_congresses_map = create_combined_legislators_map("person", "Congresses", map_function)
# Return the map generated by generate_person_congresses_map().
def get_person_congresses_map():
global person_congresses_map
# If the person Congresses map is not available yet, generate it.
if not person_congresses_map:
generate_person_congresses_map()
return person_congresses_map
# Get a list of Congresses that a person served during.
# 'person_id' is the ID of the desired person.
# 'person_id_type' is the ID type provided. (Default: "bioguide")
def get_person_congresses(person_id, person_id_type="bioguide"):
bioguide_id = person_id if person_id_type == "bioguide" else get_person_id(person_id_type, person_id, "bioguide")
person_congresses_map = get_person_congresses_map()
if bioguide_id not in person_congresses_map:
raise KeyError("No known Congresses for BioGuide ID '%s'." % (bioguide_id))
return person_congresses_map[bioguide_id]
# Generate a map from a Congress to the persons who served during it.
congress_persons_map = {}
def generate_congress_persons_map():
def map_function(congress_persons_map, person):
try:
bioguide_id = person["id"]["bioguide"]
except KeyError:
# print person["id"], person["name"]
return congress_persons_map
for term in person["terms"]:
for congress in get_term_congresses(term):
if congress not in congress_persons_map:
congress_persons_map[congress] = set()
congress_persons_map[congress].add(bioguide_id)
return congress_persons_map
# Make the person congresses map available in the global space.
global congress_persons_map
congress_persons_map = create_combined_legislators_map("Congress", "persons", map_function)
# Return the map generated by generate_congress_persons_map().
def get_congress_persons_map():
global congress_persons_map
# If the Congress persons map is not available yet, generate it.
if not congress_persons_map:
generate_congress_persons_map()
return congress_persons_map
# Get a list of persons who served during a particular Congress.
# 'congress' is the desired Congress.
def get_congress_persons(congress):
congress_persons_map = get_congress_persons_map()
if congress not in congress_persons_map:
raise KeyError("No known persons for Congress '%s'." % (congress))
return congress_persons_map[congress]
# XXX: This exception is deprecated. (It has a typo.) Only use in relation to get_govtrack_person_id().
class UnmatchedIdentifer(Exception):
def __init__(self, id_type, id_value, help_url):
super(UnmatchedIdentifer, self).__init__("%s=%s %s" % (id_type, str(id_value), help_url))
# XXX: This function is deprecated. Use get_person_id() instead.
def get_govtrack_person_id(source_id_type, source_id):
try:
govtrack_person_id = get_person_id(source_id_type, source_id, "govtrack")
except KeyError:
see_also = ""
if source_id_type == "thomas":
# Suggest a URL on congress.gov to quickly look up who the ID corresponds to.
# We store the IDs as strings with leading zeroes like on THOMAS, but in
# Congress.gov URLs it must not be zero-padded.
see_also = "http://www.congress.gov/member/xxx/%d" % int(source_id)
logging.error("GovTrack ID not known for %s %s. (%s)" % (source_id_type, str(source_id), see_also))
raise UnmatchedIdentifer(source_id_type, source_id, see_also)
return govtrack_person_id