Files
congress/tasks/nomination_info.py
2014-05-05 16:09:17 -04:00

249 lines
7.5 KiB
Python

import utils
import logging
import re
import json
from datetime import datetime
from lxml import etree
import time
from lxml.html import fromstring
# can be run on its own, just require a nomination_id (e.g. PN2094-112)
def run(options):
nomination_id = options.get('nomination_id', None)
if nomination_id:
result = fetch_nomination(nomination_id, options)
logging.warn("\n%s" % result)
else:
logging.error("To run this task directly, supply a bill_id.")
# download and cache page for nomination
def fetch_nomination(nomination_id, options={}):
logging.info("\n[%s] Fetching..." % nomination_id)
# fetch committee name map, if it doesn't already exist
nomination_type, number, congress = utils.split_nomination_id(nomination_id)
if not number:
return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id}
if not utils.committee_names:
utils.fetch_committee_names(congress, options)
# fetch bill details body
body = utils.download(
nomination_url_for(nomination_id),
nomination_cache_for(nomination_id, "information.html"), options)
if not body:
return {'saved': False, 'ok': False, 'reason': "failed to download"}
if options.get("download_only", False):
return {'saved': False, 'ok': True, 'reason': "requested download only"}
# TODO:
# detect group nominations, particularly for military promotions
# detect when a group nomination is split into subnominations
#
# Also, the splitting process is nonsense:
# http://thomas.loc.gov/home/PN/split.htm
if "split into two or more parts" in body:
return {'saved': False, 'ok': True, 'reason': 'was split'}
nomination = parse_nomination(nomination_id, body, options)
output_nomination(nomination, options)
return {'ok': True, 'saved': True}
def parse_nomination(nomination_id, body, options):
nomination_type, number, congress = utils.split_nomination_id(nomination_id)
# remove (and store) comments, which contain some info for the nomination
# but also mess up the parser
facts = re.findall("<!--(.+?)-->", body)
body = re.sub("<!--.+?-->", "", body)
doc = fromstring(body)
# get rid of centered bold labels, they screw stuff up,
# e.g. agency names on PN1375-113
body = re.sub(re.compile("<div align=\"center\">.+?</div>", re.M), "", body)
for elem in doc.xpath('//div[@align="center"]'):
elem.getparent().remove(elem)
committee_names = []
committees = []
info = {
'nomination_id': nomination_id, 'actions': []
}
# the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
for pair in doc.xpath('//span[@class="elabel"]|//strong'):
if pair.tail:
text = pair.text or pair.text_content()
label, data = text.replace(':', '').strip(), pair.tail.strip()
# handle actions separately
if label.split(" ")[-1] == "Action":
pieces = re.split("\s+\-\s+", data)
location = label.split(" ")[0].lower()
# use 'acted_at', even though it's always a date, to be consistent
# with acted_at field on bills and amendments
acted_at = datetime.strptime(pieces[0], "%B %d, %Y").strftime("%Y-%m-%d")
# join rest back together (in case action itself has a hyphen)
text = str.join(" - ", pieces[1:len(pieces)])
info['actions'].append({
"type": "action",
"location": location,
"acted_at": acted_at,
"text": text
})
else:
# let's handle these cases one by one
if label == "Organization":
info["organization"] = data
elif label == "Control Number":
# this doesn't seem useful
pass
elif label.lower() == "referred to":
committee_names.append(data)
elif label == "Reported by":
info["reported_by"] = data
elif label == "Nomination":
# sanity check - verify nomination_id matches
if nomination_id != data:
raise Exception("Whoa! Mismatched nomination ID.")
elif label == "Date Received":
# Note: Will break with the 1000th congress in year 3789
match = re.search("(\d{2,3})[stndhr]{2}", data)
if match:
info["congress"] = int(match.group(1))
else:
raise Exception("Choked, couldn't find Congress in \"%s\"" % data)
# Doc format is: "January 04, 1995 (104th Congress)"
info["received_on"] = datetime.strptime(data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")
elif label == "Nominee":
# ignore any vice suffix
name = data.split(", vice")[0]
try:
name = re.search("(.+?),", name).groups()[0]
except Exception, e:
raise Exception("Couldn't parse nominee entry: %s" % name)
# Some begin "One nomination,...", so 'List of Nominees' will get it
if "nomination" in name:
pass
# and grab the state and position out of the comment facts
if facts[-5]:
position = facts[-5]
else:
raise Exception("Couldn't find the position in the comments.")
info["nominees"] = [{
"name": name,
"position": position,
"state": facts[-6][2:]
}]
elif label.lower() == "nominees":
pass
elif label.lower() == "authority date":
pass
elif label.lower() == "list of nominees":
# step through each sibling, collecting each br's stripped tail for names as we go
# stop when we get to a strong or span (next label)
nominees = []
current_position = None
for sibling in pair.itersiblings():
if sibling.tag == "br":
if sibling.tail:
name = sibling.tail.strip()
if (name[0:5].lower() == "to be"):
current_position = name[6:].strip()
elif name:
nominees.append({
"name": sibling.tail.strip(),
"position": current_position
})
elif (sibling.tag == "strong") or (sibling.tag == "span"):
break
info["nominees"] = nominees
else:
# choke, I think we handle all of them now
raise Exception("Unrecognized label: %s" % label)
if not info.get("received_on", None):
raise Exception("Choked, couldn't find received date.")
if not info.get("nominees", None):
raise Exception("Choked, couldn't find nominee info.")
# try to normalize committee name to an ID
# choke if it doesn't work - the names should match up.
for committee_name in committee_names:
committee_id = utils.committee_names[committee_name]
committees.append(committee_id)
info["referred_to"] = committees
info["referred_to_names"] = committee_names
return info
# directory helpers
def output_for_nomination(nomination_id, format):
nomination_type, number, congress = utils.split_nomination_id(nomination_id)
return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number, "data.%s" % format)
def nomination_url_for(nomination_id):
nomination_type, number, congress = utils.split_nomination_id(nomination_id)
# numbers can be either of the form "63" or "64-01"
number_pieces = number.split("-")
if len(number_pieces) == 1:
number_pieces.append("00")
url_number = "%05d%s" % (int(number_pieces[0]), number_pieces[1])
return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%s:/" % (int(congress), nomination_type.upper(), url_number)
def nomination_cache_for(nomination_id, file):
nomination_type, number, congress = utils.split_nomination_id(nomination_id)
return "%s/nominations/%s/%s" % (congress, number, file)
def output_nomination(nomination, options):
logging.info("[%s] Writing to disk..." % nomination['nomination_id'])
# output JSON - so easy!
utils.write(
json.dumps(nomination, sort_keys=True, indent=2, default=utils.format_datetime),
output_for_nomination(nomination['nomination_id'], "json")
)