Files
congress/tasks/bill_info.py

1772 lines
70 KiB
Python

import utils
import logging
import re
import json
from lxml import etree
import time
import datetime
from lxml.html import fromstring, HtmlElement
# can be run on its own, just require a bill_id
def run(options):
bill_id = options.get('bill_id', None)
if bill_id:
result = fetch_bill(bill_id, options)
logging.warn("\n%s" % result)
else:
logging.error("To run this task directly, supply a bill_id.")
# download and cache landing page for bill
# can raise an exception under various conditions
def fetch_bill(bill_id, options):
logging.info("\n[%s] Fetching..." % bill_id)
# fetch committee name map, if it doesn't already exist
bill_type, number, congress = utils.split_bill_id(bill_id)
if not utils.committee_names:
utils.fetch_committee_names(congress, options)
# fetch bill details body
body = utils.download(
bill_url_for(bill_id),
bill_cache_for(bill_id, "information.html"),
options)
if not body:
return {'saved': False, 'ok': False, 'reason': "failed to download"}
if options.get("download_only", False):
return {'saved': False, 'ok': True, 'reason': "requested download only"}
if reserved_bill(body):
logging.warn("[%s] Reserved bill, not real, skipping..." % bill_id)
return {'saved': False, 'ok': True, 'reason': "reserved bill"}
# conditions where we want to parse the bill from multiple pages instead of one:
# 1) the all info page is truncated (~5-10 bills a congress)
# e.g. s1867-112, hr2112-112, s3240-112
if "</html>" not in body:
logging.info("[%s] Main page truncated, fetching many pages..." % bill_id)
bill = parse_bill_split(bill_id, body, options)
# 2) there are > 150 amendments, use undocumented amendments list (~5-10 bills a congress)
# e.g. hr3590-111, sconres13-111, s3240-112
elif too_many_amendments(body):
logging.info("[%s] Too many amendments, fetching many pages..." % bill_id)
bill = parse_bill_split(bill_id, body, options)
# 3) when I feel like it
elif options.get('force_split', False):
logging.info("[%s] Forcing a split, fetching many pages..." % bill_id)
bill = parse_bill_split(bill_id, body, options)
# Otherwise, get the bill's data from a single All Information page
else:
bill = parse_bill(bill_id, body, options)
output_bill(bill, options)
# output PDF and/or HTML file if requested
if not options.get("formats", False):
return {'ok': True, 'saved': True}
status = {'ok': True, 'saved': True}
options["formats"] = options["formats"].lower()
if options["formats"].lower() == "all":
formats = ["pdf", "html"]
else:
formats = options["formats"].split(",")
gpo_urls = get_GPO_url_for_bill(bill_id, options)
for fmt in formats:
if gpo_urls and fmt in gpo_urls:
utils.write(utils.download(gpo_urls[fmt], bill_cache_for(bill_id, "bill." + fmt), {'binary': True}), output_for_bill(bill_id, fmt))
logging.info("Saving %s format for %s" % (fmt, bill_id))
status[fmt] = True
else:
status[fmt] = False
return status
def parse_bill(bill_id, body, options):
bill_type, number, congress = utils.split_bill_id(bill_id)
# parse everything out of the All Information page
introduced_at = introduced_at_for(body)
by_request = parse_by_request(body)
sponsor = sponsor_for(body)
cosponsors = cosponsors_for(body)
summary = summary_for(body)
titles = titles_for(body)
actions = actions_for(body, bill_id)
related_bills = related_bills_for(body, congress, bill_id)
subjects = subjects_for(body)
committees = committees_for(body, bill_id)
amendments = amendments_for(body, bill_id)
return process_bill(bill_id, options, introduced_at, by_request, sponsor, cosponsors,
summary, titles, actions, related_bills, subjects, committees, amendments)
# parse information pieced together from various pages
def parse_bill_split(bill_id, body, options):
bill_type, number, congress = utils.split_bill_id(bill_id)
# get some info out of the All Info page, since we already have it
introduced_at = introduced_at_for(body)
by_request = parse_by_request(body)
sponsor = sponsor_for(body)
subjects = subjects_for(body)
# cosponsors page
cosponsors_body = utils.download(
bill_url_for(bill_id, "P"),
bill_cache_for(bill_id, "cosponsors.html"),
options)
cosponsors_body = utils.unescape(cosponsors_body)
cosponsors = cosponsors_for(cosponsors_body)
# summary page
summary_body = utils.download(
bill_url_for(bill_id, "D"),
bill_cache_for(bill_id, "summary.html"),
options)
summary_body = utils.unescape(summary_body)
summary = summary_for(summary_body)
# titles page
titles_body = utils.download(
bill_url_for(bill_id, "T"),
bill_cache_for(bill_id, "titles.html"),
options)
titles_body = utils.unescape(titles_body)
titles = titles_for(titles_body)
# actions page
actions_body = utils.download(
bill_url_for(bill_id, "X"),
bill_cache_for(bill_id, "actions.html"),
options)
actions_body = utils.unescape(actions_body)
actions = actions_for(actions_body, bill_id)
related_bills_body = utils.download(
bill_url_for(bill_id, "K"),
bill_cache_for(bill_id, "related_bills.html"),
options)
related_bills_body = utils.unescape(related_bills_body)
related_bills = related_bills_for(related_bills_body, congress, bill_id)
amendments_body = utils.download(
bill_url_for(bill_id, "A"),
bill_cache_for(bill_id, "amendments.html"),
options)
amendments_body = utils.unescape(amendments_body)
amendments = amendments_for_standalone(amendments_body, bill_id)
committees_body = utils.download(
bill_url_for(bill_id, "C"),
bill_cache_for(bill_id, "committees.html"),
options)
committees_body = utils.unescape(committees_body)
committees = committees_for(committees_body, bill_id)
return process_bill(bill_id, options, introduced_at, by_request, sponsor, cosponsors,
summary, titles, actions, related_bills, subjects, committees, amendments)
# take the initial parsed content, extract more information, assemble output data
def process_bill(bill_id, options,
introduced_at, by_request, sponsor, cosponsors,
summary, titles, actions, related_bills, subjects, committees, amendments):
bill_type, number, congress = utils.split_bill_id(bill_id)
# for convenience: extract out current title of each type
official_title = current_title_for(titles, "official")
short_title = current_title_for(titles, "short")
popular_title = current_title_for(titles, "popular")
# add metadata to each action, establish current status
actions = process_actions(actions, bill_id, official_title, introduced_at)
# pull out latest status change and the date of it
status, status_date = latest_status(actions)
if not status: # default to introduced
status = "INTRODUCED"
status_date = introduced_at
# pull out some very useful history information from the actions
history = history_from_actions(actions)
slip_law = slip_law_from(actions)
return {
'bill_id': bill_id,
'bill_type': bill_type,
'number': number,
'congress': congress,
'introduced_at': introduced_at,
'by_request': by_request,
'sponsor': sponsor,
'cosponsors': cosponsors,
'actions': actions,
'history': history,
'status': status,
'status_at': status_date,
'enacted_as': slip_law,
'titles': titles,
'official_title': official_title,
'short_title': short_title,
'popular_title': popular_title,
'summary': summary,
'subjects_top_term': subjects[0],
'subjects': subjects[1],
'related_bills': related_bills,
'committees': committees,
'amendments': amendments,
'updated_at': datetime.datetime.fromtimestamp(time.time()),
}
def output_bill(bill, options):
logging.info("[%s] Writing to disk..." % bill['bill_id'])
# output JSON - so easy!
utils.write(
json.dumps(bill, sort_keys=True, indent=2, default=utils.format_datetime),
output_for_bill(bill['bill_id'], "json"),
options=options,
)
# output XML
govtrack_type_codes = {'hr': 'h', 's': 's', 'hres': 'hr', 'sres': 'sr', 'hjres': 'hj', 'sjres': 'sj', 'hconres': 'hc', 'sconres': 'sc'}
root = etree.Element("bill")
root.set("session", bill['congress'])
root.set("type", govtrack_type_codes[bill['bill_type']])
root.set("number", bill['number'])
root.set("updated", utils.format_datetime(bill['updated_at']))
def make_node(parent, tag, text, **attrs):
if options.get("govtrack", False):
# Rewrite thomas_id attributes as just id with GovTrack person IDs.
attrs2 = {}
for k, v in attrs.items():
if v:
if k == "thomas_id":
# remap "thomas_id" attributes to govtrack "id"
k = "id"
v = str(utils.get_govtrack_person_id('thomas', v))
attrs2[k] = v
attrs = attrs2
return utils.make_node(parent, tag, text, **attrs)
# for American Memory Century of Lawmaking bills...
for source in bill.get("sources", []):
n = make_node(root, "source", "")
for k, v in sorted(source.items()):
if k == "source":
n.text = v
elif k == "source_url":
n.set("url", v)
else:
n.set(k, unicode(v))
if "original_bill_number" in bill:
make_node(root, "bill-number", bill["original_bill_number"])
make_node(root, "state", bill['status'], datetime=bill['status_at'])
old_status = make_node(root, "status", None)
make_node(old_status, "introduced" if bill['status'] in ("INTRODUCED", "REFERRED") else "unknown", None, datetime=bill['status_at']) # dummy for the sake of comparison
make_node(root, "introduced", None, datetime=bill['introduced_at'])
titles = make_node(root, "titles", None)
for title in bill['titles']:
n = make_node(titles, "title", title['title'])
n.set("type", title['type'])
if title['as']:
n.set("as", title['as'])
if title['is_for_portion']:
n.set("partial", "1")
if bill['sponsor']:
# TODO: Sponsored by committee?
make_node(root, "sponsor", None, thomas_id=bill['sponsor']['thomas_id'])
else:
make_node(root, "sponsor", None)
cosponsors = make_node(root, "cosponsors", None)
for cosp in bill['cosponsors']:
n = make_node(cosponsors, "cosponsor", None, thomas_id=cosp["thomas_id"])
if cosp["sponsored_at"]:
n.set("joined", cosp["sponsored_at"])
if cosp["withdrawn_at"]:
n.set("withdrawn", cosp["withdrawn_at"])
actions = make_node(root, "actions", None)
for action in bill['actions']:
a = make_node(actions,
action['type'] if action['type'] in ("vote", "vote-aux", "calendar", "topresident", "signed", "enacted", "vetoed") else "action",
None,
datetime=action['acted_at'])
if action.get("status"):
a.set("state", action["status"])
if action['type'] in ('vote', 'vote-aux'):
a.clear() # re-insert date between some of these attributes
a.set("how", action["how"])
a.set("type", action["vote_type"])
if action.get("roll") != None:
a.set("roll", action["roll"])
a.set("datetime", utils.format_datetime(action['acted_at']))
a.set("where", action["where"])
a.set("result", action["result"])
if action.get("suspension"):
a.set("suspension", "1")
if action.get("status"):
a.set("state", action["status"])
if action['type'] == 'calendar' and "calendar" in action:
a.set("calendar", action["calendar"])
if action["under"]:
a.set("under", action["under"])
if action["number"]:
a.set("number", action["number"])
if action['type'] == 'enacted':
a.clear() # re-insert date between some of these attributes
a.set("number", "%s-%s" % (bill['congress'], action["number"]))
a.set("type", action["law"])
a.set("datetime", utils.format_datetime(action['acted_at']))
if action.get("status"):
a.set("state", action["status"])
if action['type'] == 'vetoed':
if action.get("pocket"):
a.set("pocket", "1")
if action.get('text'):
make_node(a, "text", action['text'])
if action.get('in_committee'):
make_node(a, "committee", None, name=action['in_committee'])
for cr in action['references']:
make_node(a, "reference", None, ref=cr['reference'], label=cr['type'])
committees = make_node(root, "committees", None)
for cmt in bill['committees']:
make_node(committees, "committee", None, code=(cmt["committee_id"] + cmt["subcommittee_id"]) if cmt.get("subcommittee_id", None) else cmt["committee_id"], name=cmt["committee"], subcommittee=cmt.get("subcommittee").replace("Subcommittee on ", "") if cmt.get("subcommittee") else "", activity=", ".join(c.title() for c in cmt["activity"]))
relatedbills = make_node(root, "relatedbills", None)
for rb in bill['related_bills']:
if rb['type'] == "bill":
rb_bill_type, rb_number, rb_congress = utils.split_bill_id(rb['bill_id'])
make_node(relatedbills, "bill", None, session=rb_congress, type=govtrack_type_codes[rb_bill_type], number=rb_number, relation="unknown" if rb['reason'] == "related" else rb['reason'])
subjects = make_node(root, "subjects", None)
if bill['subjects_top_term']:
make_node(subjects, "term", None, name=bill['subjects_top_term'])
for s in bill['subjects']:
if s != bill['subjects_top_term']:
make_node(subjects, "term", None, name=s)
amendments = make_node(root, "amendments", None)
for amd in bill['amendments']:
make_node(amendments, "amendment", None, number=amd["chamber"] + str(amd["number"]))
if bill.get('summary'):
make_node(root, "summary", re.sub(r"^0|(/)0", lambda m: m.group(1), datetime.datetime.strftime(datetime.datetime.strptime(bill['summary']['date'], "%Y-%m-%d"), "%m/%d/%Y")) + "--" + bill['summary'].get('as', '?') + ".\n" + bill['summary']['text']) # , date=bill['summary'].get('date'), status=bill['summary'].get('as'))
utils.write(
etree.tostring(root, pretty_print=True),
output_for_bill(bill['bill_id'], "xml"),
options=options
)
# This routine is also used by amendment processing. One difference is the
# lack of <b> tags on amendment pages but their presence on bill pages.
# Also, amendments can be sponsored by committees.
def sponsor_for(body):
match = re.search(r"(?:<b>)?Sponsor: (?:</b>)?(No Sponsor|<a href=[^>]+\+(\d{5}|[hs]...\d\d).*>(.+)</a>(?:\s+\[((\w\w)(-(\d+))?)\])?)", body, re.I)
if match:
if (match.group(3) == "No Sponsor") or (match.group(1) == "No Sponsor"):
return None
elif match.group(4): # has a state/district, so it's a rep
if len(match.group(4).split('-')) == 2:
state, district = match.group(4).split('-')
else:
state, district = match.group(4), None
thomas_id = match.group(2)
if not re.match(r"\d{5}$", thomas_id):
raise Exception("Choked parsing sponsor.")
# zero-pad and apply corrections
thomas_id = "%05d" % int(thomas_id)
thomas_id = utils.thomas_corrections(thomas_id)
name = match.group(3).strip()
title, name = re.search("^(Rep|Sen|Del|Com)\.? (.*?)$", name).groups()
return {
'type': 'person',
'title': title,
'name': name,
'thomas_id': thomas_id,
'state': state,
'district': district
}
else: # it's a committee
committee_id = match.group(2)
name = match.group(3).strip()
if not re.match(r"[hs]...\d\d$", committee_id):
raise Exception("Choked parsing apparent committee sponsor.")
return {
'type': 'committee',
'name': name,
'committee_id': committee_id,
}
else:
raise Exception("Choked finding sponsor information.")
def summary_for(body):
match = re.search("SUMMARY AS OF:</a></b>(.*?)(?:<hr|<div id=\"footer\">)", body, re.S)
if not match:
if re.search("<b>SUMMARY:</b><p>\*\*\*NONE\*\*\*", body, re.I):
return None # expected when no summary
else:
raise Exception("Choked finding summary.")
ret = {}
text = match.group(1).strip()
# strip out the bold explanation of a new summary, if present
text = re.sub("\s*<p><b>\(This measure.*?</b></p>\s*", "", text)
# strip out the intro date thing
sumdate = u"(\d+/\d+/\d+)--([^\s].*?)(\u00a0\u00a0\u00a0\u00a0\(There (is|are) \d+ <a href=\"[^>]+\">other (summary|summaries)</a>\))?(\n|<p>)"
m = re.search(sumdate, text)
if m:
d = m.group(1)
if d == "7/11/1794":
d = "7/11/1974" # THOMAS error
ret["date"] = datetime.datetime.strptime(d, "%m/%d/%Y")
ret["date"] = datetime.datetime.strftime(ret["date"], "%Y-%m-%d")
ret["as"] = m.group(2)
if ret["as"].endswith("."):
ret["as"] = ret["as"][:-1]
text = re.sub(sumdate, "", text)
# Preserve paragraph breaks. Convert closing p tags (and surrounding whitespace) into two newlines. Strip trailing whitespace
text = re.sub("\s*</\s*p\s*>\s*", "\n\n", text).strip()
# naive stripping of tags, should work okay in this limited context
text = re.sub("<[^>]+>", "", text)
# compress and strip whitespace artifacts, except for the paragraph breaks
text = re.sub("[ \t\r\f\v]{2,}", " ", text).strip()
ret["text"] = text
return ret
def parse_committee_rows(rows, bill_id):
# counts on having been loaded already
committee_names = utils.committee_names
committee_info = []
top_committee = None
for row in rows:
# ignore header/end row that contain no committee information
match_header = re.search("</?table", row)
if match_header:
continue
# identifies and pulls out committee name
# Can handle committee names with letters, white space, dashes, slashes, parens, periods, apostrophes, and ampersands.
match2 = re.search("(?<=\">)[-.\w\s,()\'&/]+(?=</a>)", row)
if match2:
committee = match2.group().strip()
# remove excess internal spacing
committee = re.sub("\\s{2,}", " ", committee)
else:
raise Exception("Couldn't find committee name. Line was: " + row)
# identifies and pulls out committee activity
match3 = re.search("(?<=<td width=\"65%\">).*?(?=</td>)", row)
if match3:
activity_string = match3.group().strip().lower()
# splits string of activities into activity list
activity_list = activity_string.split(",")
# strips white space from each activity in list
activity = []
for x in activity_list:
activity.append(x.strip())
else:
raise Exception("Couldn't find committee activity.")
# identifies subcommittees by change in table cell width
match4 = re.search("<td width=\"5%\">", row)
if match4:
if not top_committee:
# Subcommittees are a little finicky, so don't raise an exception if the subcommittee can't be processed.
logging.warn("[%s] Subcommittee specified without a parent committee: %s" % (bill_id, committee))
continue
committee_info.append({"committee": top_committee, "activity": activity, "subcommittee": committee, "committee_id": committee_names[top_committee]})
# Subcommittees are a little finicky, so don't raise an exception if the subcommittee is not found.
# Just skip writing the id attribute.
try:
committee_info[-1]["subcommittee_id"] = committee_names[committee_names[top_committee] + "|" + committee.replace("Subcommittee on ", "")]
except KeyError:
logging.warn("[%s] Subcommittee not found in %s: %s" % (bill_id, committee_names[top_committee], committee))
else:
top_committee = committee # saves committee for the next row in case it is a subcommittee
committee_info.append({"committee": committee, "activity": activity, "committee_id": committee_names[committee]})
return committee_info
def committees_for(body, bill_id):
# depends on them already having been loaded
committee_names = utils.committee_names
# grabs entire Committee & Subcommittee table
match = re.search("COMMITTEE\(S\):<.*?<ul>.*?</table>", body, re.I | re.S)
if match:
committee_text = match.group().strip()
# returns empty array for bills not assigned to a committee; e.g. bill_id=hr19-112
none_match = re.search("\*\*\*NONE\*\*\*", committee_text)
if none_match:
committee_info = []
else:
# splits Committee & Subcommittee table up by table row
rows = committee_text.split("</tr>")
committee_info = parse_committee_rows(rows, bill_id)
return committee_info
if not match:
raise Exception("Couldn't find committees section.")
def titles_for(body):
match = re.search("TITLE\(S\):<.*?<ul>.*?<p><li>(.*?)(?:<hr|<div id=\"footer\">)", body, re.I | re.S)
if not match:
raise Exception("Couldn't find titles section.")
titles = []
text = match.group(1).strip()
sections = text.split("<p><li>")
for section in sections:
if section.strip() == "":
continue
# move the <I> that indicates subsequent titles are for a portion of the bill
# to after the <br> that follows it so that it's associated with the right title.
section = re.sub("<I><br ?/>", "<br/><I>", section)
# ensure single newlines between each title in the section
section = re.sub("\n?<br ?/>", "\n", section)
pieces = section.split("\n")
full_type, type_titles = pieces[0], pieces[1:]
if " AS " in full_type:
type, state = full_type.split(" AS ")
state = state.replace(":", "").lower()
else:
type, state = full_type, None
if "POPULAR TITLE" in type:
type = "popular"
elif "SHORT TITLE" in type:
type = "short"
elif "OFFICIAL TITLE" in type:
type = "official"
else:
raise Exception("Unknown title type: " + type)
is_for_portion = False
for title in type_titles:
if title.startswith("<I>"):
# This and subsequent titles in this piece are all for a portion of the bill.
# The <I> tag will be removed below.
is_for_portion = True
# Strip, remove tabs, and replace whitespace and nonbreaking spaces with spaces,
# since occasionally (e.g. s649-113) random \r's etc. appear instead of spaces.
title = re.sub("<[^>]+>", "", title) # strip tags
title = re.sub(ur"[\s\u00a0]+", " ", title.strip()) # strip space and normalize spaces
if title == "":
continue
if type == "popular":
title = re.sub(r" \(identified.+?$", "", title)
titles.append({
'title': title,
'is_for_portion': is_for_portion,
'as': state,
'type': type,
})
return titles
if len(titles) == 0:
raise Exception("No titles found.")
return titles
# the most current title of a given type is the first one in the last 'as' subgroup
# of the titles for the whole bill (that is, if there's no title for the whole bill
# in the last 'as' subgroup, use the previous 'as' subgroup and so on) --- we think
# this logic matches THOMAS/Congress.gov.
def current_title_for(titles, type):
current_title = None
current_as = -1 # not None, cause for popular titles, None is a valid 'as'
for title in titles:
if title['type'] != type or title['is_for_portion'] == True:
continue
if title['as'] == current_as:
continue
# right type, new 'as', store first one
current_title = title['title']
current_as = title['as']
return current_title
def actions_for(body, bill_id, is_amendment=False):
if not is_amendment:
match = re.search(">ALL ACTIONS:<.*?<dl>(.*?)(?:<hr|<div id=\"footer\">)", body, re.I | re.S)
else:
# This function is also used by amendment_info.py.
match = re.search(">STATUS:<.*?<dl>(.*?)(?:<hr|<div id=\"footer\">)", body, re.I | re.S)
# The Status section is optional for amendments.
if not match:
return None
if not match:
if re.search("ALL ACTIONS:((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
return [] # no actions, can happen for bills reserved for the Speaker
else:
raise Exception("Couldn't find action section.")
actions = []
indentation_level = 0
last_top_level_action = None
last_committee_level_action = None
text = match.group(1).strip()
pieces = text.split("\n")
for piece in pieces:
if re.search("<strong>", piece) is None:
continue
action_pieces = re.search("((?:</?dl>)*)<dt><strong>(.*?):</strong><dd>(.+?)$", piece)
if not action_pieces:
raise Exception("Choked on parsing an action: %s" % piece)
indentation_changes, timestamp, text = action_pieces.groups()
# indentation indicates a committee action, track the indentation level
for indentation_change in re.findall("</?dl>", indentation_changes):
if indentation_change == "<dl>":
indentation_level += 1
if indentation_change == "</dl>":
indentation_level -= 1
if indentation_level < 0 or indentation_level > 2:
raise Exception("Action indentation level %d out of bounds." % indentation_level)
# timestamp of the action
if re.search("(am|pm)", timestamp):
action_time = datetime.datetime.strptime(timestamp, "%m/%d/%Y %I:%M%p")
else:
action_time = datetime.datetime.strptime(timestamp, "%m/%d/%Y")
action_time = datetime.datetime.strftime(action_time, "%Y-%m-%d")
cleaned_text, references = action_for(text)
action = {
'text': cleaned_text,
'type': "action",
'acted_at': action_time,
'references': references
}
actions.append(action)
# Associate subcommittee actions with the parent committee by including
# a reference to the last top-level action line's dict, since we haven't
# yet parsed which committee it is in. Likewise for 2nd-level indentation
# to the top-level and 1st-level indentation actions. In some cases,
# 2nd-level indentation occurs without any preceding 1st-level indentation.
if indentation_level == 0:
last_top_level_action = action
last_committee_level_action = None
elif indentation_level == 1:
if last_top_level_action:
action["committee_action_ref"] = last_top_level_action
else:
logging.info("[%s] Committee-level action without a preceding top-level action." % bill_id)
last_committee_level_action = action
elif indentation_level == 2:
if last_top_level_action:
action["committee_action_ref"] = last_top_level_action
if last_committee_level_action:
action["subcommittee_action_ref"] = last_committee_level_action
else:
logging.info("[%s] Sub-committee-level action without a preceding committee-level action." % bill_id)
else:
logging.info("[%s] Sub-committee-level action without a preceding top-level action." % bill_id)
# THOMAS has a funny way of outputting actions. It is sorted by date,
# except that committee events are grouped together. Once we identify
# the committees related to events, we should sort the events properly
# in time order. But (of course there's a but) not all dates have times,
# meaning we will come to having to compare equal dates and dates with
# times on those dates. In those cases, preserve the original order
# of the events as shown on THOMAS.
#
# Note that we do this *before* process actions, since we must get
# this in chronological order before running our status finite state machine.
def action_comparer(a, b):
a = a["acted_at"]
b = b["acted_at"]
if type(a) == str or type(b) == str:
# If either is a plain date without time, compare them only on the
# basis of the date parts, meaning the unspecified time is treated
# as unknown, rather than treated as midnight.
if type(a) == datetime.datetime:
a = datetime.datetime.strftime(a, "%Y-%m-%d")
if type(b) == datetime.datetime:
b = datetime.datetime.strftime(b, "%Y-%m-%d")
else:
# Otherwise if both are date+time's, do a normal comparison
pass
return cmp(a, b)
actions.sort(action_comparer) # .sort() is stable, so original order is preserved where cmp == 0
return actions
# clean text, pull out the action type, any other associated metadata with an action
def action_for(text):
# strip out links
text = re.sub(r"</?[Aa]( \S.*?)?>", "", text)
# remove and extract references
references = []
match = re.search("\s+\(([^)]+)\)\s*$", text)
if match:
# remove the matched section
text = text[0:match.start()] + text[match.end():]
types = match.group(1)
# fix use of comma or colon instead of a semi colon between reference types
# have seen some accidental capitalization combined with accidental comma, thus the 'T'
# e.g. "text of Title VII as reported in House: CR H3075-3077, Text omission from Title VII:" (hr5384-109)
types = re.sub("[,:] ([a-zT])", r"; \1", types)
# fix "CR:"
types = re.sub("CR:", "CR", types)
# fix a missing semicolon altogether between references
# e.g. sres107-112, "consideration: CR S1877-1878 text as"
types = re.sub("(\d+) +([a-z])", r"\1; \2", types)
for reference in re.split("; ?", types):
if ": " not in reference:
type, reference = None, reference
else:
type, reference = reference.split(": ", 1)
references.append({'type': type, 'reference': reference})
return text, references
def introduced_at_for(body):
doc = fromstring(body)
introduced_at = None
for meta in doc.cssselect('meta'):
if meta.get('name') == 'dc.date':
introduced_at = meta.get('content')
if not introduced_at:
raise Exception("Couldn't find an introduction date in the meta tags.")
# maybe silly to parse and re-serialize, but I'd like to make explicit the format we publish dates in
parsed = datetime.datetime.strptime(introduced_at, "%Y-%m-%d")
return datetime.datetime.strftime(parsed, "%Y-%m-%d")
def parse_by_request(body):
"""
Check whether the bill was introduced by the request.
Return boolean value.
"""
doc = fromstring(body)
# Extract all text nodes from the range
# <b>Sponsor: </b> .... <br />
b_node = doc.xpath('//b[normalize-space(text()) = "Sponsor:"]')[0]
text_items = []
for node in b_node.xpath('.//following-sibling::node()'):
if isinstance(node, HtmlElement):
if node.tag == 'br':
break
if isinstance(node, unicode):
text_items.append(unicode(node))
text = u' '.join(text_items)
return u'by request' in text
def cosponsors_for(body):
match = re.search("COSPONSORS\((\d+)\).*?<p>(?:</br>)?(.*?)(?:</br>)?(?:<hr|<div id=\"footer\">)", body, re.S)
if not match:
none = re.search("COSPONSOR\(S\):</b></a><p>\*\*\*NONE\*\*\*", body)
if none:
return [] # no cosponsors, it happens, nothing to be ashamed of
else:
raise Exception("Choked finding cosponsors section.")
count = match.group(1)
text = match.group(2)
# fix some bad line breaks
text = re.sub("</br>", "<br/>", text)
cosponsors = []
lines = re.compile("<br ?/>").split(text)
for line in lines:
# can happen on stand-alone cosponsor pages
if line.strip() == "</div>":
continue
m = re.search(r"<a href=[^>]+(\d{5}).*>(Rep|Sen) (.+?)</a> \[([A-Z\d\-]+)\]\s*- (\d\d?/\d\d?/\d\d\d\d)(?:\(withdrawn - (\d\d?/\d\d?/\d\d\d\d)\))?", line, re.I)
if not m:
raise Exception("Choked scanning cosponsor line: %s" % line)
thomas_id, title, name, district, join_date, withdrawn_date = m.groups()
# zero-pad thomas ID and apply corrections
thomas_id = "%05d" % int(thomas_id)
thomas_id = utils.thomas_corrections(thomas_id)
if len(district.split('-')) == 2:
state, district_number = district.split('-')
else:
state, district_number = district, None
join_date = datetime.datetime.strptime(join_date, "%m/%d/%Y")
join_date = datetime.datetime.strftime(join_date, "%Y-%m-%d")
if withdrawn_date:
withdrawn_date = datetime.datetime.strptime(withdrawn_date, "%m/%d/%Y")
withdrawn_date = datetime.datetime.strftime(withdrawn_date, "%Y-%m-%d")
cosponsors.append({
'thomas_id': thomas_id,
'title': title,
'name': name,
'state': state,
'district': district_number,
'sponsored_at': join_date,
'withdrawn_at': withdrawn_date
})
return cosponsors
def subjects_for(body):
doc = fromstring(body)
subjects = []
top_term = None
for meta in doc.cssselect('meta'):
if meta.get('name') == 'dc.subject':
subjects.append(meta.get('content'))
if not top_term:
top_term = meta.get('content')
subjects.sort()
return top_term, subjects
def related_bills_for(body, congress, bill_id):
match = re.search("RELATED BILL DETAILS.*?<p>.*?<table border=\"0\">(.*?)(?:<hr|<div id=\"footer\">)", body, re.S)
if not match:
if re.search("RELATED BILL DETAILS:((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
return []
else:
raise Exception("Couldn't find related bills section.")
text = match.group(1).strip()
related_bills = []
for line in re.split("<tr><td", text):
if (line.strip() == "") or ("Bill:" in line):
continue
m = re.search("<a[^>]+>(.+?)</a>.*?<td>(.+?)</td>", line)
if not m:
raise Exception("Choked processing related bill line.")
bill_code, reason = m.groups()
related_id = "%s-%s" % (bill_code.lower().replace(".", "").replace(" ", ""), congress)
if "amdt" in related_id:
details = {"type": "amendment", "amendment_id": related_id}
else:
details = {"type": "bill", "bill_id": related_id}
reasons = (
("Identical bill identified by (CRS|House|Senate)", "identical"),
("Companion bill", "identical"),
("Related bill (as )?identified by (CRS|the House Clerk's office|House committee|Senate)", "related"),
("passed in (House|Senate) in lieu of .*", "supersedes"),
("Rule related to .* in (House|Senate)", "rule"),
("This bill has text inserted from .*", "includes"),
("Text from this bill was inserted in .*", "included-in"),
("Bill related to rule .* in House", "ruled-by"),
("This bill caused other related action on .*", "caused-action"),
("Other related action happened to this bill because of .*", "action-caused-by"),
("Bill that causes .* to be laid on table in House", "caused-action"),
("Bill laid on table by virtue of .* passage in House", "action-caused-by"),
("Bill that caused the virtual passage of .* in House", "caused-action"),
("Bill passed by virtue of .* passage in House", "caused-action-by"),
("Bill on wich enrollment has been corrected by virtue of .* passage in House", "caused-action"),
)
for reason_re, reason_code in reasons:
if re.search(reason_re + "$", reason, re.I):
reason = reason_code
break
else:
logging.error("[%s] Unknown bill relation with %s: %s" % (bill_id, related_id, reason.strip()))
reason = "unknown"
details['reason'] = reason
related_bills.append(details)
return related_bills
# get the public or private law number from any enacted action
def slip_law_from(actions):
for action in actions:
if action["type"] == "enacted":
return {
'law_type': action["law"],
'congress': action["congress"],
'number': action["number"]
}
# given the parsed list of actions from actions_for, run each action
# through metadata extraction and figure out what current status the bill is in
def process_actions(actions, bill_id, title, introduced_date):
status = "INTRODUCED" # every bill is at least introduced
status_date = introduced_date
new_actions = []
for action in actions:
new_action, new_status = parse_bill_action(action, status, bill_id, title)
# only change/reflect status change if there was one
if new_status:
new_action['status'] = new_status
status = new_status
# an action can opt-out of inclusion altogether
if new_action:
action.update(new_action)
new_actions.append(action)
if "subcommittee_action_ref" in action:
action["in_committee"] = action["committee_action_ref"].get("committee", None)
action["in_subcommittee"] = action["subcommittee_action_ref"].get("subcommittee", None)
del action["subcommittee_action_ref"]
del action["committee_action_ref"]
elif "committee_action_ref" in action:
action["in_committee"] = action["committee_action_ref"].get("committee", None)
del action["committee_action_ref"]
return new_actions
# find the latest status change in a set of processed actions
def latest_status(actions):
status, status_date = None, None
for action in actions:
if action.get('status', None):
status = action['status']
status_date = action['acted_at']
return status, status_date
# look at the final set of processed actions and pull out the major historical events
def history_from_actions(actions):
history = {}
activation = activation_from(actions)
if activation:
history['active'] = True
history['active_at'] = activation['acted_at']
else:
history['active'] = False
house_vote = None
for action in actions:
if (action['type'] == 'vote') and (action['where'] == 'h') and (action['vote_type'] != "override"):
house_vote = action
if house_vote:
history['house_passage_result'] = house_vote['result']
history['house_passage_result_at'] = house_vote['acted_at']
senate_vote = None
for action in actions:
if (action['type'] == 'vote') and (action['where'] == 's') and (action['vote_type'] != "override"):
senate_vote = action
if senate_vote:
history['senate_passage_result'] = senate_vote['result']
history['senate_passage_result_at'] = senate_vote['acted_at']
senate_vote = None
for action in actions:
if (action['type'] == 'vote-aux') and (action['vote_type'] == 'cloture') and (action['where'] == 's') and (action['vote_type'] != "override"):
senate_vote = action
if senate_vote:
history['senate_cloture_result'] = senate_vote['result']
history['senate_cloture_result_at'] = senate_vote['acted_at']
vetoed = None
for action in actions:
if action['type'] == 'vetoed':
vetoed = action
if vetoed:
history['vetoed'] = True
history['vetoed_at'] = vetoed['acted_at']
else:
history['vetoed'] = False
house_override_vote = None
for action in actions:
if (action['type'] == 'vote') and (action['where'] == 'h') and (action['vote_type'] == "override"):
house_override_vote = action
if house_override_vote:
history['house_override_result'] = house_override_vote['result']
history['house_override_result_at'] = house_override_vote['acted_at']
senate_override_vote = None
for action in actions:
if (action['type'] == 'vote') and (action['where'] == 's') and (action['vote_type'] == "override"):
senate_override_vote = action
if senate_override_vote:
history['senate_override_result'] = senate_override_vote['result']
history['senate_override_result_at'] = senate_override_vote['acted_at']
enacted = None
for action in actions:
if action['type'] == 'enacted':
enacted = action
if enacted:
history['enacted'] = True
history['enacted_at'] = action['acted_at']
else:
history['enacted'] = False
topresident = None
for action in actions:
if action['type'] == 'topresident':
topresident = action
if topresident and (not history['vetoed']) and (not history['enacted']):
history['awaiting_signature'] = True
history['awaiting_signature_since'] = action['acted_at']
else:
history['awaiting_signature'] = False
return history
# find the first action beyond the standard actions every bill gets.
# - if the bill's first action is "referral" then the first action not those
# most common
# e.g. hr3590-111 (active), s1-113 (inactive)
# - if the bill's first action is "action", then the next action, if one is present
# resolutions
# e.g. sres5-113 (active), sres4-113 (inactive)
# - if the bill's first action is anything else (e.g. "vote"), then that first action
# bills that skip committee
# e.g. s227-113 (active)
def activation_from(actions):
# there's NOT always at least one :(
# as of 2013-06-10, hr2272-113 has no actions at all
if len(actions) == 0:
return None
first = actions[0]
if first['type'] in ["referral", "calendar", "action"]:
for action in actions[1:]:
if (action['type'] != "referral") and (action['type'] != "calendar") and ("Sponsor introductory remarks" not in action['text']):
return action
return None
else:
return first
def parse_bill_action(action_dict, prev_status, bill_id, title):
"""Parse a THOMAS bill action line. Returns attributes to be set in the XML file on the action line."""
bill_type, number, congress = utils.split_bill_id(bill_id)
if not utils.committee_names:
utils.fetch_committee_names(congress, {})
line = action_dict['text']
status = None
action = {
"type": "action"
}
# If a line starts with an amendment number, this action is on the amendment and cannot
# be parsed yet.
m = re.search(r"^(H|S)\.Amdt\.(\d+)", line, re.I)
if m != None:
# Process actions specific to amendments separately.
return None, None
# Otherwise, parse the action line for key actions.
# VOTES
# A House Vote.
line = re.sub(", the Passed", ", Passed", line)
# 106 h4733 and others
m = re.search("("
+ "|".join([
"On passage",
"Passed House",
"Two-thirds of the Members present having voted in the affirmative the bill is passed,?",
"On motion to suspend the rules and pass the (?:bill|resolution)",
"On agreeing to the (?:resolution|conference report)",
"On motion to suspend the rules and agree to the (?:resolution|conference report)",
"House Agreed to Senate Amendments.*?",
"On motion that the House (?:suspend the rules and )?(?:agree(?: with an amendment)? to|concur in) the Senate amendments?(?: to the House amendments?| to the Senate amendments?)*",
])
+ ")"
+ "(, the objections of the President to the contrary notwithstanding.?)?"
+ "(, as amended| \(Amended\))?"
+ " (Passed|Failed|Agreed to|Rejected)?"
+ " ?(by voice vote|without objection|by (the Yeas and Nays|Yea-Nay Vote|recorded vote)"
+ "((:)? \(2/3 required\))?: \d+ - \d+(, \d+ Present)? [ \)]*\((Roll no\.|Record Vote No:) \d+\))",
line, re.I)
if m != None:
motion, is_override, as_amended, pass_fail, how = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5)
# print line
# print m.groups()
if re.search(r"Passed House|House Agreed to", motion, re.I):
pass_fail = 'pass'
elif re.search("(ayes|yeas) had prevailed", line, re.I):
pass_fail = 'pass'
elif re.search(r"Pass|Agreed", pass_fail, re.I):
pass_fail = 'pass'
else:
pass_fail = 'fail'
if "Two-thirds of the Members present" in motion:
is_override = True
if is_override:
vote_type = "override"
elif re.search(r"(agree (with an amendment )?to|concur in) the Senate amendment", line, re.I):
vote_type = "pingpong"
elif re.search("conference report", line, re.I):
vote_type = "conference"
elif bill_type[0] == "h":
vote_type = "vote"
else:
vote_type = "vote2"
roll = None
m = re.search(r"\((Roll no\.|Record Vote No:) (\d+)\)", how, re.I)
if m != None:
how = "roll" # normalize the ugly how
roll = m.group(2)
suspension = None
if roll and "On motion to suspend the rules" in motion:
suspension = True
# alternate form of as amended, e.g. hr3979-113
if "that the House agree with an amendment" in motion:
as_amended = True
action["type"] = "vote"
action["vote_type"] = vote_type
action["how"] = how
action['where'] = "h"
action['result'] = pass_fail
if roll:
action["roll"] = roll
action["suspension"] = suspension
# get the new status of the bill after this vote
new_status = new_status_after_vote(vote_type, pass_fail == "pass", "h", bill_type, suspension, as_amended, title, prev_status)
if new_status:
status = new_status
# Passed House, not necessarily by an actual vote (think "deem")
m = re.search(r"Passed House pursuant to", line, re.I)
if m != None:
vote_type = "vote" if (bill_type[0] == "h") else "vote2"
pass_fail = "pass"
action["type"] = "vote"
action["vote_type"] = vote_type
action["how"] = "by special rule"
action["where"] = "h"
action["result"] = pass_fail
# get the new status of the bill after this vote
new_status = new_status_after_vote(vote_type, pass_fail == "pass", "h", bill_type, False, False, title, prev_status)
if new_status:
status = new_status
# A Senate Vote
# (There are some annoying weird cases of double spaces which are taken care of
# at the end.)
m = re.search("("
+ "|".join([
"Passed Senate",
"Failed of passage in Senate",
"Disagreed to in Senate",
"Resolution agreed to in Senate",
"Senate (?:agreed to|concurred in) (?:the )?(?:conference report|House amendment(?: to the Senate amendments?| to the House amendments?)*)",
r"Cloture \S*\s?on the motion to proceed .*?not invoked in Senate",
r"Cloture(?: motion)? on the motion to proceed to the (?:bill|measure) invoked in Senate",
"Cloture invoked in Senate",
"Cloture on (?:the motion to proceed to )?the bill (?:not )?invoked in Senate",
"(?:Introduced|Received|Submitted) in the Senate, (?:read twice, |considered, |read the third time, )+and (?:passed|agreed to)",
])
+ ")"
+ "(,?.*,?) "
+ "(without objection|by Unanimous Consent|by Voice Vote|(?:by )?Yea-Nay( Vote)?\. \d+\s*-\s*\d+\. Record Vote (No|Number): \d+)",
line.replace(" ", " "), re.I)
if m != None:
motion, extra, how = m.group(1), m.group(2), m.group(3)
roll = None
# put disagreed check first, cause "agreed" is contained inside it
if re.search("disagreed", motion, re.I):
pass_fail = "fail"
elif re.search("passed|agreed|concurred|bill invoked|measure invoked|cloture invoked", motion, re.I):
pass_fail = "pass"
else:
pass_fail = "fail"
voteaction_type = "vote"
if re.search("over veto", extra, re.I):
vote_type = "override"
elif re.search("conference report", motion, re.I):
vote_type = "conference"
elif re.search("cloture", motion, re.I):
vote_type = "cloture"
voteaction_type = "vote-aux" # because it is not a vote on passage
elif re.search("Senate agreed to (the )?House amendment|Senate concurred in (the )?House amendment", motion, re.I):
vote_type = "pingpong"
elif bill_type[0] == "s":
vote_type = "vote"
else:
vote_type = "vote2"
m = re.search(r"Record Vote (No|Number): (\d+)", how, re.I)
if m != None:
roll = m.group(2)
how = "roll"
as_amended = False
if re.search(r"with amendments|with an amendment", extra, re.I):
as_amended = True
action["type"] = voteaction_type
action["vote_type"] = vote_type
action["how"] = how
action["result"] = pass_fail
action["where"] = "s"
if roll:
action["roll"] = roll
# get the new status of the bill after this vote
new_status = new_status_after_vote(vote_type, pass_fail == "pass", "s", bill_type, False, as_amended, title, prev_status)
if new_status:
status = new_status
# OLD-STYLE VOTES (93rd Congress-ish)
m = re.search(r"Measure passed (House|Senate)(, amended(?: \(.*?\)|, with an amendment to the title)?)?(?:,? in lieu[^,]*)?(?:, roll call #(\d+) \(\d+-\d+\))?", line, re.I)
if m != None:
chamber = m.group(1)[0].lower() # 'h' or 's'
as_amended = m.group(2)
roll_num = m.group(3)
# GovTrack legacy scraper missed these: if chamber == 's' and (as_amended or roll_num or "lieu" in line): return action, status
pass_fail = "pass"
vote_type = "vote" if bill_type[0] == chamber else "vote2"
action["type"] = "vote"
action["vote_type"] = vote_type
action["how"] = "(method not recorded)" if not roll_num else "roll"
if roll_num:
action["roll"] = roll_num
action["result"] = pass_fail
action["where"] = chamber
new_status = new_status_after_vote(vote_type, pass_fail == "pass", chamber, bill_type, False, as_amended, title, prev_status)
if new_status:
status = new_status
m = re.search(r"(House|Senate) agreed to (?:House|Senate) amendments?( with an amendment)?( under Suspension of the Rules)?(?:, roll call #(\d+) \(\d+-\d+\))?\.", line, re.I)
if m != None:
chamber = m.group(1)[0].lower() # 'h' or 's'
as_amended = m.group(2)
suspension = m.group(3)
roll_num = m.group(4)
# GovTrack legacy scraper missed these: if (chamber == 'h' and not roll_num) or (chamber == 's' and rull_num): return action, status # REMOVE ME
pass_fail = "pass"
vote_type = "pingpong"
action["type"] = "vote"
action["vote_type"] = vote_type
action["how"] = "(method not recorded)" if not roll_num else "roll"
if roll_num:
action["roll"] = roll_num
action["result"] = pass_fail
action["where"] = chamber
action["suspension"] = (suspension != None)
new_status = new_status_after_vote(vote_type, pass_fail == "pass", chamber, bill_type, False, as_amended, title, prev_status)
if new_status:
status = new_status
# PSUDO-REPORTING (because GovTrack did this, but should be changed)
# TODO: Make a new status for this as pre-reported.
m = re.search(r"Placed on (the )?([\w ]+) Calendar( under ([\w ]+))?[,\.] Calendar No\. (\d+)\.|Committee Agreed to Seek Consideration Under Suspension of the Rules|Ordered to be Reported", line, re.I)
if m != None:
# TODO: This makes no sense.
if prev_status in ("INTRODUCED", "REFERRED"):
status = "REPORTED"
action["type"] = "calendar"
# TODO: Useless. But good for GovTrack compatibility.
if m.group(2): # not 'Ordered to be Reported'
action["calendar"] = m.group(2)
action["under"] = m.group(4)
action["number"] = m.group(5)
# COMMITTEE ACTIONS
# reported
m = re.search(r"Committee on (.*)\. Reported by", line, re.I)
if m != None:
action["type"] = "reported"
action["committee"] = m.group(1)
if prev_status in ("INTRODUCED", "REFERRED"):
status = "REPORTED"
m = re.search(r"Reported to Senate from the (.*?)( \(without written report\))?\.", line, re.I)
if m != None: # 93rd Congress
action["type"] = "reported"
action["committee"] = m.group(1)
if prev_status in ("INTRODUCED", "REFERRED"):
status = "REPORTED"
# hearings held by a committee
m = re.search(r"(Committee on .*?)\. Hearings held", line, re.I)
if m != None:
action["committee"] = m.group(1)
action["type"] = "hearings"
m = re.search(r"Committee on (.*)\. Discharged (by Unanimous Consent)?", line, re.I)
if m != None:
action["committee"] = m.group(1)
action["type"] = "discharged"
if prev_status in ("INTRODUCED", "REFERRED"):
status = "REPORTED"
m = re.search("Cleared for White House|Presented to President", line, re.I)
if m != None:
action["type"] = "topresident"
m = re.search("Signed by President", line, re.I)
if m != None:
action["type"] = "signed"
status = "ENACTED:SIGNED"
m = re.search("Pocket Vetoed by President", line, re.I)
if m != None:
action["type"] = "vetoed"
action["pocket"] = "1"
status = "VETOED:POCKET"
# need to put this in an else, or this regex will match the pocket veto and override it
else:
m = re.search("Vetoed by President", line, re.I)
if m != None:
action["type"] = "vetoed"
status = "PROV_KILL:VETO"
m = re.search("^(?:Became )?(Public|Private) Law(?: No:)? ([\d\-]+)\.", line, re.I)
if m != None:
action["law"] = m.group(1).lower()
pieces = m.group(2).split("-")
action["congress"] = pieces[0]
action["number"] = pieces[1]
action["type"] = "enacted"
if prev_status == "ENACTED:SIGNED":
pass # this is a final administrative step
elif prev_status == "PROV_KILL:VETO" or prev_status.startswith("VETOED:"):
status = "ENACTED:VETO_OVERRIDE"
elif bill_id in ("hr1589-94", "s2527-100", "hr1677-101", "hr2978-101", "hr2126-104", "s1322-104"):
status = "ENACTED:TENDAYRULE"
else:
raise Exception("Missing Signed by President action? If this is a case of the 10-day rule, hard code the bill number here.")
# Check for referral type
m = re.search(r"Referred to (?:the )?(House|Senate)?\s?(?:Committee|Subcommittee)?", line, re.I)
if m != None:
action["type"] = "referral"
if prev_status == "INTRODUCED":
status = "REFERRED"
# Check for committee name, and store committee ids
# Build a regex to find mentioned committees in the action line.
cmte_names = []
for name in utils.committee_names.keys():
# excluding subcommittee names (they have pipes),
if name.find('|') == -1:
# name = re.sub(r"\(.*\)", '', name).strip()
name = re.sub(r"^(House|Senate) ", "", name)
cmte_names.append(name)
cmte_reg = r"(House|Senate)?\s*(?:Committee)?\s*(?:on)?\s*(?:the)?\s*({0})".format("|".join(cmte_names))
m = re.search(cmte_reg, line, re.I)
if m:
committees = []
chamber = m.groups()[0] # optional match
# This could be made to look for multiple committee names.
cmte_name_candidates = [" ".join([t for t in m.groups() if t is not None]).replace("House House", "House")]
for cand in cmte_name_candidates:
# many actions just say "Committee on the Judiciary", without a chamber
# do our best to assign a chamber if we can be sure
if ("House" not in cand) and ("Senate" not in cand):
in_house = utils.committee_names.get("House %s" % cand, False)
in_senate = utils.committee_names.get("Senate %s" % cand, False)
if in_house and not in_senate:
cand = "House %s" % cand
elif in_senate and not in_house:
cand = "Senate %s" % cand
# if this action is a committee-level action (indented on THOMAS), look
# at the parent action to infer the chamber
elif len(action_dict.get("committee_action_ref", {}).get("committees", [])) > 0:
chamber = action_dict["committee_action_ref"]["committees"][0][0] # H, S, or J
if chamber == "H":
cand = "House %s" % cand
elif chamber == "S":
cand = "Senate %s" % cand
# look at other signals on the action line
elif re.search("Received in the House|Reported to House", line):
cand = "House %s" % cand
elif re.search("Received in the Senate|Reported to Senate", line):
cand = "Senate %s" % cand
# if a bill is in an early stage where we're pretty sure activity is in the originating
# chamber, fall back to the bill's originating chamber
elif prev_status in ("INTRODUCED", "REFERRED", "REPORTED") and bill_id.startswith("h"):
cand = "House %s" % cand
elif prev_status in ("INTRODUCED", "REFERRED", "REPORTED") and bill_id.startswith("s"):
cand = "Senate %s" % cand
try:
cmte_id = utils.committee_names[cand]
committees.append(cmte_id)
except KeyError:
# pass
logging.warn("[%s] Committee id not found for '%s' in action <%s>" % (bill_id, cand, line))
if committees:
action['committees'] = committees
# no matter what it is, sweep the action line for bill IDs of related bills
bill_ids = utils.extract_bills(line, congress)
bill_ids = filter(lambda b: b != bill_id, bill_ids)
if bill_ids and (len(bill_ids) > 0):
action['bill_ids'] = bill_ids
return action, status
def new_status_after_vote(vote_type, passed, chamber, bill_type, suspension, amended, title, prev_status):
if vote_type == "vote": # vote in originating chamber
if passed:
if bill_type in ("hres", "sres"):
return 'PASSED:SIMPLERES' # end of life for a simple resolution
if chamber == "h":
return 'PASS_OVER:HOUSE' # passed by originating chamber, now in second chamber
else:
return 'PASS_OVER:SENATE' # passed by originating chamber, now in second chamber
if suspension:
return 'PROV_KILL:SUSPENSIONFAILED' # provisionally killed by failure to pass under suspension of the rules
if chamber == "h":
return 'FAIL:ORIGINATING:HOUSE' # outright failure
else:
return 'FAIL:ORIGINATING:SENATE' # outright failure
if vote_type in ("vote2", "pingpong"): # vote in second chamber or subsequent pingpong votes
if passed:
if amended:
# mesure is passed but not in identical form
if chamber == "h":
return 'PASS_BACK:HOUSE' # passed both chambers, but House sends it back to Senate
else:
return 'PASS_BACK:SENATE' # passed both chambers, but Senate sends it back to House
else:
# bills and joint resolutions not constitutional amendments, not amended from Senate version
if bill_type in ("hjres", "sjres") and title.startswith("Proposing an amendment to the Constitution of the United States"):
return 'PASSED:CONSTAMEND' # joint resolution that looks like an amendment to the constitution
if bill_type in ("hconres", "sconres"):
return 'PASSED:CONCURRENTRES' # end of life for concurrent resolutions
return 'PASSED:BILL' # passed by second chamber, now on to president
if vote_type == "pingpong":
# chamber failed to accept the other chamber's changes, but it can vote again
return 'PROV_KILL:PINGPONGFAIL'
if suspension:
return 'PROV_KILL:SUSPENSIONFAILED' # provisionally killed by failure to pass under suspension of the rules
if chamber == "h":
return 'FAIL:SECOND:HOUSE' # outright failure
else:
return 'FAIL:SECOND:SENATE' # outright failure
if vote_type == "cloture":
if not passed:
return "PROV_KILL:CLOTUREFAILED"
else:
return None
if vote_type == "override":
if not passed:
if bill_type[0] == chamber:
if chamber == "h":
return 'VETOED:OVERRIDE_FAIL_ORIGINATING:HOUSE'
else:
return 'VETOED:OVERRIDE_FAIL_ORIGINATING:SENATE'
else:
if chamber == "h":
return 'VETOED:OVERRIDE_FAIL_SECOND:HOUSE'
else:
return 'VETOED:OVERRIDE_FAIL_SECOND:SENATE'
else:
if bill_type[0] == chamber:
if chamber == "h":
return 'VETOED:OVERRIDE_PASS_OVER:HOUSE'
else:
return 'VETOED:OVERRIDE_PASS_OVER:SENATE'
else:
return None # just wait for the enacted line
if vote_type == "conference":
# This is tricky to integrate into status because we have to wait for both
# chambers to pass the conference report.
if passed:
if prev_status.startswith("CONFERENCE:PASSED:"):
if bill_type in ("hjres", "sjres") and title.startswith("Proposing an amendment to the Constitution of the United States"):
return 'PASSED:CONSTAMEND' # joint resolution that looks like an amendment to the constitution
if bill_type in ("hconres", "sconres"):
return 'PASSED:CONCURRENTRES' # end of life for concurrent resolutions
return 'PASSED:BILL'
else:
if chamber == "h":
return 'CONFERENCE:PASSED:HOUSE'
else:
return 'CONFERENCE:PASSED:SENATE'
return None
# parse amendments out of undocumented standalone amendments page
def amendments_for_standalone(body, bill_id):
bill_type, number, congress = utils.split_bill_id(bill_id)
amendments = []
for code, chamber, number in re.findall("<a href=\"/cgi-bin/bdquery/z\?d\d+:(SU|SP|HZ)\d+:\">(S|H)\.(?:UP\.)?AMDT\.(\d+)</a>", body, re.I):
chamber = chamber.lower()
# there are "senate unprinted amendments" for the 97th and 98th Congresses, with their own numbering scheme
# make those use 'su' as the type instead of 's'
amendment_type = chamber + "amdt"
if code == "SU":
amendment_type = "supamdt"
amendments.append({
'chamber': chamber,
'amendment_type': amendment_type,
'number': number,
'amendment_id': "%s%s-%s" % (amendment_type, number, congress)
})
if len(amendments) == 0:
if not re.search("AMENDMENT\(S\):((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
raise Exception("Couldn't find amendments section.")
return amendments
def amendments_for(body, bill_id):
bill_type, number, congress = utils.split_bill_id(bill_id)
# it is possible in older sessions for the amendments section to not appear at all.
# if this method is being run, we know the page is not truncated, so if the header
# is not at all present, assume the page is missing amendments because there are none.
if not re.search("AMENDMENT\(S\):", body):
return []
amendments = []
for code, chamber, number in re.findall("<b>\s*\d+\.</b>\s*<a href=\"/cgi-bin/bdquery/z\?d\d+:(SU|SP|HZ)\d+:\">(S|H)\.(?:UP\.)?AMDT\.(\d+)\s*</a> to ", body, re.I):
chamber = chamber.lower()
# there are "senate unprinted amendments" for the 97th and 98th Congresses, with their own numbering scheme
# make those use 'supamdt' as the type instead of 's'
amendment_type = chamber + "amdt"
if code == "SU":
amendment_type = "supamdt"
amendments.append({
'chamber': chamber,
'amendment_type': amendment_type,
'number': number,
'amendment_id': "%s%s-%s" % (amendment_type, number, congress)
})
if len(amendments) == 0:
if not re.search("AMENDMENT\(S\):((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
raise Exception("Couldn't find amendments section.")
return amendments
# are there at least 150 amendments listed in this body? a quick tally
# not the end of the world if it's wrong once in a great while, it just sparks
# a less efficient way of gathering this bill's data
def too_many_amendments(body):
# example:
# "<b>150.</b> <a href="/cgi-bin/bdquery/z?d111:SP02937:">S.AMDT.2937 </a> to <a href="/cgi-bin/bdquery/z?d111:HR03590:">H.R.3590</a>"
amendments = re.findall("(<b>\s*\d+\.</b>\s*<a href=\"/cgi-bin/bdquery/z\?d\d+:(SP|HZ)\d+:\">(S|H)\.AMDT\.\d+\s*</a> to )", body, re.I)
return (len(amendments) >= 150)
# bills reserved for the Speaker or Minority Leader are not actual legislation,
# just markers that the number will not be used for ordinary members' bills
def reserved_bill(body):
if re.search("OFFICIAL TITLE AS INTRODUCED:((?:(?!\<hr).)+)Reserved for the (Speaker|Minority Leader)", body, re.S | re.I):
return True
else:
return False
# fetch GPO URLs for PDF and HTML formats
def get_GPO_url_for_bill(bill_id, options):
# we need the URL of the pdf on GPO
# there may be a way to calculate it, but in the meantime we'll get it the old-fashioned way
# first get the THOMAS landing page. This may be duplicating work, but didn't see anything
# Maybe TODO -- reconcile with fdsys script (ideally without downloading large sitemaps for a single bill)
bill_type, number, congress = utils.split_bill_id(bill_id)
thomas_type = utils.thomas_types[bill_type][0]
congress = int(congress)
landing_url = "http://thomas.loc.gov/cgi-bin/bdquery/D?d%03d:%s:./list/bss/d%03d%s.lst:" % (congress, number, congress, thomas_type)
landing_page = utils.download(
landing_url,
bill_cache_for(bill_id, "landing_page.html"),
options)
text_landing_page_url = "http://thomas.loc.gov/cgi-bin/query/z" + re.search('href="/cgi-bin/query/z?(.*?)">Text of Legislation', landing_page, re.I | re.S).groups(1)[0]
text_landing_page = utils.download(
text_landing_page_url,
bill_cache_for(bill_id, "text_landing_page.html"),
options)
gpo_urls = re.findall('http://www.gpo.gov/fdsys/(.*?)\.pdf', text_landing_page, re.I | re.S)
if not len(gpo_urls):
logging.info("No GPO link discovered")
return False
# get last url on page, in cases where there are several versions of bill
# THOMAS advises us to use the last one (e.g. http://thomas.loc.gov/cgi-bin/query/z?c113:S.CON.RES.1: )
return {
"pdf": "http://www.gpo.gov/fdsys/" + gpo_urls[-1] + ".pdf",
"html": "http://www.gpo.gov/fdsys/" + gpo_urls[-1].replace("pdf", "html") + ".htm"
}
# directory helpers
def output_for_bill(bill_id, format, is_data_dot=True):
bill_type, number, congress = utils.split_bill_id(bill_id)
if is_data_dot:
fn = "data.%s" % format
else:
fn = format
return "%s/%s/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn)
# defaults to "All Information" page for a bill
def bill_url_for(bill_id, page="L"):
bill_type, number, congress = utils.split_bill_id(bill_id)
thomas_type = utils.thomas_types[bill_type][0]
congress = int(congress)
return "http://thomas.loc.gov/cgi-bin/bdquery/z?d%03d:%s%s:@@@%s&summ2=m&" % (congress, thomas_type, number, page)
def bill_cache_for(bill_id, file):
bill_type, number, congress = utils.split_bill_id(bill_id)
return "%s/bills/%s/%s%s/%s" % (congress, bill_type, bill_type, number, file)