mirror of
https://github.com/unitedstates/congress.git
synced 2025-12-25 01:00:04 -05:00
1772 lines
70 KiB
Python
1772 lines
70 KiB
Python
import utils
|
|
import logging
|
|
import re
|
|
import json
|
|
from lxml import etree
|
|
import time
|
|
import datetime
|
|
from lxml.html import fromstring, HtmlElement
|
|
|
|
# can be run on its own, just require a bill_id
|
|
|
|
|
|
def run(options):
|
|
bill_id = options.get('bill_id', None)
|
|
|
|
if bill_id:
|
|
result = fetch_bill(bill_id, options)
|
|
logging.warn("\n%s" % result)
|
|
else:
|
|
logging.error("To run this task directly, supply a bill_id.")
|
|
|
|
|
|
# download and cache landing page for bill
|
|
# can raise an exception under various conditions
|
|
def fetch_bill(bill_id, options):
|
|
logging.info("\n[%s] Fetching..." % bill_id)
|
|
|
|
# fetch committee name map, if it doesn't already exist
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
if not utils.committee_names:
|
|
utils.fetch_committee_names(congress, options)
|
|
|
|
# fetch bill details body
|
|
body = utils.download(
|
|
bill_url_for(bill_id),
|
|
bill_cache_for(bill_id, "information.html"),
|
|
options)
|
|
|
|
if not body:
|
|
return {'saved': False, 'ok': False, 'reason': "failed to download"}
|
|
|
|
if options.get("download_only", False):
|
|
return {'saved': False, 'ok': True, 'reason': "requested download only"}
|
|
|
|
if reserved_bill(body):
|
|
logging.warn("[%s] Reserved bill, not real, skipping..." % bill_id)
|
|
return {'saved': False, 'ok': True, 'reason': "reserved bill"}
|
|
|
|
# conditions where we want to parse the bill from multiple pages instead of one:
|
|
|
|
# 1) the all info page is truncated (~5-10 bills a congress)
|
|
# e.g. s1867-112, hr2112-112, s3240-112
|
|
if "</html>" not in body:
|
|
logging.info("[%s] Main page truncated, fetching many pages..." % bill_id)
|
|
bill = parse_bill_split(bill_id, body, options)
|
|
|
|
# 2) there are > 150 amendments, use undocumented amendments list (~5-10 bills a congress)
|
|
# e.g. hr3590-111, sconres13-111, s3240-112
|
|
elif too_many_amendments(body):
|
|
logging.info("[%s] Too many amendments, fetching many pages..." % bill_id)
|
|
bill = parse_bill_split(bill_id, body, options)
|
|
|
|
# 3) when I feel like it
|
|
elif options.get('force_split', False):
|
|
logging.info("[%s] Forcing a split, fetching many pages..." % bill_id)
|
|
bill = parse_bill_split(bill_id, body, options)
|
|
|
|
# Otherwise, get the bill's data from a single All Information page
|
|
else:
|
|
bill = parse_bill(bill_id, body, options)
|
|
|
|
output_bill(bill, options)
|
|
|
|
# output PDF and/or HTML file if requested
|
|
|
|
if not options.get("formats", False):
|
|
return {'ok': True, 'saved': True}
|
|
|
|
status = {'ok': True, 'saved': True}
|
|
|
|
options["formats"] = options["formats"].lower()
|
|
|
|
if options["formats"].lower() == "all":
|
|
formats = ["pdf", "html"]
|
|
else:
|
|
formats = options["formats"].split(",")
|
|
|
|
gpo_urls = get_GPO_url_for_bill(bill_id, options)
|
|
|
|
for fmt in formats:
|
|
if gpo_urls and fmt in gpo_urls:
|
|
utils.write(utils.download(gpo_urls[fmt], bill_cache_for(bill_id, "bill." + fmt), {'binary': True}), output_for_bill(bill_id, fmt))
|
|
logging.info("Saving %s format for %s" % (fmt, bill_id))
|
|
status[fmt] = True
|
|
else:
|
|
status[fmt] = False
|
|
|
|
return status
|
|
|
|
|
|
def parse_bill(bill_id, body, options):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
|
|
# parse everything out of the All Information page
|
|
introduced_at = introduced_at_for(body)
|
|
by_request = parse_by_request(body)
|
|
sponsor = sponsor_for(body)
|
|
cosponsors = cosponsors_for(body)
|
|
summary = summary_for(body)
|
|
titles = titles_for(body)
|
|
actions = actions_for(body, bill_id)
|
|
related_bills = related_bills_for(body, congress, bill_id)
|
|
subjects = subjects_for(body)
|
|
committees = committees_for(body, bill_id)
|
|
amendments = amendments_for(body, bill_id)
|
|
|
|
return process_bill(bill_id, options, introduced_at, by_request, sponsor, cosponsors,
|
|
summary, titles, actions, related_bills, subjects, committees, amendments)
|
|
|
|
|
|
# parse information pieced together from various pages
|
|
def parse_bill_split(bill_id, body, options):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
|
|
# get some info out of the All Info page, since we already have it
|
|
introduced_at = introduced_at_for(body)
|
|
by_request = parse_by_request(body)
|
|
sponsor = sponsor_for(body)
|
|
subjects = subjects_for(body)
|
|
|
|
# cosponsors page
|
|
cosponsors_body = utils.download(
|
|
bill_url_for(bill_id, "P"),
|
|
bill_cache_for(bill_id, "cosponsors.html"),
|
|
options)
|
|
cosponsors_body = utils.unescape(cosponsors_body)
|
|
cosponsors = cosponsors_for(cosponsors_body)
|
|
|
|
# summary page
|
|
summary_body = utils.download(
|
|
bill_url_for(bill_id, "D"),
|
|
bill_cache_for(bill_id, "summary.html"),
|
|
options)
|
|
summary_body = utils.unescape(summary_body)
|
|
summary = summary_for(summary_body)
|
|
|
|
# titles page
|
|
titles_body = utils.download(
|
|
bill_url_for(bill_id, "T"),
|
|
bill_cache_for(bill_id, "titles.html"),
|
|
options)
|
|
titles_body = utils.unescape(titles_body)
|
|
titles = titles_for(titles_body)
|
|
|
|
# actions page
|
|
actions_body = utils.download(
|
|
bill_url_for(bill_id, "X"),
|
|
bill_cache_for(bill_id, "actions.html"),
|
|
options)
|
|
actions_body = utils.unescape(actions_body)
|
|
actions = actions_for(actions_body, bill_id)
|
|
|
|
related_bills_body = utils.download(
|
|
bill_url_for(bill_id, "K"),
|
|
bill_cache_for(bill_id, "related_bills.html"),
|
|
options)
|
|
related_bills_body = utils.unescape(related_bills_body)
|
|
related_bills = related_bills_for(related_bills_body, congress, bill_id)
|
|
|
|
amendments_body = utils.download(
|
|
bill_url_for(bill_id, "A"),
|
|
bill_cache_for(bill_id, "amendments.html"),
|
|
options)
|
|
amendments_body = utils.unescape(amendments_body)
|
|
amendments = amendments_for_standalone(amendments_body, bill_id)
|
|
|
|
committees_body = utils.download(
|
|
bill_url_for(bill_id, "C"),
|
|
bill_cache_for(bill_id, "committees.html"),
|
|
options)
|
|
committees_body = utils.unescape(committees_body)
|
|
committees = committees_for(committees_body, bill_id)
|
|
|
|
return process_bill(bill_id, options, introduced_at, by_request, sponsor, cosponsors,
|
|
summary, titles, actions, related_bills, subjects, committees, amendments)
|
|
|
|
|
|
# take the initial parsed content, extract more information, assemble output data
|
|
def process_bill(bill_id, options,
|
|
introduced_at, by_request, sponsor, cosponsors,
|
|
summary, titles, actions, related_bills, subjects, committees, amendments):
|
|
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
|
|
# for convenience: extract out current title of each type
|
|
official_title = current_title_for(titles, "official")
|
|
short_title = current_title_for(titles, "short")
|
|
popular_title = current_title_for(titles, "popular")
|
|
|
|
# add metadata to each action, establish current status
|
|
actions = process_actions(actions, bill_id, official_title, introduced_at)
|
|
|
|
# pull out latest status change and the date of it
|
|
status, status_date = latest_status(actions)
|
|
if not status: # default to introduced
|
|
status = "INTRODUCED"
|
|
status_date = introduced_at
|
|
|
|
# pull out some very useful history information from the actions
|
|
history = history_from_actions(actions)
|
|
|
|
slip_law = slip_law_from(actions)
|
|
|
|
return {
|
|
'bill_id': bill_id,
|
|
'bill_type': bill_type,
|
|
'number': number,
|
|
'congress': congress,
|
|
|
|
'introduced_at': introduced_at,
|
|
'by_request': by_request,
|
|
'sponsor': sponsor,
|
|
'cosponsors': cosponsors,
|
|
|
|
'actions': actions,
|
|
'history': history,
|
|
'status': status,
|
|
'status_at': status_date,
|
|
'enacted_as': slip_law,
|
|
|
|
'titles': titles,
|
|
'official_title': official_title,
|
|
'short_title': short_title,
|
|
'popular_title': popular_title,
|
|
|
|
'summary': summary,
|
|
'subjects_top_term': subjects[0],
|
|
'subjects': subjects[1],
|
|
|
|
'related_bills': related_bills,
|
|
'committees': committees,
|
|
'amendments': amendments,
|
|
|
|
'updated_at': datetime.datetime.fromtimestamp(time.time()),
|
|
}
|
|
|
|
|
|
def output_bill(bill, options):
|
|
logging.info("[%s] Writing to disk..." % bill['bill_id'])
|
|
|
|
# output JSON - so easy!
|
|
utils.write(
|
|
json.dumps(bill, sort_keys=True, indent=2, default=utils.format_datetime),
|
|
output_for_bill(bill['bill_id'], "json"),
|
|
options=options,
|
|
)
|
|
|
|
# output XML
|
|
govtrack_type_codes = {'hr': 'h', 's': 's', 'hres': 'hr', 'sres': 'sr', 'hjres': 'hj', 'sjres': 'sj', 'hconres': 'hc', 'sconres': 'sc'}
|
|
root = etree.Element("bill")
|
|
root.set("session", bill['congress'])
|
|
root.set("type", govtrack_type_codes[bill['bill_type']])
|
|
root.set("number", bill['number'])
|
|
root.set("updated", utils.format_datetime(bill['updated_at']))
|
|
|
|
def make_node(parent, tag, text, **attrs):
|
|
if options.get("govtrack", False):
|
|
# Rewrite thomas_id attributes as just id with GovTrack person IDs.
|
|
attrs2 = {}
|
|
for k, v in attrs.items():
|
|
if v:
|
|
if k == "thomas_id":
|
|
# remap "thomas_id" attributes to govtrack "id"
|
|
k = "id"
|
|
v = str(utils.get_govtrack_person_id('thomas', v))
|
|
attrs2[k] = v
|
|
attrs = attrs2
|
|
|
|
return utils.make_node(parent, tag, text, **attrs)
|
|
|
|
# for American Memory Century of Lawmaking bills...
|
|
for source in bill.get("sources", []):
|
|
n = make_node(root, "source", "")
|
|
for k, v in sorted(source.items()):
|
|
if k == "source":
|
|
n.text = v
|
|
elif k == "source_url":
|
|
n.set("url", v)
|
|
else:
|
|
n.set(k, unicode(v))
|
|
if "original_bill_number" in bill:
|
|
make_node(root, "bill-number", bill["original_bill_number"])
|
|
|
|
make_node(root, "state", bill['status'], datetime=bill['status_at'])
|
|
old_status = make_node(root, "status", None)
|
|
make_node(old_status, "introduced" if bill['status'] in ("INTRODUCED", "REFERRED") else "unknown", None, datetime=bill['status_at']) # dummy for the sake of comparison
|
|
|
|
make_node(root, "introduced", None, datetime=bill['introduced_at'])
|
|
titles = make_node(root, "titles", None)
|
|
for title in bill['titles']:
|
|
n = make_node(titles, "title", title['title'])
|
|
n.set("type", title['type'])
|
|
if title['as']:
|
|
n.set("as", title['as'])
|
|
if title['is_for_portion']:
|
|
n.set("partial", "1")
|
|
|
|
if bill['sponsor']:
|
|
# TODO: Sponsored by committee?
|
|
make_node(root, "sponsor", None, thomas_id=bill['sponsor']['thomas_id'])
|
|
else:
|
|
make_node(root, "sponsor", None)
|
|
|
|
cosponsors = make_node(root, "cosponsors", None)
|
|
for cosp in bill['cosponsors']:
|
|
n = make_node(cosponsors, "cosponsor", None, thomas_id=cosp["thomas_id"])
|
|
if cosp["sponsored_at"]:
|
|
n.set("joined", cosp["sponsored_at"])
|
|
if cosp["withdrawn_at"]:
|
|
n.set("withdrawn", cosp["withdrawn_at"])
|
|
|
|
actions = make_node(root, "actions", None)
|
|
for action in bill['actions']:
|
|
a = make_node(actions,
|
|
action['type'] if action['type'] in ("vote", "vote-aux", "calendar", "topresident", "signed", "enacted", "vetoed") else "action",
|
|
None,
|
|
datetime=action['acted_at'])
|
|
if action.get("status"):
|
|
a.set("state", action["status"])
|
|
if action['type'] in ('vote', 'vote-aux'):
|
|
a.clear() # re-insert date between some of these attributes
|
|
a.set("how", action["how"])
|
|
a.set("type", action["vote_type"])
|
|
if action.get("roll") != None:
|
|
a.set("roll", action["roll"])
|
|
a.set("datetime", utils.format_datetime(action['acted_at']))
|
|
a.set("where", action["where"])
|
|
a.set("result", action["result"])
|
|
if action.get("suspension"):
|
|
a.set("suspension", "1")
|
|
if action.get("status"):
|
|
a.set("state", action["status"])
|
|
if action['type'] == 'calendar' and "calendar" in action:
|
|
a.set("calendar", action["calendar"])
|
|
if action["under"]:
|
|
a.set("under", action["under"])
|
|
if action["number"]:
|
|
a.set("number", action["number"])
|
|
if action['type'] == 'enacted':
|
|
a.clear() # re-insert date between some of these attributes
|
|
a.set("number", "%s-%s" % (bill['congress'], action["number"]))
|
|
a.set("type", action["law"])
|
|
a.set("datetime", utils.format_datetime(action['acted_at']))
|
|
if action.get("status"):
|
|
a.set("state", action["status"])
|
|
if action['type'] == 'vetoed':
|
|
if action.get("pocket"):
|
|
a.set("pocket", "1")
|
|
if action.get('text'):
|
|
make_node(a, "text", action['text'])
|
|
if action.get('in_committee'):
|
|
make_node(a, "committee", None, name=action['in_committee'])
|
|
for cr in action['references']:
|
|
make_node(a, "reference", None, ref=cr['reference'], label=cr['type'])
|
|
|
|
committees = make_node(root, "committees", None)
|
|
for cmt in bill['committees']:
|
|
make_node(committees, "committee", None, code=(cmt["committee_id"] + cmt["subcommittee_id"]) if cmt.get("subcommittee_id", None) else cmt["committee_id"], name=cmt["committee"], subcommittee=cmt.get("subcommittee").replace("Subcommittee on ", "") if cmt.get("subcommittee") else "", activity=", ".join(c.title() for c in cmt["activity"]))
|
|
|
|
relatedbills = make_node(root, "relatedbills", None)
|
|
for rb in bill['related_bills']:
|
|
if rb['type'] == "bill":
|
|
rb_bill_type, rb_number, rb_congress = utils.split_bill_id(rb['bill_id'])
|
|
make_node(relatedbills, "bill", None, session=rb_congress, type=govtrack_type_codes[rb_bill_type], number=rb_number, relation="unknown" if rb['reason'] == "related" else rb['reason'])
|
|
|
|
subjects = make_node(root, "subjects", None)
|
|
if bill['subjects_top_term']:
|
|
make_node(subjects, "term", None, name=bill['subjects_top_term'])
|
|
for s in bill['subjects']:
|
|
if s != bill['subjects_top_term']:
|
|
make_node(subjects, "term", None, name=s)
|
|
|
|
amendments = make_node(root, "amendments", None)
|
|
for amd in bill['amendments']:
|
|
make_node(amendments, "amendment", None, number=amd["chamber"] + str(amd["number"]))
|
|
|
|
if bill.get('summary'):
|
|
make_node(root, "summary", re.sub(r"^0|(/)0", lambda m: m.group(1), datetime.datetime.strftime(datetime.datetime.strptime(bill['summary']['date'], "%Y-%m-%d"), "%m/%d/%Y")) + "--" + bill['summary'].get('as', '?') + ".\n" + bill['summary']['text']) # , date=bill['summary'].get('date'), status=bill['summary'].get('as'))
|
|
|
|
utils.write(
|
|
etree.tostring(root, pretty_print=True),
|
|
output_for_bill(bill['bill_id'], "xml"),
|
|
options=options
|
|
)
|
|
|
|
|
|
# This routine is also used by amendment processing. One difference is the
|
|
# lack of <b> tags on amendment pages but their presence on bill pages.
|
|
# Also, amendments can be sponsored by committees.
|
|
def sponsor_for(body):
|
|
match = re.search(r"(?:<b>)?Sponsor: (?:</b>)?(No Sponsor|<a href=[^>]+\+(\d{5}|[hs]...\d\d).*>(.+)</a>(?:\s+\[((\w\w)(-(\d+))?)\])?)", body, re.I)
|
|
if match:
|
|
if (match.group(3) == "No Sponsor") or (match.group(1) == "No Sponsor"):
|
|
return None
|
|
elif match.group(4): # has a state/district, so it's a rep
|
|
if len(match.group(4).split('-')) == 2:
|
|
state, district = match.group(4).split('-')
|
|
else:
|
|
state, district = match.group(4), None
|
|
|
|
thomas_id = match.group(2)
|
|
if not re.match(r"\d{5}$", thomas_id):
|
|
raise Exception("Choked parsing sponsor.")
|
|
|
|
# zero-pad and apply corrections
|
|
thomas_id = "%05d" % int(thomas_id)
|
|
thomas_id = utils.thomas_corrections(thomas_id)
|
|
|
|
name = match.group(3).strip()
|
|
title, name = re.search("^(Rep|Sen|Del|Com)\.? (.*?)$", name).groups()
|
|
|
|
return {
|
|
'type': 'person',
|
|
'title': title,
|
|
'name': name,
|
|
'thomas_id': thomas_id,
|
|
'state': state,
|
|
'district': district
|
|
}
|
|
else: # it's a committee
|
|
committee_id = match.group(2)
|
|
name = match.group(3).strip()
|
|
if not re.match(r"[hs]...\d\d$", committee_id):
|
|
raise Exception("Choked parsing apparent committee sponsor.")
|
|
return {
|
|
'type': 'committee',
|
|
'name': name,
|
|
'committee_id': committee_id,
|
|
}
|
|
|
|
else:
|
|
raise Exception("Choked finding sponsor information.")
|
|
|
|
|
|
def summary_for(body):
|
|
match = re.search("SUMMARY AS OF:</a></b>(.*?)(?:<hr|<div id=\"footer\">)", body, re.S)
|
|
if not match:
|
|
if re.search("<b>SUMMARY:</b><p>\*\*\*NONE\*\*\*", body, re.I):
|
|
return None # expected when no summary
|
|
else:
|
|
raise Exception("Choked finding summary.")
|
|
|
|
ret = {}
|
|
|
|
text = match.group(1).strip()
|
|
|
|
# strip out the bold explanation of a new summary, if present
|
|
text = re.sub("\s*<p><b>\(This measure.*?</b></p>\s*", "", text)
|
|
|
|
# strip out the intro date thing
|
|
sumdate = u"(\d+/\d+/\d+)--([^\s].*?)(\u00a0\u00a0\u00a0\u00a0\(There (is|are) \d+ <a href=\"[^>]+\">other (summary|summaries)</a>\))?(\n|<p>)"
|
|
m = re.search(sumdate, text)
|
|
if m:
|
|
d = m.group(1)
|
|
if d == "7/11/1794":
|
|
d = "7/11/1974" # THOMAS error
|
|
ret["date"] = datetime.datetime.strptime(d, "%m/%d/%Y")
|
|
ret["date"] = datetime.datetime.strftime(ret["date"], "%Y-%m-%d")
|
|
ret["as"] = m.group(2)
|
|
if ret["as"].endswith("."):
|
|
ret["as"] = ret["as"][:-1]
|
|
text = re.sub(sumdate, "", text)
|
|
|
|
# Preserve paragraph breaks. Convert closing p tags (and surrounding whitespace) into two newlines. Strip trailing whitespace
|
|
text = re.sub("\s*</\s*p\s*>\s*", "\n\n", text).strip()
|
|
|
|
# naive stripping of tags, should work okay in this limited context
|
|
text = re.sub("<[^>]+>", "", text)
|
|
|
|
# compress and strip whitespace artifacts, except for the paragraph breaks
|
|
text = re.sub("[ \t\r\f\v]{2,}", " ", text).strip()
|
|
|
|
ret["text"] = text
|
|
|
|
return ret
|
|
|
|
|
|
def parse_committee_rows(rows, bill_id):
|
|
# counts on having been loaded already
|
|
committee_names = utils.committee_names
|
|
|
|
committee_info = []
|
|
top_committee = None
|
|
for row in rows:
|
|
# ignore header/end row that contain no committee information
|
|
match_header = re.search("</?table", row)
|
|
if match_header:
|
|
continue
|
|
|
|
# identifies and pulls out committee name
|
|
# Can handle committee names with letters, white space, dashes, slashes, parens, periods, apostrophes, and ampersands.
|
|
match2 = re.search("(?<=\">)[-.\w\s,()\'&/]+(?=</a>)", row)
|
|
if match2:
|
|
committee = match2.group().strip()
|
|
# remove excess internal spacing
|
|
committee = re.sub("\\s{2,}", " ", committee)
|
|
else:
|
|
raise Exception("Couldn't find committee name. Line was: " + row)
|
|
|
|
# identifies and pulls out committee activity
|
|
match3 = re.search("(?<=<td width=\"65%\">).*?(?=</td>)", row)
|
|
if match3:
|
|
activity_string = match3.group().strip().lower()
|
|
|
|
# splits string of activities into activity list
|
|
activity_list = activity_string.split(",")
|
|
|
|
# strips white space from each activity in list
|
|
activity = []
|
|
for x in activity_list:
|
|
activity.append(x.strip())
|
|
|
|
else:
|
|
raise Exception("Couldn't find committee activity.")
|
|
|
|
# identifies subcommittees by change in table cell width
|
|
match4 = re.search("<td width=\"5%\">", row)
|
|
if match4:
|
|
if not top_committee:
|
|
# Subcommittees are a little finicky, so don't raise an exception if the subcommittee can't be processed.
|
|
logging.warn("[%s] Subcommittee specified without a parent committee: %s" % (bill_id, committee))
|
|
continue
|
|
committee_info.append({"committee": top_committee, "activity": activity, "subcommittee": committee, "committee_id": committee_names[top_committee]})
|
|
# Subcommittees are a little finicky, so don't raise an exception if the subcommittee is not found.
|
|
# Just skip writing the id attribute.
|
|
try:
|
|
committee_info[-1]["subcommittee_id"] = committee_names[committee_names[top_committee] + "|" + committee.replace("Subcommittee on ", "")]
|
|
except KeyError:
|
|
logging.warn("[%s] Subcommittee not found in %s: %s" % (bill_id, committee_names[top_committee], committee))
|
|
|
|
else:
|
|
top_committee = committee # saves committee for the next row in case it is a subcommittee
|
|
committee_info.append({"committee": committee, "activity": activity, "committee_id": committee_names[committee]})
|
|
|
|
return committee_info
|
|
|
|
|
|
def committees_for(body, bill_id):
|
|
# depends on them already having been loaded
|
|
committee_names = utils.committee_names
|
|
|
|
# grabs entire Committee & Subcommittee table
|
|
match = re.search("COMMITTEE\(S\):<.*?<ul>.*?</table>", body, re.I | re.S)
|
|
if match:
|
|
committee_text = match.group().strip()
|
|
|
|
# returns empty array for bills not assigned to a committee; e.g. bill_id=hr19-112
|
|
none_match = re.search("\*\*\*NONE\*\*\*", committee_text)
|
|
if none_match:
|
|
committee_info = []
|
|
else:
|
|
# splits Committee & Subcommittee table up by table row
|
|
rows = committee_text.split("</tr>")
|
|
committee_info = parse_committee_rows(rows, bill_id)
|
|
|
|
return committee_info
|
|
|
|
if not match:
|
|
raise Exception("Couldn't find committees section.")
|
|
|
|
|
|
def titles_for(body):
|
|
match = re.search("TITLE\(S\):<.*?<ul>.*?<p><li>(.*?)(?:<hr|<div id=\"footer\">)", body, re.I | re.S)
|
|
if not match:
|
|
raise Exception("Couldn't find titles section.")
|
|
|
|
titles = []
|
|
|
|
text = match.group(1).strip()
|
|
sections = text.split("<p><li>")
|
|
for section in sections:
|
|
if section.strip() == "":
|
|
continue
|
|
|
|
# move the <I> that indicates subsequent titles are for a portion of the bill
|
|
# to after the <br> that follows it so that it's associated with the right title.
|
|
section = re.sub("<I><br ?/>", "<br/><I>", section)
|
|
|
|
# ensure single newlines between each title in the section
|
|
section = re.sub("\n?<br ?/>", "\n", section)
|
|
|
|
pieces = section.split("\n")
|
|
|
|
full_type, type_titles = pieces[0], pieces[1:]
|
|
if " AS " in full_type:
|
|
type, state = full_type.split(" AS ")
|
|
state = state.replace(":", "").lower()
|
|
else:
|
|
type, state = full_type, None
|
|
|
|
if "POPULAR TITLE" in type:
|
|
type = "popular"
|
|
elif "SHORT TITLE" in type:
|
|
type = "short"
|
|
elif "OFFICIAL TITLE" in type:
|
|
type = "official"
|
|
else:
|
|
raise Exception("Unknown title type: " + type)
|
|
|
|
is_for_portion = False
|
|
for title in type_titles:
|
|
if title.startswith("<I>"):
|
|
# This and subsequent titles in this piece are all for a portion of the bill.
|
|
# The <I> tag will be removed below.
|
|
is_for_portion = True
|
|
|
|
# Strip, remove tabs, and replace whitespace and nonbreaking spaces with spaces,
|
|
# since occasionally (e.g. s649-113) random \r's etc. appear instead of spaces.
|
|
title = re.sub("<[^>]+>", "", title) # strip tags
|
|
title = re.sub(ur"[\s\u00a0]+", " ", title.strip()) # strip space and normalize spaces
|
|
if title == "":
|
|
continue
|
|
|
|
if type == "popular":
|
|
title = re.sub(r" \(identified.+?$", "", title)
|
|
|
|
titles.append({
|
|
'title': title,
|
|
'is_for_portion': is_for_portion,
|
|
'as': state,
|
|
'type': type,
|
|
})
|
|
|
|
return titles
|
|
|
|
if len(titles) == 0:
|
|
raise Exception("No titles found.")
|
|
|
|
return titles
|
|
|
|
# the most current title of a given type is the first one in the last 'as' subgroup
|
|
# of the titles for the whole bill (that is, if there's no title for the whole bill
|
|
# in the last 'as' subgroup, use the previous 'as' subgroup and so on) --- we think
|
|
# this logic matches THOMAS/Congress.gov.
|
|
|
|
|
|
def current_title_for(titles, type):
|
|
current_title = None
|
|
current_as = -1 # not None, cause for popular titles, None is a valid 'as'
|
|
|
|
for title in titles:
|
|
if title['type'] != type or title['is_for_portion'] == True:
|
|
continue
|
|
if title['as'] == current_as:
|
|
continue
|
|
# right type, new 'as', store first one
|
|
current_title = title['title']
|
|
current_as = title['as']
|
|
|
|
return current_title
|
|
|
|
|
|
def actions_for(body, bill_id, is_amendment=False):
|
|
if not is_amendment:
|
|
match = re.search(">ALL ACTIONS:<.*?<dl>(.*?)(?:<hr|<div id=\"footer\">)", body, re.I | re.S)
|
|
else:
|
|
# This function is also used by amendment_info.py.
|
|
match = re.search(">STATUS:<.*?<dl>(.*?)(?:<hr|<div id=\"footer\">)", body, re.I | re.S)
|
|
|
|
# The Status section is optional for amendments.
|
|
if not match:
|
|
return None
|
|
|
|
if not match:
|
|
if re.search("ALL ACTIONS:((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
|
|
return [] # no actions, can happen for bills reserved for the Speaker
|
|
else:
|
|
raise Exception("Couldn't find action section.")
|
|
|
|
actions = []
|
|
indentation_level = 0
|
|
last_top_level_action = None
|
|
last_committee_level_action = None
|
|
|
|
text = match.group(1).strip()
|
|
|
|
pieces = text.split("\n")
|
|
for piece in pieces:
|
|
if re.search("<strong>", piece) is None:
|
|
continue
|
|
|
|
action_pieces = re.search("((?:</?dl>)*)<dt><strong>(.*?):</strong><dd>(.+?)$", piece)
|
|
if not action_pieces:
|
|
raise Exception("Choked on parsing an action: %s" % piece)
|
|
|
|
indentation_changes, timestamp, text = action_pieces.groups()
|
|
|
|
# indentation indicates a committee action, track the indentation level
|
|
for indentation_change in re.findall("</?dl>", indentation_changes):
|
|
if indentation_change == "<dl>":
|
|
indentation_level += 1
|
|
if indentation_change == "</dl>":
|
|
indentation_level -= 1
|
|
if indentation_level < 0 or indentation_level > 2:
|
|
raise Exception("Action indentation level %d out of bounds." % indentation_level)
|
|
|
|
# timestamp of the action
|
|
if re.search("(am|pm)", timestamp):
|
|
action_time = datetime.datetime.strptime(timestamp, "%m/%d/%Y %I:%M%p")
|
|
else:
|
|
action_time = datetime.datetime.strptime(timestamp, "%m/%d/%Y")
|
|
action_time = datetime.datetime.strftime(action_time, "%Y-%m-%d")
|
|
|
|
cleaned_text, references = action_for(text)
|
|
|
|
action = {
|
|
'text': cleaned_text,
|
|
'type': "action",
|
|
'acted_at': action_time,
|
|
'references': references
|
|
}
|
|
actions.append(action)
|
|
|
|
# Associate subcommittee actions with the parent committee by including
|
|
# a reference to the last top-level action line's dict, since we haven't
|
|
# yet parsed which committee it is in. Likewise for 2nd-level indentation
|
|
# to the top-level and 1st-level indentation actions. In some cases,
|
|
# 2nd-level indentation occurs without any preceding 1st-level indentation.
|
|
if indentation_level == 0:
|
|
last_top_level_action = action
|
|
last_committee_level_action = None
|
|
elif indentation_level == 1:
|
|
if last_top_level_action:
|
|
action["committee_action_ref"] = last_top_level_action
|
|
else:
|
|
logging.info("[%s] Committee-level action without a preceding top-level action." % bill_id)
|
|
last_committee_level_action = action
|
|
elif indentation_level == 2:
|
|
if last_top_level_action:
|
|
action["committee_action_ref"] = last_top_level_action
|
|
if last_committee_level_action:
|
|
action["subcommittee_action_ref"] = last_committee_level_action
|
|
else:
|
|
logging.info("[%s] Sub-committee-level action without a preceding committee-level action." % bill_id)
|
|
else:
|
|
logging.info("[%s] Sub-committee-level action without a preceding top-level action." % bill_id)
|
|
|
|
# THOMAS has a funny way of outputting actions. It is sorted by date,
|
|
# except that committee events are grouped together. Once we identify
|
|
# the committees related to events, we should sort the events properly
|
|
# in time order. But (of course there's a but) not all dates have times,
|
|
# meaning we will come to having to compare equal dates and dates with
|
|
# times on those dates. In those cases, preserve the original order
|
|
# of the events as shown on THOMAS.
|
|
#
|
|
# Note that we do this *before* process actions, since we must get
|
|
# this in chronological order before running our status finite state machine.
|
|
def action_comparer(a, b):
|
|
a = a["acted_at"]
|
|
b = b["acted_at"]
|
|
if type(a) == str or type(b) == str:
|
|
# If either is a plain date without time, compare them only on the
|
|
# basis of the date parts, meaning the unspecified time is treated
|
|
# as unknown, rather than treated as midnight.
|
|
if type(a) == datetime.datetime:
|
|
a = datetime.datetime.strftime(a, "%Y-%m-%d")
|
|
if type(b) == datetime.datetime:
|
|
b = datetime.datetime.strftime(b, "%Y-%m-%d")
|
|
else:
|
|
# Otherwise if both are date+time's, do a normal comparison
|
|
pass
|
|
return cmp(a, b)
|
|
actions.sort(action_comparer) # .sort() is stable, so original order is preserved where cmp == 0
|
|
|
|
return actions
|
|
|
|
|
|
# clean text, pull out the action type, any other associated metadata with an action
|
|
def action_for(text):
|
|
# strip out links
|
|
text = re.sub(r"</?[Aa]( \S.*?)?>", "", text)
|
|
|
|
# remove and extract references
|
|
references = []
|
|
match = re.search("\s+\(([^)]+)\)\s*$", text)
|
|
if match:
|
|
# remove the matched section
|
|
text = text[0:match.start()] + text[match.end():]
|
|
|
|
types = match.group(1)
|
|
|
|
# fix use of comma or colon instead of a semi colon between reference types
|
|
# have seen some accidental capitalization combined with accidental comma, thus the 'T'
|
|
# e.g. "text of Title VII as reported in House: CR H3075-3077, Text omission from Title VII:" (hr5384-109)
|
|
types = re.sub("[,:] ([a-zT])", r"; \1", types)
|
|
# fix "CR:"
|
|
types = re.sub("CR:", "CR", types)
|
|
# fix a missing semicolon altogether between references
|
|
# e.g. sres107-112, "consideration: CR S1877-1878 text as"
|
|
types = re.sub("(\d+) +([a-z])", r"\1; \2", types)
|
|
|
|
for reference in re.split("; ?", types):
|
|
if ": " not in reference:
|
|
type, reference = None, reference
|
|
else:
|
|
type, reference = reference.split(": ", 1)
|
|
|
|
references.append({'type': type, 'reference': reference})
|
|
|
|
return text, references
|
|
|
|
|
|
def introduced_at_for(body):
|
|
doc = fromstring(body)
|
|
|
|
introduced_at = None
|
|
for meta in doc.cssselect('meta'):
|
|
if meta.get('name') == 'dc.date':
|
|
introduced_at = meta.get('content')
|
|
|
|
if not introduced_at:
|
|
raise Exception("Couldn't find an introduction date in the meta tags.")
|
|
|
|
# maybe silly to parse and re-serialize, but I'd like to make explicit the format we publish dates in
|
|
parsed = datetime.datetime.strptime(introduced_at, "%Y-%m-%d")
|
|
return datetime.datetime.strftime(parsed, "%Y-%m-%d")
|
|
|
|
|
|
def parse_by_request(body):
|
|
"""
|
|
Check whether the bill was introduced by the request.
|
|
|
|
Return boolean value.
|
|
"""
|
|
doc = fromstring(body)
|
|
|
|
# Extract all text nodes from the range
|
|
# <b>Sponsor: </b> .... <br />
|
|
b_node = doc.xpath('//b[normalize-space(text()) = "Sponsor:"]')[0]
|
|
text_items = []
|
|
for node in b_node.xpath('.//following-sibling::node()'):
|
|
if isinstance(node, HtmlElement):
|
|
if node.tag == 'br':
|
|
break
|
|
if isinstance(node, unicode):
|
|
text_items.append(unicode(node))
|
|
text = u' '.join(text_items)
|
|
return u'by request' in text
|
|
|
|
|
|
def cosponsors_for(body):
|
|
match = re.search("COSPONSORS\((\d+)\).*?<p>(?:</br>)?(.*?)(?:</br>)?(?:<hr|<div id=\"footer\">)", body, re.S)
|
|
if not match:
|
|
none = re.search("COSPONSOR\(S\):</b></a><p>\*\*\*NONE\*\*\*", body)
|
|
if none:
|
|
return [] # no cosponsors, it happens, nothing to be ashamed of
|
|
else:
|
|
raise Exception("Choked finding cosponsors section.")
|
|
|
|
count = match.group(1)
|
|
text = match.group(2)
|
|
|
|
# fix some bad line breaks
|
|
text = re.sub("</br>", "<br/>", text)
|
|
|
|
cosponsors = []
|
|
|
|
lines = re.compile("<br ?/>").split(text)
|
|
for line in lines:
|
|
# can happen on stand-alone cosponsor pages
|
|
if line.strip() == "</div>":
|
|
continue
|
|
|
|
m = re.search(r"<a href=[^>]+(\d{5}).*>(Rep|Sen) (.+?)</a> \[([A-Z\d\-]+)\]\s*- (\d\d?/\d\d?/\d\d\d\d)(?:\(withdrawn - (\d\d?/\d\d?/\d\d\d\d)\))?", line, re.I)
|
|
if not m:
|
|
raise Exception("Choked scanning cosponsor line: %s" % line)
|
|
|
|
thomas_id, title, name, district, join_date, withdrawn_date = m.groups()
|
|
|
|
# zero-pad thomas ID and apply corrections
|
|
thomas_id = "%05d" % int(thomas_id)
|
|
thomas_id = utils.thomas_corrections(thomas_id)
|
|
|
|
if len(district.split('-')) == 2:
|
|
state, district_number = district.split('-')
|
|
else:
|
|
state, district_number = district, None
|
|
|
|
join_date = datetime.datetime.strptime(join_date, "%m/%d/%Y")
|
|
join_date = datetime.datetime.strftime(join_date, "%Y-%m-%d")
|
|
if withdrawn_date:
|
|
withdrawn_date = datetime.datetime.strptime(withdrawn_date, "%m/%d/%Y")
|
|
withdrawn_date = datetime.datetime.strftime(withdrawn_date, "%Y-%m-%d")
|
|
|
|
cosponsors.append({
|
|
'thomas_id': thomas_id,
|
|
'title': title,
|
|
'name': name,
|
|
'state': state,
|
|
'district': district_number,
|
|
'sponsored_at': join_date,
|
|
'withdrawn_at': withdrawn_date
|
|
})
|
|
|
|
return cosponsors
|
|
|
|
|
|
def subjects_for(body):
|
|
doc = fromstring(body)
|
|
subjects = []
|
|
top_term = None
|
|
for meta in doc.cssselect('meta'):
|
|
if meta.get('name') == 'dc.subject':
|
|
subjects.append(meta.get('content'))
|
|
if not top_term:
|
|
top_term = meta.get('content')
|
|
subjects.sort()
|
|
|
|
return top_term, subjects
|
|
|
|
|
|
def related_bills_for(body, congress, bill_id):
|
|
match = re.search("RELATED BILL DETAILS.*?<p>.*?<table border=\"0\">(.*?)(?:<hr|<div id=\"footer\">)", body, re.S)
|
|
if not match:
|
|
if re.search("RELATED BILL DETAILS:((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
|
|
return []
|
|
else:
|
|
raise Exception("Couldn't find related bills section.")
|
|
|
|
text = match.group(1).strip()
|
|
|
|
related_bills = []
|
|
|
|
for line in re.split("<tr><td", text):
|
|
if (line.strip() == "") or ("Bill:" in line):
|
|
continue
|
|
|
|
m = re.search("<a[^>]+>(.+?)</a>.*?<td>(.+?)</td>", line)
|
|
if not m:
|
|
raise Exception("Choked processing related bill line.")
|
|
|
|
bill_code, reason = m.groups()
|
|
|
|
related_id = "%s-%s" % (bill_code.lower().replace(".", "").replace(" ", ""), congress)
|
|
|
|
if "amdt" in related_id:
|
|
details = {"type": "amendment", "amendment_id": related_id}
|
|
else:
|
|
details = {"type": "bill", "bill_id": related_id}
|
|
|
|
reasons = (
|
|
("Identical bill identified by (CRS|House|Senate)", "identical"),
|
|
("Companion bill", "identical"),
|
|
("Related bill (as )?identified by (CRS|the House Clerk's office|House committee|Senate)", "related"),
|
|
("passed in (House|Senate) in lieu of .*", "supersedes"),
|
|
("Rule related to .* in (House|Senate)", "rule"),
|
|
("This bill has text inserted from .*", "includes"),
|
|
("Text from this bill was inserted in .*", "included-in"),
|
|
("Bill related to rule .* in House", "ruled-by"),
|
|
("This bill caused other related action on .*", "caused-action"),
|
|
("Other related action happened to this bill because of .*", "action-caused-by"),
|
|
("Bill that causes .* to be laid on table in House", "caused-action"),
|
|
("Bill laid on table by virtue of .* passage in House", "action-caused-by"),
|
|
("Bill that caused the virtual passage of .* in House", "caused-action"),
|
|
("Bill passed by virtue of .* passage in House", "caused-action-by"),
|
|
("Bill on wich enrollment has been corrected by virtue of .* passage in House", "caused-action"),
|
|
)
|
|
for reason_re, reason_code in reasons:
|
|
if re.search(reason_re + "$", reason, re.I):
|
|
reason = reason_code
|
|
break
|
|
else:
|
|
logging.error("[%s] Unknown bill relation with %s: %s" % (bill_id, related_id, reason.strip()))
|
|
reason = "unknown"
|
|
|
|
details['reason'] = reason
|
|
|
|
related_bills.append(details)
|
|
|
|
return related_bills
|
|
|
|
# get the public or private law number from any enacted action
|
|
|
|
|
|
def slip_law_from(actions):
|
|
for action in actions:
|
|
if action["type"] == "enacted":
|
|
return {
|
|
'law_type': action["law"],
|
|
'congress': action["congress"],
|
|
'number': action["number"]
|
|
}
|
|
|
|
# given the parsed list of actions from actions_for, run each action
|
|
# through metadata extraction and figure out what current status the bill is in
|
|
|
|
|
|
def process_actions(actions, bill_id, title, introduced_date):
|
|
|
|
status = "INTRODUCED" # every bill is at least introduced
|
|
status_date = introduced_date
|
|
new_actions = []
|
|
|
|
for action in actions:
|
|
new_action, new_status = parse_bill_action(action, status, bill_id, title)
|
|
|
|
# only change/reflect status change if there was one
|
|
if new_status:
|
|
new_action['status'] = new_status
|
|
status = new_status
|
|
|
|
# an action can opt-out of inclusion altogether
|
|
if new_action:
|
|
action.update(new_action)
|
|
new_actions.append(action)
|
|
|
|
if "subcommittee_action_ref" in action:
|
|
action["in_committee"] = action["committee_action_ref"].get("committee", None)
|
|
action["in_subcommittee"] = action["subcommittee_action_ref"].get("subcommittee", None)
|
|
del action["subcommittee_action_ref"]
|
|
del action["committee_action_ref"]
|
|
elif "committee_action_ref" in action:
|
|
action["in_committee"] = action["committee_action_ref"].get("committee", None)
|
|
del action["committee_action_ref"]
|
|
|
|
return new_actions
|
|
|
|
# find the latest status change in a set of processed actions
|
|
|
|
|
|
def latest_status(actions):
|
|
status, status_date = None, None
|
|
for action in actions:
|
|
if action.get('status', None):
|
|
status = action['status']
|
|
status_date = action['acted_at']
|
|
return status, status_date
|
|
|
|
# look at the final set of processed actions and pull out the major historical events
|
|
|
|
|
|
def history_from_actions(actions):
|
|
|
|
history = {}
|
|
|
|
activation = activation_from(actions)
|
|
if activation:
|
|
history['active'] = True
|
|
history['active_at'] = activation['acted_at']
|
|
else:
|
|
history['active'] = False
|
|
|
|
house_vote = None
|
|
for action in actions:
|
|
if (action['type'] == 'vote') and (action['where'] == 'h') and (action['vote_type'] != "override"):
|
|
house_vote = action
|
|
if house_vote:
|
|
history['house_passage_result'] = house_vote['result']
|
|
history['house_passage_result_at'] = house_vote['acted_at']
|
|
|
|
senate_vote = None
|
|
for action in actions:
|
|
if (action['type'] == 'vote') and (action['where'] == 's') and (action['vote_type'] != "override"):
|
|
senate_vote = action
|
|
if senate_vote:
|
|
history['senate_passage_result'] = senate_vote['result']
|
|
history['senate_passage_result_at'] = senate_vote['acted_at']
|
|
|
|
senate_vote = None
|
|
for action in actions:
|
|
if (action['type'] == 'vote-aux') and (action['vote_type'] == 'cloture') and (action['where'] == 's') and (action['vote_type'] != "override"):
|
|
senate_vote = action
|
|
if senate_vote:
|
|
history['senate_cloture_result'] = senate_vote['result']
|
|
history['senate_cloture_result_at'] = senate_vote['acted_at']
|
|
|
|
vetoed = None
|
|
for action in actions:
|
|
if action['type'] == 'vetoed':
|
|
vetoed = action
|
|
if vetoed:
|
|
history['vetoed'] = True
|
|
history['vetoed_at'] = vetoed['acted_at']
|
|
else:
|
|
history['vetoed'] = False
|
|
|
|
house_override_vote = None
|
|
for action in actions:
|
|
if (action['type'] == 'vote') and (action['where'] == 'h') and (action['vote_type'] == "override"):
|
|
house_override_vote = action
|
|
if house_override_vote:
|
|
history['house_override_result'] = house_override_vote['result']
|
|
history['house_override_result_at'] = house_override_vote['acted_at']
|
|
|
|
senate_override_vote = None
|
|
for action in actions:
|
|
if (action['type'] == 'vote') and (action['where'] == 's') and (action['vote_type'] == "override"):
|
|
senate_override_vote = action
|
|
if senate_override_vote:
|
|
history['senate_override_result'] = senate_override_vote['result']
|
|
history['senate_override_result_at'] = senate_override_vote['acted_at']
|
|
|
|
enacted = None
|
|
for action in actions:
|
|
if action['type'] == 'enacted':
|
|
enacted = action
|
|
if enacted:
|
|
history['enacted'] = True
|
|
history['enacted_at'] = action['acted_at']
|
|
else:
|
|
history['enacted'] = False
|
|
|
|
topresident = None
|
|
for action in actions:
|
|
if action['type'] == 'topresident':
|
|
topresident = action
|
|
if topresident and (not history['vetoed']) and (not history['enacted']):
|
|
history['awaiting_signature'] = True
|
|
history['awaiting_signature_since'] = action['acted_at']
|
|
else:
|
|
history['awaiting_signature'] = False
|
|
|
|
return history
|
|
|
|
|
|
# find the first action beyond the standard actions every bill gets.
|
|
# - if the bill's first action is "referral" then the first action not those
|
|
# most common
|
|
# e.g. hr3590-111 (active), s1-113 (inactive)
|
|
# - if the bill's first action is "action", then the next action, if one is present
|
|
# resolutions
|
|
# e.g. sres5-113 (active), sres4-113 (inactive)
|
|
# - if the bill's first action is anything else (e.g. "vote"), then that first action
|
|
# bills that skip committee
|
|
# e.g. s227-113 (active)
|
|
def activation_from(actions):
|
|
# there's NOT always at least one :(
|
|
# as of 2013-06-10, hr2272-113 has no actions at all
|
|
if len(actions) == 0:
|
|
return None
|
|
|
|
first = actions[0]
|
|
|
|
if first['type'] in ["referral", "calendar", "action"]:
|
|
for action in actions[1:]:
|
|
if (action['type'] != "referral") and (action['type'] != "calendar") and ("Sponsor introductory remarks" not in action['text']):
|
|
return action
|
|
return None
|
|
else:
|
|
return first
|
|
|
|
|
|
def parse_bill_action(action_dict, prev_status, bill_id, title):
|
|
"""Parse a THOMAS bill action line. Returns attributes to be set in the XML file on the action line."""
|
|
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
if not utils.committee_names:
|
|
utils.fetch_committee_names(congress, {})
|
|
|
|
line = action_dict['text']
|
|
|
|
status = None
|
|
action = {
|
|
"type": "action"
|
|
}
|
|
|
|
# If a line starts with an amendment number, this action is on the amendment and cannot
|
|
# be parsed yet.
|
|
m = re.search(r"^(H|S)\.Amdt\.(\d+)", line, re.I)
|
|
if m != None:
|
|
# Process actions specific to amendments separately.
|
|
return None, None
|
|
|
|
# Otherwise, parse the action line for key actions.
|
|
|
|
# VOTES
|
|
|
|
# A House Vote.
|
|
line = re.sub(", the Passed", ", Passed", line)
|
|
# 106 h4733 and others
|
|
|
|
m = re.search("("
|
|
+ "|".join([
|
|
"On passage",
|
|
"Passed House",
|
|
"Two-thirds of the Members present having voted in the affirmative the bill is passed,?",
|
|
"On motion to suspend the rules and pass the (?:bill|resolution)",
|
|
"On agreeing to the (?:resolution|conference report)",
|
|
"On motion to suspend the rules and agree to the (?:resolution|conference report)",
|
|
"House Agreed to Senate Amendments.*?",
|
|
"On motion that the House (?:suspend the rules and )?(?:agree(?: with an amendment)? to|concur in) the Senate amendments?(?: to the House amendments?| to the Senate amendments?)*",
|
|
])
|
|
+ ")"
|
|
+ "(, the objections of the President to the contrary notwithstanding.?)?"
|
|
+ "(, as amended| \(Amended\))?"
|
|
+ " (Passed|Failed|Agreed to|Rejected)?"
|
|
+ " ?(by voice vote|without objection|by (the Yeas and Nays|Yea-Nay Vote|recorded vote)"
|
|
+ "((:)? \(2/3 required\))?: \d+ - \d+(, \d+ Present)? [ \)]*\((Roll no\.|Record Vote No:) \d+\))",
|
|
line, re.I)
|
|
if m != None:
|
|
motion, is_override, as_amended, pass_fail, how = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5)
|
|
|
|
# print line
|
|
# print m.groups()
|
|
|
|
if re.search(r"Passed House|House Agreed to", motion, re.I):
|
|
pass_fail = 'pass'
|
|
elif re.search("(ayes|yeas) had prevailed", line, re.I):
|
|
pass_fail = 'pass'
|
|
elif re.search(r"Pass|Agreed", pass_fail, re.I):
|
|
pass_fail = 'pass'
|
|
else:
|
|
pass_fail = 'fail'
|
|
|
|
if "Two-thirds of the Members present" in motion:
|
|
is_override = True
|
|
|
|
if is_override:
|
|
vote_type = "override"
|
|
elif re.search(r"(agree (with an amendment )?to|concur in) the Senate amendment", line, re.I):
|
|
vote_type = "pingpong"
|
|
elif re.search("conference report", line, re.I):
|
|
vote_type = "conference"
|
|
elif bill_type[0] == "h":
|
|
vote_type = "vote"
|
|
else:
|
|
vote_type = "vote2"
|
|
|
|
roll = None
|
|
m = re.search(r"\((Roll no\.|Record Vote No:) (\d+)\)", how, re.I)
|
|
if m != None:
|
|
how = "roll" # normalize the ugly how
|
|
roll = m.group(2)
|
|
|
|
suspension = None
|
|
if roll and "On motion to suspend the rules" in motion:
|
|
suspension = True
|
|
|
|
# alternate form of as amended, e.g. hr3979-113
|
|
if "that the House agree with an amendment" in motion:
|
|
as_amended = True
|
|
|
|
action["type"] = "vote"
|
|
action["vote_type"] = vote_type
|
|
action["how"] = how
|
|
action['where'] = "h"
|
|
action['result'] = pass_fail
|
|
if roll:
|
|
action["roll"] = roll
|
|
action["suspension"] = suspension
|
|
|
|
# get the new status of the bill after this vote
|
|
new_status = new_status_after_vote(vote_type, pass_fail == "pass", "h", bill_type, suspension, as_amended, title, prev_status)
|
|
if new_status:
|
|
status = new_status
|
|
|
|
# Passed House, not necessarily by an actual vote (think "deem")
|
|
m = re.search(r"Passed House pursuant to", line, re.I)
|
|
if m != None:
|
|
vote_type = "vote" if (bill_type[0] == "h") else "vote2"
|
|
pass_fail = "pass"
|
|
|
|
action["type"] = "vote"
|
|
action["vote_type"] = vote_type
|
|
action["how"] = "by special rule"
|
|
action["where"] = "h"
|
|
action["result"] = pass_fail
|
|
|
|
# get the new status of the bill after this vote
|
|
new_status = new_status_after_vote(vote_type, pass_fail == "pass", "h", bill_type, False, False, title, prev_status)
|
|
|
|
if new_status:
|
|
status = new_status
|
|
|
|
# A Senate Vote
|
|
# (There are some annoying weird cases of double spaces which are taken care of
|
|
# at the end.)
|
|
m = re.search("("
|
|
+ "|".join([
|
|
"Passed Senate",
|
|
"Failed of passage in Senate",
|
|
"Disagreed to in Senate",
|
|
"Resolution agreed to in Senate",
|
|
"Senate (?:agreed to|concurred in) (?:the )?(?:conference report|House amendment(?: to the Senate amendments?| to the House amendments?)*)",
|
|
r"Cloture \S*\s?on the motion to proceed .*?not invoked in Senate",
|
|
r"Cloture(?: motion)? on the motion to proceed to the (?:bill|measure) invoked in Senate",
|
|
"Cloture invoked in Senate",
|
|
"Cloture on (?:the motion to proceed to )?the bill (?:not )?invoked in Senate",
|
|
"(?:Introduced|Received|Submitted) in the Senate, (?:read twice, |considered, |read the third time, )+and (?:passed|agreed to)",
|
|
])
|
|
+ ")"
|
|
+ "(,?.*,?) "
|
|
+ "(without objection|by Unanimous Consent|by Voice Vote|(?:by )?Yea-Nay( Vote)?\. \d+\s*-\s*\d+\. Record Vote (No|Number): \d+)",
|
|
line.replace(" ", " "), re.I)
|
|
if m != None:
|
|
motion, extra, how = m.group(1), m.group(2), m.group(3)
|
|
roll = None
|
|
|
|
# put disagreed check first, cause "agreed" is contained inside it
|
|
if re.search("disagreed", motion, re.I):
|
|
pass_fail = "fail"
|
|
elif re.search("passed|agreed|concurred|bill invoked|measure invoked|cloture invoked", motion, re.I):
|
|
pass_fail = "pass"
|
|
else:
|
|
pass_fail = "fail"
|
|
|
|
voteaction_type = "vote"
|
|
if re.search("over veto", extra, re.I):
|
|
vote_type = "override"
|
|
elif re.search("conference report", motion, re.I):
|
|
vote_type = "conference"
|
|
elif re.search("cloture", motion, re.I):
|
|
vote_type = "cloture"
|
|
voteaction_type = "vote-aux" # because it is not a vote on passage
|
|
elif re.search("Senate agreed to (the )?House amendment|Senate concurred in (the )?House amendment", motion, re.I):
|
|
vote_type = "pingpong"
|
|
elif bill_type[0] == "s":
|
|
vote_type = "vote"
|
|
else:
|
|
vote_type = "vote2"
|
|
|
|
m = re.search(r"Record Vote (No|Number): (\d+)", how, re.I)
|
|
if m != None:
|
|
roll = m.group(2)
|
|
how = "roll"
|
|
|
|
as_amended = False
|
|
if re.search(r"with amendments|with an amendment", extra, re.I):
|
|
as_amended = True
|
|
|
|
action["type"] = voteaction_type
|
|
action["vote_type"] = vote_type
|
|
action["how"] = how
|
|
action["result"] = pass_fail
|
|
action["where"] = "s"
|
|
if roll:
|
|
action["roll"] = roll
|
|
|
|
# get the new status of the bill after this vote
|
|
new_status = new_status_after_vote(vote_type, pass_fail == "pass", "s", bill_type, False, as_amended, title, prev_status)
|
|
|
|
if new_status:
|
|
status = new_status
|
|
|
|
# OLD-STYLE VOTES (93rd Congress-ish)
|
|
|
|
m = re.search(r"Measure passed (House|Senate)(, amended(?: \(.*?\)|, with an amendment to the title)?)?(?:,? in lieu[^,]*)?(?:, roll call #(\d+) \(\d+-\d+\))?", line, re.I)
|
|
if m != None:
|
|
chamber = m.group(1)[0].lower() # 'h' or 's'
|
|
as_amended = m.group(2)
|
|
roll_num = m.group(3)
|
|
# GovTrack legacy scraper missed these: if chamber == 's' and (as_amended or roll_num or "lieu" in line): return action, status
|
|
pass_fail = "pass"
|
|
vote_type = "vote" if bill_type[0] == chamber else "vote2"
|
|
action["type"] = "vote"
|
|
action["vote_type"] = vote_type
|
|
action["how"] = "(method not recorded)" if not roll_num else "roll"
|
|
if roll_num:
|
|
action["roll"] = roll_num
|
|
action["result"] = pass_fail
|
|
action["where"] = chamber
|
|
new_status = new_status_after_vote(vote_type, pass_fail == "pass", chamber, bill_type, False, as_amended, title, prev_status)
|
|
if new_status:
|
|
status = new_status
|
|
|
|
m = re.search(r"(House|Senate) agreed to (?:House|Senate) amendments?( with an amendment)?( under Suspension of the Rules)?(?:, roll call #(\d+) \(\d+-\d+\))?\.", line, re.I)
|
|
if m != None:
|
|
chamber = m.group(1)[0].lower() # 'h' or 's'
|
|
as_amended = m.group(2)
|
|
suspension = m.group(3)
|
|
roll_num = m.group(4)
|
|
# GovTrack legacy scraper missed these: if (chamber == 'h' and not roll_num) or (chamber == 's' and rull_num): return action, status # REMOVE ME
|
|
pass_fail = "pass"
|
|
vote_type = "pingpong"
|
|
action["type"] = "vote"
|
|
action["vote_type"] = vote_type
|
|
action["how"] = "(method not recorded)" if not roll_num else "roll"
|
|
if roll_num:
|
|
action["roll"] = roll_num
|
|
action["result"] = pass_fail
|
|
action["where"] = chamber
|
|
action["suspension"] = (suspension != None)
|
|
new_status = new_status_after_vote(vote_type, pass_fail == "pass", chamber, bill_type, False, as_amended, title, prev_status)
|
|
if new_status:
|
|
status = new_status
|
|
|
|
# PSUDO-REPORTING (because GovTrack did this, but should be changed)
|
|
|
|
# TODO: Make a new status for this as pre-reported.
|
|
m = re.search(r"Placed on (the )?([\w ]+) Calendar( under ([\w ]+))?[,\.] Calendar No\. (\d+)\.|Committee Agreed to Seek Consideration Under Suspension of the Rules|Ordered to be Reported", line, re.I)
|
|
if m != None:
|
|
# TODO: This makes no sense.
|
|
if prev_status in ("INTRODUCED", "REFERRED"):
|
|
status = "REPORTED"
|
|
|
|
action["type"] = "calendar"
|
|
|
|
# TODO: Useless. But good for GovTrack compatibility.
|
|
if m.group(2): # not 'Ordered to be Reported'
|
|
action["calendar"] = m.group(2)
|
|
action["under"] = m.group(4)
|
|
action["number"] = m.group(5)
|
|
|
|
# COMMITTEE ACTIONS
|
|
|
|
# reported
|
|
m = re.search(r"Committee on (.*)\. Reported by", line, re.I)
|
|
if m != None:
|
|
action["type"] = "reported"
|
|
action["committee"] = m.group(1)
|
|
if prev_status in ("INTRODUCED", "REFERRED"):
|
|
status = "REPORTED"
|
|
m = re.search(r"Reported to Senate from the (.*?)( \(without written report\))?\.", line, re.I)
|
|
if m != None: # 93rd Congress
|
|
action["type"] = "reported"
|
|
action["committee"] = m.group(1)
|
|
if prev_status in ("INTRODUCED", "REFERRED"):
|
|
status = "REPORTED"
|
|
|
|
# hearings held by a committee
|
|
m = re.search(r"(Committee on .*?)\. Hearings held", line, re.I)
|
|
if m != None:
|
|
action["committee"] = m.group(1)
|
|
action["type"] = "hearings"
|
|
|
|
m = re.search(r"Committee on (.*)\. Discharged (by Unanimous Consent)?", line, re.I)
|
|
if m != None:
|
|
action["committee"] = m.group(1)
|
|
action["type"] = "discharged"
|
|
if prev_status in ("INTRODUCED", "REFERRED"):
|
|
status = "REPORTED"
|
|
|
|
m = re.search("Cleared for White House|Presented to President", line, re.I)
|
|
if m != None:
|
|
action["type"] = "topresident"
|
|
|
|
m = re.search("Signed by President", line, re.I)
|
|
if m != None:
|
|
action["type"] = "signed"
|
|
status = "ENACTED:SIGNED"
|
|
|
|
m = re.search("Pocket Vetoed by President", line, re.I)
|
|
if m != None:
|
|
action["type"] = "vetoed"
|
|
action["pocket"] = "1"
|
|
status = "VETOED:POCKET"
|
|
|
|
# need to put this in an else, or this regex will match the pocket veto and override it
|
|
else:
|
|
m = re.search("Vetoed by President", line, re.I)
|
|
if m != None:
|
|
action["type"] = "vetoed"
|
|
status = "PROV_KILL:VETO"
|
|
|
|
m = re.search("^(?:Became )?(Public|Private) Law(?: No:)? ([\d\-]+)\.", line, re.I)
|
|
if m != None:
|
|
action["law"] = m.group(1).lower()
|
|
pieces = m.group(2).split("-")
|
|
action["congress"] = pieces[0]
|
|
action["number"] = pieces[1]
|
|
action["type"] = "enacted"
|
|
if prev_status == "ENACTED:SIGNED":
|
|
pass # this is a final administrative step
|
|
elif prev_status == "PROV_KILL:VETO" or prev_status.startswith("VETOED:"):
|
|
status = "ENACTED:VETO_OVERRIDE"
|
|
elif bill_id in ("hr1589-94", "s2527-100", "hr1677-101", "hr2978-101", "hr2126-104", "s1322-104"):
|
|
status = "ENACTED:TENDAYRULE"
|
|
else:
|
|
raise Exception("Missing Signed by President action? If this is a case of the 10-day rule, hard code the bill number here.")
|
|
|
|
# Check for referral type
|
|
m = re.search(r"Referred to (?:the )?(House|Senate)?\s?(?:Committee|Subcommittee)?", line, re.I)
|
|
if m != None:
|
|
action["type"] = "referral"
|
|
if prev_status == "INTRODUCED":
|
|
status = "REFERRED"
|
|
|
|
# Check for committee name, and store committee ids
|
|
|
|
# Build a regex to find mentioned committees in the action line.
|
|
cmte_names = []
|
|
for name in utils.committee_names.keys():
|
|
# excluding subcommittee names (they have pipes),
|
|
if name.find('|') == -1:
|
|
# name = re.sub(r"\(.*\)", '', name).strip()
|
|
name = re.sub(r"^(House|Senate) ", "", name)
|
|
cmte_names.append(name)
|
|
cmte_reg = r"(House|Senate)?\s*(?:Committee)?\s*(?:on)?\s*(?:the)?\s*({0})".format("|".join(cmte_names))
|
|
|
|
m = re.search(cmte_reg, line, re.I)
|
|
if m:
|
|
committees = []
|
|
chamber = m.groups()[0] # optional match
|
|
|
|
# This could be made to look for multiple committee names.
|
|
cmte_name_candidates = [" ".join([t for t in m.groups() if t is not None]).replace("House House", "House")]
|
|
|
|
for cand in cmte_name_candidates:
|
|
# many actions just say "Committee on the Judiciary", without a chamber
|
|
# do our best to assign a chamber if we can be sure
|
|
if ("House" not in cand) and ("Senate" not in cand):
|
|
in_house = utils.committee_names.get("House %s" % cand, False)
|
|
in_senate = utils.committee_names.get("Senate %s" % cand, False)
|
|
if in_house and not in_senate:
|
|
cand = "House %s" % cand
|
|
elif in_senate and not in_house:
|
|
cand = "Senate %s" % cand
|
|
|
|
# if this action is a committee-level action (indented on THOMAS), look
|
|
# at the parent action to infer the chamber
|
|
elif len(action_dict.get("committee_action_ref", {}).get("committees", [])) > 0:
|
|
chamber = action_dict["committee_action_ref"]["committees"][0][0] # H, S, or J
|
|
if chamber == "H":
|
|
cand = "House %s" % cand
|
|
elif chamber == "S":
|
|
cand = "Senate %s" % cand
|
|
|
|
# look at other signals on the action line
|
|
elif re.search("Received in the House|Reported to House", line):
|
|
cand = "House %s" % cand
|
|
elif re.search("Received in the Senate|Reported to Senate", line):
|
|
cand = "Senate %s" % cand
|
|
|
|
# if a bill is in an early stage where we're pretty sure activity is in the originating
|
|
# chamber, fall back to the bill's originating chamber
|
|
elif prev_status in ("INTRODUCED", "REFERRED", "REPORTED") and bill_id.startswith("h"):
|
|
cand = "House %s" % cand
|
|
elif prev_status in ("INTRODUCED", "REFERRED", "REPORTED") and bill_id.startswith("s"):
|
|
cand = "Senate %s" % cand
|
|
|
|
try:
|
|
cmte_id = utils.committee_names[cand]
|
|
committees.append(cmte_id)
|
|
except KeyError:
|
|
# pass
|
|
logging.warn("[%s] Committee id not found for '%s' in action <%s>" % (bill_id, cand, line))
|
|
if committees:
|
|
action['committees'] = committees
|
|
|
|
# no matter what it is, sweep the action line for bill IDs of related bills
|
|
bill_ids = utils.extract_bills(line, congress)
|
|
bill_ids = filter(lambda b: b != bill_id, bill_ids)
|
|
if bill_ids and (len(bill_ids) > 0):
|
|
action['bill_ids'] = bill_ids
|
|
|
|
return action, status
|
|
|
|
|
|
def new_status_after_vote(vote_type, passed, chamber, bill_type, suspension, amended, title, prev_status):
|
|
if vote_type == "vote": # vote in originating chamber
|
|
if passed:
|
|
if bill_type in ("hres", "sres"):
|
|
return 'PASSED:SIMPLERES' # end of life for a simple resolution
|
|
if chamber == "h":
|
|
return 'PASS_OVER:HOUSE' # passed by originating chamber, now in second chamber
|
|
else:
|
|
return 'PASS_OVER:SENATE' # passed by originating chamber, now in second chamber
|
|
if suspension:
|
|
return 'PROV_KILL:SUSPENSIONFAILED' # provisionally killed by failure to pass under suspension of the rules
|
|
if chamber == "h":
|
|
return 'FAIL:ORIGINATING:HOUSE' # outright failure
|
|
else:
|
|
return 'FAIL:ORIGINATING:SENATE' # outright failure
|
|
if vote_type in ("vote2", "pingpong"): # vote in second chamber or subsequent pingpong votes
|
|
if passed:
|
|
if amended:
|
|
# mesure is passed but not in identical form
|
|
if chamber == "h":
|
|
return 'PASS_BACK:HOUSE' # passed both chambers, but House sends it back to Senate
|
|
else:
|
|
return 'PASS_BACK:SENATE' # passed both chambers, but Senate sends it back to House
|
|
else:
|
|
# bills and joint resolutions not constitutional amendments, not amended from Senate version
|
|
if bill_type in ("hjres", "sjres") and title.startswith("Proposing an amendment to the Constitution of the United States"):
|
|
return 'PASSED:CONSTAMEND' # joint resolution that looks like an amendment to the constitution
|
|
if bill_type in ("hconres", "sconres"):
|
|
return 'PASSED:CONCURRENTRES' # end of life for concurrent resolutions
|
|
return 'PASSED:BILL' # passed by second chamber, now on to president
|
|
if vote_type == "pingpong":
|
|
# chamber failed to accept the other chamber's changes, but it can vote again
|
|
return 'PROV_KILL:PINGPONGFAIL'
|
|
if suspension:
|
|
return 'PROV_KILL:SUSPENSIONFAILED' # provisionally killed by failure to pass under suspension of the rules
|
|
if chamber == "h":
|
|
return 'FAIL:SECOND:HOUSE' # outright failure
|
|
else:
|
|
return 'FAIL:SECOND:SENATE' # outright failure
|
|
if vote_type == "cloture":
|
|
if not passed:
|
|
return "PROV_KILL:CLOTUREFAILED"
|
|
else:
|
|
return None
|
|
if vote_type == "override":
|
|
if not passed:
|
|
if bill_type[0] == chamber:
|
|
if chamber == "h":
|
|
return 'VETOED:OVERRIDE_FAIL_ORIGINATING:HOUSE'
|
|
else:
|
|
return 'VETOED:OVERRIDE_FAIL_ORIGINATING:SENATE'
|
|
else:
|
|
if chamber == "h":
|
|
return 'VETOED:OVERRIDE_FAIL_SECOND:HOUSE'
|
|
else:
|
|
return 'VETOED:OVERRIDE_FAIL_SECOND:SENATE'
|
|
else:
|
|
if bill_type[0] == chamber:
|
|
if chamber == "h":
|
|
return 'VETOED:OVERRIDE_PASS_OVER:HOUSE'
|
|
else:
|
|
return 'VETOED:OVERRIDE_PASS_OVER:SENATE'
|
|
else:
|
|
return None # just wait for the enacted line
|
|
if vote_type == "conference":
|
|
# This is tricky to integrate into status because we have to wait for both
|
|
# chambers to pass the conference report.
|
|
if passed:
|
|
if prev_status.startswith("CONFERENCE:PASSED:"):
|
|
if bill_type in ("hjres", "sjres") and title.startswith("Proposing an amendment to the Constitution of the United States"):
|
|
return 'PASSED:CONSTAMEND' # joint resolution that looks like an amendment to the constitution
|
|
if bill_type in ("hconres", "sconres"):
|
|
return 'PASSED:CONCURRENTRES' # end of life for concurrent resolutions
|
|
return 'PASSED:BILL'
|
|
else:
|
|
if chamber == "h":
|
|
return 'CONFERENCE:PASSED:HOUSE'
|
|
else:
|
|
return 'CONFERENCE:PASSED:SENATE'
|
|
|
|
return None
|
|
|
|
# parse amendments out of undocumented standalone amendments page
|
|
|
|
|
|
def amendments_for_standalone(body, bill_id):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
|
|
amendments = []
|
|
|
|
for code, chamber, number in re.findall("<a href=\"/cgi-bin/bdquery/z\?d\d+:(SU|SP|HZ)\d+:\">(S|H)\.(?:UP\.)?AMDT\.(\d+)</a>", body, re.I):
|
|
chamber = chamber.lower()
|
|
|
|
# there are "senate unprinted amendments" for the 97th and 98th Congresses, with their own numbering scheme
|
|
# make those use 'su' as the type instead of 's'
|
|
amendment_type = chamber + "amdt"
|
|
if code == "SU":
|
|
amendment_type = "supamdt"
|
|
|
|
amendments.append({
|
|
'chamber': chamber,
|
|
'amendment_type': amendment_type,
|
|
'number': number,
|
|
'amendment_id': "%s%s-%s" % (amendment_type, number, congress)
|
|
})
|
|
|
|
if len(amendments) == 0:
|
|
if not re.search("AMENDMENT\(S\):((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
|
|
raise Exception("Couldn't find amendments section.")
|
|
|
|
return amendments
|
|
|
|
|
|
def amendments_for(body, bill_id):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
|
|
# it is possible in older sessions for the amendments section to not appear at all.
|
|
# if this method is being run, we know the page is not truncated, so if the header
|
|
# is not at all present, assume the page is missing amendments because there are none.
|
|
if not re.search("AMENDMENT\(S\):", body):
|
|
return []
|
|
|
|
amendments = []
|
|
|
|
for code, chamber, number in re.findall("<b>\s*\d+\.</b>\s*<a href=\"/cgi-bin/bdquery/z\?d\d+:(SU|SP|HZ)\d+:\">(S|H)\.(?:UP\.)?AMDT\.(\d+)\s*</a> to ", body, re.I):
|
|
chamber = chamber.lower()
|
|
|
|
# there are "senate unprinted amendments" for the 97th and 98th Congresses, with their own numbering scheme
|
|
# make those use 'supamdt' as the type instead of 's'
|
|
amendment_type = chamber + "amdt"
|
|
if code == "SU":
|
|
amendment_type = "supamdt"
|
|
|
|
amendments.append({
|
|
'chamber': chamber,
|
|
'amendment_type': amendment_type,
|
|
'number': number,
|
|
'amendment_id': "%s%s-%s" % (amendment_type, number, congress)
|
|
})
|
|
|
|
if len(amendments) == 0:
|
|
if not re.search("AMENDMENT\(S\):((?:(?!\<hr).)+)\*\*\*NONE\*\*\*", body, re.S):
|
|
raise Exception("Couldn't find amendments section.")
|
|
|
|
return amendments
|
|
|
|
|
|
# are there at least 150 amendments listed in this body? a quick tally
|
|
# not the end of the world if it's wrong once in a great while, it just sparks
|
|
# a less efficient way of gathering this bill's data
|
|
def too_many_amendments(body):
|
|
# example:
|
|
# "<b>150.</b> <a href="/cgi-bin/bdquery/z?d111:SP02937:">S.AMDT.2937 </a> to <a href="/cgi-bin/bdquery/z?d111:HR03590:">H.R.3590</a>"
|
|
amendments = re.findall("(<b>\s*\d+\.</b>\s*<a href=\"/cgi-bin/bdquery/z\?d\d+:(SP|HZ)\d+:\">(S|H)\.AMDT\.\d+\s*</a> to )", body, re.I)
|
|
return (len(amendments) >= 150)
|
|
|
|
# bills reserved for the Speaker or Minority Leader are not actual legislation,
|
|
# just markers that the number will not be used for ordinary members' bills
|
|
|
|
|
|
def reserved_bill(body):
|
|
if re.search("OFFICIAL TITLE AS INTRODUCED:((?:(?!\<hr).)+)Reserved for the (Speaker|Minority Leader)", body, re.S | re.I):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
# fetch GPO URLs for PDF and HTML formats
|
|
|
|
|
|
def get_GPO_url_for_bill(bill_id, options):
|
|
# we need the URL of the pdf on GPO
|
|
# there may be a way to calculate it, but in the meantime we'll get it the old-fashioned way
|
|
# first get the THOMAS landing page. This may be duplicating work, but didn't see anything
|
|
# Maybe TODO -- reconcile with fdsys script (ideally without downloading large sitemaps for a single bill)
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
thomas_type = utils.thomas_types[bill_type][0]
|
|
congress = int(congress)
|
|
landing_url = "http://thomas.loc.gov/cgi-bin/bdquery/D?d%03d:%s:./list/bss/d%03d%s.lst:" % (congress, number, congress, thomas_type)
|
|
landing_page = utils.download(
|
|
landing_url,
|
|
bill_cache_for(bill_id, "landing_page.html"),
|
|
options)
|
|
text_landing_page_url = "http://thomas.loc.gov/cgi-bin/query/z" + re.search('href="/cgi-bin/query/z?(.*?)">Text of Legislation', landing_page, re.I | re.S).groups(1)[0]
|
|
text_landing_page = utils.download(
|
|
text_landing_page_url,
|
|
bill_cache_for(bill_id, "text_landing_page.html"),
|
|
options)
|
|
gpo_urls = re.findall('http://www.gpo.gov/fdsys/(.*?)\.pdf', text_landing_page, re.I | re.S)
|
|
if not len(gpo_urls):
|
|
logging.info("No GPO link discovered")
|
|
return False
|
|
# get last url on page, in cases where there are several versions of bill
|
|
# THOMAS advises us to use the last one (e.g. http://thomas.loc.gov/cgi-bin/query/z?c113:S.CON.RES.1: )
|
|
|
|
return {
|
|
"pdf": "http://www.gpo.gov/fdsys/" + gpo_urls[-1] + ".pdf",
|
|
"html": "http://www.gpo.gov/fdsys/" + gpo_urls[-1].replace("pdf", "html") + ".htm"
|
|
}
|
|
|
|
|
|
# directory helpers
|
|
|
|
def output_for_bill(bill_id, format, is_data_dot=True):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
if is_data_dot:
|
|
fn = "data.%s" % format
|
|
else:
|
|
fn = format
|
|
return "%s/%s/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn)
|
|
|
|
# defaults to "All Information" page for a bill
|
|
|
|
|
|
def bill_url_for(bill_id, page="L"):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
thomas_type = utils.thomas_types[bill_type][0]
|
|
congress = int(congress)
|
|
return "http://thomas.loc.gov/cgi-bin/bdquery/z?d%03d:%s%s:@@@%s&summ2=m&" % (congress, thomas_type, number, page)
|
|
|
|
|
|
def bill_cache_for(bill_id, file):
|
|
bill_type, number, congress = utils.split_bill_id(bill_id)
|
|
return "%s/bills/%s/%s%s/%s" % (congress, bill_type, bill_type, number, file)
|