mirror of
https://github.com/unitedstates/congress.git
synced 2026-03-25 14:00:05 -04:00
Few more prefab patterns for amendment_code.py
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
import json, re
|
||||
import utils
|
||||
import logging
|
||||
|
||||
|
||||
'''
|
||||
the goal here is to convert the natural language of the amendment into code
|
||||
@@ -12,8 +14,10 @@ tremendous NLP chops.
|
||||
|
||||
#certain amendment structures are so common that it's wisest -- that is, easiest -- to define explicitly
|
||||
prefabs = [
|
||||
("(On page (\d+), line (\d+), ([a-z]+) the amount by \$([\d,]+)\.)", ["verbatim", "page", "line", "action", "content"]),
|
||||
("(At the (end) of (.*?), ([a-z]+) the following:(.*))", ["verbatim", "direction", "location", "action", "content"])
|
||||
("(On page (\d+), lines? (\d+), ([a-z]+) the amount by \$([\d,]+)\.)", ["verbatim", "page", "line", "action", "content"]),
|
||||
("(On page (\d+), (.*?)lines? (\d+)(.*?)(?:\.|;))", ["verbatim", "page", "action", "line", "content"]),
|
||||
("(At the (end) of (.*?), ([a-z]+) the following:?(.*))", ["verbatim", "direction", "location", "action", "content"]),
|
||||
("(At the (appropriate place),? ([a-z]+) the following:?(.*))", ["verbatim", "location", "action", "content"])
|
||||
]
|
||||
|
||||
#a list of verbs that will translate to functions
|
||||
@@ -23,6 +27,12 @@ actions = ["strike", "insert", "delete", "increase", "decrease"]
|
||||
directions = ["after", "at the end"]
|
||||
|
||||
def parse_amendment_text(amendment, bill):
|
||||
if amendment["info"]["amends"]["document_type"] != "bill":
|
||||
logging.info("amendment %s does not amend the bill itself." % amendment["info"]["amendment_id"])
|
||||
return amendment
|
||||
|
||||
amendment["text"] = re.sub("\s+", " ", amendment["text"], re.S)
|
||||
|
||||
# parse the intention of the amendment
|
||||
amendment["commands"] = []
|
||||
commands = []
|
||||
@@ -31,14 +41,18 @@ def parse_amendment_text(amendment, bill):
|
||||
#check for prefab patterns
|
||||
for prefab in prefabs:
|
||||
temp = re.findall(prefab[0], amendment["text"], re.I | re.S)
|
||||
if temp:
|
||||
if temp:
|
||||
for match in temp:
|
||||
command = dict([(x[1], match[x[0]]) for x in enumerate(prefab[1])])
|
||||
commands.append(command)
|
||||
#remove the matched command to prevent double counting
|
||||
amendment["text"] = amendment["text"].replace(command["verbatim"], "")
|
||||
|
||||
|
||||
# for amendments that reference a place in the legislation instead of line number, resolve to location
|
||||
for command in commands:
|
||||
if len(command["verbatim"]) > 1000:
|
||||
command["verbatim"] = "<em>Text of amendment too long (" + str(len(command["verbatim"].split(" "))) + " words).</em>"
|
||||
if "line" in command and "page" in command:
|
||||
amendment["commands"].append(command)
|
||||
elif "location" in command:
|
||||
|
||||
@@ -50,10 +50,11 @@ def fetch_amendment_text(body, amdt, options):
|
||||
#attempt to retrieve sponsor info
|
||||
try:
|
||||
info = json.load(open("data/%s/amendments/samdt/samdt%s/data.json" % (amdt['congress'], amdt['number']), 'r'))
|
||||
data["sponsor"] = info["sponsor"]
|
||||
data["sponsor"]["name"] = members[info["sponsor"]["thomas_id"]]["name"]["official_full"]
|
||||
data["sponsor"]["party"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["party"][0]
|
||||
data["sponsor"]["state"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["state"]
|
||||
#data["sponsor"] = info["sponsor"]
|
||||
info["sponsor"]["name"] = members[info["sponsor"]["thomas_id"]]["name"]["official_full"]
|
||||
info["sponsor"]["party"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["party"][0]
|
||||
#data["sponsor"]["state"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["state"]
|
||||
data["info"] = info
|
||||
except Exception, e:
|
||||
print e
|
||||
|
||||
|
||||
@@ -115,7 +115,12 @@ def get_text_from_pdf(filename):
|
||||
temp = re.search("Subtitle [A-Z]", lines[line])
|
||||
if temp:
|
||||
roadmap[temp.group(0).upper() + " OF " + current_title] = [page, line]
|
||||
|
||||
|
||||
#add last repository for "appropriate place"
|
||||
last_page = sorted(text.keys())[-1]
|
||||
text[int(last_page) + 1] = { '0' : "<em>Amendments with unspecified locations.</em>" }
|
||||
roadmap["APPROPRIATE PLACE"] = [last_page + 1, 0]
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"roadmap": roadmap
|
||||
|
||||
@@ -43,11 +43,12 @@ def run(options):
|
||||
print "Couldn't find parsed text for amendment %d" % a
|
||||
continue
|
||||
amendment = parse_amendment_text(amendment, data)
|
||||
if len(amendment["commands"]):
|
||||
if "commands" in amendment and len(amendment["commands"]):
|
||||
logging.info("Found %d commands in amendment %d" % (len(amendment["commands"]), a))
|
||||
all_amendments[str(a)] = amendment
|
||||
else:
|
||||
logging.info("Didn't find any commands in amendment %d" % a)
|
||||
logging.info(amendment["text"])
|
||||
|
||||
write(json.dumps(all_amendments, indent=2), "data/%s/amendments/samdt/combined/%i_%i.json" % (congress, start, end))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user