From 22045725ce945dda1928dffbdff9a48fc0500a27 Mon Sep 17 00:00:00 2001 From: wilson428 Date: Mon, 25 Mar 2013 23:41:27 -0400 Subject: [PATCH] Few more prefab patterns for amendment_code.py --- tasks/amendment_code.py | 20 +++++++++++++++++--- tasks/amendment_text.py | 9 +++++---- tasks/extract.py | 7 ++++++- tasks/parse.py | 3 ++- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/tasks/amendment_code.py b/tasks/amendment_code.py index 18e1445..e05025b 100644 --- a/tasks/amendment_code.py +++ b/tasks/amendment_code.py @@ -1,5 +1,7 @@ import json, re import utils +import logging + ''' the goal here is to convert the natural language of the amendment into code @@ -12,8 +14,10 @@ tremendous NLP chops. #certain amendment structures are so common that it's wisest -- that is, easiest -- to define explicitly prefabs = [ - ("(On page (\d+), line (\d+), ([a-z]+) the amount by \$([\d,]+)\.)", ["verbatim", "page", "line", "action", "content"]), - ("(At the (end) of (.*?), ([a-z]+) the following:(.*))", ["verbatim", "direction", "location", "action", "content"]) + ("(On page (\d+), lines? (\d+), ([a-z]+) the amount by \$([\d,]+)\.)", ["verbatim", "page", "line", "action", "content"]), + ("(On page (\d+), (.*?)lines? (\d+)(.*?)(?:\.|;))", ["verbatim", "page", "action", "line", "content"]), + ("(At the (end) of (.*?), ([a-z]+) the following:?(.*))", ["verbatim", "direction", "location", "action", "content"]), + ("(At the (appropriate place),? ([a-z]+) the following:?(.*))", ["verbatim", "location", "action", "content"]) ] #a list of verbs that will translate to functions @@ -23,6 +27,12 @@ actions = ["strike", "insert", "delete", "increase", "decrease"] directions = ["after", "at the end"] def parse_amendment_text(amendment, bill): + if amendment["info"]["amends"]["document_type"] != "bill": + logging.info("amendment %s does not amend the bill itself." % amendment["info"]["amendment_id"]) + return amendment + + amendment["text"] = re.sub("\s+", " ", amendment["text"], re.S) + # parse the intention of the amendment amendment["commands"] = [] commands = [] @@ -31,14 +41,18 @@ def parse_amendment_text(amendment, bill): #check for prefab patterns for prefab in prefabs: temp = re.findall(prefab[0], amendment["text"], re.I | re.S) - if temp: + if temp: for match in temp: command = dict([(x[1], match[x[0]]) for x in enumerate(prefab[1])]) commands.append(command) + #remove the matched command to prevent double counting + amendment["text"] = amendment["text"].replace(command["verbatim"], "") # for amendments that reference a place in the legislation instead of line number, resolve to location for command in commands: + if len(command["verbatim"]) > 1000: + command["verbatim"] = "Text of amendment too long (" + str(len(command["verbatim"].split(" "))) + " words)." if "line" in command and "page" in command: amendment["commands"].append(command) elif "location" in command: diff --git a/tasks/amendment_text.py b/tasks/amendment_text.py index 7c50d50..3dbc334 100644 --- a/tasks/amendment_text.py +++ b/tasks/amendment_text.py @@ -50,10 +50,11 @@ def fetch_amendment_text(body, amdt, options): #attempt to retrieve sponsor info try: info = json.load(open("data/%s/amendments/samdt/samdt%s/data.json" % (amdt['congress'], amdt['number']), 'r')) - data["sponsor"] = info["sponsor"] - data["sponsor"]["name"] = members[info["sponsor"]["thomas_id"]]["name"]["official_full"] - data["sponsor"]["party"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["party"][0] - data["sponsor"]["state"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["state"] + #data["sponsor"] = info["sponsor"] + info["sponsor"]["name"] = members[info["sponsor"]["thomas_id"]]["name"]["official_full"] + info["sponsor"]["party"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["party"][0] + #data["sponsor"]["state"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["state"] + data["info"] = info except Exception, e: print e diff --git a/tasks/extract.py b/tasks/extract.py index 5a61e18..d8dfcb7 100644 --- a/tasks/extract.py +++ b/tasks/extract.py @@ -115,7 +115,12 @@ def get_text_from_pdf(filename): temp = re.search("Subtitle [A-Z]", lines[line]) if temp: roadmap[temp.group(0).upper() + " OF " + current_title] = [page, line] - + + #add last repository for "appropriate place" + last_page = sorted(text.keys())[-1] + text[int(last_page) + 1] = { '0' : "Amendments with unspecified locations." } + roadmap["APPROPRIATE PLACE"] = [last_page + 1, 0] + return { "text": text, "roadmap": roadmap diff --git a/tasks/parse.py b/tasks/parse.py index c75c2d6..18925d8 100644 --- a/tasks/parse.py +++ b/tasks/parse.py @@ -43,11 +43,12 @@ def run(options): print "Couldn't find parsed text for amendment %d" % a continue amendment = parse_amendment_text(amendment, data) - if len(amendment["commands"]): + if "commands" in amendment and len(amendment["commands"]): logging.info("Found %d commands in amendment %d" % (len(amendment["commands"]), a)) all_amendments[str(a)] = amendment else: logging.info("Didn't find any commands in amendment %d" % a) + logging.info(amendment["text"]) write(json.dumps(all_amendments, indent=2), "data/%s/amendments/samdt/combined/%i_%i.json" % (congress, start, end))