From 22045725ce945dda1928dffbdff9a48fc0500a27 Mon Sep 17 00:00:00 2001
From: wilson428 <christopher.e.wilson@gmail.com>
Date: Mon, 25 Mar 2013 23:41:27 -0400
Subject: [PATCH] Few more prefab patterns for amendment_code.py

---
 tasks/amendment_code.py | 20 +++++++++++++++++---
 tasks/amendment_text.py |  9 +++++----
 tasks/extract.py        |  7 ++++++-
 tasks/parse.py          |  3 ++-
 4 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/tasks/amendment_code.py b/tasks/amendment_code.py
index 18e1445..e05025b 100644
--- a/tasks/amendment_code.py
+++ b/tasks/amendment_code.py
@@ -1,5 +1,7 @@
 import json, re
 import utils
+import logging
+
 
 '''
 the goal here is to convert the natural language of the amendment into code
@@ -12,8 +14,10 @@ tremendous NLP chops.
 
 #certain amendment structures are so common that it's wisest -- that is, easiest -- to define explicitly
 prefabs = [
-    ("(On page (\d+), line (\d+), ([a-z]+) the amount by \$([\d,]+)\.)", ["verbatim", "page", "line", "action", "content"]),
-    ("(At the (end) of (.*?), ([a-z]+) the following:(.*))", ["verbatim", "direction", "location", "action", "content"])
+    ("(On page (\d+), lines? (\d+), ([a-z]+) the amount by \$([\d,]+)\.)", ["verbatim", "page", "line", "action", "content"]),
+    ("(On page (\d+), (.*?)lines? (\d+)(.*?)(?:\.|;))", ["verbatim", "page", "action", "line", "content"]),
+    ("(At the (end) of (.*?), ([a-z]+) the following:?(.*))", ["verbatim", "direction", "location", "action", "content"]),
+    ("(At the (appropriate place),? ([a-z]+) the following:?(.*))", ["verbatim", "location", "action", "content"])
 ]
 
 #a list of verbs that will translate to functions
@@ -23,6 +27,12 @@ actions = ["strike", "insert", "delete", "increase", "decrease"]
 directions = ["after", "at the end"]
 
 def parse_amendment_text(amendment, bill):
+  if amendment["info"]["amends"]["document_type"] != "bill":
+    logging.info("amendment %s does not amend the bill itself." % amendment["info"]["amendment_id"])
+    return amendment
+
+  amendment["text"] = re.sub("\s+", " ", amendment["text"], re.S)
+  
   # parse the intention of the amendment
   amendment["commands"] = []
   commands = []
@@ -31,14 +41,18 @@ def parse_amendment_text(amendment, bill):
   #check for prefab patterns
   for prefab in prefabs:
       temp = re.findall(prefab[0], amendment["text"], re.I | re.S)
-      if temp:          
+      if temp:
           for match in temp:
               command = dict([(x[1], match[x[0]]) for x in enumerate(prefab[1])])
               commands.append(command)
+              #remove the matched command to prevent double counting
+              amendment["text"] = amendment["text"].replace(command["verbatim"], "")
 
 
   # for amendments that reference a place in the legislation instead of line number, resolve to location
   for command in commands:
+      if len(command["verbatim"]) > 1000:
+        command["verbatim"] = "<em>Text of amendment too long (" + str(len(command["verbatim"].split(" "))) + " words).</em>"
       if "line" in command and "page" in command:
           amendment["commands"].append(command)
       elif "location" in command:
diff --git a/tasks/amendment_text.py b/tasks/amendment_text.py
index 7c50d50..3dbc334 100644
--- a/tasks/amendment_text.py
+++ b/tasks/amendment_text.py
@@ -50,10 +50,11 @@ def fetch_amendment_text(body, amdt, options):
   #attempt to retrieve sponsor info
   try:
     info = json.load(open("data/%s/amendments/samdt/samdt%s/data.json" % (amdt['congress'], amdt['number']), 'r'))
-    data["sponsor"] = info["sponsor"]
-    data["sponsor"]["name"] = members[info["sponsor"]["thomas_id"]]["name"]["official_full"]
-    data["sponsor"]["party"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["party"][0]
-    data["sponsor"]["state"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["state"]
+    #data["sponsor"] = info["sponsor"]
+    info["sponsor"]["name"] = members[info["sponsor"]["thomas_id"]]["name"]["official_full"]
+    info["sponsor"]["party"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["party"][0]
+    #data["sponsor"]["state"] = members[info["sponsor"]["thomas_id"]]["terms"][-1]["state"]
+    data["info"] = info
   except Exception, e:
     print e
 
diff --git a/tasks/extract.py b/tasks/extract.py
index 5a61e18..d8dfcb7 100644
--- a/tasks/extract.py
+++ b/tasks/extract.py
@@ -115,7 +115,12 @@ def get_text_from_pdf(filename):
             temp = re.search("Subtitle [A-Z]", lines[line])
             if temp:
                 roadmap[temp.group(0).upper() + " OF " + current_title] = [page, line]
-        
+
+    #add last repository for "appropriate place"
+    last_page = sorted(text.keys())[-1]
+    text[int(last_page) + 1] = { '0' : "<em>Amendments with unspecified locations.</em>" }
+    roadmap["APPROPRIATE PLACE"] = [last_page + 1, 0]
+    
     return {
         "text": text,
         "roadmap": roadmap
diff --git a/tasks/parse.py b/tasks/parse.py
index c75c2d6..18925d8 100644
--- a/tasks/parse.py
+++ b/tasks/parse.py
@@ -43,11 +43,12 @@ def run(options):
         print "Couldn't find parsed text for amendment %d" % a
         continue
     amendment = parse_amendment_text(amendment, data)
-    if len(amendment["commands"]):
+    if "commands" in amendment and len(amendment["commands"]):
         logging.info("Found %d commands in amendment %d" % (len(amendment["commands"]), a))
         all_amendments[str(a)] = amendment
     else:
         logging.info("Didn't find any commands in amendment %d" % a)
+        logging.info(amendment["text"])
                      
   write(json.dumps(all_amendments, indent=2), "data/%s/amendments/samdt/combined/%i_%i.json" % (congress, start, end))