Implemented parsing the "by request" feature

2025-12-25 01:00:04 -05:00 · 2014-07-17 01:06:24 +07:00
parent 46156ff4d5
commit 0191deb2c7
2 changed files with 34 additions and 5 deletions
--- a/tasks/bill_info.py
+++ b/tasks/bill_info.py
@@ -5,7 +5,7 @@ import json
 from lxml import etree
 import time
 import datetime
-from lxml.html import fromstring
+from lxml.html import fromstring, HtmlElement

 # can be run on its own, just require a bill_id

@@ -103,6 +103,7 @@ def parse_bill(bill_id, body, options):

    # parse everything out of the All Information page
    introduced_at = introduced_at_for(body)
+    by_request = parse_by_request(body)
    sponsor = sponsor_for(body)
    cosponsors = cosponsors_for(body)
    summary = summary_for(body)
@@ -113,7 +114,7 @@ def parse_bill(bill_id, body, options):
    committees = committees_for(body, bill_id)
    amendments = amendments_for(body, bill_id)

-    return process_bill(bill_id, options, introduced_at, sponsor, cosponsors,
+    return process_bill(bill_id, options, introduced_at, by_request, sponsor, cosponsors,
                        summary, titles, actions, related_bills, subjects, committees, amendments)


@@ -123,6 +124,7 @@ def parse_bill_split(bill_id, body, options):

    # get some info out of the All Info page, since we already have it
    introduced_at = introduced_at_for(body)
+    by_request = parse_by_request(body)
    sponsor = sponsor_for(body)
    subjects = subjects_for(body)

@@ -179,13 +181,13 @@ def parse_bill_split(bill_id, body, options):
    committees_body = utils.unescape(committees_body)
    committees = committees_for(committees_body, bill_id)

-    return process_bill(bill_id, options, introduced_at, sponsor, cosponsors,
+    return process_bill(bill_id, options, introduced_at, by_request, sponsor, cosponsors,
                        summary, titles, actions, related_bills, subjects, committees, amendments)


 # take the initial parsed content, extract more information, assemble output data
 def process_bill(bill_id, options,
-                 introduced_at, sponsor, cosponsors,
+                 introduced_at, by_request, sponsor, cosponsors,
                 summary, titles, actions, related_bills, subjects, committees, amendments):

    bill_type, number, congress = utils.split_bill_id(bill_id)
@@ -216,6 +218,7 @@ def process_bill(bill_id, options,
        'congress': congress,

        'introduced_at': introduced_at,
+        'by_request': by_request,
        'sponsor': sponsor,
        'cosponsors': cosponsors,

@@ -823,6 +826,28 @@ def introduced_at_for(body):
    return datetime.datetime.strftime(parsed, "%Y-%m-%d")


+def parse_by_request(body):
+    """
+    Check whether the bill was introduced by the request.
+
+    Return boolean value.
+    """
+    doc = fromstring(body)
+
+    # Extract all text nodes from the range
+    # <b>Sponsor: </b> .... <br />
+    b_node = doc.xpath('//b[normalize-space(text()) = "Sponsor:"]')[0]
+    text_items = []
+    for node in b_node.xpath('.//following-sibling::node()'):
+        if isinstance(node, HtmlElement):
+            if node.tag == 'br':
+                break
+        if isinstance(node, unicode):
+            text_items.append(unicode(node))
+    text = u' '.join(text_items)
+    return u'by request' in text
+
+
 def cosponsors_for(body):
    match = re.search("COSPONSORS\((\d+)\).*?<p>(?:</br>)?(.*?)(?:</br>)?(?:<hr|<div id=\"footer\">)", body, re.S)
    if not match: