From f5b510a5516bfcf1dd0ffe1b3cb70a4705ace36c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 21 Dec 2022 22:05:47 +0000 Subject: [PATCH] GPO BILLSTATUS XML schema 3.0.0 changes It's hard to know if this is complete. Not all of the apparent changes were noticed. The scraper is compatible with older files too since past Congresses' data are probably not yet available with the new schema. See https://github.com/usgpo/bill-status/issues/200. --- congress/tasks/bill_info.py | 15 ++++++---- congress/tasks/bills.py | 57 ++++++++++++++++++++++++++++--------- setup.py | 1 + 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/congress/tasks/bill_info.py b/congress/tasks/bill_info.py index 3c61655..6515b8c 100644 --- a/congress/tasks/bill_info.py +++ b/congress/tasks/bill_info.py @@ -58,6 +58,8 @@ def create_govtrack_xml(bill, options): n.set("type", title['type']) if title['as']: n.set("as", title['as']) + if title['textVersionCode']: + n.set("textVersionCode", title['textVersionCode']) if title['is_for_portion']: n.set("partial", "1") @@ -77,7 +79,7 @@ def create_govtrack_xml(bill, options): n = make_node(cosponsors, "cosponsor", None, **get_legislator_id_attr(cosp)) if cosp["sponsored_at"]: n.set("joined", cosp["sponsored_at"]) - if cosp["withdrawn_at"]: + if cosp.get("withdrawn_at"): # no longer present in GPO BILLSTATUS XML schema 3.0.0 n.set("withdrawn", cosp["withdrawn_at"]) actions = make_node(root, "actions", None) @@ -185,13 +187,13 @@ def summary_for(summaries): return None # Take the most recent summary, by looking at the lexicographically last updateDate. - summaries = summaries['item'] summary = sorted(summaries, key = lambda s: s['updateDate'])[-1] # Build dict. return { "date": summary['updateDate'], - "as": summary['name'], + "as": summary['actionDesc'], + "asOf": summary['actionDate'], "text": strip_tags(summary['text']), } @@ -303,6 +305,7 @@ def titles_for(title_list): 'title': item['title'], 'is_for_portion': is_for_portion, 'as': state, + 'textVersionCode': item.get('TextVersionCode'), 'type': title_type } @@ -394,7 +397,7 @@ def actions_for(action_list, bill_id, title): keep = True if closure['prev']: - if item['sourceSystem']['code'] == "9": + if item['sourceSystem'].get('code') == "9": # Date must match previous action.. # If both this and previous have a time, the times must match. # The text must approximately match. Sometimes the LOC text has a prefix @@ -455,7 +458,7 @@ def action_for(item): # text & references # (amendment actions don't always have text?) - text = item['text'] if item['text'] is not None else '' + text = item['text'] if item.get('text') is not None else '' # strip out links text = re.sub(r"", "", text) @@ -533,7 +536,7 @@ def cosponsors_for(cosponsors_list): del cosponsor_dict["type"] # always 'person' cosponsor_dict.update({ 'sponsored_at': item['sponsorshipDate'], - 'withdrawn_at': item['sponsorshipWithdrawnDate'], + # 'withdrawn_at': item['sponsorshipWithdrawnDate'], # no longer present in GPO BILLSTATUS XML schema 3.0.0? 'original_cosponsor': item['isOriginalCosponsor'] == 'True' }) return cosponsor_dict diff --git a/congress/tasks/bills.py b/congress/tasks/bills.py index 92aec56..2457176 100644 --- a/congress/tasks/bills.py +++ b/congress/tasks/bills.py @@ -178,8 +178,17 @@ def form_bill_json_dict(xml_as_dict): @rtype: dict """ + from packaging.version import parse as parse_version + try: + schema_version = parse_version(xml_as_dict['billStatus']['version']) + except KeyError: # no 'version' attribute is present before 2022-12-20 + schema_version = parse_version('1.0.0') + bill_dict = xml_as_dict['billStatus']['bill'] - bill_id = build_bill_id(bill_dict['billType'].lower(), bill_dict['billNumber'], bill_dict['congress']) + if schema_version >= parse_version('3.0.0'): + bill_id = build_bill_id(bill_dict['type'].lower(), bill_dict['number'], bill_dict['congress']) + else: + bill_id = build_bill_id(bill_dict['billType'].lower(), bill_dict['billNumber'], bill_dict['congress']) titles = bill_info.titles_for(bill_dict['titles']['item']) actions = bill_info.actions_for(bill_dict['actions']['item'], bill_id, bill_info.current_title_for(titles, 'official')) status, status_date = bill_info.latest_status(actions, bill_dict.get('introducedDate', '')) @@ -188,18 +197,38 @@ def form_bill_json_dict(xml_as_dict): logging.info("[%s] Skipping reserved bill number with no sponsor (%s)" % (bill_id, bill_dict['titles']['item'][0]['title'])) return bill_dict['titles']['item'][0]['title'] # becomes the 'reason' + if schema_version >= parse_version('3.0.0'): + by_request = bill_dict['sponsors']['item'][0]['isByRequest'] == 'Y' + else: + by_request = bill_dict['sponsors']['item'][0]['byRequestType'] is not None + + billCommittees = bill_dict.get('committees') + if schema_version < parse_version('3.0.0'): + billCommittees = (billCommittees or {})['billCommittees'] + + if schema_version >= parse_version('3.0.0'): + legislativeSubjects = bill_dict.get('subjects', {}).get('legislativeSubjects') + else: + legislativeSubjects = bill_dict['subjects']['billSubjects']['legislativeSubjects'] + + if schema_version >= parse_version('3.0.0'): + billSummaries = bill_dict.get('summaries', {}).get('summary') + else: + billSummaries = bill_dict['summaries']['billSummaries']['item'] + if billSummaries and not isinstance(billSummaries, list): billSummaries = [billSummaries] + bill_data = { 'bill_id': bill_id, - 'bill_type': bill_dict.get('billType').lower(), - 'number': bill_dict.get('billNumber'), + 'bill_type': bill_dict.get('type' if schema_version >= parse_version('3.0.0') else 'billType').lower(), + 'number': bill_dict.get('number' if schema_version >= parse_version('3.0.0') else 'billNumber'), 'congress': bill_dict.get('congress'), 'url': billstatus_url_for(bill_id), 'introduced_at': bill_dict.get('introducedDate', ''), - 'by_request': bill_dict['sponsors']['item'][0]['byRequestType'] is not None, + 'by_request': by_request, 'sponsor': bill_info.sponsor_for(bill_dict['sponsors']['item'][0]), - 'cosponsors': bill_info.cosponsors_for(bill_dict['cosponsors']), + 'cosponsors': bill_info.cosponsors_for(bill_dict.get('cosponsors')), 'actions': actions, 'history': bill_info.history_from_actions(actions), @@ -212,22 +241,22 @@ def form_bill_json_dict(xml_as_dict): 'short_title': bill_info.current_title_for(titles, 'short'), 'popular_title': bill_info.current_title_for(titles, 'popular'), - 'summary': bill_info.summary_for(bill_dict['summaries']['billSummaries']), + 'summary': bill_info.summary_for(billSummaries), # The top term's case has changed with the new bulk data. It's now in # Title Case. For backwards compatibility, the top term is run through # '.capitalize()' so it matches the old string. TODO: Remove one day? - 'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict['policyArea'] else None, + 'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict.get('policyArea') else None, 'subjects': sorted( - ([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict['policyArea'] else []) + - ([item['name'] for item in bill_dict['subjects']['billSubjects']['legislativeSubjects']['item']] if bill_dict['subjects']['billSubjects']['legislativeSubjects'] else []) + ([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict.get('policyArea') else []) + + ([item['name'] for item in legislativeSubjects['item']] if legislativeSubjects else []) ), - 'related_bills': bill_info.related_bills_for(bill_dict['relatedBills']), - 'committees': bill_info.committees_for(bill_dict['committees']['billCommittees']), - 'amendments': bill_info.amendments_for(bill_dict['amendments']), - 'committee_reports': bill_info.committee_reports_for(bill_dict['committeeReports']), + 'related_bills': bill_info.related_bills_for(bill_dict.get('relatedBills')), + 'committees': bill_info.committees_for(billCommittees), + 'amendments': bill_info.amendments_for(bill_dict.get('amendments')), + 'committee_reports': bill_info.committee_reports_for(bill_dict.get('committeeReports')), 'updated_at': bill_dict.get('updateDate', ''), } @@ -255,7 +284,7 @@ def output_for_bill(bill_id, format, is_data_dot=True): return "%s/%s/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn) def process_amendments(bill_id, bill_amendments, options): - amdt_list = bill_amendments['billStatus']['bill']['amendments'] + amdt_list = bill_amendments['billStatus']['bill'].get('amendments') if amdt_list is None: # many bills don't have amendments return diff --git a/setup.py b/setup.py index 97b405f..9fc1a01 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ setuptools.setup( 'pyyaml', 'scrapelib', 'xmltodict', + 'packaging', ], entry_points={ 'console_scripts': [