GPO BILLSTATUS XML schema 3.0.0 changes

It's hard to know if this is complete. Not all of the apparent changes were noticed. The scraper is compatible with older files too since past Congresses' data are probably not yet available with the new schema.

See https://github.com/usgpo/bill-status/issues/200.
This commit is contained in:
Joshua Tauberer
2022-12-21 22:05:47 +00:00
parent 4bedc84c6d
commit f5b510a551
3 changed files with 53 additions and 20 deletions

View File

@@ -58,6 +58,8 @@ def create_govtrack_xml(bill, options):
n.set("type", title['type'])
if title['as']:
n.set("as", title['as'])
if title['textVersionCode']:
n.set("textVersionCode", title['textVersionCode'])
if title['is_for_portion']:
n.set("partial", "1")
@@ -77,7 +79,7 @@ def create_govtrack_xml(bill, options):
n = make_node(cosponsors, "cosponsor", None, **get_legislator_id_attr(cosp))
if cosp["sponsored_at"]:
n.set("joined", cosp["sponsored_at"])
if cosp["withdrawn_at"]:
if cosp.get("withdrawn_at"): # no longer present in GPO BILLSTATUS XML schema 3.0.0
n.set("withdrawn", cosp["withdrawn_at"])
actions = make_node(root, "actions", None)
@@ -185,13 +187,13 @@ def summary_for(summaries):
return None
# Take the most recent summary, by looking at the lexicographically last updateDate.
summaries = summaries['item']
summary = sorted(summaries, key = lambda s: s['updateDate'])[-1]
# Build dict.
return {
"date": summary['updateDate'],
"as": summary['name'],
"as": summary['actionDesc'],
"asOf": summary['actionDate'],
"text": strip_tags(summary['text']),
}
@@ -303,6 +305,7 @@ def titles_for(title_list):
'title': item['title'],
'is_for_portion': is_for_portion,
'as': state,
'textVersionCode': item.get('TextVersionCode'),
'type': title_type
}
@@ -394,7 +397,7 @@ def actions_for(action_list, bill_id, title):
keep = True
if closure['prev']:
if item['sourceSystem']['code'] == "9":
if item['sourceSystem'].get('code') == "9":
# Date must match previous action..
# If both this and previous have a time, the times must match.
# The text must approximately match. Sometimes the LOC text has a prefix
@@ -455,7 +458,7 @@ def action_for(item):
# text & references
# (amendment actions don't always have text?)
text = item['text'] if item['text'] is not None else ''
text = item['text'] if item.get('text') is not None else ''
# strip out links
text = re.sub(r"</?[Aa]( \S.*?)?>", "", text)
@@ -533,7 +536,7 @@ def cosponsors_for(cosponsors_list):
del cosponsor_dict["type"] # always 'person'
cosponsor_dict.update({
'sponsored_at': item['sponsorshipDate'],
'withdrawn_at': item['sponsorshipWithdrawnDate'],
# 'withdrawn_at': item['sponsorshipWithdrawnDate'], # no longer present in GPO BILLSTATUS XML schema 3.0.0?
'original_cosponsor': item['isOriginalCosponsor'] == 'True'
})
return cosponsor_dict

View File

@@ -178,7 +178,16 @@ def form_bill_json_dict(xml_as_dict):
@rtype: dict
"""
from packaging.version import parse as parse_version
try:
schema_version = parse_version(xml_as_dict['billStatus']['version'])
except KeyError: # no 'version' attribute is present before 2022-12-20
schema_version = parse_version('1.0.0')
bill_dict = xml_as_dict['billStatus']['bill']
if schema_version >= parse_version('3.0.0'):
bill_id = build_bill_id(bill_dict['type'].lower(), bill_dict['number'], bill_dict['congress'])
else:
bill_id = build_bill_id(bill_dict['billType'].lower(), bill_dict['billNumber'], bill_dict['congress'])
titles = bill_info.titles_for(bill_dict['titles']['item'])
actions = bill_info.actions_for(bill_dict['actions']['item'], bill_id, bill_info.current_title_for(titles, 'official'))
@@ -188,18 +197,38 @@ def form_bill_json_dict(xml_as_dict):
logging.info("[%s] Skipping reserved bill number with no sponsor (%s)" % (bill_id, bill_dict['titles']['item'][0]['title']))
return bill_dict['titles']['item'][0]['title'] # becomes the 'reason'
if schema_version >= parse_version('3.0.0'):
by_request = bill_dict['sponsors']['item'][0]['isByRequest'] == 'Y'
else:
by_request = bill_dict['sponsors']['item'][0]['byRequestType'] is not None
billCommittees = bill_dict.get('committees')
if schema_version < parse_version('3.0.0'):
billCommittees = (billCommittees or {})['billCommittees']
if schema_version >= parse_version('3.0.0'):
legislativeSubjects = bill_dict.get('subjects', {}).get('legislativeSubjects')
else:
legislativeSubjects = bill_dict['subjects']['billSubjects']['legislativeSubjects']
if schema_version >= parse_version('3.0.0'):
billSummaries = bill_dict.get('summaries', {}).get('summary')
else:
billSummaries = bill_dict['summaries']['billSummaries']['item']
if billSummaries and not isinstance(billSummaries, list): billSummaries = [billSummaries]
bill_data = {
'bill_id': bill_id,
'bill_type': bill_dict.get('billType').lower(),
'number': bill_dict.get('billNumber'),
'bill_type': bill_dict.get('type' if schema_version >= parse_version('3.0.0') else 'billType').lower(),
'number': bill_dict.get('number' if schema_version >= parse_version('3.0.0') else 'billNumber'),
'congress': bill_dict.get('congress'),
'url': billstatus_url_for(bill_id),
'introduced_at': bill_dict.get('introducedDate', ''),
'by_request': bill_dict['sponsors']['item'][0]['byRequestType'] is not None,
'by_request': by_request,
'sponsor': bill_info.sponsor_for(bill_dict['sponsors']['item'][0]),
'cosponsors': bill_info.cosponsors_for(bill_dict['cosponsors']),
'cosponsors': bill_info.cosponsors_for(bill_dict.get('cosponsors')),
'actions': actions,
'history': bill_info.history_from_actions(actions),
@@ -212,22 +241,22 @@ def form_bill_json_dict(xml_as_dict):
'short_title': bill_info.current_title_for(titles, 'short'),
'popular_title': bill_info.current_title_for(titles, 'popular'),
'summary': bill_info.summary_for(bill_dict['summaries']['billSummaries']),
'summary': bill_info.summary_for(billSummaries),
# The top term's case has changed with the new bulk data. It's now in
# Title Case. For backwards compatibility, the top term is run through
# '.capitalize()' so it matches the old string. TODO: Remove one day?
'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict['policyArea'] else None,
'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict.get('policyArea') else None,
'subjects':
sorted(
([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict['policyArea'] else []) +
([item['name'] for item in bill_dict['subjects']['billSubjects']['legislativeSubjects']['item']] if bill_dict['subjects']['billSubjects']['legislativeSubjects'] else [])
([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict.get('policyArea') else []) +
([item['name'] for item in legislativeSubjects['item']] if legislativeSubjects else [])
),
'related_bills': bill_info.related_bills_for(bill_dict['relatedBills']),
'committees': bill_info.committees_for(bill_dict['committees']['billCommittees']),
'amendments': bill_info.amendments_for(bill_dict['amendments']),
'committee_reports': bill_info.committee_reports_for(bill_dict['committeeReports']),
'related_bills': bill_info.related_bills_for(bill_dict.get('relatedBills')),
'committees': bill_info.committees_for(billCommittees),
'amendments': bill_info.amendments_for(bill_dict.get('amendments')),
'committee_reports': bill_info.committee_reports_for(bill_dict.get('committeeReports')),
'updated_at': bill_dict.get('updateDate', ''),
}
@@ -255,7 +284,7 @@ def output_for_bill(bill_id, format, is_data_dot=True):
return "%s/%s/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn)
def process_amendments(bill_id, bill_amendments, options):
amdt_list = bill_amendments['billStatus']['bill']['amendments']
amdt_list = bill_amendments['billStatus']['bill'].get('amendments')
if amdt_list is None: # many bills don't have amendments
return

View File

@@ -30,6 +30,7 @@ setuptools.setup(
'pyyaml',
'scrapelib',
'xmltodict',
'packaging',
],
entry_points={
'console_scripts': [