mirror of
https://github.com/unitedstates/congress.git
synced 2025-12-19 17:16:58 -05:00
GPO BILLSTATUS XML schema 3.0.0 changes
It's hard to know if this is complete. Not all of the apparent changes were noticed. The scraper is compatible with older files too since past Congresses' data are probably not yet available with the new schema. See https://github.com/usgpo/bill-status/issues/200.
This commit is contained in:
@@ -58,6 +58,8 @@ def create_govtrack_xml(bill, options):
|
||||
n.set("type", title['type'])
|
||||
if title['as']:
|
||||
n.set("as", title['as'])
|
||||
if title['textVersionCode']:
|
||||
n.set("textVersionCode", title['textVersionCode'])
|
||||
if title['is_for_portion']:
|
||||
n.set("partial", "1")
|
||||
|
||||
@@ -77,7 +79,7 @@ def create_govtrack_xml(bill, options):
|
||||
n = make_node(cosponsors, "cosponsor", None, **get_legislator_id_attr(cosp))
|
||||
if cosp["sponsored_at"]:
|
||||
n.set("joined", cosp["sponsored_at"])
|
||||
if cosp["withdrawn_at"]:
|
||||
if cosp.get("withdrawn_at"): # no longer present in GPO BILLSTATUS XML schema 3.0.0
|
||||
n.set("withdrawn", cosp["withdrawn_at"])
|
||||
|
||||
actions = make_node(root, "actions", None)
|
||||
@@ -185,13 +187,13 @@ def summary_for(summaries):
|
||||
return None
|
||||
|
||||
# Take the most recent summary, by looking at the lexicographically last updateDate.
|
||||
summaries = summaries['item']
|
||||
summary = sorted(summaries, key = lambda s: s['updateDate'])[-1]
|
||||
|
||||
# Build dict.
|
||||
return {
|
||||
"date": summary['updateDate'],
|
||||
"as": summary['name'],
|
||||
"as": summary['actionDesc'],
|
||||
"asOf": summary['actionDate'],
|
||||
"text": strip_tags(summary['text']),
|
||||
}
|
||||
|
||||
@@ -303,6 +305,7 @@ def titles_for(title_list):
|
||||
'title': item['title'],
|
||||
'is_for_portion': is_for_portion,
|
||||
'as': state,
|
||||
'textVersionCode': item.get('TextVersionCode'),
|
||||
'type': title_type
|
||||
}
|
||||
|
||||
@@ -394,7 +397,7 @@ def actions_for(action_list, bill_id, title):
|
||||
|
||||
keep = True
|
||||
if closure['prev']:
|
||||
if item['sourceSystem']['code'] == "9":
|
||||
if item['sourceSystem'].get('code') == "9":
|
||||
# Date must match previous action..
|
||||
# If both this and previous have a time, the times must match.
|
||||
# The text must approximately match. Sometimes the LOC text has a prefix
|
||||
@@ -455,7 +458,7 @@ def action_for(item):
|
||||
# text & references
|
||||
# (amendment actions don't always have text?)
|
||||
|
||||
text = item['text'] if item['text'] is not None else ''
|
||||
text = item['text'] if item.get('text') is not None else ''
|
||||
|
||||
# strip out links
|
||||
text = re.sub(r"</?[Aa]( \S.*?)?>", "", text)
|
||||
@@ -533,7 +536,7 @@ def cosponsors_for(cosponsors_list):
|
||||
del cosponsor_dict["type"] # always 'person'
|
||||
cosponsor_dict.update({
|
||||
'sponsored_at': item['sponsorshipDate'],
|
||||
'withdrawn_at': item['sponsorshipWithdrawnDate'],
|
||||
# 'withdrawn_at': item['sponsorshipWithdrawnDate'], # no longer present in GPO BILLSTATUS XML schema 3.0.0?
|
||||
'original_cosponsor': item['isOriginalCosponsor'] == 'True'
|
||||
})
|
||||
return cosponsor_dict
|
||||
|
||||
@@ -178,7 +178,16 @@ def form_bill_json_dict(xml_as_dict):
|
||||
@rtype: dict
|
||||
"""
|
||||
|
||||
from packaging.version import parse as parse_version
|
||||
try:
|
||||
schema_version = parse_version(xml_as_dict['billStatus']['version'])
|
||||
except KeyError: # no 'version' attribute is present before 2022-12-20
|
||||
schema_version = parse_version('1.0.0')
|
||||
|
||||
bill_dict = xml_as_dict['billStatus']['bill']
|
||||
if schema_version >= parse_version('3.0.0'):
|
||||
bill_id = build_bill_id(bill_dict['type'].lower(), bill_dict['number'], bill_dict['congress'])
|
||||
else:
|
||||
bill_id = build_bill_id(bill_dict['billType'].lower(), bill_dict['billNumber'], bill_dict['congress'])
|
||||
titles = bill_info.titles_for(bill_dict['titles']['item'])
|
||||
actions = bill_info.actions_for(bill_dict['actions']['item'], bill_id, bill_info.current_title_for(titles, 'official'))
|
||||
@@ -188,18 +197,38 @@ def form_bill_json_dict(xml_as_dict):
|
||||
logging.info("[%s] Skipping reserved bill number with no sponsor (%s)" % (bill_id, bill_dict['titles']['item'][0]['title']))
|
||||
return bill_dict['titles']['item'][0]['title'] # becomes the 'reason'
|
||||
|
||||
if schema_version >= parse_version('3.0.0'):
|
||||
by_request = bill_dict['sponsors']['item'][0]['isByRequest'] == 'Y'
|
||||
else:
|
||||
by_request = bill_dict['sponsors']['item'][0]['byRequestType'] is not None
|
||||
|
||||
billCommittees = bill_dict.get('committees')
|
||||
if schema_version < parse_version('3.0.0'):
|
||||
billCommittees = (billCommittees or {})['billCommittees']
|
||||
|
||||
if schema_version >= parse_version('3.0.0'):
|
||||
legislativeSubjects = bill_dict.get('subjects', {}).get('legislativeSubjects')
|
||||
else:
|
||||
legislativeSubjects = bill_dict['subjects']['billSubjects']['legislativeSubjects']
|
||||
|
||||
if schema_version >= parse_version('3.0.0'):
|
||||
billSummaries = bill_dict.get('summaries', {}).get('summary')
|
||||
else:
|
||||
billSummaries = bill_dict['summaries']['billSummaries']['item']
|
||||
if billSummaries and not isinstance(billSummaries, list): billSummaries = [billSummaries]
|
||||
|
||||
bill_data = {
|
||||
'bill_id': bill_id,
|
||||
'bill_type': bill_dict.get('billType').lower(),
|
||||
'number': bill_dict.get('billNumber'),
|
||||
'bill_type': bill_dict.get('type' if schema_version >= parse_version('3.0.0') else 'billType').lower(),
|
||||
'number': bill_dict.get('number' if schema_version >= parse_version('3.0.0') else 'billNumber'),
|
||||
'congress': bill_dict.get('congress'),
|
||||
|
||||
'url': billstatus_url_for(bill_id),
|
||||
|
||||
'introduced_at': bill_dict.get('introducedDate', ''),
|
||||
'by_request': bill_dict['sponsors']['item'][0]['byRequestType'] is not None,
|
||||
'by_request': by_request,
|
||||
'sponsor': bill_info.sponsor_for(bill_dict['sponsors']['item'][0]),
|
||||
'cosponsors': bill_info.cosponsors_for(bill_dict['cosponsors']),
|
||||
'cosponsors': bill_info.cosponsors_for(bill_dict.get('cosponsors')),
|
||||
|
||||
'actions': actions,
|
||||
'history': bill_info.history_from_actions(actions),
|
||||
@@ -212,22 +241,22 @@ def form_bill_json_dict(xml_as_dict):
|
||||
'short_title': bill_info.current_title_for(titles, 'short'),
|
||||
'popular_title': bill_info.current_title_for(titles, 'popular'),
|
||||
|
||||
'summary': bill_info.summary_for(bill_dict['summaries']['billSummaries']),
|
||||
'summary': bill_info.summary_for(billSummaries),
|
||||
|
||||
# The top term's case has changed with the new bulk data. It's now in
|
||||
# Title Case. For backwards compatibility, the top term is run through
|
||||
# '.capitalize()' so it matches the old string. TODO: Remove one day?
|
||||
'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict['policyArea'] else None,
|
||||
'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict.get('policyArea') else None,
|
||||
'subjects':
|
||||
sorted(
|
||||
([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict['policyArea'] else []) +
|
||||
([item['name'] for item in bill_dict['subjects']['billSubjects']['legislativeSubjects']['item']] if bill_dict['subjects']['billSubjects']['legislativeSubjects'] else [])
|
||||
([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict.get('policyArea') else []) +
|
||||
([item['name'] for item in legislativeSubjects['item']] if legislativeSubjects else [])
|
||||
),
|
||||
|
||||
'related_bills': bill_info.related_bills_for(bill_dict['relatedBills']),
|
||||
'committees': bill_info.committees_for(bill_dict['committees']['billCommittees']),
|
||||
'amendments': bill_info.amendments_for(bill_dict['amendments']),
|
||||
'committee_reports': bill_info.committee_reports_for(bill_dict['committeeReports']),
|
||||
'related_bills': bill_info.related_bills_for(bill_dict.get('relatedBills')),
|
||||
'committees': bill_info.committees_for(billCommittees),
|
||||
'amendments': bill_info.amendments_for(bill_dict.get('amendments')),
|
||||
'committee_reports': bill_info.committee_reports_for(bill_dict.get('committeeReports')),
|
||||
|
||||
'updated_at': bill_dict.get('updateDate', ''),
|
||||
}
|
||||
@@ -255,7 +284,7 @@ def output_for_bill(bill_id, format, is_data_dot=True):
|
||||
return "%s/%s/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn)
|
||||
|
||||
def process_amendments(bill_id, bill_amendments, options):
|
||||
amdt_list = bill_amendments['billStatus']['bill']['amendments']
|
||||
amdt_list = bill_amendments['billStatus']['bill'].get('amendments')
|
||||
if amdt_list is None: # many bills don't have amendments
|
||||
return
|
||||
|
||||
|
||||
Reference in New Issue
Block a user