improve the error messages of test/validate.py

This commit is contained in:
Joshua Tauberer
2018-12-16 17:41:11 -05:00
parent f11cb2c27d
commit ced1a983c8

View File

@@ -10,9 +10,9 @@ import utils
from office_validator import run as validate_offices
ok = True
def error(message):
def error(context, message):
global ok
print(message)
print(context + ": " + message)
ok = False
# Current apportionment of the U.S. House, so that we can report if there
@@ -78,77 +78,85 @@ def check_legislators_file(fn, seen_ids, current=None, current_mocs=None):
with open(fn) as f:
legislators = rtyaml.load(f)
for legislator in legislators:
# Create a string for error messages to tell us where problems are ocurring.
context = "{} in {}".format(fn, repr(legislator))
# Check the IDs.
if "id" not in legislator:
error(repr(legislator) + " is missing 'id'.")
error(context, "Missing 'id' mapping.")
else:
# Check that the IDs are valid.
check_id_types(legislator, seen_ids, True)
check_id_types(legislator, seen_ids, True, context)
# Create a string for error messages to tell us where problems are ocurring.
context = "{}:{}".format(fn, legislator['id']['bioguide'])
# Check the name.
if "name" not in legislator:
error(repr(legislator) + " is missing 'name'.")
error(context, "Missing 'name' mapping.")
else:
check_name(legislator["name"])
check_name(legislator["name"], context)
for name in legislator.get("other_names", []):
check_name(name, is_other_names=True)
check_name(name, context + ":other_names", is_other_names=True)
# Check the biographical fields.
if "bio" not in legislator:
error(repr(legislator) + " is missing 'bio'.")
error(context, "Missing 'bio' mapping.")
else:
check_bio(legislator["bio"], current)
check_bio(legislator["bio"], current, context)
# Check the terms.
if "terms" not in legislator:
error(repr(legislator) + " is missing 'terms'.")
error(context, "Missing 'terms' list.")
elif not isinstance(legislator["terms"], list):
error(repr(legislator) + " terms has an invalid data type.")
error(context, "'terms' has an invalid data type.")
elif len(legislator["terms"]) == 0:
error(repr(legislator) + " terms is empty.")
error(context, "'terms' is empty.")
else:
prev_term = None
for i, term in enumerate(legislator["terms"]):
check_term(term, prev_term,
check_term(term, prev_term, context+":terms[{}]".format(i),
current=(current and i==len(legislator["terms"])-1),
current_mocs=current_mocs)
prev_term = term
# Check the leadership roles.
for role in legislator.get("leadership_roles", []):
# All of these fields must be strings.
for key, value in role.items():
if not isinstance(value, str):
error(rtyaml.dump({ key: value }) + " has an invalid data type.")
check_leadership_roles(legislator.get("leadership_roles", []), current, context)
# Check required fields.
if "title" not in role:
error(rtyaml.dump(role) + " is missing title.")
if role.get("chamber") not in ("house", "senate"):
error(rtyaml.dump(role) + " has an invalid chamber.")
if "start" not in role:
error(rtyaml.dump(role) + " is missing start.")
if "end" not in role and not current:
# end is required only in the historical file
error(rtyaml.dump(role) + " is missing end.")
def check_leadership_roles(roles, current, context):
for role in roles:
# All of these fields must be strings.
for key, value in role.items():
if not isinstance(value, str):
error(context, rtyaml.dump({ key: value }) + " has an invalid data type.")
# Check dates.
start = check_date(role['start'])
if "end" in role:
end = check_date(role['end'])
if start and end and end < start:
error(rtyaml.dump(role) + " has end before start.")
# Check required fields.
if "title" not in role:
error(context, rtyaml.dump(role) + " is missing title.")
if role.get("chamber") not in ("house", "senate"):
error(context, rtyaml.dump(role) + " has an invalid chamber.")
if "start" not in role:
error(context, rtyaml.dump(role) + " is missing start.")
if "end" not in role and not current:
# end is required only in the historical file
error(context, rtyaml.dump(role) + " is missing end.")
def check_id_types(legislator, seen_ids, is_legislator):
# Check dates.
start = check_date(role['start'], context)
if "end" in role:
end = check_date(role['end'], context)
if start and end and end < start:
error(context, rtyaml.dump(role) + " has end before start.")
def check_id_types(legislator, seen_ids, is_legislator, context):
for key, value in legislator["id"].items():
# Check that the id key is one we know about.
if key not in id_types:
error(rtyaml.dump({ key: value }) + " is not a valid id.")
error(context, rtyaml.dump({ key: value }) + " is not a valid id.")
# Check that the data type is correct.
elif not isinstance(value, id_types[key]):
error(rtyaml.dump({ key: value }) + " has an invalid data type.")
error(context, rtyaml.dump({ key: value }) + " has an invalid data type.")
else:
# Check that the ID isn't duplicated across legislators.
@@ -163,92 +171,94 @@ def check_id_types(legislator, seen_ids, is_legislator):
# Check that every legislator has ids of the required types.
for id_type in id_required:
if id_type not in legislator["id"]:
error("Missing %s id in:\n%s" % (id_type, rtyaml.dump(legislator['id'])))
error(context, "Missing %s id." % id_type)
def check_name(name, is_other_names=False):
def check_name(name, context, is_other_names=False):
# Check for required keys and data types of the values.
for key, value in name.items():
if key in ("start", "end") and is_other_names:
if not isinstance(value, str):
error(rtyaml.dump({ key: value }) + " has an invalid data type.")
error(context, rtyaml.dump({ key: value }) + " has an invalid data type.")
elif key not in name_keys:
error("%s is not a valid key in name." % key)
error(context, "%s is not a valid key in name." % key)
elif key in ("first", "last"):
# These are required.
if not isinstance(value, str):
error(rtyaml.dump({ key: value }) + " has an invalid data type.")
error(context, rtyaml.dump({ key: value }) + " has an invalid data type.")
else:
# These can be set explicitly to None, but maybe we should just remove
# those keys then.
if not isinstance(value, (str, type(None))):
error(rtyaml.dump({ key: value }) + " has an invalid data type.")
error(context, rtyaml.dump({ key: value }) + " has an invalid data type.")
# If a person as a first initial only, they should also have a middle name.
# (GovTrack relies on this to generate name strings.)
if isinstance(name.get("first"), str) and len(name["first"]) == 2 and name["first"].endswith(".") and not name.get("middle"):
error(rtyaml.dump(name) + " is missing a middle name to go with its first initial.")
error(context, rtyaml.dump(name) + " is missing a middle name to go with its first initial.")
def check_bio(bio, is_current_legislator):
def check_bio(bio, is_current_legislator, context):
for key, value in bio.items():
if key not in (bio_keys | old_allowed_other_bio_keys):
error("%s is not a valid key in bio." % key)
error(context, "%s is not a valid key in bio." % key)
elif not isinstance(value, str):
error(rtyaml.dump({ key: value }) + " has an invalid data type.")
error(context, rtyaml.dump({ key: value }) + " has an invalid data type.")
if is_current_legislator:
# These keys are required only for current legislators.
# We don't always have the information for historical members of Congress or presidents.
for key in bio_keys:
if key not in bio:
error("Missing bio->{}.".format(key))
error(context, "Missing bio->{}.".format(key))
def check_term(term, prev_term, current=None, current_mocs=None):
def check_term(term, prev_term, context, current=None, current_mocs=None):
# Check type.
if term.get("type") not in ("rep", "sen"):
error(rtyaml.dump(term) + " has invalid type.")
error(context, "Term has invalid 'type'.")
# Check date range.
start = check_date(term.get('start'))
end = check_date(term.get('end'))
start = check_date(term.get('start'), context)
end = check_date(term.get('end'), context)
if start and end:
context += "({} to {})".format(start, end)
if end < start:
error(rtyaml.dump(term) + " has end before start.")
error(context, "Term has end before start.")
# TODO: Remove 'and end > "2000-"'. I'm just adding it because
# lots of historical data fails this test.
if prev_term and end > date(2000,1,1):
prev_end = check_date(prev_term.get("end"))
prev_end = check_date(prev_term.get("end"), context)
if prev_end:
if start < prev_end:
error(rtyaml.dump(term) + " has start before previous term's end.")
error(context, "Term has start before previous term's end.")
if not current and (end > now):
error(rtyaml.dump(term) + " has an end date in the future but is in the historical file.")
error(context, "Term has an end date in the future but is a past term.")
if current and (end < now):
error(rtyaml.dump(term) + " has an end date in the past but is in the current file.")
error(context, "Term has an end date in the past but is a most recent term in the current file.")
# Check how.
if term.get("how") not in (None, "appointment",):
error(rtyaml.dump(term) + " has invalid 'how'.")
error(context, "Term has invalid 'how'.")
# Check end-type.
if term.get("end-type") not in (None, "special-election",):
error(rtyaml.dump(term) + " has invalid 'end-type'.")
error(context, "Term has invalid 'end-type'.")
if term.get("end-type") == "special-election" and term.get("how") != "appointment":
error(rtyaml.dump(term) + " can't have an 'end-type' without being an appointed senator.")
error(context, "Term can't have an 'end-type' without being an appointed senator.")
# Check state, district, class, state_rank.
if term.get("state") not in utils.states:
error(rtyaml.dump(term) + " has invalid state.")
error(context, "Term has invalid state.")
if term.get("type") == "rep":
if not isinstance(term.get("district"), int):
error(rtyaml.dump(term) + " has invalid district.")
error(context, "Term has invalid district.")
if term.get("type") == "sen":
if term.get("class") not in (1, 2, 3):
error(rtyaml.dump(term) + " has invalid class.")
error(context, "Term has invalid class.")
if term.get("state_rank") not in ("junior", "senior", None):
error(rtyaml.dump(term) + " has invalid senator state_rank.")
error(context, "Term has invalid senator state_rank.")
elif current and term.get("state_rank") is None:
error(rtyaml.dump(term) + " is missing senator state_rank.")
error(context, "Term is missing senator state_rank.")
if current:
# Check uniqueness of office for current members.
@@ -256,27 +266,27 @@ def check_term(term, prev_term, current=None, current_mocs=None):
# Check office.
office = (term.get("type"), term.get("state"), term.get("district") if term.get("type") == "rep" else term.get("class"))
if office in current_mocs:
error(rtyaml.dump(term) + " duplicates an office.")
error(context, "Term duplicates an office.")
current_mocs.add(office)
# Check senator rank isn't duplicated.
if term.get("type") == "sen":
office = (term.get("state"), term.get("state_rank"))
if office in current_mocs:
error(rtyaml.dump(term) + " duplicates state_rank in a state.")
error(context, "Term duplicates state_rank in a state.")
current_mocs.add(office)
# Check party of current members (historical is too difficult).
if term.get("party") not in ("Republican", "Democrat", "Independent"):
error(rtyaml.dump({ "party": term.get("party") }) + " is invalid.")
error(context, rtyaml.dump({ "party": term.get("party") }) + " is invalid.")
# Check caucus of Independent members.
if term.get("party") == "Independent" and term.get("caucus") not in ("Republican", "Democrat"):
error(rtyaml.dump({ "caucus": term.get("caucus") }) + " is invalid when party is Independent.")
error(context, rtyaml.dump({ "caucus": term.get("caucus") }) + " is invalid when party is Independent.")
# Check website -- optional.
# Check website -- it's optional, so warn.
if not term.get("url"):
print(rtyaml.dump(term) + " is missing a website url.")
print(context, "Term is missing a website url.")
# TODO: Check party_affiliations and office information.
@@ -303,79 +313,81 @@ def check_executive_file(fn):
with open(fn) as f:
people = rtyaml.load(f)
for person in people:
# Create a string for error messages to tell us where problems are ocurring.
context = "{} in {}".format(fn, repr(person))
# Check the IDs.
if "id" not in person:
error(repr(person) + " is missing 'id'.")
error(context, "Missing 'id' mapping.")
else:
# Check that the IDs are valid.
check_id_types(person, {}, False)
check_id_types(person, {}, False, context)
# Check the name.
if "name" not in person:
error(repr(person) + " is missing 'name'.")
error(context, "Missing 'name' mapping.")
else:
check_name(person["name"])
check_name(person["name"], context)
# Check the biographical fields.
if "bio" not in person:
error(repr(person) + " is missing 'bio'.")
error(context, "Missing 'bio' mapping.")
else:
check_bio(person["bio"], False)
check_bio(person["bio"], False, repr(person))
# Check the terms.
if "terms" not in person:
error(repr(person) + " is missing 'terms'.")
error(context, "Missing 'terms' list.")
elif not isinstance(person["terms"], list):
error(repr(person) + " terms has an invalid data type.")
error(context, "'terms' has an invalid data type.")
elif len(person["terms"]) == 0:
error(repr(person) + " terms is empty.")
error(context, "'terms' is empty.")
else:
for i, term in enumerate(person["terms"]):
check_executive_term(term)
check_executive_term(term, context+":term[{}]".format(i))
def check_executive_term(term):
def check_executive_term(term, context):
# Check type.
if term.get("type") not in ("prez", "viceprez"):
error(rtyaml.dump(term) + " has invalid type.")
error(context, "Term has invalid 'type'.")
# Check how.
if term.get("how") not in ("election", "succession", "appointment"):
error(rtyaml.dump(term) + " has invalid 'how'.")
error(context, "Term has invalid 'how'.")
# Check date range.
start = check_date(term.get('start'))
end = check_date(term.get('end'))
start = check_date(term.get('start'), context)
end = check_date(term.get('end'), context)
if start and end:
if end < start:
error(rtyaml.dump(term) + " has end before start.")
error(context, "Term has end before start.")
if end.year > 2000:
# Check party of current members (historical is too difficult and even recent ones incorrectly have Democratic instead of Democrat, which is inconsistent with the legislators files).
if term.get("party") not in ("Republican", "Democrat"):
error(rtyaml.dump({ "party": term.get("party") }) + " is invalid.")
error(context, rtyaml.dump({ "party": term.get("party") }) + " is invalid.")
def check_date(d):
def check_date(d, context):
if not isinstance(d, str):
error(str(d) + ": invalid data type")
error(context, str(d) + ": invalid data type")
return None
try:
return utils.parse_date(d)
except Exception as e:
error(d + ": " + str(e))
error(context, d + ": " + str(e))
return None
def check_id_uniqueness(seen_ids):
for (id_type, id_value), occurrences in seen_ids.items():
if len(occurrences) > 1:
error("%s %s is duplicated: %s" % (id_type, id_value,
error("", "%s %s is duplicated: %s" % (id_type, id_value,
" ".join(legislator['id']['bioguide'] for legislator in occurrences)))
def check_district_offices():
has_errors = validate_offices(skip_warnings=True)
if has_errors:
error("District offices have errors")
error("", "District offices have errors")
if __name__ == "__main__":