mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 18:05:51 -05:00
Fix tests by moving the new script into a function since all scripts are checked for formatting in tests
This commit is contained in:
@@ -11,77 +11,80 @@ import json
|
|||||||
import rtyaml
|
import rtyaml
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
# Load existing legislators and map bioguide IDs
|
def run():
|
||||||
# to their entries.
|
# Load existing legislators and map bioguide IDs
|
||||||
legislator_data = { }
|
# to their entries.
|
||||||
legislators = { }
|
legislator_data = { }
|
||||||
for ft in ("current", "historical"):
|
legislators = { }
|
||||||
with open("../legislators-{}.yaml".format(ft)) as f:
|
for ft in ("current", "historical"):
|
||||||
data = rtyaml.load(f)
|
with open("../legislators-{}.yaml".format(ft)) as f:
|
||||||
legislator_data[ft] = data
|
data = rtyaml.load(f)
|
||||||
for p in data:
|
legislator_data[ft] = data
|
||||||
legislators[p["id"]["bioguide"]] = p
|
for p in data:
|
||||||
|
legislators[p["id"]["bioguide"]] = p
|
||||||
|
|
||||||
def parse_birthday_from_text(text):
|
def parse_birthday_from_text(text):
|
||||||
# exceptions for not-nicely-placed semicolons
|
# exceptions for not-nicely-placed semicolons
|
||||||
text = text.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
|
text = text.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
|
||||||
text = text.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
|
text = text.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
|
||||||
text = text.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
|
text = text.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
|
||||||
text = text.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
|
text = text.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
|
||||||
text = text.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
|
text = text.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
|
||||||
text = text.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
|
text = text.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
|
||||||
|
|
||||||
# look for a date
|
# look for a date
|
||||||
pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
|
pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
|
||||||
match = re.search(pattern, text, re.I)
|
match = re.search(pattern, text, re.I)
|
||||||
if not match or not match.group(1):
|
if not match or not match.group(1):
|
||||||
# specifically detect cases that we can't handle to avoid unnecessary warnings
|
# specifically detect cases that we can't handle to avoid unnecessary warnings
|
||||||
if re.search("birth dates? unknown|date of birth is unknown", text, re.I): return None, None
|
if re.search("birth dates? unknown|date of birth is unknown", text, re.I): return None, None
|
||||||
if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", text, re.I): return None, None
|
if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", text, re.I): return None, None
|
||||||
return None, None
|
return None, None
|
||||||
original_text = match.group(1).strip()
|
original_text = match.group(1).strip()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
birthday = datetime.datetime.strptime(original_text.replace(",", ""), "%B %d %Y")
|
birthday = datetime.datetime.strptime(original_text.replace(",", ""), "%B %d %Y")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main))
|
print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide_id, original_text))
|
||||||
return None, original_text
|
return None, original_text
|
||||||
|
|
||||||
birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
|
birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
|
||||||
return birthday, original_text
|
return birthday, original_text
|
||||||
|
|
||||||
# Process all profile data in the bioguide ZIP file.
|
# Process all profile data in the bioguide ZIP file.
|
||||||
with zipfile.ZipFile(sys.argv[1]) as zf:
|
with zipfile.ZipFile(sys.argv[1]) as zf:
|
||||||
for profile_fn in zf.namelist():
|
for profile_fn in zf.namelist():
|
||||||
bioguide_id = re.match(r"^([A-Z]\d+)\.json", profile_fn).group(1)
|
bioguide_id = re.match(r"^([A-Z]\d+)\.json", profile_fn).group(1)
|
||||||
if bioguide_id not in legislators:
|
if bioguide_id not in legislators:
|
||||||
#print("No legislator for", bioguide_id)
|
#print("No legislator for", bioguide_id)
|
||||||
continue
|
|
||||||
with zf.open(profile_fn) as zff:
|
|
||||||
profile = json.load(zff)
|
|
||||||
if "profileText" not in profile:
|
|
||||||
continue
|
continue
|
||||||
|
with zf.open(profile_fn) as zff:
|
||||||
|
profile = json.load(zff)
|
||||||
|
if "profileText" not in profile:
|
||||||
|
continue
|
||||||
|
|
||||||
legislator = legislators[bioguide_id]
|
legislator = legislators[bioguide_id]
|
||||||
|
|
||||||
# Get birthday from text.
|
# Get birthday from text.
|
||||||
birthday, original_text = parse_birthday_from_text(profile["profileText"])
|
birthday, original_text = parse_birthday_from_text(profile["profileText"])
|
||||||
if birthday:
|
if birthday:
|
||||||
|
|
||||||
# Check birthday from metadata --- not as reliable.
|
# Check birthday from metadata --- not as reliable.
|
||||||
# Since the metadata may only have a year, only match
|
# Since the metadata may only have a year, only match
|
||||||
# as much of the date string as it has.
|
# as much of the date string as it has.
|
||||||
if profile.get("birthDate") and not profile.get("birthCirca"):
|
if profile.get("birthDate") and not profile.get("birthCirca"):
|
||||||
if profile["birthDate"] != birthday[0:len(profile["birthDate"])]:
|
if profile["birthDate"] != birthday[0:len(profile["birthDate"])]:
|
||||||
print(bioguide_id, "metadata", repr(profile["birthDate"]), "doesn't match profile text", repr(original_text))
|
print(bioguide_id, "metadata", repr(profile["birthDate"]), "doesn't match profile text", repr(original_text))
|
||||||
else:
|
else:
|
||||||
# They match, so update.
|
# They match, so update.
|
||||||
legislators.setdefault("bio", {})
|
legislators.setdefault("bio", {})
|
||||||
legislator["bio"]["birthday"] = birthday
|
legislator["bio"]["birthday"] = birthday
|
||||||
|
|
||||||
|
|
||||||
# Write out updated data files.
|
# Write out updated data files.
|
||||||
for fn in legislator_data:
|
for fn in legislator_data:
|
||||||
with open("../legislators-{}.yaml".format(ft), "w") as f:
|
with open("../legislators-{}.yaml".format(ft), "w") as f:
|
||||||
rtyaml.dump(legislator_data[fn], f)
|
rtyaml.dump(legislator_data[fn], f)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run()
|
||||||
|
|||||||
Reference in New Issue
Block a user