Fix tests by moving the new script into a function since all scripts are checked for formatting in tests

2025-12-19 18:05:51 -05:00 · 2023-01-04 05:55:31 -05:00
parent 0d79e7c3f6
commit 8b54aaa1e5
1 changed files with 65 additions and 62 deletions
--- a/scripts/bioguide_xml.py
+++ b/scripts/bioguide_xml.py
@@ -11,77 +11,80 @@ import json
 import rtyaml
 import datetime
-# Load existing legislators and map bioguide IDs
+def run():
-# to their entries.
+    # Load existing legislators and map bioguide IDs
-legislator_data = { }
+    # to their entries.
-legislators = { }
+    legislator_data = { }
-for ft in ("current", "historical"):
+    legislators = { }
-    with open("../legislators-{}.yaml".format(ft)) as f:
+    for ft in ("current", "historical"):
-        data = rtyaml.load(f)
+        with open("../legislators-{}.yaml".format(ft)) as f:
-        legislator_data[ft] = data
+            data = rtyaml.load(f)
-        for p in data:
+            legislator_data[ft] = data
-            legislators[p["id"]["bioguide"]] = p
+            for p in data:
                legislators[p["id"]["bioguide"]] = p
-def parse_birthday_from_text(text):
+    def parse_birthday_from_text(text):
-    # exceptions for not-nicely-placed semicolons
+        # exceptions for not-nicely-placed semicolons
-    text = text.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
+        text = text.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
-    text = text.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
+        text = text.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
-    text = text.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
+        text = text.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
-    text = text.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
+        text = text.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
-    text = text.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
+        text = text.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
-    text = text.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
+        text = text.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
-    # look for a date
+        # look for a date
-    pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
+        pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
-    match = re.search(pattern, text, re.I)
+        match = re.search(pattern, text, re.I)
-    if not match or not match.group(1):
+        if not match or not match.group(1):
-      # specifically detect cases that we can't handle to avoid unnecessary warnings
+          # specifically detect cases that we can't handle to avoid unnecessary warnings
-      if re.search("birth dates? unknown|date of birth is unknown", text, re.I): return None, None
+          if re.search("birth dates? unknown|date of birth is unknown", text, re.I): return None, None
-      if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", text, re.I): return None, None
+          if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", text, re.I): return None, None
-      return None, None
+          return None, None
-    original_text = match.group(1).strip()
+        original_text = match.group(1).strip()
-    try:
+        try:
-      birthday = datetime.datetime.strptime(original_text.replace(",", ""), "%B %d %Y")
+          birthday = datetime.datetime.strptime(original_text.replace(",", ""), "%B %d %Y")
-    except ValueError:
+        except ValueError:
-      print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main))
+          print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide_id, original_text))
-      return None, original_text
+          return None, original_text
-    birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
+        birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
-    return birthday, original_text
+        return birthday, original_text
-# Process all profile data in the bioguide ZIP file.
+    # Process all profile data in the bioguide ZIP file.
-with zipfile.ZipFile(sys.argv[1]) as zf:
+    with zipfile.ZipFile(sys.argv[1]) as zf:
-    for profile_fn in zf.namelist():
+        for profile_fn in zf.namelist():
-        bioguide_id = re.match(r"^([A-Z]\d+)\.json", profile_fn).group(1)
+            bioguide_id = re.match(r"^([A-Z]\d+)\.json", profile_fn).group(1)
-        if bioguide_id not in legislators:
+            if bioguide_id not in legislators:
-            #print("No legislator for", bioguide_id)
+                #print("No legislator for", bioguide_id)
            continue
        with zf.open(profile_fn) as zff:
            profile = json.load(zff)
            if "profileText" not in profile:
                continue
            with zf.open(profile_fn) as zff:
                profile = json.load(zff)
                if "profileText" not in profile:
                    continue
-            legislator = legislators[bioguide_id]
+                legislator = legislators[bioguide_id]
-            # Get birthday from text.
+                # Get birthday from text.
-            birthday, original_text = parse_birthday_from_text(profile["profileText"])
+                birthday, original_text = parse_birthday_from_text(profile["profileText"])
-            if birthday:
+                if birthday:
-                # Check birthday from metadata --- not as reliable.
+                    # Check birthday from metadata --- not as reliable.
-                # Since the metadata may only have a year, only match
+                    # Since the metadata may only have a year, only match
-                # as much of the date string as it has.
+                    # as much of the date string as it has.
-                if profile.get("birthDate") and not profile.get("birthCirca"):
+                    if profile.get("birthDate") and not profile.get("birthCirca"):
-                    if profile["birthDate"] != birthday[0:len(profile["birthDate"])]:
+                        if profile["birthDate"] != birthday[0:len(profile["birthDate"])]:
-                         print(bioguide_id, "metadata", repr(profile["birthDate"]), "doesn't match profile text", repr(original_text))
+                             print(bioguide_id, "metadata", repr(profile["birthDate"]), "doesn't match profile text", repr(original_text))
-                    else:
+                        else:
-                        # They match, so update.
+                            # They match, so update.
-                        legislators.setdefault("bio", {})
+                            legislators.setdefault("bio", {})
-                        legislator["bio"]["birthday"] = birthday
+                            legislator["bio"]["birthday"] = birthday
-# Write out updated data files.
+    # Write out updated data files.
-for fn in legislator_data:
+    for fn in legislator_data:
-    with open("../legislators-{}.yaml".format(ft), "w") as f:
+        with open("../legislators-{}.yaml".format(ft), "w") as f:
-        rtyaml.dump(legislator_data[fn], f)
+            rtyaml.dump(legislator_data[fn], f)
 if __name__ == "__main__":
    run()