mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 09:50:37 -05:00
Adds ids mapping to GPO's Pictorial Member Guide pictorialapi.gpo.gov This PR includes ids for all members of 118th Congress. It may also work for historical files -- at least going back to 110 -- but I have not included that work here, as it requires manual fixes and I imagine most interest is in current membership. Closes #942
208 lines
7.2 KiB
Python
208 lines
7.2 KiB
Python
#!/usr/bin/env python
|
|
|
|
import csv
|
|
import json
|
|
import unicodedata
|
|
import utils
|
|
from utils import load_data, mkdir_p, save_data, parse_date
|
|
|
|
# Update legislators current pictorial ids
|
|
# https://pictorialapi.gpo.gov/index.html
|
|
#
|
|
# options:
|
|
# --cache: load from cache if present on disk (default: false)
|
|
# --bioguide: load only one legislator, by their bioguide ID
|
|
# --congress: do *only* updates for legislators serving in specific congress
|
|
#
|
|
# example:
|
|
# python pictorial_ids.py --congress=118
|
|
|
|
|
|
def run():
|
|
|
|
# default to not caching
|
|
cache = utils.flags().get("cache", False)
|
|
force = not cache
|
|
|
|
only_bioguide = utils.flags().get("bioguide", None)
|
|
congress = utils.flags().get("congress", None)
|
|
|
|
data_files = []
|
|
print("Loading %s..." % "legislators-current.yaml")
|
|
legislators = load_data("legislators-current.yaml")
|
|
data_files.append((legislators, "legislators-current.yaml"))
|
|
print("Loading %s..." % "legislators-historical.yaml")
|
|
legislators = load_data("legislators-historical.yaml")
|
|
data_files.append((legislators, "legislators-historical.yaml"))
|
|
|
|
if congress == None:
|
|
raise Exception("the --congress flag is required")
|
|
elif int(congress) >= 110:
|
|
# Pictorial seems to go back to 110th Congress
|
|
url = f"https://pictorialapi.gpo.gov/api/GuideMember/GetMembers/{congress}"
|
|
pass
|
|
else:
|
|
raise Exception("no data for congress " + congress)
|
|
|
|
pictorial_destination = f"pictorial/source/GetMembers/{congress}.json"
|
|
pictorial_data = json.loads(utils.download(url, pictorial_destination, force))
|
|
|
|
# Filter out non-legislators and the vacant placeholders
|
|
pictorial_members = [
|
|
member
|
|
for member in pictorial_data["memberCollection"]
|
|
if member["memberType"] in ("Senator", "Representative", "Delegate")
|
|
and member["name"] != "Vacant, Vacant"
|
|
]
|
|
|
|
error_filename = f"cache/errors/pictorial/mismatch_{congress}.csv"
|
|
mkdir_p("cache/errors/pictorial")
|
|
error_log = csv.writer(open(error_filename, "w"))
|
|
error_log.writerow(
|
|
[
|
|
"message",
|
|
"bioguide_id",
|
|
"name_first",
|
|
"name_last",
|
|
]
|
|
)
|
|
error_count = 0
|
|
|
|
print("Running for congress " + congress)
|
|
for legislators, filename in data_files:
|
|
for legislator in legislators:
|
|
# this can't run unless we've already collected a bioguide for this person
|
|
bioguide = legislator["id"].get("bioguide", None)
|
|
# if we've limited this to just one bioguide, skip over everyone else
|
|
if only_bioguide and (bioguide != only_bioguide):
|
|
continue
|
|
|
|
# only run for selected congress
|
|
latest_term = legislator["terms"][-1]
|
|
latest_congress = utils.congress_from_legislative_year(
|
|
utils.legislative_year(parse_date(latest_term["start"]))
|
|
)
|
|
if int(congress) != latest_congress:
|
|
continue
|
|
|
|
# skip if we already have it
|
|
if legislator["id"].get("pictorial"):
|
|
continue
|
|
try:
|
|
pictorial_id = match_pictorial_id(legislator, pictorial_members)
|
|
legislator["id"]["pictorial"] = pictorial_id
|
|
except ValueError as e:
|
|
error_count += 1
|
|
error_log.writerow(
|
|
[
|
|
e,
|
|
bioguide,
|
|
legislator["name"]["first"],
|
|
legislator["name"]["last"],
|
|
]
|
|
)
|
|
|
|
save_data(legislators, filename)
|
|
|
|
if error_count:
|
|
print(f"{error_count} error details written to {error_filename}")
|
|
|
|
|
|
def to_ascii(s):
|
|
return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("ASCII")
|
|
|
|
|
|
def reverse_name(name):
|
|
"""
|
|
Given a name in "Last, First" format, return "First Last"
|
|
"""
|
|
return " ".join(name.split(", ")[::-1])
|
|
|
|
|
|
def match_pictorial_id(legislator, pictorial_members):
|
|
"""
|
|
Attempt to find the corresponding pictorial id for the given member.
|
|
|
|
There are many odd cases -- see tests/test_gpo_member_photos.py for
|
|
examples.
|
|
"""
|
|
name = legislator["name"]["official_full"]
|
|
|
|
# Map common nicknames (and GPO typos) from legislators to pictorial
|
|
common_nicknames = {
|
|
"Nick": "Nicolas",
|
|
"Daniel": "Dan",
|
|
"Mike": "Michael",
|
|
"Michael": "Mike",
|
|
"Richard": "Rich",
|
|
"Christopher": "Chris",
|
|
"JOhn": "John",
|
|
}
|
|
|
|
matches = []
|
|
for member_pictorial in pictorial_members:
|
|
# First check whether the name matches
|
|
name_matches = False
|
|
legislator_name_last = to_ascii(legislator["name"]["last"].replace(" ", ""))
|
|
legislator_name_first = to_ascii(legislator["name"]["first"].replace(" ", ""))
|
|
|
|
if legislator_name_last == member_pictorial["lastName"]:
|
|
if legislator_name_first == member_pictorial["firstName"] or (
|
|
"nickname" in legislator["name"]
|
|
and legislator["name"]["nickname"] == member_pictorial["firstName"]
|
|
):
|
|
name_matches = True
|
|
# Sometimes the nickname is encoded in the first name
|
|
elif member_pictorial["firstName"] in legislator_name_first:
|
|
name_matches = True
|
|
# Sometimes the nickname is encoded in the middle name
|
|
elif (
|
|
"middle" in legislator["name"]
|
|
and member_pictorial["firstName"] in legislator["name"]["middle"]
|
|
):
|
|
name_matches = True
|
|
# Sometimes the nickname is not encoded
|
|
elif (
|
|
member_pictorial["firstName"] in common_nicknames
|
|
and common_nicknames[member_pictorial["firstName"]]
|
|
== legislator_name_first
|
|
):
|
|
name_matches = True
|
|
|
|
# Sometimes matching the official full name is best
|
|
if legislator["name"]["official_full"] == reverse_name(
|
|
member_pictorial["name"]
|
|
):
|
|
name_matches = True
|
|
|
|
# The GPO has some first and last names swapped, so check those too
|
|
if not name_matches and legislator_name_first == member_pictorial["lastName"]:
|
|
if legislator_name_last == member_pictorial["firstName"] or (
|
|
"nickname" in legislator["name"]
|
|
and legislator["name"]["nickname"] == member_pictorial["firstName"]
|
|
):
|
|
name_matches = True
|
|
|
|
# If the name matches, check the office and state
|
|
# Note: Assumes we're matching against most recent term
|
|
if name_matches:
|
|
most_recent_term = legislator["terms"][-1]
|
|
mType = "sen" if member_pictorial["memberType"] == "Senator" else "rep"
|
|
if (
|
|
most_recent_term["state"] == member_pictorial["stateId"]
|
|
and most_recent_term["type"] == mType
|
|
):
|
|
matches.append(member_pictorial)
|
|
|
|
if len(matches) == 1:
|
|
return matches[0]["memberId"]
|
|
else:
|
|
if len(matches):
|
|
raise ValueError(f"Multiple pictorial id matches found for {name}")
|
|
else:
|
|
raise ValueError(f"No pictorial id match found for {name}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|