congress-legislators/scripts/pictorial_ids.py

#!/usr/bin/env python

import csv
import json
import unicodedata
import utils
from utils import load_data, mkdir_p, save_data, parse_date

# Update legislators current pictorial ids
# https://pictorialapi.gpo.gov/index.html
#
# options:
#  --cache: load from cache if present on disk (default: false)
#  --bioguide: load only one legislator, by their bioguide ID
#  --congress: do *only* updates for legislators serving in specific congress
#
# example:
#  python pictorial_ids.py --congress=118


def run():

    # default to not caching
    cache = utils.flags().get("cache", False)
    force = not cache

    only_bioguide = utils.flags().get("bioguide", None)
    congress = utils.flags().get("congress", None)

    data_files = []
    print("Loading %s..." % "legislators-current.yaml")
    legislators = load_data("legislators-current.yaml")
    data_files.append((legislators, "legislators-current.yaml"))
    print("Loading %s..." % "legislators-historical.yaml")
    legislators = load_data("legislators-historical.yaml")
    data_files.append((legislators, "legislators-historical.yaml"))

    if congress == None:
        raise Exception("the --congress flag is required")
    elif int(congress) >= 110:
        # Pictorial seems to go back to 110th Congress
        url = f"https://pictorialapi.gpo.gov/api/GuideMember/GetMembers/{congress}"
        pass
    else:
        raise Exception("no data for congress " + congress)

    pictorial_destination = f"pictorial/source/GetMembers/{congress}.json"
    pictorial_data = json.loads(utils.download(url, pictorial_destination, force))

    # Filter out non-legislators and the vacant placeholders
    pictorial_members = [
        member
        for member in pictorial_data["memberCollection"]
        if member["memberType"] in ("Senator", "Representative", "Delegate")
        and member["name"] != "Vacant, Vacant"
    ]

    error_filename = f"cache/errors/pictorial/mismatch_{congress}.csv"
    mkdir_p("cache/errors/pictorial")
    error_log = csv.writer(open(error_filename, "w"))
    error_log.writerow(
        [
            "message",
            "bioguide_id",
            "name_first",
            "name_last",
        ]
    )
    error_count = 0

    print("Running for congress " + congress)
    for legislators, filename in data_files:
        for legislator in legislators:
            # this can't run unless we've already collected a bioguide for this person
            bioguide = legislator["id"].get("bioguide", None)
            # if we've limited this to just one bioguide, skip over everyone else
            if only_bioguide and (bioguide != only_bioguide):
                continue

            # only run for selected congress
            latest_term = legislator["terms"][-1]
            latest_congress = utils.congress_from_legislative_year(
                utils.legislative_year(parse_date(latest_term["start"]))
            )
            if int(congress) != latest_congress:
                continue

            # skip if we already have it
            if legislator["id"].get("pictorial"):
                continue
            try:
                pictorial_id = match_pictorial_id(legislator, pictorial_members)
                legislator["id"]["pictorial"] = pictorial_id
            except ValueError as e:
                error_count += 1
                error_log.writerow(
                    [
                        e,
                        bioguide,
                        legislator["name"]["first"],
                        legislator["name"]["last"],
                    ]
                )

        save_data(legislators, filename)

    if error_count:
        print(f"{error_count} error details written to {error_filename}")


def to_ascii(s):
    return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("ASCII")


def reverse_name(name):
    """
    Given a name in "Last, First" format, return "First Last"
    """
    return " ".join(name.split(", ")[::-1])


def match_pictorial_id(legislator, pictorial_members):
    """
    Attempt to find the corresponding pictorial id for the given member.

    There are many odd cases -- see tests/test_gpo_member_photos.py for
    examples.
    """
    name = legislator["name"]["official_full"]

    # Map common nicknames (and GPO typos) from legislators to pictorial
    common_nicknames = {
        "Nick": "Nicolas",
        "Daniel": "Dan",
        "Mike": "Michael",
        "Michael": "Mike",
        "Richard": "Rich",
        "Christopher": "Chris",
        "JOhn": "John",
    }

    matches = []
    for member_pictorial in pictorial_members:
        # First check whether the name matches
        name_matches = False
        legislator_name_last = to_ascii(legislator["name"]["last"].replace(" ", ""))
        legislator_name_first = to_ascii(legislator["name"]["first"].replace(" ", ""))

        if legislator_name_last == member_pictorial["lastName"]:
            if legislator_name_first == member_pictorial["firstName"] or (
                "nickname" in legislator["name"]
                and legislator["name"]["nickname"] == member_pictorial["firstName"]
            ):
                name_matches = True
            # Sometimes the nickname is encoded in the first name
            elif member_pictorial["firstName"] in legislator_name_first:
                name_matches = True
            # Sometimes the nickname is encoded in the middle name
            elif (
                "middle" in legislator["name"]
                and member_pictorial["firstName"] in legislator["name"]["middle"]
            ):
                name_matches = True
            # Sometimes the nickname is not encoded
            elif (
                member_pictorial["firstName"] in common_nicknames
                and common_nicknames[member_pictorial["firstName"]]
                == legislator_name_first
            ):
                name_matches = True

        # Sometimes matching the official full name is best
        if legislator["name"]["official_full"] == reverse_name(
            member_pictorial["name"]
        ):
            name_matches = True

        # The GPO has some first and last names swapped, so check those too
        if not name_matches and legislator_name_first == member_pictorial["lastName"]:
            if legislator_name_last == member_pictorial["firstName"] or (
                "nickname" in legislator["name"]
                and legislator["name"]["nickname"] == member_pictorial["firstName"]
            ):
                name_matches = True

        # If the name matches, check the office and state
        # Note: Assumes we're matching against most recent term
        if name_matches:
            most_recent_term = legislator["terms"][-1]
            mType = "sen" if member_pictorial["memberType"] == "Senator" else "rep"
            if (
                most_recent_term["state"] == member_pictorial["stateId"]
                and most_recent_term["type"] == mType
            ):
                matches.append(member_pictorial)

    if len(matches) == 1:
        return matches[0]["memberId"]
    else:
        if len(matches):
            raise ValueError(f"Multiple pictorial id matches found for {name}")
        else:
            raise ValueError(f"No pictorial id match found for {name}")


if __name__ == "__main__":
    run()