#!/usr/bin/python import re import urllib.request import json from urllib.parse import quote, unquote from utils import load_data, save_data from SPARQLWrapper import SPARQLWrapper, JSON def get_wikidata_ids(legislators): # Look up wikidata IDs for legislators with English Wikipedia IDs. for p in legislators: if not p["id"].get("wikidata") and p["id"].get("wikipedia"): w = quote(p["id"]["wikipedia"].replace(" ", "_")) query_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={w}&format=json" response = json.load(urllib.request.urlopen(query_url)) wikidata_id = list(response["query"]["pages"].values())[0]["pageprops"]["wikibase_item"] p["id"]["wikidata"] = wikidata_id def get_ids_from_wikidata(legislators): # Query to fetch information for entities that have a bioguide ID. # Selecting on bioguide ID efficiently gets wikidata entries that # we are interested in. table = run_query(""" PREFIX wdt: PREFIX schema: SELECT ?subject ?bioguide ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia WHERE { ?subject wdt:P1157 ?bioguide . OPTIONAL { ?subject wdt:P2671 ?google_entity_id } OPTIONAL { ?subject wdt:P2686 ?opensecrets } OPTIONAL { ?subject wdt:P3344 ?votesmart } OPTIONAL { ?subject wdt:P2390 ?ballotpedia } OPTIONAL { ?wikipedia schema:about ?subject . ?wikipedia schema:inLanguage "en" . ?wikipedia schema:isPartOf . } } """) # make a mapping from bioguide ID to query result mapping = { r["bioguide"]: r for r in table } # update legislators for p in legislators: if p["id"].get("bioguide") in mapping: p["id"].update(mapping[p["id"]["bioguide"]]) def get_ids_from_wikidata_without_bioguide(legislators): # The SQPARL server doesn't seem to suppor VALUES or FILTER(?subject IN (...)) # so in order to fill in values for legislators without bioguide IDs but with # wikidata IDs, we can just query them one by one. This probably is only useful # at the start of a new Congress when bioguide IDs are not yet available. for p in legislators: if not ("bioguide" not in p["id"] and "wikidata" in p["id"]): continue table = run_query(""" PREFIX wd: PREFIX wdt: PREFIX schema: SELECT ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia WHERE { OPTIONAL { ?subject wdt:P2671 ?google_entity_id } OPTIONAL { ?subject wdt:P2686 ?opensecrets } OPTIONAL { ?subject wdt:P3344 ?votesmart } OPTIONAL { ?subject wdt:P2390 ?ballotpedia } OPTIONAL { ?wikipedia schema:about ?subject . ?wikipedia schema:inLanguage "en" . ?wikipedia schema:isPartOf . } } """.replace("?subject", "wd:" + p["id"]["wikidata"])) p["id"].update(table[0]) def run_query(query): sparql_endpoint = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql' s = SPARQLWrapper(sparql_endpoint) # run the query s.setQuery(query) s.setReturnFormat(JSON) results = s.query().convert() for row in results['results']['bindings']: if "subject" in row: # replace the ?subject variable with the wikidata id row['wikidata'] = { "value": re.search(r'/(Q\d+)', row['subject']['value']).group(1) } del row["subject"] # clean up the google entity id if 'google_entity_id' in row: row['google_entity_id']["value"] = 'kg:' + row['google_entity_id']["value"] # clean up the wikipedia and ballotpedia results if "wikipedia" in row: row["wikipedia"]["value"] = \ unquote(row["wikipedia"]["value"])\ .replace("https://en.wikipedia.org/wiki/", "")\ .strip().replace('_',' ') if "ballotpedia" in row: row["ballotpedia"]["value"] = row["ballotpedia"]["value"].strip().replace('_',' ') # clean up the votesmart id if "votesmart" in row: try: row["votesmart"]["value"] = int(row["votesmart"]["value"]) except ValueError: print("invalid value", row["votesmart"]["value"]) continue # return a simple list of dicts of results return [ { k: row[k]['value'] for k in row } for row in results['results']['bindings'] ] def run(): p1 = load_data("legislators-current.yaml") p2 = load_data("legislators-historical.yaml") get_wikidata_ids(p1+p2) get_ids_from_wikidata(p1+p2) get_ids_from_wikidata_without_bioguide(p1+p2) save_data(p1, "legislators-current.yaml") save_data(p2, "legislators-historical.yaml") if __name__ == '__main__': run()