mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 09:50:37 -05:00
From https://clerk.house.gov/xml/lists/unofficial-119-member-elect-data.xml, birthdays from Wikipedia, and review by the GovTrack team. I think everyone was sworn in today except Sen. Justice who plans to take office on Jan 13, and consequently his bioguide ID is not available, so I've left him out.
159 lines
5.3 KiB
Python
Executable File
159 lines
5.3 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
import re
|
|
import urllib.request
|
|
import json
|
|
from urllib.parse import quote, unquote
|
|
from utils import load_data, save_data
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
|
|
def get_wikidata_ids(legislators):
|
|
# Look up wikidata IDs for legislators with English Wikipedia IDs.
|
|
for p in legislators:
|
|
if not p["id"].get("wikidata") and p["id"].get("wikipedia"):
|
|
w = quote(p["id"]["wikipedia"].replace(" ", "_"))
|
|
query_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={w}&format=json"
|
|
response = json.load(urllib.request.urlopen(query_url))
|
|
wikidata_id = list(response["query"]["pages"].values())[0]["pageprops"]["wikibase_item"]
|
|
p["id"]["wikidata"] = wikidata_id
|
|
|
|
|
|
def get_ids_from_wikidata(legislators):
|
|
# Query to fetch information for entities that have a bioguide ID.
|
|
# Selecting on bioguide ID efficiently gets wikidata entries that
|
|
# we are interested in.
|
|
|
|
table = run_query("""
|
|
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT ?subject ?bioguide ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia
|
|
WHERE {
|
|
?subject wdt:P1157 ?bioguide .
|
|
OPTIONAL {
|
|
?subject wdt:P2671 ?google_entity_id
|
|
}
|
|
OPTIONAL {
|
|
?subject wdt:P2686 ?opensecrets
|
|
}
|
|
OPTIONAL {
|
|
?subject wdt:P3344 ?votesmart
|
|
}
|
|
OPTIONAL {
|
|
?subject wdt:P2390 ?ballotpedia
|
|
}
|
|
OPTIONAL {
|
|
?wikipedia schema:about ?subject .
|
|
?wikipedia schema:inLanguage "en" .
|
|
?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
|
|
}
|
|
}
|
|
""")
|
|
|
|
# make a mapping from bioguide ID to query result
|
|
mapping = { r["bioguide"]: r for r in table }
|
|
|
|
# update legislators
|
|
for p in legislators:
|
|
if p["id"].get("bioguide") in mapping:
|
|
p["id"].update(mapping[p["id"]["bioguide"]])
|
|
|
|
|
|
def get_ids_from_wikidata_without_bioguide(legislators):
|
|
# The SQPARL server doesn't seem to suppor VALUES or FILTER(?subject IN (...))
|
|
# so in order to fill in values for legislators without bioguide IDs but with
|
|
# wikidata IDs, we can just query them one by one. This probably is only useful
|
|
# at the start of a new Congress when bioguide IDs are not yet available.
|
|
for p in legislators:
|
|
if not ("bioguide" not in p["id"] and "wikidata" in p["id"]): continue
|
|
|
|
table = run_query("""
|
|
PREFIX wd: <http://www.wikidata.org/entity/>
|
|
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia
|
|
WHERE {
|
|
OPTIONAL {
|
|
?subject wdt:P2671 ?google_entity_id
|
|
}
|
|
OPTIONAL {
|
|
?subject wdt:P2686 ?opensecrets
|
|
}
|
|
OPTIONAL {
|
|
?subject wdt:P3344 ?votesmart
|
|
}
|
|
OPTIONAL {
|
|
?subject wdt:P2390 ?ballotpedia
|
|
}
|
|
OPTIONAL {
|
|
?wikipedia schema:about ?subject .
|
|
?wikipedia schema:inLanguage "en" .
|
|
?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
|
|
}
|
|
}
|
|
""".replace("?subject", "wd:" + p["id"]["wikidata"]))
|
|
|
|
|
|
p["id"].update(table[0])
|
|
|
|
|
|
def run_query(query):
|
|
sparql_endpoint = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
|
|
s = SPARQLWrapper(sparql_endpoint)
|
|
|
|
# run the query
|
|
s.setQuery(query)
|
|
s.setReturnFormat(JSON)
|
|
results = s.query().convert()
|
|
|
|
for row in results['results']['bindings']:
|
|
if "subject" in row:
|
|
# replace the ?subject variable with the wikidata id
|
|
row['wikidata'] = { "value": re.search(r'/(Q\d+)', row['subject']['value']).group(1) }
|
|
del row["subject"]
|
|
|
|
# clean up the google entity id
|
|
if 'google_entity_id' in row:
|
|
row['google_entity_id']["value"] = 'kg:' + row['google_entity_id']["value"]
|
|
|
|
# clean up the wikipedia and ballotpedia results
|
|
if "wikipedia" in row:
|
|
row["wikipedia"]["value"] = \
|
|
unquote(row["wikipedia"]["value"])\
|
|
.replace("https://en.wikipedia.org/wiki/", "")\
|
|
.strip().replace('_',' ')
|
|
if "ballotpedia" in row:
|
|
row["ballotpedia"]["value"] = row["ballotpedia"]["value"].strip().replace('_',' ')
|
|
|
|
# clean up the votesmart id
|
|
if "votesmart" in row:
|
|
try:
|
|
row["votesmart"]["value"] = int(row["votesmart"]["value"])
|
|
except ValueError:
|
|
print("invalid value", row["votesmart"]["value"])
|
|
continue
|
|
# return a simple list of dicts of results
|
|
return [
|
|
{
|
|
k: row[k]['value']
|
|
for k in row
|
|
}
|
|
for row in results['results']['bindings']
|
|
]
|
|
|
|
|
|
def run():
|
|
p1 = load_data("legislators-current.yaml")
|
|
p2 = load_data("legislators-historical.yaml")
|
|
get_wikidata_ids(p1+p2)
|
|
get_ids_from_wikidata(p1+p2)
|
|
get_ids_from_wikidata_without_bioguide(p1+p2)
|
|
save_data(p1, "legislators-current.yaml")
|
|
save_data(p2, "legislators-historical.yaml")
|
|
|
|
if __name__ == '__main__':
|
|
run()
|
|
|
|
|