Files
congress-legislators/scripts/wikipedia_ids.py

148 lines
5.3 KiB
Python
Executable File

# Scans Wikipedia for pages using the CongBio and CongLinks
# templates, which have Bioguide IDs. Updates the 'wikipedia'
# ID field for matching Members of Congress, and for pages
# using the CongLinks template also updates a variety of
# other ID as found in the template.
import lxml.etree, re, urllib.request, urllib.parse, urllib.error
import utils, os.path
def run():
# Field mapping. And which fields should be turned into integers.
# See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
fieldmap = {
"congbio": "bioguide",
#"fec": "fec", # handled specially...
"govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
"opensecrets": "opensecrets",
"votesmart": "votesmart",
"cspan": "cspan",
}
int_fields = ("govtrack", "votesmart", "cspan")
# default to not caching
cache = utils.flags().get('cache', False)
# Load legislator files and map bioguide IDs.
y1 = utils.load_data("legislators-current.yaml")
y2 = utils.load_data("legislators-historical.yaml")
bioguides = { }
for y in y1+y2:
bioguides[y["id"]["bioguide"]] = y
# Okay now the Wikipedia stuff...
def get_matching_pages():
# Does a Wikipedia API search for pages containing either of the
# two templates. Returns the pages.
page_titles = set()
for template in ("CongLinks", "CongBio"):
eicontinue = ""
while True:
# construct query URL, using the "eicontinue" of the last query to get the next batch
url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
if eicontinue: url += "&eicontinue=" + eicontinue
# load the XML
print("Getting %s pages (%d...)" % (template, len(page_titles)))
dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably
for pgname in dom.xpath("query/embeddedin/ei/@title"):
page_titles.add(pgname)
# get the next eicontinue value and loop
eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
if not eicontinue: break
return page_titles
# Get the list of Wikipedia pages that use any of the templates we care about.
page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
if cache and os.path.exists(page_list_cache_file):
# Load from cache.
matching_pages = open(page_list_cache_file).read().split("\n")
else:
# Query Wikipedia API and save to cache.
matching_pages = get_matching_pages()
utils.write(("\n".join(matching_pages)), page_list_cache_file)
# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
matching_pages = [p for p in matching_pages if ":" not in p]
# Load each page's content and parse the template.
for p in sorted(matching_pages):
if " campaign" in p: continue
if " (surname)" in p: continue
if "career of " in p: continue
if "for Congress" in p: continue
if p.startswith("List of "): continue
if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue
# Query the Wikipedia API to get the raw page content in XML,
# and then use XPath to get the raw page text.
url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap"
cache_path = "legislators/wikipedia/pages/" + p
dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" })
# Build a dict for the IDs that we want to insert into our files.
new_ids = {
"wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores)
}
if "CongLinks" in page_content:
# Parse the key/val pairs in the template.
m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
if not m: continue # no template?
for arg in m.group(1).split("|"):
if "=" not in arg: continue
key, val = arg.split("=", 1)
key = key.strip()
val = val.strip()
if val and key in fieldmap:
try:
if fieldmap[key] in int_fields: val = int(val)
except ValueError:
print("invalid value", key, val)
continue
if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper()
new_ids[fieldmap[key]] = val
if "bioguide" not in new_ids: continue
new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm
bioguide = new_ids["bioguide"]
else:
m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
if not m: continue # no template?
bioguide = m.group(1).upper()
if not bioguide in bioguides:
print("Member not found: " + bioguide, p, "(Might have been a delegate to the Constitutional Convention.)")
continue
# handle FEC ids specially because they are stored in an array...
fec_id = new_ids.get("fec")
if fec_id: del new_ids["fec"]
member = bioguides[bioguide]
member["id"].update(new_ids)
# ...finish the FEC id.
if fec_id:
if fec_id not in bioguides[bioguide]["id"].get("fec", []):
bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)
#print p.encode("utf8"), new_ids
utils.save_data(y1, "legislators-current.yaml")
utils.save_data(y2, "legislators-historical.yaml")
if __name__ == '__main__':
run()