Files
congress/tasks/fdsys.py
2016-12-16 20:44:34 -05:00

675 lines
27 KiB
Python

# Downloads documents from GPO FDSys, using their sitemaps
# to efficiently determine what needs to be updated.
#
# ./run fdsys --list
# Dumps a list of the names of GPO's collections and the years
# they have data in (since most collections are divided by year
# of document publication).
#
# ./run fdsys --collections=BILLS --bulkdata=False
# Download bill text (from the primary FDSys BILLS collection;
# there's also a bulk data BILLS collection but it has less it
# it).
#
# Options:
#
# --collections=BILLS,BILLSTATUS,STATUTE,...
# Restricts the downloads to just the named collections. For
# BILLS, you should probably also specify --bulkdata=True/False.
# If omitted, downloads files from all collections.
#
# --bulkdata=True|False
# Download regular document collections or bulk data collections.
# If omitted, downloads all. But there's a problem-
# The BILLS collection occurs both as a regular documents
# collection (bill text in multiple formats) and as a bulk
# data collection (just XML starting recently). This flag is
# how you can distinguish which one you want.
#
# --years=2001[,2002,2004]
# Comma-separated list of years to download from (does not
# apply to bulk data collections which are not divided by
# year).
#
# --congress=113[,114]
# Comma-separated list of congresses to download from (only for
# BILLSTATUS). Alternate format:
#
# --congress=">113"
# Specify a number to get all congresses *after* the value (only for
# BILLSTATUS) The quotes are necessary for this format.
#
# --store=mods,pdf,text,xml,premis
# Save the MODS, PDF, text, XML, or PREMIS file associated
# with each package. If omitted, stores every file for each
# package.
#
# --filter="regex"
# Only stores files that match the regex. Regular collections
# are matched against the package name (i.e. BILLS-113hconres66ih)
# while bulk data items are matched against the their file path
# (i.e. 113/1/hconres/BILLS-113hconres66ih.xml).
#
# --granules
# Some collections, like STATUTE, have "granules" inside each
# package (a package is a volume of the Statutes at Large, while
# a granule is an extracted portion for a particular public law).
# With --granules, saves the individual granules instead of the
# main package files.
#
# --cached|--force
# Always/never use the cache.
from lxml import etree, html
import glob
import json
import re
import logging
import os.path
import zipfile
import utils
# globals
fdsys_baseurl = "https://www.gpo.gov/smap/"
BULKDATA_BASE_URL = "https://www.gpo.gov/fdsys/bulkdata/"
FDSYS_BILLSTATUS_FILENAME = "fdsys_billstatus.xml"
# for xpath
ns = {"x": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Main entry point
def run(options):
# GPO FDSys organizes its sitemaps by publication year (the date of
# original print publication) and then by colletion (bills, statutes,
# etc.). There are additional unconnected sitemaps for each bulk
# data collection.
# Update our cache of the complete FDSys sitemap and download package
# files as requested in the command-line options.
listing = []
update_sitemap_cache(options, listing)
# With --list, just output all of the available data on FDSys
# (the collection names, and the years each collection is available in, etc.).
if options.get("list", False):
listing = map(format_item_for_listing, listing)
listing.sort()
for item in listing:
print item
# Processing the Sitemaps
def update_sitemap_cache(options, listing):
"""Updates the local cache of the complete FDSys sitemap trees,
only downloading changed sitemap files."""
# with --bulkdata=False, or not specified
if options.get("bulkdata", None) in (None, False):
# Process the main sitemap index for all of the document collections.
update_sitemap(fdsys_baseurl + "fdsys/sitemap.xml", None, [], options, listing)
# with --bulkdata=True, or not specified
if options.get("bulkdata", None) in (None, True):
# Process the bulk data sitemap index.
update_sitemap(fdsys_baseurl + "bulkdata/sitemapindex.xml", None, [], options, listing)
def update_sitemap(url, current_lastmod, how_we_got_here, options, listing):
"""Updates the local cache of a sitemap file."""
# Return a list of files we downloaded.
results = []
# What is this sitemap for?
subject = extract_sitemap_subject_from_url(url, how_we_got_here)
# For debugging, remember what URLs we are stepping through.
how_we_got_here = how_we_got_here + [url]
# Does the user want to process this sitemap?
if skip_sitemap(subject, options):
return
# Where to cache the sitemap and a file where we store its current <lastmod> date
# (which comes from a parent sitemap)?
(cache_file, lastmod_cache_file) = get_sitemap_cache_files(subject)
lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)
# Download anew if the current_lastmod doesn't match the stored lastmod
# in our cache, and if --cache is not specified. Or if --force is given.
# If we're not downloading it, load it from disk because we still have
# to process each sitemap to ensure we've downloaded all of the package
# files the user wants.
download = should_download_sitemap(lastmod_cache_file, current_lastmod, options)
# Download, or just retreive from cache.
if download:
logging.warn("Downloading: %s" % url)
body = utils.download(
url,
cache_file,
utils.merge(options, {
'force': download,
'binary': True
}))
if not body:
raise Exception("Failed to download %s" % url)
# Write the current last modified date to disk so we know the next time whether
# we need to fetch the file --- if we just downloaded it.
if download and current_lastmod:
utils.write(current_lastmod, lastmod_cache_file)
# Load the XML.
try:
sitemap = etree.fromstring(body)
except etree.XMLSyntaxError as e:
raise Exception("XML syntax error in %s: %s" % (url, str(e)))
# Process the entries.
if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
# This is a sitemap index. Process the sitemaps listed in this
# sitemapindex recursively.
for node in sitemap.xpath("x:sitemap", namespaces=ns):
# Get URL and lastmod date of the sitemap.
url = str(node.xpath("string(x:loc)", namespaces=ns))
lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
sitemap_results = update_sitemap(url, lastmod, how_we_got_here, options, listing)
if sitemap_results is not None:
results = results + sitemap_results
elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
# This is a regular sitemap with content items listed.
# For the --list command, remember that this sitemap had some data.
# And then return --- don't download any package files.
if options.get("list"):
listing.append(subject)
return
# Process the items.
for node in sitemap.xpath("x:url", namespaces=ns):
url = str(node.xpath("string(x:loc)", namespaces=ns))
lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
if not subject.get("bulkdata"):
# This is a regular collection item.
#
# Get the "package" name, i.e. a particular document (which has
# one or more file formats within it).
m = re.match("https://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
if not m:
raise Exception("Unmatched package URL (%s) at %s." % (url, "->".join(how_we_got_here)))
package_name = m.group(1)
if options.get("filter") and not re.search(options["filter"], package_name): continue
mirror_results = mirror_package(subject, package_name, lastmod, url, options)
if mirror_results is not None and len(mirror_results) > 0:
results = results + mirror_results
else:
# This is a bulk data item. Extract components of the URL.
m = re.match(re.escape(BULKDATA_BASE_URL) + re.escape(subject["collection"]) + "/(.+)", url)
if not m:
raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here)))
item_path = m.group(1)
if options.get("filter") and not re.search(options["filter"], item_path): continue
mirror_results = mirror_bulkdata_file(subject, url, item_path, lastmod, options)
if mirror_results is not None and len(mirror_results) > 0:
results = results + mirror_results
else:
raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url))
return results
def extract_sitemap_subject_from_url(url, how_we_got_here):
# The root of the main documents collections sitemap.
if url == fdsys_baseurl + "fdsys/sitemap.xml":
return { }
# A year sitemap under the main documents root.
m = re.match(re.escape(fdsys_baseurl) + r"fdsys/sitemap_(\d+)/sitemap_\d+.xml$", url)
if m:
return { "year": m.group(1) }
# A regular collection sitemap.
m = re.match(re.escape(fdsys_baseurl) + r"fdsys/sitemap_(\d+)/\d+_(.*)_sitemap.xml$", url)
if m:
return { "year": m.group(1), "collection": m.group(2) }
if url == fdsys_baseurl + "bulkdata/sitemapindex.xml":
return { "bulkdata": True }
# The root of a bulkdata collection. Bulk data sitemaps
# aren't grouped by year in the same way the regular
# collections are.
m = re.match(re.escape(fdsys_baseurl) + r"bulkdata/(.*)/sitemapindex.xml$", url)
if m:
return { "bulkdata": True, "collection": m.group(1) }
# Bulk data collections have subdivisions, like for BILLS it's
# subdivided by Congress+bill-type strings (like "113s" for
# 113th Congress, "S." (senate) bills).
m = re.match(re.escape(fdsys_baseurl) + r"bulkdata/(.*)/([^/]+)/sitemap.xml$", url)
if m:
return_data = { "bulkdata": True, "collection": m.group(1), "grouping": m.group(2) }
congress_match = re.match(r"^([0-9]+)", m.group(2))
if return_data["collection"] == "BILLSTATUS" and congress_match:
return_data['congress'] = congress_match.group(1)
return return_data
raise ValueError("Unrecognized sitemap URL: " + url + " (" + "->".join(how_we_got_here) + ")")
def skip_sitemap(subject, options):
# Which years should we download? All if none is specified.
if "year" in subject and options.get("years", "").strip() != "":
only_years = set(options.get("years").split(","))
if subject["year"] not in only_years:
return True
# Which collections should we download? All if none is specified.
if "collection" in subject and options.get("collections", "").strip() != "":
only_collections = set(options.get("collections").split(","))
if subject["collection"] not in only_collections:
return True
# Which congresses should we download? All if none is specified.
if "congress" in subject and options.get("congress", "").strip() != "":
# If we're looking for congresses after a certain one.
if options.get("congress")[0] == '>':
if int(subject["congress"]) <= int(options.get("congress")[1:]):
return True
else:
only_congress = set(options.get("congress").split(","))
if subject["congress"] not in only_congress:
return True
return False
def get_sitemap_cache_files(subject):
# Where should we store the local cache of the sitemap XML and a file
# that stores its <lastmod> date for when we last downloaded it? Returns
# a path relative to the cache root.
cache_file = "fdsys/sitemap"
if "year" in subject:
# The main document collections have years, but the bulk data
# collections don't.
cache_file = os.path.join(cache_file, subject["year"])
if "collection" in subject:
# The root sitemap for the main collections doesn't have a "collection" name.
cache_file = os.path.join(cache_file, subject["collection"])
if "grouping" in subject:
# Some bulk data sitemaps have what we're calling groupings.
cache_file = os.path.join(cache_file, subject["grouping"])
cache_file = os.path.join(cache_file, "sitemap.xml")
lastmod_cache_file = cache_file.replace(".xml", "-lastmod.txt")
return (cache_file, lastmod_cache_file)
def should_download_sitemap(lastmod_cache_file, current_lastmod, options):
# Download a sitemap or just read from our cache?
if not current_lastmod:
# No lastmod is known for this file (it's the root of a sitemap
# tree - this is the first web request).
return True
elif options.get("force", False):
# User requests downloading everything.
return True
elif options.get("cached", False):
# User requests downloading nothing.
return False
else:
# Download if the lastmod from the parent sitemap doesn't agree with
# the lastmod stored on disk.
return current_lastmod != utils.read(lastmod_cache_file)
def format_item_for_listing(item):
# Helper function for the --list command.
ret = item["collection"]
if item.get("bulkdata"):
ret += " (bulkdata)"
if item.get("year"):
# for regular collections
ret += " " + item["year"]
if item.get("grouping"):
# for bulk data collections
ret += " " + item["grouping"]
return ret
# Downloading Packages
def mirror_package(sitemap, package_name, lastmod, content_detail_url, options):
"""Create a local mirror of a FDSys package."""
# Return a list of files we downloaded.
results = []
if not options.get("granules", False):
# Most packages are just a package. This is the usual case.
results = mirror_package_or_granule(sitemap, package_name, None, lastmod, options)
else:
# In some collections, like STATUTE, each document has subparts which are not
# described in the sitemap. Load the main HTML page and scrape for the sub-files.
# In the STATUTE collection, the MODS information in granules is redundant with
# information in the top-level package MODS file. But the only way to get granule-
# level PDFs is to go through the granules.
content_index = utils.download(content_detail_url,
"fdsys/package/%s/%s/%s.html" % (sitemap["year"], sitemap["collection"], package_name),
utils.merge(options, {
'binary': True,
}))
if not content_index:
raise Exception("Failed to download %s" % content_detail_url)
for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
if link.text == "More":
m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
if not m or m.group(1) != package_name:
raise Exception("Unmatched granule URL %s" % link.get("href"))
granule_name = m.group(2)
results = mirror_package_or_granule(sitemap, package_name, granule_name, lastmod, options)
return results
def mirror_package_or_granule(sitemap, package_name, granule_name, lastmod, options):
# Return a list of files we downloaded.
results = []
# Where should we store the file? Each collection has a different
# file system layout (for BILLS, we put bill text along where the
# bills scraper puts bills).
path = get_output_path(sitemap, package_name, granule_name, options)
if not path:
return # should skip
# Get the lastmod times of the files previously saved for this package.
file_lastmod_changed = False
file_lastmod = { }
lastmod_cache_file = path + "/lastmod.json"
if os.path.exists(lastmod_cache_file):
file_lastmod = json.load(open(lastmod_cache_file))
# Try downloading files for each file type.
targets = get_package_files(package_name, granule_name)
for file_type, (file_url, relpath) in targets.items():
# Does the user want to save this file type? If the user didn't
# specify --store, save everything. Otherwise only save the
# file types asked for.
if options.get("store", "") and file_type not in options["store"].split(","):
continue
# Do we already have this file updated? The file_lastmod JSON
# stores the lastmod from the sitemap at the time we downloaded
# the individual file.
if file_lastmod.get(file_type) == lastmod:
if not options.get("force", False):
continue
# With --cached, skip if the file is already downloaded.
file_path = os.path.join(path, relpath)
if os.path.exists(file_path) and options.get("cached", False):
continue
# Download.
logging.warn("Downloading: " + file_path)
data = utils.download(file_url, file_path, utils.merge(options, {
'binary': True,
'force': True, # decision to cache was made above
'to_cache': False,
'return_status_code_on_error': True,
'needs_content': (file_type == "text" and file_path.endswith(".html")),
}))
results.append(file_path)
# Download failed?
if data == 404:
# Not all packages have all file types. Just check the ones we know
# must be there.
if file_type in ("pdf", "zip"):
# expected to be present for all packages
raise Exception("Failed to download %s %s (404)" % (package_name, file_type))
elif sitemap["collection"] == "BILLS" and file_type in ("text", "mods"):
# expected to be present for bills
raise Exception("Failed to download %s %s (404)" % (package_name, file_type))
elif data is True:
# Download was successful but needs_content was False so we don't have the
# file content. Instead, True is returned. Strangely isintance(True, int) is
# True (!!!) so we have to test for True separately from testing if we got a
# return code integer.
pass
elif not data or isinstance(data, int):
# There was some other error - skip the rest. Don't
# update file_lastmod!
continue
# Update the lastmod of the downloaded file. If the download failed,
# because of a 404, we still update this to indicate that the file
# definitively does not exist. We won't try fetcihng it again.
file_lastmod[file_type] = lastmod
file_lastmod_changed = True
# The "text" format files are put in an HTML container. Unwrap it into a .txt file.
# TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
# html.fromstring does auto-detection.
if file_type == "text" and file_path.endswith(".html"):
file_path_text = file_path[0:-4] + "txt"
logging.info("Unwrapping HTML to: " + file_path_text)
with open(file_path_text, "w") as f:
f.write(unwrap_text_in_html(data))
if sitemap["collection"] == "BILLS" and file_type == "mods":
# When we download bill files, also create the text-versions/data.json file
# which extracts commonly used components of the MODS XML, whenever we update
# that MODS file.
extract_bill_version_metadata(package_name, path)
# Write the current last modified date back to disk so we know the next time whether
# we need to fetch the files for this sitemap item. Assuming we fetched anything.
# If nothing new was fetched, then there is no reason to update the file.
if file_lastmod and file_lastmod_changed:
utils.write(json.dumps(file_lastmod), lastmod_cache_file)
return results
def get_bill_id_for_package(package_name, with_version=True, restrict_to_congress=None):
m = re.match(r"BILL(?:S|STATUS)-(\d+)([a-z]+)(\d+)([a-z][a-z0-9]*|)$", package_name)
if not m:
raise Exception("Unmatched bill document package name: " + package_name)
congress, bill_type, bill_number, version_code = m.groups()
if restrict_to_congress and int(congress) != int(restrict_to_congress):
return None
if not with_version:
return ("%s%s-%s" % (bill_type, bill_number, congress), version_code)
else:
return "%s%s-%s-%s" % (bill_type, bill_number, congress, version_code)
def get_output_path(sitemap, package_name, granule_name, options):
# Where to store the document files?
# The path will depend a bit on the collection.
if sitemap["collection"] == "BILLS":
# Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]).
bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress"))
if not bill_and_ver:
return None # congress number does not match options["congress"]
from bills import output_for_bill
bill_id, version_code = bill_and_ver
return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False)
elif sitemap["collection"] == "CRPT":
# Store committee reports in [congress]/crpt/[reporttype].
m = re.match(r"CRPT-(\d+)([hse]rpt)(\d+)$", package_name)
if not m:
raise ValueError(package_name)
congress, report_type, report_number = m.groups()
if options.get("congress") and congress != options.get("congress"):
return None # congress number does not match options["congress"]
return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress, sitemap["collection"].lower(), report_type, report_type + report_number)
else:
# Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), sitemap["collection"], sitemap["year"], package_name)
if granule_name:
path += "/" + granule_name
return path
def get_package_files(package_name, granule_name):
# What URL are the package files at? Return a tuple of the remote
# URL and a relative filename for storing it locally.
baseurl = "https://www.gpo.gov/fdsys/pkg/%s" % package_name
if not granule_name:
# For regular packages, the URL layout is...
baseurl2 = baseurl
file_name = package_name
else:
# For granules, the URL layout is...
baseurl2 = "https://www.gpo.gov/fdsys/granule/%s/%s" % (package_name, granule_name)
file_name = granule_name
ret = {
'mods': (baseurl2 + "/mods.xml", "mods.xml"),
'pdf': (baseurl + "/pdf/" + file_name + ".pdf", "document.pdf"),
'xml': (baseurl + "/xml/" + file_name + ".xml", "document.xml"),
'text': (baseurl + "/html/" + file_name + ".htm", "document.html"), # text wrapped in HTML!
'premis': (baseurl + "/premis.xml", "premis.xml")
}
if granule_name:
# Granules don't have PREMIS files.
del ret['premis']
if package_name.startswith("STATUTE-"):
# Statutes at Large don't have XML.
del ret['xml']
return ret
def unwrap_text_in_html(data):
text_content = unicode(html.fromstring(data).text_content())
return text_content.encode("utf8")
# Downloading bulk data files
def mirror_bulkdata_file(sitemap, url, item_path, lastmod, options):
# Return a list of files we downloaded.
results = []
# Where should we store the file?
path = "%s/fdsys/%s/%s" % (utils.data_dir(), sitemap["collection"], item_path)
# For BILLSTATUS, store this along with where we store the rest of bill
# status data.
if sitemap["collection"] == "BILLSTATUS":
from bills import output_for_bill
bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path))[0], with_version=False)
path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)
# Where should we store the lastmod found in the sitemap so that
# we can tell later if the file has changed?
lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt"
# Do we already have this file up to date?
if os.path.exists(lastmod_cache_file) and not options.get("force", False):
if lastmod == utils.read(lastmod_cache_file):
return
# With --cached, skip if the file is already downloaded.
if os.path.exists(path) and options.get("cached", False):
return
# Download.
logging.warn("Downloading: " + path)
data = utils.download(url, path, utils.merge(options, {
'binary': True,
'force': True, # decision to cache was made above
'to_cache': False,
}))
results.append(path)
if not data:
# Something failed.
return
# Write the current last modified date back to disk so we know the next time whether
# we need to fetch the file again.
utils.write(lastmod, lastmod_cache_file)
return results
def extract_bill_version_metadata(package_name, text_path):
bill_version_id = get_bill_id_for_package(package_name)
bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
bill_version = {
'bill_version_id': bill_version_id,
'version_code': version_code,
'urls': {},
}
mods_ns = {"mods": "http://www.loc.gov/mods/v3"}
doc = etree.parse(os.path.join(text_path, "mods.xml"))
locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns)
for location in locations:
label = location.attrib['displayLabel']
if "HTML" in label:
format = "html"
elif "PDF" in label:
format = "pdf"
elif "XML" in label:
format = "xml"
else:
format = "unknown"
bill_version["urls"][format] = location.text
bill_version["issued_on"] = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns)
utils.write(
json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime),
output_for_bill_version(bill_version_id)
)
def output_for_bill_version(bill_version_id):
bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
return "%s/%s/bills/%s/%s%s/text-versions/%s/data.json" % (utils.data_dir(), congress, bill_type, bill_type, number, version_code)