mirror of
https://github.com/unitedstates/congress.git
synced 2026-03-26 17:00:04 -04:00
475 lines
20 KiB
Python
475 lines
20 KiB
Python
# Cache FDSys sitemaps to get a list of available documents.
|
|
#
|
|
# ./run fdsys [--year=XXXX]
|
|
# Caches the complete FDSys sitemap. Uses lastmod times in
|
|
# sitemaps to only download new files. Use --year to only
|
|
# update a particular year (for testing, I guess).
|
|
#
|
|
# ./run fdsys --list-collections
|
|
# Dumps a list of the names of GPO's collections.
|
|
#
|
|
# ./run fdsys --collections=BILLS,STATUTE
|
|
# Only fetch sitemaps for these collections.
|
|
#
|
|
# ./run fdsys --cached|force
|
|
# Always/never use the cache.
|
|
#
|
|
# # ./run fdsys --collections=BILLS --congress=XXX
|
|
# Updates the sitemaps for the years of the indicated Congress
|
|
# and then outputs text-versions.json next to each bill data.json
|
|
# file from the bills scraper.
|
|
#
|
|
# ./run fdsys ... --store mods,pdf
|
|
# When downloading, also locally mirror the MODS and PDF documents
|
|
# associated with each package. Update as the sitemap indicates.
|
|
# Pass --granules to locally cache only granule files (e.g. the
|
|
# individual statute files w/in a volume).
|
|
|
|
from lxml import etree, html
|
|
import glob, json, re, logging, os.path
|
|
import utils
|
|
|
|
# for xpath
|
|
ns = { "x": "http://www.sitemaps.org/schemas/sitemap/0.9" }
|
|
|
|
def run(options):
|
|
# GPO FDSys organizes its sitemaps by publication year (the date of
|
|
# original print publication) and then by colletion (bills, statutes,
|
|
# etc.).
|
|
|
|
# Which collections should we download? All if none is specified.
|
|
fetch_collections = None
|
|
if options.get("collections", "").strip() != "":
|
|
fetch_collections = set(options.get("collections").split(","))
|
|
|
|
# Update our cache of the complete FDSys sitemap.
|
|
update_sitemap_cache(fetch_collections, options)
|
|
if options.get("list-collections", False): return
|
|
|
|
# Locally store MODS, PDF, etc.
|
|
if "store" in options:
|
|
mirror_files(fetch_collections, options)
|
|
|
|
# Create a JSON file listing all available bill text documents.
|
|
# Only if --collections is omitted or specifies BILLS, and if
|
|
# --congress is specified.
|
|
if (not fetch_collections or "BILLS" in fetch_collections) and options.get('congress', None):
|
|
update_bill_version_list(int(options.get('congress')))
|
|
|
|
def update_sitemap_cache(fetch_collections, options):
|
|
"""Updates a local cache of the complete FDSys sitemap tree.
|
|
Pass fetch_collections as None, or to restrict the update to
|
|
particular FDSys collections a set of collection names. Only
|
|
downloads changed sitemap files."""
|
|
|
|
seen_collections = set()
|
|
|
|
# Load the root sitemap.
|
|
master_sitemap = get_sitemap(None, None, None, options)
|
|
if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type at the root sitemap.")
|
|
|
|
# Process the year-by-year sitemaps.
|
|
for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns):
|
|
# Get year and lastmod date.
|
|
url = str(year_node.xpath("string(x:loc)", namespaces=ns))
|
|
lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns))
|
|
m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url)
|
|
if not m or m.group(1) != m.group(2): raise ValueError("Unmatched sitemap URL: %s" % url)
|
|
year = m.group(1)
|
|
|
|
# Should we process this year's sitemaps?
|
|
if options.get("congress", None) and int(year) not in utils.get_congress_years(int(options.get("congress"))): continue
|
|
if options.get("year", None) and int(year) != int(options.get("year")): continue
|
|
|
|
# Get the sitemap.
|
|
year_sitemap = get_sitemap(year, None, lastmod, options)
|
|
if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type in %s sitemap." % year)
|
|
|
|
# Process the collection sitemaps.
|
|
for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns):
|
|
# Get collection and lastmod date.
|
|
url = str(collection_node.xpath("string(x:loc)", namespaces=ns))
|
|
lastmod = str(collection_node.xpath("string(x:lastmod)", namespaces=ns))
|
|
m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url)
|
|
if not m or m.group(1) != year or m.group(2) != year: raise ValueError("Unmatched sitemap URL: %s" % url)
|
|
collection = m.group(3)
|
|
|
|
# To help the user find a collection name, record this collection but don't download it.
|
|
if options.get("list-collections", False):
|
|
seen_collections.add(collection)
|
|
continue
|
|
|
|
# Should we download the sitemap?
|
|
if fetch_collections and collection not in fetch_collections:
|
|
continue
|
|
|
|
# Get the sitemap.
|
|
collection_sitemap = get_sitemap(year, collection, lastmod, options)
|
|
if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection))
|
|
|
|
if options.get("list-collections", False):
|
|
print "\n".join(sorted(seen_collections))
|
|
|
|
def get_sitemap(year, collection, lastmod, options):
|
|
"""Gets a single sitemap, downloading it if the sitemap has changed.
|
|
|
|
Downloads the root sitemap (year==None, collection==None), or
|
|
the sitemap for a year (collection==None), or the sitemap for
|
|
a particular year and collection. Pass lastmod which is the current
|
|
modification time of the file according to its parent sitemap, which
|
|
is how it knows to return a cached copy.
|
|
|
|
Returns the sitemap parsed into a DOM.
|
|
"""
|
|
|
|
# Construct the URL and the path to where to cache the file on disk.
|
|
if year == None:
|
|
url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
|
|
path = "fdsys/sitemap/sitemap.xml"
|
|
elif collection == None:
|
|
url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
|
|
path = "fdsys/sitemap/%s/sitemap.xml" % year
|
|
else:
|
|
url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
|
|
path = "fdsys/sitemap/%s/%s.xml" % (year, collection)
|
|
|
|
# Should we re-download the file?
|
|
lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
|
|
if options.get("cached", False):
|
|
# If --cached is used, don't hit the network.
|
|
force = False
|
|
elif not lastmod:
|
|
# No *current* lastmod date is known for this file (because it is the master
|
|
# sitemap file, probably), so always download.
|
|
force = True
|
|
else:
|
|
# If the file is out of date or --force is used, download the file.
|
|
cache_lastmod = utils.read(lastmod_cache_file)
|
|
force = (lastmod != cache_lastmod) or options.get("force", False)
|
|
|
|
if force:
|
|
logging.warn("Downloading: %s" % url)
|
|
|
|
body = utils.download(url, path, utils.merge(options, {
|
|
'force': force,
|
|
'xml': True
|
|
}))
|
|
|
|
if not body:
|
|
raise Exception("Failed to download %s" % url)
|
|
|
|
# Write the current last modified date to disk so we know the next time whether
|
|
# we need to fetch the file.
|
|
if lastmod and not options.get("cached", False):
|
|
utils.write(lastmod, lastmod_cache_file)
|
|
|
|
return etree.fromstring(body)
|
|
|
|
|
|
# uses get_sitemap, but returns a list of tuples of date and url
|
|
def entries_from_collection(year, collection, lastmod, options):
|
|
if (not collection) or (not year):
|
|
raise Exception("This method requires a specific year and collection.")
|
|
|
|
sitemap = get_sitemap(year, collection, lastmod, options)
|
|
|
|
entries = []
|
|
|
|
for entry_node in sitemap.xpath("x:url", namespaces=ns):
|
|
url = str(entry_node.xpath("string(x:loc)", namespaces=ns))
|
|
lastmod = str(entry_node.xpath("string(x:lastmod)", namespaces=ns))
|
|
entries.append((url, lastmod))
|
|
|
|
return entries
|
|
|
|
|
|
|
|
def mirror_files(fetch_collections, options):
|
|
"""Create a local mirror of FDSys document files. Only downloads
|
|
changed files, according to the sitemap. Run update_sitemap_cache first.
|
|
|
|
Pass fetch_collections as None, or to restrict the update to
|
|
particular FDSys collections a set of collection names.
|
|
|
|
Set options["store"] to a comma-separated list of file types (pdf,
|
|
mods, text, xml).
|
|
"""
|
|
|
|
# For determining whether we need to process a sitemap file again on a later
|
|
# run, we need to make a key out of the command line arguments that affect
|
|
# which files we are downloading.
|
|
cache_options_key = repr(tuple(sorted(kv for kv in options.items() if kv[0] in ("store", "year", "congress", "granules", "cached"))))
|
|
|
|
file_types = options["store"].split(",")
|
|
|
|
# Process each FDSys sitemap...
|
|
for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")):
|
|
# Should we process this file?
|
|
year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups()
|
|
if "year" in options and year != options["year"]: continue
|
|
if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue
|
|
if fetch_collections and collection not in fetch_collections: continue
|
|
|
|
# Has this sitemap changed since the last successful mirror?
|
|
#
|
|
# The sitemap's last modification time is stored in ...-lastmod.txt,
|
|
# which comes from the sitemap's parent sitemap's lastmod listing for
|
|
# the file.
|
|
#
|
|
# Compare that to the lastmod value of when we last did a successful mirror.
|
|
# This function can be run to fetch different sets of files, so get the
|
|
# lastmod value corresponding to the current run arguments.
|
|
sitemap_store_state_file = re.sub(r"\.xml$", "-store-state.json", sitemap)
|
|
sitemap_last_mod = open(re.sub(r"\.xml$", "-lastmod.txt", sitemap)).read()
|
|
if os.path.exists(sitemap_store_state_file):
|
|
sitemap_store_state = json.load(open(sitemap_store_state_file))
|
|
if sitemap_store_state.get(cache_options_key) == sitemap_last_mod:
|
|
# sitemap hasn't changed since the last time
|
|
continue
|
|
|
|
logging.info("scanning " + sitemap + "...")
|
|
|
|
# Load the sitemap for this year & collection, and loop through each document.
|
|
for package_name, lastmod in get_sitemap_entries(sitemap):
|
|
# Add this package to the download list.
|
|
file_list = []
|
|
|
|
if not options.get("granules", False):
|
|
# Doing top-level package files (granule==None).
|
|
file_list.append(None)
|
|
|
|
else:
|
|
# In some collections, like STATUTE, each document has subparts which are not
|
|
# described in the sitemap. Load the main HTML page and scrape for the sub-files.
|
|
# In the STATUTE collection, the MODS information in granules is redudant with
|
|
# information in the top-level package MODS file. But the only way to get granule-
|
|
# level PDFs is to go through the granules.
|
|
content_detail_url = "http://www.gpo.gov/fdsys/pkg/%s/content-detail.html" % package_name
|
|
content_index = utils.download(content_detail_url,
|
|
"fdsys/package/%s/%s/%s.html" % (year, collection, package_name),
|
|
utils.merge(options, {
|
|
'xml': True, # it's not XML but this avoid unescaping HTML which fails if there are unicode characters
|
|
}))
|
|
if not content_index: raise Exception("Failed to download %s" % content_detail_url)
|
|
for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
|
|
if link.text == "More":
|
|
m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
|
|
if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href"))
|
|
granule_name = m.group(2)
|
|
file_list.append(granule_name)
|
|
|
|
# Download the files of the desired types.
|
|
for granule_name in file_list:
|
|
mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options)
|
|
|
|
# If we got this far, we successfully downloaded all of the files in this year/collection.
|
|
# To speed up future updates, save the lastmod time of this sitemap in a file indicating
|
|
# what we downloaded. The store-state file contains a JSON mapping of command line options
|
|
# to the most recent lastmod value for this sitemap.
|
|
sitemap_store_state = { }
|
|
if os.path.exists(sitemap_store_state_file):
|
|
sitemap_store_state = json.load(open(sitemap_store_state_file))
|
|
sitemap_store_state[cache_options_key] = sitemap_last_mod
|
|
json.dump(sitemap_store_state, open(sitemap_store_state_file, "w"))
|
|
|
|
def get_sitemap_entries(sitemap_filename):
|
|
# Load the XML file.
|
|
dom = etree.parse(sitemap_filename).getroot()
|
|
if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")
|
|
|
|
# Loop through entries.
|
|
for file_node in dom.xpath("x:url", namespaces=ns):
|
|
# Get URL and last modified timestamp.
|
|
url = str(file_node.xpath("string(x:loc)", namespaces=ns))
|
|
lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
|
|
if not url.endswith("/content-detail.html"): raise Exception("Unrecognized file pattern.")
|
|
|
|
# Get the package name.
|
|
m = re.match("http://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
|
|
if not m: raise Exception("Unmatched document URL")
|
|
package_name = m.group(1)
|
|
|
|
yield package_name, lastmod
|
|
|
|
def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options):
|
|
# Where should we store the file?
|
|
path = get_output_path(year, collection, package_name, granule_name, options)
|
|
if not path: return # should skip
|
|
|
|
# Do we need to update this record?
|
|
lastmod_cache_file = path + "/lastmod.txt"
|
|
cache_lastmod = utils.read(lastmod_cache_file)
|
|
force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)
|
|
|
|
# Try downloading files for each file type.
|
|
targets = get_package_files(package_name, granule_name, path)
|
|
for file_type in file_types:
|
|
if file_type not in targets: raise Exception("Invalid file type: %s" % file_type)
|
|
f_url, f_path = targets[file_type]
|
|
|
|
if (not force) and os.path.exists(f_path): continue # we already have the current file
|
|
logging.warn("Downloading: " + f_path)
|
|
data = utils.download(f_url, f_path, utils.merge(options, {
|
|
'xml': True,
|
|
'force': force,
|
|
'to_cache': False
|
|
}))
|
|
|
|
if not data:
|
|
if file_type == "pdf":
|
|
# expected to be present for all packages
|
|
raise Exception("Failed to download %s" % package_name)
|
|
else:
|
|
# not all packages have all file types, but assume this is OK
|
|
logging.error("file not found: " + f_url)
|
|
|
|
if file_type == "text" and f_path.endswith(".html"):
|
|
# The "text" format files are put in an HTML container. Unwrap it into a .txt file.
|
|
# TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
|
|
# html.fromstring does auto-detection.
|
|
with open(f_path[0:-4] + "txt", "w") as f:
|
|
text_content = unicode(html.fromstring(data).text_content())
|
|
f.write(text_content.encode("utf8"))
|
|
|
|
# Write the current last modified date to disk so we know the next time whether
|
|
# we need to fetch the files for this sitemap item.
|
|
if lastmod and not options.get("cached", False):
|
|
utils.write(lastmod, lastmod_cache_file)
|
|
|
|
def get_output_path(year, collection, package_name, granule_name, options):
|
|
# Where to store the document files?
|
|
# The path will depend a bit on the collection.
|
|
if collection == "BILLS":
|
|
# Store with the other bill data.
|
|
m = re.match(r"BILLS-(\d+)([a-z]+)(\d+)(\D.*)", package_name)
|
|
if not m: raise Exception("Unmatched bill document package name: " + package_name)
|
|
congress, bill_type, bill_number, version_code = m.groups()
|
|
congress = int(congress)
|
|
if "congress" in options and congress != int(options["congress"]): return None
|
|
return output_for_bill(congress, bill_type, bill_number, "text-versions/" + version_code)
|
|
else:
|
|
# Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
|
|
path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name)
|
|
if granule_name: path += "/" + granule_name
|
|
return path
|
|
|
|
def get_package_files(package_name, granule_name, path):
|
|
baseurl = "http://www.gpo.gov/fdsys/pkg/%s/" % package_name
|
|
baseurl_mods = baseurl
|
|
|
|
if not granule_name:
|
|
file_name = package_name
|
|
else:
|
|
file_name = granule_name
|
|
baseurl_mods = "http://www.gpo.gov/fdsys/granule/%s/%s/" % (package_name, granule_name)
|
|
|
|
ret = {
|
|
'mods': (baseurl_mods + "mods.xml", path + "/mods.xml"),
|
|
'pdf': (baseurl + "pdf/" + file_name + ".pdf", path + "/document.pdf"),
|
|
'xml': (baseurl + "xml/" + file_name + ".xml", path + "/document.xml"),
|
|
'text': (baseurl + "html/" + file_name + ".htm", path + "/document.html"), # text wrapped in HTML
|
|
}
|
|
if not granule_name:
|
|
# granules don't have PREMIS files?
|
|
ret['premis'] = (baseurl + "premis.xml", path + "/premis.xml")
|
|
|
|
return ret
|
|
|
|
def update_bill_version_list(only_congress):
|
|
bill_versions = { }
|
|
|
|
# Which sitemap years should we look at?
|
|
if not only_congress:
|
|
sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
|
|
else:
|
|
# If --congress=X is specified, only look at the relevant years.
|
|
sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)]
|
|
sitemap_files = [f for f in sitemap_files if os.path.exists(f)]
|
|
|
|
# For each year-by-year BILLS sitemap...
|
|
for year_sitemap in sitemap_files:
|
|
dom = etree.parse(year_sitemap).getroot()
|
|
if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")
|
|
|
|
# Loop through each bill text version...
|
|
for file_node in dom.xpath("x:url", namespaces=ns):
|
|
# get URL and last modified date
|
|
url = str(file_node.xpath("string(x:loc)", namespaces=ns))
|
|
lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
|
|
|
|
# extract bill congress, type, number, and version from the URL
|
|
m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
|
|
if not m: raise Exception("Unmatched bill document URL: " + url)
|
|
congress, bill_type, bill_number, version_code = m.groups()
|
|
congress = int(congress)
|
|
if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url)
|
|
|
|
# If --congress=XXX is specified, only look at those bills.
|
|
if only_congress and congress != only_congress:
|
|
continue
|
|
|
|
# Track the documents by congress, bill type, etc.
|
|
bill_versions\
|
|
.setdefault(congress, { })\
|
|
.setdefault(bill_type, { })\
|
|
.setdefault(bill_number, { })\
|
|
[version_code] = {
|
|
"url": url,
|
|
"lastmod": lastmod,
|
|
}
|
|
|
|
# Output the bill version info. We can't do this until the end because we need to get
|
|
# the complete list of versions for a bill before we write the file, and the versions
|
|
# may be split across multiple sitemap files.
|
|
|
|
for congress in bill_versions:
|
|
for bill_type in bill_versions[congress]:
|
|
for bill_number in bill_versions[congress][bill_type]:
|
|
utils.write(
|
|
json.dumps(bill_versions[congress][bill_type][bill_number],
|
|
sort_keys=True, indent=2, default=utils.format_datetime),
|
|
output_for_bill(congress, bill_type, bill_number, "text-versions.json")
|
|
)
|
|
|
|
|
|
def output_for_bill(congress, bill_type, number, fn):
|
|
# Similar to bills.output_for_bill
|
|
return "%s/%d/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn)
|
|
|
|
# given a FDsys filename (e.g. BILLS-113hr302ih), fetch the MODS doc, and return:
|
|
# issued_on: the date the referenced document was issued (<dateIssued>)
|
|
# urls: a dict of forms of this doc (<location>)
|
|
def document_info_for(filename, cache, options):
|
|
mods_url = mods_for(filename)
|
|
mods_cache = ""
|
|
body = utils.download(mods_url,
|
|
cache,
|
|
utils.merge(options, {'xml': True})
|
|
)
|
|
|
|
doc = etree.fromstring(body)
|
|
mods_ns = {"mods": "http://www.loc.gov/mods/v3"}
|
|
|
|
locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns)
|
|
|
|
urls = {}
|
|
for location in locations:
|
|
label = location.attrib['displayLabel']
|
|
if "HTML" in label:
|
|
format = "html"
|
|
elif "PDF" in label:
|
|
format = "pdf"
|
|
elif "XML" in label:
|
|
format = "xml"
|
|
else:
|
|
format = "unknown"
|
|
urls[format] = location.text
|
|
|
|
issued_on = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns)
|
|
|
|
return issued_on, urls
|
|
|
|
|
|
def mods_for(filename):
|
|
return "http://www.gpo.gov/fdsys/pkg/%s/mods.xml" % filename
|