mirror of
https://github.com/unitedstates/congress.git
synced 2026-03-26 17:00:04 -04:00
Merge pull request #229 from unitedstates/govinfogov
switch fdsys scraper from fdsys to govinfo.gov
This commit is contained in:
@@ -8,7 +8,7 @@ Includes:
|
||||
|
||||
* Scrapers for House and Senate roll call votes.
|
||||
|
||||
* A scraper for GPO FDSys, the official repository for most legislative documents.
|
||||
* A document fetcher for GovInfo.gov, which holds bill text, bill status, and other official documents.
|
||||
|
||||
* A defunct THOMAS scraper for presidential nominations in Congress.
|
||||
|
||||
@@ -74,13 +74,13 @@ where data-type is one of:
|
||||
* `votes` (see [Votes](https://github.com/unitedstates/congress/wiki/votes))
|
||||
* `nominations` (see [Nominations](https://github.com/unitedstates/congress/wiki/nominations))
|
||||
* `committee_meetings` (see [Committee Meetings](https://github.com/unitedstates/congress/wiki/committee-meetings))
|
||||
* `fdsys` (see [Bill Text](https://github.com/unitedstates/congress/wiki/bill-text))
|
||||
* `govinfo` (see [Bill Text](https://github.com/unitedstates/congress/wiki/bill-text))
|
||||
* `statutes` (see [Bills](https://github.com/unitedstates/congress/wiki/bills) and [Bill Text](https://github.com/unitedstates/congress/wiki/bill-text))
|
||||
|
||||
To scrape bills, resolutions, and amendments from THOMAS, run:
|
||||
To get data for bills, resolutions, and amendments, run:
|
||||
|
||||
```bash
|
||||
./run fdsys --collections=BILLSTATUS
|
||||
./run govinfo --bulkdata=BILLSTATUS
|
||||
./run bills
|
||||
```
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
# Refresh the bulk data collection.
|
||||
./run fdsys --bulkdata=True --collections=BILLSTATUS
|
||||
./run govinfo --bulkdata=BILLSTATUS
|
||||
|
||||
# Turn into JSON and GovTrack-XML.
|
||||
./run bills --govtrack $@
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
./run fdsys --collections=STATUTE --store=mods,pdf
|
||||
./run govinfo --collections=STATUTE --extract=mods,pdf
|
||||
./run statutes --volumes=65-86 --govtrack # bill status
|
||||
./run statutes --volumes=65-106 --textversions --extracttext # bill text
|
||||
|
||||
@@ -6,7 +6,7 @@ import xmltodict
|
||||
|
||||
import bill_info
|
||||
import amendment_info
|
||||
import fdsys
|
||||
import govinfo
|
||||
import utils
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ def get_bills_to_process(options):
|
||||
# Return a generator over bill_ids that need to be processed.
|
||||
# Every time we process a bill we copy the fdsys_billstatus-lastmod.txt
|
||||
# file to data-fromfdsys-lastmod.txt, next to data.json. This way we
|
||||
# know when the FDSys XML file has changed.
|
||||
# know when the GovInfo (formerly FDSys) XML file has changed.
|
||||
|
||||
def get_data_path(*args):
|
||||
# Utility function to generate a part of the path
|
||||
@@ -83,9 +83,9 @@ def get_bills_to_process(options):
|
||||
key = lambda x : int(x.replace(bill_type, ""))
|
||||
):
|
||||
|
||||
fn = get_data_path(congress, bill_type, bill_type_and_number, fdsys.FDSYS_BILLSTATUS_FILENAME)
|
||||
fn = get_data_path(congress, bill_type, bill_type_and_number, govinfo.FDSYS_BILLSTATUS_FILENAME)
|
||||
if os.path.exists(fn):
|
||||
# The FDSys bulk data file exists. Does our JSON data
|
||||
# The GovInfo.gov bulk data file exists. Does our JSON data
|
||||
# file need to be updated?
|
||||
bulkfile_lastmod = utils.read(fn.replace(".xml", "-lastmod.txt"))
|
||||
parse_lastmod = utils.read(get_data_path(congress, bill_type, bill_type_and_number, "data-fromfdsys-lastmod.txt"))
|
||||
@@ -125,7 +125,7 @@ def process_bill(bill_id, options):
|
||||
}
|
||||
|
||||
def _path_to_billstatus_file(bill_id):
|
||||
return output_for_bill(bill_id, fdsys.FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)
|
||||
return output_for_bill(bill_id, govinfo.FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)
|
||||
|
||||
def read_fdsys_bulk_bill_status_file(fn, bill_id):
|
||||
fdsys_billstatus = utils.read(fn)
|
||||
@@ -203,7 +203,7 @@ def build_bill_id(bill_type, bill_number, congress):
|
||||
|
||||
def billstatus_url_for(bill_id):
|
||||
bill_type, bill_number, congress = utils.split_bill_id(bill_id)
|
||||
return fdsys.BULKDATA_BASE_URL + 'BILLSTATUS/{0}/{1}/BILLSTATUS-{0}{1}{2}.xml'.format(congress, bill_type, bill_number)
|
||||
return govinfo.BULKDATA_BASE_URL + 'BILLSTATUS/{0}/{1}/BILLSTATUS-{0}{1}{2}.xml'.format(congress, bill_type, bill_number)
|
||||
|
||||
def output_for_bill(bill_id, format, is_data_dot=True):
|
||||
bill_type, number, congress = utils.split_bill_id(bill_id)
|
||||
|
||||
688
tasks/fdsys.py
688
tasks/fdsys.py
@@ -1,688 +0,0 @@
|
||||
# Downloads documents from GPO FDSys, using their sitemaps
|
||||
# to efficiently determine what needs to be updated.
|
||||
#
|
||||
# ./run fdsys --list
|
||||
# Dumps a list of the names of GPO's collections and the years
|
||||
# they have data in (since most collections are divided by year
|
||||
# of document publication).
|
||||
#
|
||||
# ./run fdsys --collections=BILLS --bulkdata=False
|
||||
# Download bill text (from the primary FDSys BILLS collection;
|
||||
# there's also a bulk data BILLS collection but it has less it
|
||||
# it).
|
||||
#
|
||||
# Options:
|
||||
#
|
||||
# --collections=BILLS,BILLSTATUS,STATUTE,...
|
||||
# Restricts the downloads to just the named collections. For
|
||||
# BILLS, you should probably also specify --bulkdata=True/False.
|
||||
# If omitted, downloads files from all collections.
|
||||
#
|
||||
# --bulkdata=True|False
|
||||
# Download regular document collections or bulk data collections.
|
||||
# If omitted, downloads all. But there's a problem-
|
||||
# The BILLS collection occurs both as a regular documents
|
||||
# collection (bill text in multiple formats) and as a bulk
|
||||
# data collection (just XML starting recently). This flag is
|
||||
# how you can distinguish which one you want.
|
||||
#
|
||||
# --years=2001[,2002,2004]
|
||||
# Comma-separated list of years to download from (does not
|
||||
# apply to bulk data collections which are not divided by
|
||||
# year).
|
||||
#
|
||||
# --congress=113[,114]
|
||||
# Comma-separated list of congresses to download from (only for
|
||||
# BILLSTATUS). Alternate format:
|
||||
#
|
||||
# --congress=">113"
|
||||
# Specify a number to get all congresses *after* the value (only for
|
||||
# BILLSTATUS) The quotes are necessary for this format.
|
||||
#
|
||||
# --store=mods,pdf,text,xml,premis
|
||||
# Save the MODS, PDF, text, XML, or PREMIS file associated
|
||||
# with each package. If omitted, stores every file for each
|
||||
# package.
|
||||
#
|
||||
# --filter="regex"
|
||||
# Only stores files that match the regex. Regular collections
|
||||
# are matched against the package name (i.e. BILLS-113hconres66ih)
|
||||
# while bulk data items are matched against the their file path
|
||||
# (i.e. 113/1/hconres/BILLS-113hconres66ih.xml).
|
||||
#
|
||||
# --granules
|
||||
# Some collections, like STATUTE, have "granules" inside each
|
||||
# package (a package is a volume of the Statutes at Large, while
|
||||
# a granule is an extracted portion for a particular public law).
|
||||
# With --granules, saves the individual granules instead of the
|
||||
# main package files.
|
||||
#
|
||||
# --cached|--force
|
||||
# Always/never use the cache.
|
||||
|
||||
from lxml import etree, html
|
||||
import glob
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import os.path
|
||||
import zipfile
|
||||
import utils
|
||||
|
||||
import rtyaml
|
||||
|
||||
# globals
|
||||
fdsys_baseurl = "https://www.gpo.gov/smap/"
|
||||
BULKDATA_BASE_URL = "https://www.gpo.gov/fdsys/bulkdata/"
|
||||
FDSYS_BILLSTATUS_FILENAME = "fdsys_billstatus.xml"
|
||||
|
||||
# for xpath
|
||||
ns = {"x": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||
|
||||
|
||||
# Main entry point
|
||||
|
||||
def run(options):
|
||||
# GPO FDSys organizes its sitemaps by publication year (the date of
|
||||
# original print publication) and then by colletion (bills, statutes,
|
||||
# etc.). There are additional unconnected sitemaps for each bulk
|
||||
# data collection.
|
||||
|
||||
# Update our cache of the complete FDSys sitemap and download package
|
||||
# files as requested in the command-line options.
|
||||
listing = []
|
||||
update_sitemap_cache(options, listing)
|
||||
|
||||
# With --list, just output all of the available data on FDSys
|
||||
# (the collection names, and the years each collection is available in, etc.).
|
||||
if options.get("list", False):
|
||||
listing = map(format_item_for_listing, listing)
|
||||
listing.sort()
|
||||
for item in listing:
|
||||
print item
|
||||
|
||||
|
||||
# Processing the Sitemaps
|
||||
|
||||
|
||||
def update_sitemap_cache(options, listing):
|
||||
"""Updates the local cache of the complete FDSys sitemap trees,
|
||||
only downloading changed sitemap files."""
|
||||
|
||||
# with --bulkdata=False, or not specified
|
||||
if options.get("bulkdata", None) in (None, False):
|
||||
# Process the main sitemap index for all of the document collections.
|
||||
update_sitemap(fdsys_baseurl + "fdsys/sitemap.xml", None, [], options, listing)
|
||||
|
||||
# with --bulkdata=True, or not specified
|
||||
if options.get("bulkdata", None) in (None, True):
|
||||
# Process the bulk data sitemap index.
|
||||
update_sitemap(fdsys_baseurl + "bulkdata/sitemapindex.xml", None, [], options, listing)
|
||||
|
||||
def update_sitemap(url, current_lastmod, how_we_got_here, options, listing):
|
||||
"""Updates the local cache of a sitemap file."""
|
||||
|
||||
# Return a list of files we downloaded.
|
||||
results = []
|
||||
|
||||
# What is this sitemap for?
|
||||
subject = extract_sitemap_subject_from_url(url, how_we_got_here)
|
||||
|
||||
# For debugging, remember what URLs we are stepping through.
|
||||
how_we_got_here = how_we_got_here + [url]
|
||||
|
||||
# Does the user want to process this sitemap?
|
||||
if skip_sitemap(subject, options):
|
||||
return
|
||||
|
||||
# Get the file paths to cache:
|
||||
# * the sitemap XML for future runs
|
||||
# * its <lastmod> date (which comes from the parent sitemap) so we know if we need to re-download it now
|
||||
# * the <lastmod> dates of the packages listed in this sitemap so we know if we need to re-download any package files
|
||||
(cache_file, lastmod_cache_file) = get_sitemap_cache_files(subject)
|
||||
lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)
|
||||
if not os.path.exists(lastmod_cache_file):
|
||||
lastmod_cache = { }
|
||||
|
||||
# Migrate from old flat file format.
|
||||
if os.path.exists(lastmod_cache_file.replace(".yaml", ".txt")):
|
||||
lastmod_cache["lastmod"] = utils.read(lastmod_cache_file.replace(".yaml", ".txt"))
|
||||
else:
|
||||
with open(lastmod_cache_file) as f:
|
||||
lastmod_cache = rtyaml.load(f)
|
||||
|
||||
# Download anew if the current_lastmod doesn't match the stored lastmod
|
||||
# in our cache, and if --cache is not specified. Or if --force is given.
|
||||
# If we're not downloading it, load it from disk because we still have
|
||||
# to process each sitemap to ensure we've downloaded all of the package
|
||||
# files the user wants.
|
||||
download = should_download_sitemap(lastmod_cache.get("lastmod"), current_lastmod, options)
|
||||
|
||||
# Download, or just retreive from cache.
|
||||
if download:
|
||||
logging.warn("Downloading: %s" % url)
|
||||
body = utils.download(
|
||||
url,
|
||||
cache_file,
|
||||
utils.merge(options, {
|
||||
'force': download,
|
||||
'binary': True
|
||||
}))
|
||||
if not body:
|
||||
raise Exception("Failed to download %s" % url)
|
||||
|
||||
# If we downloaded a new file, update the lastmod for our cache.
|
||||
if download and current_lastmod:
|
||||
lastmod_cache["lastmod"] = current_lastmod
|
||||
|
||||
# Load the XML.
|
||||
try:
|
||||
sitemap = etree.fromstring(body)
|
||||
except etree.XMLSyntaxError as e:
|
||||
raise Exception("XML syntax error in %s: %s" % (url, str(e)))
|
||||
|
||||
# Process the entries.
|
||||
if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
|
||||
|
||||
# This is a sitemap index. Process the sitemaps listed in this
|
||||
# sitemapindex recursively.
|
||||
for node in sitemap.xpath("x:sitemap", namespaces=ns):
|
||||
# Get URL and lastmod date of the sitemap.
|
||||
url = str(node.xpath("string(x:loc)", namespaces=ns))
|
||||
lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
|
||||
sitemap_results = update_sitemap(url, lastmod, how_we_got_here, options, listing)
|
||||
if sitemap_results is not None:
|
||||
results = results + sitemap_results
|
||||
|
||||
elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
|
||||
|
||||
# This is a regular sitemap with content items listed.
|
||||
|
||||
# For the --list command, remember that this sitemap had some data.
|
||||
# And then return --- don't download any package files.
|
||||
if options.get("list"):
|
||||
listing.append(subject)
|
||||
return
|
||||
|
||||
# Process the items.
|
||||
for node in sitemap.xpath("x:url", namespaces=ns):
|
||||
url = str(node.xpath("string(x:loc)", namespaces=ns))
|
||||
lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
|
||||
|
||||
if not subject.get("bulkdata"):
|
||||
# This is a regular collection item.
|
||||
#
|
||||
# Get the "package" name, i.e. a particular document (which has
|
||||
# one or more file formats within it).
|
||||
m = re.match("https://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
|
||||
if not m:
|
||||
raise Exception("Unmatched package URL (%s) at %s." % (url, "->".join(how_we_got_here)))
|
||||
package_name = m.group(1)
|
||||
if options.get("filter") and not re.search(options["filter"], package_name): continue
|
||||
mirror_results = mirror_package(subject, package_name, lastmod, lastmod_cache.setdefault("packages", {}), url, options)
|
||||
if mirror_results is not None and len(mirror_results) > 0:
|
||||
results = results + mirror_results
|
||||
|
||||
else:
|
||||
# This is a bulk data item. Extract components of the URL.
|
||||
m = re.match(re.escape(BULKDATA_BASE_URL) + re.escape(subject["collection"]) + "/(.+)", url)
|
||||
if not m:
|
||||
raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here)))
|
||||
item_path = m.group(1)
|
||||
if options.get("filter") and not re.search(options["filter"], item_path): continue
|
||||
mirror_results = mirror_bulkdata_file(subject, url, item_path, lastmod, options)
|
||||
if mirror_results is not None and len(mirror_results) > 0:
|
||||
results = results + mirror_results
|
||||
|
||||
else:
|
||||
raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url))
|
||||
|
||||
# Write the updated last modified dates to disk so we know the next time whether
|
||||
# we need to fetch the files. If we didn't download anything, no need to write an
|
||||
# empty file.
|
||||
if lastmod_cache:
|
||||
with open(lastmod_cache_file, "w") as f:
|
||||
rtyaml.dump(lastmod_cache, f)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def extract_sitemap_subject_from_url(url, how_we_got_here):
|
||||
# The root of the main documents collections sitemap.
|
||||
if url == fdsys_baseurl + "fdsys/sitemap.xml":
|
||||
return { }
|
||||
|
||||
# A year sitemap under the main documents root.
|
||||
m = re.match(re.escape(fdsys_baseurl) + r"fdsys/sitemap_(\d+)/sitemap_\d+.xml$", url)
|
||||
if m:
|
||||
return { "year": m.group(1) }
|
||||
|
||||
# A regular collection sitemap.
|
||||
m = re.match(re.escape(fdsys_baseurl) + r"fdsys/sitemap_(\d+)/\d+_(.*)_sitemap.xml$", url)
|
||||
if m:
|
||||
return { "year": m.group(1), "collection": m.group(2) }
|
||||
|
||||
if url == fdsys_baseurl + "bulkdata/sitemapindex.xml":
|
||||
return { "bulkdata": True }
|
||||
|
||||
# The root of a bulkdata collection. Bulk data sitemaps
|
||||
# aren't grouped by year in the same way the regular
|
||||
# collections are.
|
||||
m = re.match(re.escape(fdsys_baseurl) + r"bulkdata/(.*)/sitemapindex.xml$", url)
|
||||
if m:
|
||||
return { "bulkdata": True, "collection": m.group(1) }
|
||||
|
||||
# Bulk data collections have subdivisions, like for BILLS it's
|
||||
# subdivided by Congress+bill-type strings (like "113s" for
|
||||
# 113th Congress, "S." (senate) bills).
|
||||
m = re.match(re.escape(fdsys_baseurl) + r"bulkdata/(.*)/([^/]+)/sitemap.xml$", url)
|
||||
if m:
|
||||
return_data = { "bulkdata": True, "collection": m.group(1), "grouping": m.group(2) }
|
||||
congress_match = re.match(r"^([0-9]+)", m.group(2))
|
||||
if return_data["collection"] == "BILLSTATUS" and congress_match:
|
||||
return_data['congress'] = congress_match.group(1)
|
||||
|
||||
return return_data
|
||||
|
||||
raise ValueError("Unrecognized sitemap URL: " + url + " (" + "->".join(how_we_got_here) + ")")
|
||||
|
||||
|
||||
def skip_sitemap(subject, options):
|
||||
# Which years should we download? All if none is specified.
|
||||
if "year" in subject and options.get("years", "").strip() != "":
|
||||
only_years = set(options.get("years").split(","))
|
||||
if subject["year"] not in only_years:
|
||||
return True
|
||||
|
||||
# Which collections should we download? All if none is specified.
|
||||
if "collection" in subject and options.get("collections", "").strip() != "":
|
||||
only_collections = set(options.get("collections").split(","))
|
||||
if subject["collection"] not in only_collections:
|
||||
return True
|
||||
|
||||
# Which congresses should we download? All if none is specified.
|
||||
if "congress" in subject and options.get("congress", "").strip() != "":
|
||||
# If we're looking for congresses after a certain one.
|
||||
if options.get("congress")[0] == '>':
|
||||
if int(subject["congress"]) <= int(options.get("congress")[1:]):
|
||||
return True
|
||||
else:
|
||||
only_congress = set(options.get("congress").split(","))
|
||||
if subject["congress"] not in only_congress:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_sitemap_cache_files(subject):
|
||||
# Where should we store the local cache of the sitemap XML and a file
|
||||
# that stores its <lastmod> date for when we last downloaded it? Returns
|
||||
# a path relative to the cache root.
|
||||
|
||||
cache_file = "fdsys/sitemap"
|
||||
|
||||
if "year" in subject:
|
||||
# The main document collections have years, but the bulk data
|
||||
# collections don't.
|
||||
cache_file = os.path.join(cache_file, subject["year"])
|
||||
|
||||
if "collection" in subject:
|
||||
# The root sitemap for the main collections doesn't have a "collection" name.
|
||||
cache_file = os.path.join(cache_file, subject["collection"])
|
||||
|
||||
if "grouping" in subject:
|
||||
# Some bulk data sitemaps have what we're calling groupings.
|
||||
cache_file = os.path.join(cache_file, subject["grouping"])
|
||||
|
||||
cache_file = os.path.join(cache_file, "sitemap.xml")
|
||||
|
||||
lastmod_cache_file = cache_file.replace(".xml", "-lastmod.yaml")
|
||||
|
||||
return (cache_file, lastmod_cache_file)
|
||||
|
||||
|
||||
def should_download_sitemap(lastmod_cache, current_lastmod, options):
|
||||
# Download a sitemap or just read from our cache?
|
||||
|
||||
if not current_lastmod:
|
||||
# No lastmod is known for this file (it's the root of a sitemap
|
||||
# tree - this is the first web request).
|
||||
return True
|
||||
|
||||
elif options.get("force", False):
|
||||
# User requests downloading everything.
|
||||
return True
|
||||
|
||||
elif options.get("cached", False):
|
||||
# User requests downloading nothing.
|
||||
return False
|
||||
|
||||
else:
|
||||
# Download if the lastmod from the parent sitemap doesn't agree with
|
||||
# the lastmod stored on disk.
|
||||
return current_lastmod != lastmod_cache
|
||||
|
||||
|
||||
def format_item_for_listing(item):
|
||||
# Helper function for the --list command.
|
||||
|
||||
ret = item["collection"]
|
||||
if item.get("bulkdata"):
|
||||
ret += " (bulkdata)"
|
||||
|
||||
if item.get("year"):
|
||||
# for regular collections
|
||||
ret += " " + item["year"]
|
||||
|
||||
if item.get("grouping"):
|
||||
# for bulk data collections
|
||||
ret += " " + item["grouping"]
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
# Downloading Packages
|
||||
|
||||
|
||||
def mirror_package(sitemap, package_name, lastmod, lastmod_cache, content_detail_url, options):
|
||||
"""Create a local mirror of a FDSys package."""
|
||||
|
||||
# Return a list of files we downloaded.
|
||||
results = []
|
||||
|
||||
if not options.get("granules", False):
|
||||
# Most packages are just a package. This is the usual case.
|
||||
results = mirror_package_or_granule(sitemap, package_name, None, lastmod, lastmod_cache, options)
|
||||
|
||||
else:
|
||||
# In some collections, like STATUTE, each document has subparts which are not
|
||||
# described in the sitemap. Load the main HTML page and scrape for the sub-files.
|
||||
# In the STATUTE collection, the MODS information in granules is redundant with
|
||||
# information in the top-level package MODS file. But the only way to get granule-
|
||||
# level PDFs is to go through the granules.
|
||||
content_index = utils.download(content_detail_url,
|
||||
"fdsys/package/%s/%s/%s.html" % (sitemap["year"], sitemap["collection"], package_name),
|
||||
utils.merge(options, {
|
||||
'binary': True,
|
||||
}))
|
||||
if not content_index:
|
||||
raise Exception("Failed to download %s" % content_detail_url)
|
||||
for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
|
||||
if link.text == "More":
|
||||
m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
|
||||
if not m or m.group(1) != package_name:
|
||||
raise Exception("Unmatched granule URL %s" % link.get("href"))
|
||||
granule_name = m.group(2)
|
||||
results = mirror_package_or_granule(sitemap, package_name, granule_name, lastmod, lastmod_cache, options)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def mirror_package_or_granule(sitemap, package_name, granule_name, lastmod, lastmod_cache, options):
|
||||
# Return a list of files we downloaded.
|
||||
results = []
|
||||
|
||||
# Where should we store the file? Each collection has a different
|
||||
# file system layout (for BILLS, we put bill text along where the
|
||||
# bills scraper puts bills).
|
||||
path = get_output_path(sitemap, package_name, granule_name, options)
|
||||
if not path:
|
||||
return # should skip
|
||||
|
||||
# Go to the part of the lastmod_cache for this package.
|
||||
lastmod_cache = lastmod_cache.setdefault(package_name, {})
|
||||
if granule_name: lastmod_cache = lastmod_cache.setdefault(granule_name, {})
|
||||
lastmod_cache = lastmod_cache.setdefault("files", {})
|
||||
|
||||
# Migrate old cache storage:
|
||||
# Get the lastmod times of the files previously saved for this package.
|
||||
lastmod_cache_file = path + "/lastmod.json"
|
||||
if not lastmod_cache and os.path.exists(lastmod_cache_file):
|
||||
lastmod_cache.update(json.load(open(lastmod_cache_file)))
|
||||
|
||||
# Try downloading files for each file type.
|
||||
targets = get_package_files(package_name, granule_name)
|
||||
for file_type, (file_url, relpath) in targets.items():
|
||||
# Does the user want to save this file type? If the user didn't
|
||||
# specify --store, save everything. Otherwise only save the
|
||||
# file types asked for.
|
||||
if options.get("store", "") and file_type not in options["store"].split(","):
|
||||
continue
|
||||
|
||||
# Do we already have this file updated?
|
||||
if lastmod_cache.get(file_type) == lastmod:
|
||||
if not options.get("force", False):
|
||||
continue
|
||||
|
||||
# With --cached, skip if the file is already downloaded.
|
||||
file_path = os.path.join(path, relpath)
|
||||
if os.path.exists(file_path) and options.get("cached", False):
|
||||
continue
|
||||
|
||||
# Download.
|
||||
logging.warn("Downloading: " + file_path)
|
||||
data = utils.download(file_url, file_path, utils.merge(options, {
|
||||
'binary': True,
|
||||
'force': True, # decision to cache was made above
|
||||
'to_cache': False,
|
||||
'return_status_code_on_error': True,
|
||||
'needs_content': (file_type == "text" and file_path.endswith(".html")),
|
||||
}))
|
||||
results.append(file_path)
|
||||
|
||||
# Download failed?
|
||||
if data == 404:
|
||||
# Not all packages have all file types. Just check the ones we know
|
||||
# must be there.
|
||||
if file_type in ("pdf", "zip"):
|
||||
# expected to be present for all packages
|
||||
raise Exception("Failed to download %s %s (404)" % (package_name, file_type))
|
||||
elif sitemap["collection"] == "BILLS" and file_type in ("text", "mods"):
|
||||
# expected to be present for bills
|
||||
raise Exception("Failed to download %s %s (404)" % (package_name, file_type))
|
||||
elif data is True:
|
||||
# Download was successful but needs_content was False so we don't have the
|
||||
# file content. Instead, True is returned. Strangely isintance(True, int) is
|
||||
# True (!!!) so we have to test for True separately from testing if we got a
|
||||
# return code integer.
|
||||
pass
|
||||
elif not data or isinstance(data, int):
|
||||
# There was some other error - skip the rest. Don't
|
||||
# update lastmod_cache!
|
||||
continue
|
||||
|
||||
# Update the lastmod of the downloaded file. If the download failed,
|
||||
# because of a 404, we still update this to indicate that the file
|
||||
# definitively does not exist. We won't try fetcihng it again.
|
||||
lastmod_cache[file_type] = lastmod
|
||||
|
||||
# The "text" format files are put in an HTML container. Unwrap it into a .txt file.
|
||||
# TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
|
||||
# html.fromstring does auto-detection.
|
||||
if file_type == "text" and file_path.endswith(".html"):
|
||||
file_path_text = file_path[0:-4] + "txt"
|
||||
logging.info("Unwrapping HTML to: " + file_path_text)
|
||||
with open(file_path_text, "w") as f:
|
||||
f.write(unwrap_text_in_html(data))
|
||||
|
||||
if sitemap["collection"] == "BILLS" and file_type == "mods":
|
||||
# When we download bill files, also create the text-versions/data.json file
|
||||
# which extracts commonly used components of the MODS XML, whenever we update
|
||||
# that MODS file.
|
||||
extract_bill_version_metadata(package_name, path)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_bill_id_for_package(package_name, with_version=True, restrict_to_congress=None):
|
||||
m = re.match(r"BILL(?:S|STATUS)-(\d+)([a-z]+)(\d+)([a-z][a-z0-9]*|)$", package_name)
|
||||
if not m:
|
||||
raise Exception("Unmatched bill document package name: " + package_name)
|
||||
congress, bill_type, bill_number, version_code = m.groups()
|
||||
|
||||
if restrict_to_congress and int(congress) != int(restrict_to_congress):
|
||||
return None
|
||||
|
||||
if not with_version:
|
||||
return ("%s%s-%s" % (bill_type, bill_number, congress), version_code)
|
||||
else:
|
||||
return "%s%s-%s-%s" % (bill_type, bill_number, congress, version_code)
|
||||
|
||||
|
||||
def get_output_path(sitemap, package_name, granule_name, options):
|
||||
# Where to store the document files?
|
||||
|
||||
# The path will depend a bit on the collection.
|
||||
if sitemap["collection"] == "BILLS":
|
||||
# Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]).
|
||||
bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress"))
|
||||
if not bill_and_ver:
|
||||
return None # congress number does not match options["congress"]
|
||||
from bills import output_for_bill
|
||||
bill_id, version_code = bill_and_ver
|
||||
return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False)
|
||||
|
||||
elif sitemap["collection"] == "CRPT":
|
||||
# Store committee reports in [congress]/crpt/[reporttype].
|
||||
m = re.match(r"CRPT-(\d+)([hse]rpt)(\d+)$", package_name)
|
||||
if not m:
|
||||
raise ValueError(package_name)
|
||||
congress, report_type, report_number = m.groups()
|
||||
if options.get("congress") and congress != options.get("congress"):
|
||||
return None # congress number does not match options["congress"]
|
||||
return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress, sitemap["collection"].lower(), report_type, report_type + report_number)
|
||||
|
||||
else:
|
||||
# Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
|
||||
path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), sitemap["collection"], sitemap["year"], package_name)
|
||||
if granule_name:
|
||||
path += "/" + granule_name
|
||||
return path
|
||||
|
||||
|
||||
def get_package_files(package_name, granule_name):
|
||||
# What URL are the package files at? Return a tuple of the remote
|
||||
# URL and a relative filename for storing it locally.
|
||||
|
||||
baseurl = "https://www.gpo.gov/fdsys/pkg/%s" % package_name
|
||||
|
||||
if not granule_name:
|
||||
# For regular packages, the URL layout is...
|
||||
baseurl2 = baseurl
|
||||
file_name = package_name
|
||||
else:
|
||||
# For granules, the URL layout is...
|
||||
baseurl2 = "https://www.gpo.gov/fdsys/granule/%s/%s" % (package_name, granule_name)
|
||||
file_name = granule_name
|
||||
|
||||
ret = {
|
||||
'mods': (baseurl2 + "/mods.xml", "mods.xml"),
|
||||
'pdf': (baseurl + "/pdf/" + file_name + ".pdf", "document.pdf"),
|
||||
'xml': (baseurl + "/xml/" + file_name + ".xml", "document.xml"),
|
||||
'text': (baseurl + "/html/" + file_name + ".htm", "document.html"), # text wrapped in HTML!
|
||||
'premis': (baseurl + "/premis.xml", "premis.xml")
|
||||
}
|
||||
|
||||
if granule_name:
|
||||
# Granules don't have PREMIS files.
|
||||
del ret['premis']
|
||||
|
||||
if package_name.startswith("STATUTE-"):
|
||||
# Statutes at Large don't have XML.
|
||||
del ret['xml']
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def unwrap_text_in_html(data):
|
||||
text_content = unicode(html.fromstring(data).text_content())
|
||||
return text_content.encode("utf8")
|
||||
|
||||
|
||||
# Downloading bulk data files
|
||||
|
||||
|
||||
def mirror_bulkdata_file(sitemap, url, item_path, lastmod, options):
|
||||
# Return a list of files we downloaded.
|
||||
results = []
|
||||
|
||||
# Where should we store the file?
|
||||
path = "%s/fdsys/%s/%s" % (utils.data_dir(), sitemap["collection"], item_path)
|
||||
|
||||
# For BILLSTATUS, store this along with where we store the rest of bill
|
||||
# status data.
|
||||
if sitemap["collection"] == "BILLSTATUS":
|
||||
from bills import output_for_bill
|
||||
bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path))[0], with_version=False)
|
||||
path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)
|
||||
|
||||
# Where should we store the lastmod found in the sitemap so that
|
||||
# we can tell later if the file has changed?
|
||||
lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt"
|
||||
|
||||
# Do we already have this file up to date?
|
||||
if os.path.exists(lastmod_cache_file) and not options.get("force", False):
|
||||
if lastmod == utils.read(lastmod_cache_file):
|
||||
return
|
||||
|
||||
# With --cached, skip if the file is already downloaded.
|
||||
if os.path.exists(path) and options.get("cached", False):
|
||||
return
|
||||
|
||||
# Download.
|
||||
logging.warn("Downloading: " + path)
|
||||
data = utils.download(url, path, utils.merge(options, {
|
||||
'binary': True,
|
||||
'force': True, # decision to cache was made above
|
||||
'to_cache': False,
|
||||
}))
|
||||
results.append(path)
|
||||
|
||||
if not data:
|
||||
# Something failed.
|
||||
return
|
||||
|
||||
# Write the current last modified date back to disk so we know the next time whether
|
||||
# we need to fetch the file again.
|
||||
utils.write(lastmod, lastmod_cache_file)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def extract_bill_version_metadata(package_name, text_path):
|
||||
bill_version_id = get_bill_id_for_package(package_name)
|
||||
|
||||
bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
|
||||
|
||||
bill_version = {
|
||||
'bill_version_id': bill_version_id,
|
||||
'version_code': version_code,
|
||||
'urls': {},
|
||||
}
|
||||
|
||||
mods_ns = {"mods": "http://www.loc.gov/mods/v3"}
|
||||
doc = etree.parse(os.path.join(text_path, "mods.xml"))
|
||||
locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns)
|
||||
|
||||
for location in locations:
|
||||
label = location.attrib['displayLabel']
|
||||
if "HTML" in label:
|
||||
format = "html"
|
||||
elif "PDF" in label:
|
||||
format = "pdf"
|
||||
elif "XML" in label:
|
||||
format = "xml"
|
||||
else:
|
||||
format = "unknown"
|
||||
bill_version["urls"][format] = location.text
|
||||
|
||||
bill_version["issued_on"] = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns)
|
||||
|
||||
utils.write(
|
||||
json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime),
|
||||
output_for_bill_version(bill_version_id)
|
||||
)
|
||||
|
||||
def output_for_bill_version(bill_version_id):
|
||||
bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
|
||||
return "%s/%s/bills/%s/%s%s/text-versions/%s/data.json" % (utils.data_dir(), congress, bill_type, bill_type, number, version_code)
|
||||
544
tasks/govinfo.py
Normal file
544
tasks/govinfo.py
Normal file
@@ -0,0 +1,544 @@
|
||||
# Downloads documents from GPO's GovInfo.gov site, using sitemaps
|
||||
# to efficiently determine what needs to be updated. See
|
||||
# https://www.govinfo.gov/sitemaps for a list of collections.
|
||||
# This service was formerly called "Fdsys."
|
||||
#
|
||||
# ./run govinfo--collections=BILLS,STATUTE,...
|
||||
# Download bill text (from the BILLS collection; there's also a bulk
|
||||
# data BILLS collection but it has less in it), the Statues at Large,
|
||||
# and other documents from GovInfo.gov's non-bulk-data collections.
|
||||
#
|
||||
# ./run govinfo --bulkdata=BILLSTATUS,FR,...
|
||||
# Download bill status, the Federal Register, and other documents
|
||||
# from GovInfo.gov's bulk data collections. (The BILLS collection occurs
|
||||
# both as a regular collection (bill text in multiple formats) and as
|
||||
# a bulk data collection (just XML starting recently). Use --bulkdata=BILLS
|
||||
# to get the bulk data collection.)
|
||||
#
|
||||
# Options:
|
||||
#
|
||||
# --years=2001[,2002,2004]
|
||||
# Comma-separated list of years to download from. Applies to collections
|
||||
# that are divided by year.
|
||||
#
|
||||
# --congress=113[,114]
|
||||
# Comma-separated list of congresses to download from. Applies to bulk
|
||||
# data collections like BILLSTATUS that are grouped by Congress + Bill Type.
|
||||
#
|
||||
# --extract=mods,pdf,text,xml,premis
|
||||
# Extract the MODS, PDF, text, XML, or PREMIS file associated
|
||||
# with each package from the downloaded package ZIP file.
|
||||
#
|
||||
# --filter="regex"
|
||||
# Only stores files that match the regex. Regular collections
|
||||
# are matched against the package name (i.e. BILLS-113hconres66ih)
|
||||
# while bulk data items are matched against the their file path
|
||||
# (i.e. 113/1/hconres/BILLS-113hconres66ih.xml).
|
||||
#
|
||||
# --cached|--force
|
||||
# Always/never use the cache.
|
||||
|
||||
from lxml import etree, html
|
||||
import glob
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import os.path
|
||||
import zipfile
|
||||
import utils
|
||||
|
||||
import rtyaml
|
||||
|
||||
|
||||
# globals
|
||||
GOVINFO_BASE_URL = "https://www.govinfo.gov/"
|
||||
COLLECTION_BASE_URL = GOVINFO_BASE_URL + "app/details/"
|
||||
BULKDATA_BASE_URL = GOVINFO_BASE_URL + "bulkdata/"
|
||||
COLLECTION_SITEMAPINDEX_PATTERN = GOVINFO_BASE_URL + "sitemap/{collection}_sitemap_index.xml"
|
||||
BULKDATA_SITEMAPINDEX_PATTERN = GOVINFO_BASE_URL + "sitemap/bulkdata/{collection}/sitemapindex.xml"
|
||||
FDSYS_BILLSTATUS_FILENAME = "fdsys_billstatus.xml"
|
||||
|
||||
# for xpath
|
||||
ns = {"x": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||
|
||||
|
||||
# Main entry point
|
||||
|
||||
def run(options):
|
||||
# Process sitemaps.
|
||||
for collection in sorted(options.get("collections", "").split(",")):
|
||||
if collection != "":
|
||||
update_sitemap(COLLECTION_SITEMAPINDEX_PATTERN.format(collection=collection), None, [], options)
|
||||
for collection in sorted(options.get("bulkdata", "").split(",")):
|
||||
if collection != "":
|
||||
update_sitemap(BULKDATA_SITEMAPINDEX_PATTERN.format(collection=collection), None, [], options)
|
||||
|
||||
def update_sitemap(url, current_lastmod, how_we_got_here, options):
|
||||
"""Updates the local cache of a sitemap file."""
|
||||
|
||||
# Skip if the year or congress flags are set and this sitemap is
|
||||
# not for that year or Congress.
|
||||
if should_skip_sitemap(url, options):
|
||||
return []
|
||||
|
||||
# For debugging, remember what URLs we are stepping through.
|
||||
how_we_got_here = how_we_got_here + [url]
|
||||
|
||||
# Get the file paths to cache:
|
||||
# * the sitemap XML for future runs
|
||||
# * its <lastmod> date (which comes from the parent sitemap) so we know if we need to re-download it now
|
||||
# * the <lastmod> dates of the packages listed in this sitemap so we know if we need to re-download any package files
|
||||
cache_file = get_sitemap_cache_file(url)
|
||||
cache_file = os.path.join("govinfo/sitemap", cache_file, "sitemap.xml")
|
||||
lastmod_cache_file = cache_file.replace(".xml", "-lastmod.yaml")
|
||||
lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file)
|
||||
if not os.path.exists(lastmod_cache_file):
|
||||
lastmod_cache = { }
|
||||
else:
|
||||
with open(lastmod_cache_file) as f:
|
||||
lastmod_cache = rtyaml.load(f)
|
||||
|
||||
try:
|
||||
return update_sitemap2(url, current_lastmod, how_we_got_here, options, lastmod_cache, cache_file)
|
||||
finally:
|
||||
# Write the updated last modified dates to disk so we know the next time whether
|
||||
# we need to fetch the files. If we didn't download anything, no need to write an
|
||||
# empty file.
|
||||
with utils.NoInterrupt():
|
||||
with open(lastmod_cache_file, "w") as f:
|
||||
rtyaml.dump(lastmod_cache, f)
|
||||
|
||||
|
||||
def update_sitemap2(url, current_lastmod, how_we_got_here, options, lastmod_cache, cache_file):
|
||||
# Return a list of files we downloaded.
|
||||
results = []
|
||||
|
||||
# Download anew if the current_lastmod doesn't match the stored lastmod
|
||||
# in our cache, and if --cache is not specified. Or if --force is given.
|
||||
# If we're not downloading it, load it from disk because we still have
|
||||
# to process each sitemap to ensure we've downloaded all of the package
|
||||
# files the user wants.
|
||||
download = should_download_sitemap(lastmod_cache.get("lastmod"), current_lastmod, options)
|
||||
|
||||
# Download, or just retreive from cache.
|
||||
if download:
|
||||
logging.warn("Downloading: %s" % url)
|
||||
body = utils.download(
|
||||
url,
|
||||
cache_file,
|
||||
utils.merge(options, {
|
||||
'force': download,
|
||||
'binary': True
|
||||
}))
|
||||
if not body:
|
||||
raise Exception("Failed to download %s" % url)
|
||||
|
||||
# If we downloaded a new file, update the lastmod for our cache.
|
||||
if download and current_lastmod:
|
||||
lastmod_cache["lastmod"] = current_lastmod
|
||||
|
||||
# Load the XML.
|
||||
try:
|
||||
sitemap = etree.fromstring(body)
|
||||
except etree.XMLSyntaxError as e:
|
||||
raise Exception("XML syntax error in %s: %s" % (url, str(e)))
|
||||
|
||||
# Process the entries.
|
||||
if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
|
||||
|
||||
# This is a sitemap index. Process the sitemaps listed in this
|
||||
# sitemapindex recursively.
|
||||
for node in sitemap.xpath("x:sitemap", namespaces=ns):
|
||||
# Get URL and lastmod date of the sitemap.
|
||||
url = str(node.xpath("string(x:loc)", namespaces=ns))
|
||||
lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
|
||||
sitemap_results = update_sitemap(url, lastmod, how_we_got_here, options)
|
||||
if sitemap_results is not None:
|
||||
results = results + sitemap_results
|
||||
|
||||
elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
|
||||
|
||||
# This is a regular sitemap with content items listed.
|
||||
|
||||
# Process the items.
|
||||
for node in sitemap.xpath("x:url", namespaces=ns):
|
||||
url = str(node.xpath("string(x:loc)", namespaces=ns))
|
||||
lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns))
|
||||
|
||||
m = re.match(COLLECTION_BASE_URL + r"([^-]+)-(.*)", url)
|
||||
if m:
|
||||
collection = m.group(1)
|
||||
package_name = m.group(2)
|
||||
if options.get("filter") and not re.search(options["filter"], package_name): continue
|
||||
mirror_results = mirror_package(collection, package_name, lastmod, lastmod_cache.setdefault("packages", {}), options)
|
||||
results.extend(mirror_results)
|
||||
|
||||
else:
|
||||
# This is a bulk data item. Extract components of the URL.
|
||||
m = re.match(BULKDATA_BASE_URL + r"([^/]+)/(.*)", url)
|
||||
if not m:
|
||||
raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here)))
|
||||
collection = m.group(1)
|
||||
item_path = m.group(2)
|
||||
if options.get("filter") and not re.search(options["filter"], item_path): continue
|
||||
mirror_results = mirror_bulkdata_file(collection, url, item_path, lastmod, options)
|
||||
if mirror_results is not None and len(mirror_results) > 0:
|
||||
results = results + mirror_results
|
||||
|
||||
else:
|
||||
raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url))
|
||||
|
||||
return results
|
||||
|
||||
def should_skip_sitemap(url, options):
|
||||
# Don't skip sitemap indexes.
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/(\w+)_sitemap_index.xml", url)
|
||||
if m:
|
||||
return False
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/bulkdata/(\w+)/sitemapindex.xml", url)
|
||||
if m:
|
||||
return False
|
||||
|
||||
year_filter = options.get("years", "").strip()
|
||||
congress_filter = options.get("congress", "").strip()
|
||||
|
||||
# Regular collections are grouped by publication year.
|
||||
# Which years should we download? All if none is specified.
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/(\w+)_(\d+)_sitemap.xml", url)
|
||||
if m:
|
||||
year = m.group(2)
|
||||
if year_filter != "" and year not in year_filter.split(","):
|
||||
return True
|
||||
|
||||
# Bulk data collections are grouped into subdirectories that can
|
||||
# represent years (as in the FR collection) or other types of groupings
|
||||
# like Congress + Bill Type for the BILLSTATUS collection.
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/bulkdata/(\w+)/(\d+)(.*)/sitemap.xml", url)
|
||||
if m:
|
||||
numeric_grouping = m.group(2)
|
||||
if year_filter != "" and numeric_grouping not in year_filter.split(","):
|
||||
return True
|
||||
if congress_filter != "" and numeric_grouping not in congress_filter.split(","):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_sitemap_cache_file(url):
|
||||
# Where should we store the local cache of the sitemap XML and a file
|
||||
# that stores its <lastmod> date for when we last downloaded it? Returns
|
||||
# a path relative to the cache root.
|
||||
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/(\w+)_sitemap_index.xml", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/(\w+)_(\d+)_sitemap.xml", url)
|
||||
if m:
|
||||
return m.group(1) + "/" + m.group(2)
|
||||
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/bulkdata/(\w+)/sitemapindex.xml", url)
|
||||
if m:
|
||||
return m.group(1) + "-bulkdata"
|
||||
|
||||
m = re.match(re.escape(GOVINFO_BASE_URL) + r"sitemap/bulkdata/(\w+)/(.+)/sitemap.xml", url)
|
||||
if m:
|
||||
return m.group(1) + "-bulkdata/" + m.group(2)
|
||||
|
||||
raise ValueError(url)
|
||||
|
||||
def should_download_sitemap(lastmod_cache, current_lastmod, options):
|
||||
# Download a sitemap or just read from our cache?
|
||||
|
||||
if not current_lastmod:
|
||||
# No lastmod is known for this file (it's the root of a sitemap
|
||||
# tree - this is the first web request).
|
||||
return True
|
||||
|
||||
elif options.get("force", False):
|
||||
# User requests downloading everything.
|
||||
return True
|
||||
|
||||
elif options.get("cached", False):
|
||||
# User requests downloading nothing.
|
||||
return False
|
||||
|
||||
else:
|
||||
# Download if the lastmod from the parent sitemap doesn't agree with
|
||||
# the lastmod stored on disk.
|
||||
return current_lastmod != lastmod_cache
|
||||
|
||||
|
||||
# Downloading Packages
|
||||
|
||||
|
||||
def mirror_package(collection, package_name, lastmod, lastmod_cache, options):
|
||||
"""Create a local mirror of a GovInfo.gov package."""
|
||||
|
||||
# Where should we store the file? Each collection has a different
|
||||
# file system layout (for BILLS, we put bill text along where the
|
||||
# bills scraper puts bills).
|
||||
path = get_output_path(collection, package_name, options)
|
||||
if not path: # should skip
|
||||
return []
|
||||
|
||||
# Go to the part of the lastmod_cache for this package.
|
||||
lastmod_cache = lastmod_cache.setdefault(package_name, {})
|
||||
lastmod_cache = lastmod_cache.setdefault("files", {})
|
||||
|
||||
# Download the package ZIP file. We don't know what formats are available
|
||||
# until we download this file, and when we hit particular package files
|
||||
# we get 302 HTTP responses, which is uninformative about whether the
|
||||
# file is supposed to exist or not. But we can reliably download the ZIP
|
||||
# package.
|
||||
file_path = os.path.join(path, "package.zip")
|
||||
|
||||
# Download the package ZIP file if it's updated.
|
||||
downloaded_files = []
|
||||
if mirror_package_zipfile(collection, package_name, file_path, lastmod, lastmod_cache, options):
|
||||
downloaded_files.append(file_path)
|
||||
|
||||
# Extract files from the package ZIP file depending on the --extract
|
||||
# command-line arguments. We do this even if the package ZIP file has
|
||||
# not changed because the --extract arguments might have changed and
|
||||
# the caller may want to extract files after having already gotten the
|
||||
# package ZIP file.
|
||||
extracted_files = extract_package_files(collection, package_name, file_path, lastmod_cache, options)
|
||||
downloaded_files.extend(extracted_files)
|
||||
|
||||
return downloaded_files
|
||||
|
||||
def mirror_package_zipfile(collection, package_name, file_path, lastmod, lastmod_cache, options):
|
||||
# Do we already have this file updated?
|
||||
if lastmod_cache.get("package") == lastmod:
|
||||
if not options.get("force", False):
|
||||
return
|
||||
|
||||
# With --cached, skip if the file is already downloaded.
|
||||
if os.path.exists(file_path) and options.get("cached", False):
|
||||
return
|
||||
|
||||
# Download.
|
||||
file_url = GOVINFO_BASE_URL + "content/pkg/{}-{}.zip".format(collection, package_name)
|
||||
logging.warn("Downloading: " + file_path)
|
||||
data = utils.download(file_url, file_path, utils.merge(options, {
|
||||
'binary': True,
|
||||
'force': True, # decision to cache was made above
|
||||
'to_cache': False,
|
||||
'needs_content': False,
|
||||
}))
|
||||
|
||||
# Update the lastmod of the downloaded file.
|
||||
lastmod_cache['package'] = lastmod
|
||||
return True
|
||||
|
||||
def extract_package_files(collection, package_name, package_file, lastmod_cache, options):
|
||||
# Extract files from the package ZIP file depending on the --extract
|
||||
# command-line argument. When extracting a file, mark the extracted
|
||||
# file's lastmod as the same as the package's lastmod.
|
||||
|
||||
# Get the formats that the user wants to extract.
|
||||
extract_formats = set(format for format in options.get("extract", "").split(",") if format.strip())
|
||||
|
||||
# Make a mapping from file formats to a tuple of the filename found in the package ZIP
|
||||
# file and the filename that we will use to store the extracted format locally.
|
||||
format_paths = {
|
||||
'pdf': ("{collection}-{package_name}/pdf/{collection}-{package_name}.pdf", "document.pdf"),
|
||||
'text': ("{collection}-{package_name}/html/{collection}-{package_name}.htm", "document.html"), # text wrapped in HTML!
|
||||
'xml': ("{collection}-{package_name}/xml/{collection}-{package_name}.xml", "document.xml"),
|
||||
'mods': ("{collection}-{package_name}/mods.xml", "mods.xml"),
|
||||
'premis': ("{collection}-{package_name}/premis.xml", "premis.xml")
|
||||
}
|
||||
|
||||
# Extract only files if the package lastmod is newer than the file's lastmod.
|
||||
extract_formats = { format for format in extract_formats
|
||||
if lastmod_cache.get(format) is None or lastmod_cache[format] < lastmod_cache['package'] }
|
||||
|
||||
# Don't even bother opening the ZIP file if there are no new files to extract.
|
||||
if not extract_formats:
|
||||
return []
|
||||
|
||||
# Open the package ZIP file and try to extract files with names
|
||||
# we recognize.
|
||||
extracted_files = []
|
||||
with zipfile.ZipFile(package_file) as package:
|
||||
for format in extract_formats:
|
||||
if format not in format_paths:
|
||||
raise ValueError("invalid format: " + format)
|
||||
|
||||
# Construct the expected path in the package ZIP file and the desired local filename.
|
||||
package_path, local_path = format_paths[format]
|
||||
package_path = package_path.format(collection=collection, package_name=package_name)
|
||||
local_path = os.path.join(os.path.dirname(package_file), local_path)
|
||||
|
||||
# Extract it.
|
||||
try:
|
||||
with package.open(package_path) as f1:
|
||||
with open(local_path, 'w') as f2:
|
||||
f2.write(f1.read())
|
||||
except KeyError:
|
||||
# No file of this format is present in this package.
|
||||
continue
|
||||
finally:
|
||||
# Even if the file didn't exist, which is NOT an error condition
|
||||
# because not all packages have documents of all formats, update
|
||||
# the format's file's lastmod in our cache so that we don't try
|
||||
# to extract it again later, unless the package is updated.
|
||||
lastmod_cache[format] = lastmod_cache['package']
|
||||
|
||||
logging.warn("Extracted: " + local_path)
|
||||
extracted_files.append(local_path)
|
||||
|
||||
# The "text" format files are put in an HTML container. Unwrap it into a .txt file.
|
||||
if format == "text":
|
||||
file_path_text = local_path.replace(".html", ".txt")
|
||||
logging.info("Unwrapping HTML to: " + file_path_text)
|
||||
with open(local_path) as f1:
|
||||
with open(file_path_text, "w") as f2:
|
||||
f2.write(unwrap_text_in_html(f1.read()))
|
||||
extracted_files.append(file_path_text)
|
||||
|
||||
if collection == "BILLS" and format == "mods":
|
||||
# When we download bill files, also create the text-versions/data.json file
|
||||
# which extracts commonly used components of the MODS XML, whenever we update
|
||||
# that MODS file.
|
||||
extract_bill_version_metadata(package_name, os.path.dirname(package_file))
|
||||
|
||||
return extracted_files
|
||||
|
||||
|
||||
def get_bill_id_for_package(package_name, with_version=True, restrict_to_congress=None):
|
||||
m = re.match(r"(\d+)([a-z]+)(\d+)([a-z][a-z0-9]*|)$", package_name)
|
||||
if not m:
|
||||
raise Exception("Unmatched bill document package name: " + package_name)
|
||||
congress, bill_type, bill_number, version_code = m.groups()
|
||||
|
||||
if restrict_to_congress and int(congress) != int(restrict_to_congress):
|
||||
return None
|
||||
|
||||
if not with_version:
|
||||
return ("%s%s-%s" % (bill_type, bill_number, congress), version_code)
|
||||
else:
|
||||
return "%s%s-%s-%s" % (bill_type, bill_number, congress, version_code)
|
||||
|
||||
|
||||
def get_output_path(collection, package_name, options):
|
||||
# Where to store the document files?
|
||||
|
||||
# The path will depend a bit on the collection.
|
||||
if collection == "BILLS":
|
||||
# Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]).
|
||||
bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress"))
|
||||
if not bill_and_ver:
|
||||
return None # congress number does not match options["congress"]
|
||||
from bills import output_for_bill
|
||||
bill_id, version_code = bill_and_ver
|
||||
return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False)
|
||||
|
||||
elif collection == "CRPT":
|
||||
# Store committee reports in [congress]/crpt/[reporttype].
|
||||
m = re.match(r"(\d+)([hse]rpt)(\d+)$", package_name)
|
||||
if not m:
|
||||
raise ValueError(package_name)
|
||||
congress, report_type, report_number = m.groups()
|
||||
if options.get("congress") and congress != options.get("congress"):
|
||||
return None # congress number does not match options["congress"]
|
||||
return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress, collection.lower(), report_type, report_type + report_number)
|
||||
|
||||
else:
|
||||
# Store in govinfo/COLLECTION/PKGNAME.
|
||||
path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, package_name)
|
||||
return path
|
||||
|
||||
|
||||
def unwrap_text_in_html(data):
|
||||
text_content = unicode(html.fromstring(data).text_content())
|
||||
return text_content.encode("utf8")
|
||||
|
||||
|
||||
# Downloading bulk data files
|
||||
|
||||
|
||||
def mirror_bulkdata_file(collection, url, item_path, lastmod, options):
|
||||
# Return a list of files we downloaded.
|
||||
results = []
|
||||
|
||||
# Where should we store the file?
|
||||
path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, item_path)
|
||||
|
||||
# For BILLSTATUS, store this along with where we store the rest of bill
|
||||
# status data.
|
||||
if collection == "BILLSTATUS":
|
||||
from bills import output_for_bill
|
||||
bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path.replace("BILLSTATUS-", "")))[0], with_version=False)
|
||||
path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)
|
||||
|
||||
# Where should we store the lastmod found in the sitemap so that
|
||||
# we can tell later if the file has changed?
|
||||
lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt"
|
||||
|
||||
# Do we already have this file up to date?
|
||||
if os.path.exists(lastmod_cache_file) and not options.get("force", False):
|
||||
if lastmod == utils.read(lastmod_cache_file):
|
||||
return
|
||||
|
||||
# With --cached, skip if the file is already downloaded.
|
||||
if os.path.exists(path) and options.get("cached", False):
|
||||
return
|
||||
|
||||
# Download.
|
||||
logging.warn("Downloading: " + path)
|
||||
data = utils.download(url, path, utils.merge(options, {
|
||||
'binary': True,
|
||||
'force': True, # decision to cache was made above
|
||||
'to_cache': False,
|
||||
}))
|
||||
results.append(path)
|
||||
|
||||
if not data:
|
||||
# Something failed.
|
||||
return
|
||||
|
||||
# Write the current last modified date back to disk so we know the next time whether
|
||||
# we need to fetch the file again.
|
||||
utils.write(lastmod, lastmod_cache_file)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def extract_bill_version_metadata(package_name, text_path):
|
||||
bill_version_id = get_bill_id_for_package(package_name)
|
||||
|
||||
bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
|
||||
|
||||
bill_version = {
|
||||
'bill_version_id': bill_version_id,
|
||||
'version_code': version_code,
|
||||
'urls': {},
|
||||
}
|
||||
|
||||
mods_ns = {"mods": "http://www.loc.gov/mods/v3"}
|
||||
doc = etree.parse(os.path.join(text_path, "mods.xml"))
|
||||
locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns)
|
||||
|
||||
for location in locations:
|
||||
label = location.attrib['displayLabel']
|
||||
if "HTML" in label:
|
||||
format = "html"
|
||||
elif "PDF" in label:
|
||||
format = "pdf"
|
||||
elif "XML" in label:
|
||||
format = "xml"
|
||||
else:
|
||||
format = "unknown"
|
||||
bill_version["urls"][format] = location.text
|
||||
|
||||
bill_version["issued_on"] = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns)
|
||||
|
||||
utils.write(
|
||||
json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime),
|
||||
output_for_bill_version(bill_version_id)
|
||||
)
|
||||
|
||||
def output_for_bill_version(bill_version_id):
|
||||
bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
|
||||
return "%s/%s/bills/%s/%s%s/text-versions/%s/data.json" % (utils.data_dir(), congress, bill_type, bill_type, number, version_code)
|
||||
@@ -16,6 +16,7 @@ import scrapelib
|
||||
import pprint
|
||||
import logging
|
||||
import subprocess
|
||||
import signal
|
||||
|
||||
import smtplib
|
||||
import email.utils
|
||||
@@ -845,3 +846,31 @@ def translate_legislator_id(source_id_type, source_id, dest_id_type):
|
||||
return _translate_legislator_id_cache[(source_id_type, source_id)][dest_id_type]
|
||||
except KeyError:
|
||||
raise UnmatchedIdentifer(source_id_type, source_id, dest_id_type)
|
||||
|
||||
# adapted from https://gist.github.com/tcwalther/ae058c64d5d9078a9f333913718bba95,
|
||||
# which was based on http://stackoverflow.com/a/21919644/487556.
|
||||
# This provides a with-block object that prevents Ctrl+C (SIGINT)
|
||||
# or the TERM signal from interrupting program flow until the
|
||||
# with-block exits. This is useful to ensure that file write
|
||||
# operations aren't killed mid-write resulting in a corrupt file.
|
||||
class NoInterrupt(object):
|
||||
def __init__(self, *signals):
|
||||
if not signals: signals = [signal.SIGTERM, signal.SIGINT]
|
||||
self.sigs = signals
|
||||
def __enter__(self):
|
||||
self.signal_received = {}
|
||||
self.old_handlers = {}
|
||||
for sig in self.sigs:
|
||||
def handler(s, frame, sig=sig): # sig=sig ensures the variable is captured by value
|
||||
self.signal_received[sig] = (s, frame)
|
||||
# Note: in Python 3.5, you can use signal.Signals(sig).name
|
||||
logging.info('Signal %s received. Delaying KeyboardInterrupt.' % sig)
|
||||
self.old_handlers[sig] = signal.signal(sig, handler)
|
||||
def __exit__(self, type, value, traceback):
|
||||
# Restore signal handlers that were in place before entering the with-block.
|
||||
for sig in self.sigs:
|
||||
signal.signal(sig, self.old_handlers[sig])
|
||||
# Issue the signals caught during the with-block.
|
||||
for sig, args in self.signal_received.items():
|
||||
if self.old_handlers[sig]:
|
||||
self.old_handlers[sig](*args)
|
||||
|
||||
Reference in New Issue
Block a user