congress/tasks/fdsys.py

# Cache FDSys sitemaps to get a list of available documents.
#
# ./run fdsys [--year=XXXX]
# Caches the complete FDSys sitemap. Uses lastmod times in
# sitemaps to only download new files. Use --year to only
# update a particular year (for testing, I guess).
#
# ./run fdsys --list-collections
# Dumps a list of the names of GPO's collections.
#
# ./run fdsys --collections=BILLS,STATUTE
# Only fetch sitemaps for these collections.
#
# ./run fdsys --cached|force
# Always/never use the cache.
#
# # ./run fdsys --collections=BILLS --congress=XXX
# Updates the sitemaps for the years of the indicated Congress
# and then outputs text-versions.json next to each bill data.json
# file from the bills scraper.
#
# ./run fdsys ... --store mods,pdf
# When downloading, also locally mirror the MODS and PDF documents
# associated with each package. Update as the sitemap indicates.
# Pass --granules to locally cache only granule files (e.g. the
# individual statute files w/in a volume).

from lxml import etree, html
import glob, json, re, logging, os.path
import utils

# for xpath
ns = { "x": "http://www.sitemaps.org/schemas/sitemap/0.9" }

def run(options):
  # GPO FDSys organizes its sitemaps by publication year (the date of
  # original print publication) and then by colletion (bills, statutes,
  # etc.).

  # Which collections should we download? All if none is specified.
  fetch_collections = None
  if options.get("collections", "").strip() != "":
    fetch_collections = set(options.get("collections").split(","))

  # Update our cache of the complete FDSys sitemap.
  update_sitemap_cache(fetch_collections, options)
  if options.get("list-collections", False): return

  # Locally store MODS, PDF, etc.
  if "store" in options:
    mirror_files(fetch_collections, options)

  # Create a JSON file listing all available bill text documents.
  # Only if --collections is omitted or specifies BILLS, and if
  # --congress is specified.
  if (not fetch_collections or "BILLS" in fetch_collections) and options.get('congress', None):
    update_bill_version_list(int(options.get('congress')))

def update_sitemap_cache(fetch_collections, options):
  """Updates a local cache of the complete FDSys sitemap tree.
  Pass fetch_collections as None, or to restrict the update to
  particular FDSys collections a set of collection names. Only
  downloads changed sitemap files."""

  seen_collections = set()

  # Load the root sitemap.
  master_sitemap = get_sitemap(None, None, None, options)
  if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type at the root sitemap.")

  # Process the year-by-year sitemaps.
  for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns):
    # Get year and lastmod date.
    url = str(year_node.xpath("string(x:loc)", namespaces=ns))
    lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns))
    m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url)
    if not m or m.group(1) != m.group(2): raise ValueError("Unmatched sitemap URL: %s" % url)
    year = m.group(1)

    # Should we process this year's sitemaps?
    if options.get("congress", None) and int(year) not in utils.get_congress_years(int(options.get("congress"))): continue
    if options.get("year", None) and int(year) != int(options.get("year")): continue

    # Get the sitemap.
    year_sitemap = get_sitemap(year, None, lastmod, options)
    if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type in %s sitemap." % year)

    # Process the collection sitemaps.
    for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns):
      # Get collection and lastmod date.
      url = str(collection_node.xpath("string(x:loc)", namespaces=ns))
      lastmod = str(collection_node.xpath("string(x:lastmod)", namespaces=ns))
      m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url)
      if not m or m.group(1) != year or m.group(2) != year: raise ValueError("Unmatched sitemap URL: %s" % url)
      collection = m.group(3)

      # To help the user find a collection name, record this collection but don't download it.
      if options.get("list-collections", False):
        seen_collections.add(collection)
        continue

      # Should we download the sitemap?
      if fetch_collections and collection not in fetch_collections:
        continue

      # Get the sitemap.
      collection_sitemap = get_sitemap(year, collection, lastmod, options)
      if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection))

  if options.get("list-collections", False):
    print "\n".join(sorted(seen_collections))

def get_sitemap(year, collection, lastmod, options):
  """Gets a single sitemap, downloading it if the sitemap has changed.

  Downloads the root sitemap (year==None, collection==None), or
  the sitemap for a year (collection==None), or the sitemap for
  a particular year and collection. Pass lastmod which is the current
  modification time of the file according to its parent sitemap, which
  is how it knows to return a cached copy.

  Returns the sitemap parsed into a DOM.
  """

  # Construct the URL and the path to where to cache the file on disk.
  if year == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
    path = "fdsys/sitemap/sitemap.xml"
  elif collection == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
    path = "fdsys/sitemap/%s/sitemap.xml" % year
  else:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
    path = "fdsys/sitemap/%s/%s.xml" % (year, collection)

  # Should we re-download the file?
  lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
  if options.get("cached", False):
    # If --cached is used, don't hit the network.
    force = False
  elif not lastmod:
    # No *current* lastmod date is known for this file (because it is the master
    # sitemap file, probably), so always download.
    force = True
  else:
    # If the file is out of date or --force is used, download the file.
    cache_lastmod = utils.read(lastmod_cache_file)
    force = (lastmod != cache_lastmod) or options.get("force", False)

  if force:
    logging.warn("Downloading: %s" % url)

  body = utils.download(url, path, utils.merge(options, {
    'force': force,
    'xml': True
  }))

  if not body:
      raise Exception("Failed to download %s" % url)

  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the file.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file)

  return etree.fromstring(body)


# uses get_sitemap, but returns a list of tuples of date and url
def entries_from_collection(year, collection, lastmod, options):
  if (not collection) or (not year):
    raise Exception("This method requires a specific year and collection.")

  sitemap = get_sitemap(year, collection, lastmod, options)

  entries = []

  for entry_node in sitemap.xpath("x:url", namespaces=ns):
    url = str(entry_node.xpath("string(x:loc)", namespaces=ns))
    lastmod = str(entry_node.xpath("string(x:lastmod)", namespaces=ns))
    entries.append((url, lastmod))

  return entries


def mirror_files(fetch_collections, options):
  """Create a local mirror of FDSys document files. Only downloads
  changed files, according to the sitemap. Run update_sitemap_cache first.

  Pass fetch_collections as None, or to restrict the update to
  particular FDSys collections a set of collection names.

  Set options["store"] to a comma-separated list of file types (pdf,
  mods, text, xml).
  """

  # For determining whether we need to process a sitemap file again on a later
  # run, we need to make a key out of the command line arguments that affect
  # which files we are downloading.
  cache_options_key = repr(tuple(sorted(kv for kv in options.items() if kv[0] in ("store", "year", "congress", "granules", "cached"))))

  file_types = options["store"].split(",")

  # Process each FDSys sitemap...
  for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")):
    # Should we process this file?
    year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups()
    if "year" in options and year != options["year"]: continue
    if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue
    if fetch_collections and collection not in fetch_collections: continue

    # Has this sitemap changed since the last successful mirror?
    #
    # The sitemap's last modification time is stored in ...-lastmod.txt,
    # which comes from the sitemap's parent sitemap's lastmod listing for
    # the file.
    #
    # Compare that to the lastmod value of when we last did a successful mirror.
    # This function can be run to fetch different sets of files, so get the
    # lastmod value corresponding to the current run arguments.
    sitemap_store_state_file = re.sub(r"\.xml$", "-store-state.json", sitemap)
    sitemap_last_mod = open(re.sub(r"\.xml$", "-lastmod.txt", sitemap)).read()
    if os.path.exists(sitemap_store_state_file):
      sitemap_store_state = json.load(open(sitemap_store_state_file))
      if sitemap_store_state.get(cache_options_key) == sitemap_last_mod:
        # sitemap hasn't changed since the last time
        continue

    logging.info("scanning " + sitemap + "...")

    # Load the sitemap for this year & collection, and loop through each document.
    for package_name, lastmod in get_sitemap_entries(sitemap):
      # Add this package to the download list.
      file_list = []

      if not options.get("granules", False):
        # Doing top-level package files (granule==None).
        file_list.append(None)

      else:
        # In some collections, like STATUTE, each document has subparts which are not
        # described in the sitemap. Load the main HTML page and scrape for the sub-files.
        # In the STATUTE collection, the MODS information in granules is redudant with
        # information in the top-level package MODS file. But the only way to get granule-
        # level PDFs is to go through the granules.
        content_detail_url = "http://www.gpo.gov/fdsys/pkg/%s/content-detail.html" % package_name
        content_index = utils.download(content_detail_url,
            "fdsys/package/%s/%s/%s.html" % (year, collection, package_name),
            utils.merge(options, {
            'xml': True, # it's not XML but this avoid unescaping HTML which fails if there are unicode characters
          }))
        if not content_index: raise Exception("Failed to download %s" % content_detail_url)
        for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
          if link.text == "More":
            m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
            if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href"))
            granule_name = m.group(2)
            file_list.append(granule_name)

      # Download the files of the desired types.
      for granule_name in file_list:
        mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options)

    # If we got this far, we successfully downloaded all of the files in this year/collection.
    # To speed up future updates, save the lastmod time of this sitemap in a file indicating
    # what we downloaded. The store-state file contains a JSON mapping of command line options
    # to the most recent lastmod value for this sitemap.
    sitemap_store_state = { }
    if os.path.exists(sitemap_store_state_file):
      sitemap_store_state = json.load(open(sitemap_store_state_file))
    sitemap_store_state[cache_options_key] = sitemap_last_mod
    json.dump(sitemap_store_state, open(sitemap_store_state_file, "w"))

def get_sitemap_entries(sitemap_filename):
  # Load the XML file.
  dom = etree.parse(sitemap_filename).getroot()
  if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")

  # Loop through entries.
  for file_node in dom.xpath("x:url", namespaces=ns):
    # Get URL and last modified timestamp.
    url = str(file_node.xpath("string(x:loc)", namespaces=ns))
    lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
    if not url.endswith("/content-detail.html"): raise Exception("Unrecognized file pattern.")

    # Get the package name.
    m = re.match("http://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
    if not m: raise Exception("Unmatched document URL")
    package_name = m.group(1)

    yield package_name, lastmod

def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options):
  # Where should we store the file?
  path = get_output_path(year, collection, package_name, granule_name, options)
  if not path: return # should skip

  # Do we need to update this record?
  lastmod_cache_file = path + "/lastmod.txt"
  cache_lastmod = utils.read(lastmod_cache_file)
  force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)

  # Try downloading files for each file type.
  targets = get_package_files(package_name, granule_name, path)
  for file_type in file_types:
    if file_type not in targets: raise Exception("Invalid file type: %s" % file_type)
    f_url, f_path = targets[file_type]

    if (not force) and os.path.exists(f_path): continue # we already have the current file
    logging.warn("Downloading: " + f_path)
    data = utils.download(f_url, f_path, utils.merge(options, {
      'xml': True,
      'force': force,
      'to_cache': False
    }))

    if not data:
      if file_type == "pdf":
        # expected to be present for all packages
        raise Exception("Failed to download %s" % package_name)
      else:
        # not all packages have all file types, but assume this is OK
        logging.error("file not found: " + f_url)

    if file_type == "text" and f_path.endswith(".html"):
      # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
      # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
      #       html.fromstring does auto-detection.
      with open(f_path[0:-4] + "txt", "w") as f:
        text_content = unicode(html.fromstring(data).text_content())
        f.write(text_content.encode("utf8"))

  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the files for this sitemap item.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file)

def get_output_path(year, collection, package_name, granule_name, options):
  # Where to store the document files?
  # The path will depend a bit on the collection.
  if collection == "BILLS":
    # Store with the other bill data.
    m = re.match(r"BILLS-(\d+)([a-z]+)(\d+)(\D.*)", package_name)
    if not m: raise Exception("Unmatched bill document package name: " + package_name)
    congress, bill_type, bill_number, version_code = m.groups()
    congress = int(congress)
    if "congress" in options and congress != int(options["congress"]): return None
    return output_for_bill(congress, bill_type, bill_number, "text-versions/" + version_code)
  else:
    # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
    path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name)
    if granule_name: path += "/" + granule_name
    return path

def get_package_files(package_name, granule_name, path):
  baseurl = "http://www.gpo.gov/fdsys/pkg/%s/" % package_name
  baseurl_mods = baseurl

  if not granule_name:
    file_name = package_name
  else:
    file_name = granule_name
    baseurl_mods = "http://www.gpo.gov/fdsys/granule/%s/%s/" % (package_name, granule_name)

  ret = {
    'mods': (baseurl_mods + "mods.xml", path + "/mods.xml"),
    'pdf': (baseurl + "pdf/" + file_name + ".pdf", path + "/document.pdf"),
    'xml': (baseurl + "xml/" + file_name + ".xml", path + "/document.xml"),
    'text': (baseurl + "html/" + file_name + ".htm", path + "/document.html"), # text wrapped in HTML
  }
  if not granule_name:
    # granules don't have PREMIS files?
    ret['premis'] = (baseurl + "premis.xml", path + "/premis.xml")

  return ret

def update_bill_version_list(only_congress):
  bill_versions = { }

  # Which sitemap years should we look at?
  if not only_congress:
    sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
  else:
    # If --congress=X is specified, only look at the relevant years.
    sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)]
    sitemap_files = [f for f in sitemap_files if os.path.exists(f)]

  # For each year-by-year BILLS sitemap...
  for year_sitemap in sitemap_files:
    dom = etree.parse(year_sitemap).getroot()
    if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")

    # Loop through each bill text version...
    for file_node in dom.xpath("x:url", namespaces=ns):
      # get URL and last modified date
      url = str(file_node.xpath("string(x:loc)", namespaces=ns))
      lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))

      # extract bill congress, type, number, and version from the URL
      m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
      if not m: raise Exception("Unmatched bill document URL: " + url)
      congress, bill_type, bill_number, version_code = m.groups()
      congress = int(congress)
      if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url)

      # If --congress=XXX is specified, only look at those bills.
      if only_congress and congress != only_congress:
        continue

      # Track the documents by congress, bill type, etc.
      bill_versions\
        .setdefault(congress, { })\
        .setdefault(bill_type, { })\
        .setdefault(bill_number, { })\
        [version_code] = {
          "url": url,
          "lastmod": lastmod,
        }

  # Output the bill version info. We can't do this until the end because we need to get
  # the complete list of versions for a bill before we write the file, and the versions
  # may be split across multiple sitemap files.

  for congress in bill_versions:
    for bill_type in bill_versions[congress]:
      for bill_number in bill_versions[congress][bill_type]:
        utils.write(
          json.dumps(bill_versions[congress][bill_type][bill_number],
            sort_keys=True, indent=2, default=utils.format_datetime),
          output_for_bill(congress, bill_type, bill_number, "text-versions.json")
        )


def output_for_bill(congress, bill_type, number, fn):
  # Similar to bills.output_for_bill
  return "%s/%d/bills/%s/%s%s/%s" % (utils.data_dir(), congress, bill_type, bill_type, number, fn)

# given a FDsys filename (e.g. BILLS-113hr302ih), fetch the MODS doc, and return:
#   issued_on: the date the referenced document was issued (<dateIssued>)
#   urls: a dict of forms of this doc (<location>)
def document_info_for(filename, cache, options):
  mods_url = mods_for(filename)
  mods_cache = ""
  body = utils.download(mods_url,
    cache,
    utils.merge(options, {'xml': True})
  )

  doc = etree.fromstring(body)
  mods_ns = {"mods": "http://www.loc.gov/mods/v3"}

  locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns)

  urls = {}
  for location in locations:
    label = location.attrib['displayLabel']
    if "HTML" in label:
      format = "html"
    elif "PDF" in label:
      format = "pdf"
    elif "XML" in label:
      format = "xml"
    else:
      format = "unknown"
    urls[format] = location.text

  issued_on = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns)

  return issued_on, urls


def mods_for(filename):
  return "http://www.gpo.gov/fdsys/pkg/%s/mods.xml" % filename