congress/tasks/utils.py

import os, os.path, errno, sys, traceback, zipfile
import re, htmlentitydefs
import yaml, json
from pytz import timezone
import datetime, time
from lxml import html, etree
import scrapelib
import pprint
import logging

import smtplib
import email.utils
from email.mime.text import MIMEText
import getpass


# read in an opt-in config file for changing directories and supplying email settings
# returns None if it's not there, and this should always be handled gracefully
path = "config.yml"
if os.path.exists(path):
  config = yaml.load(open(path, 'r'))
else:
  config = None


eastern_time_zone = timezone('US/Eastern')

# scraper should be instantiated at class-load time, so that it can rate limit appropriately
scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3)
scraper.user_agent = "unitedstates/congress (https://github.com/unitedstates/congress)"

govtrack_person_id_map = None

class UnmatchedIdentifer(Exception):
  def __init__(self, id_type, id_value, help_url):
    super(UnmatchedIdentifer, self).__init__("%s=%s %s" % (id_type, str(id_value), help_url))

def format_datetime(obj):
  if isinstance(obj, datetime.datetime):
    return eastern_time_zone.localize(obj.replace(microsecond=0)).isoformat()
  elif isinstance(obj, str):
    return obj
  else:
    return None

def current_congress():
  year = current_legislative_year()
  return congress_from_legislative_year(year)

def congress_from_legislative_year(year):
  return ((year + 1) / 2) - 894

def current_legislative_year(date=None):
  if not date:
    date = datetime.datetime.now()

  year = date.year

  if date.month == 1:
    if date.day == 1 or date.day == 2:
      return date.year - 1
    elif date.day == 3 and date.hour < 12:
      return date.year - 1
    else:
      return date.year
  else:
    return date.year

def get_congress_first_year(congress):
  return (((int(congress)+894)*2) - 1)

# get the three calendar years that the Congress extends through (Jan 3 to Jan 3).
def get_congress_years(congress):
  y1 = get_congress_first_year(congress)
  return (y1, y1+1, y1+2)

# bill_type, bill_number, congress
def split_bill_id(bill_id):
  return re.match("^([a-z]+)(\d+)-(\d+)$", bill_id).groups()

# bill_type, bill_number, congress, version_code
def split_bill_version_id(bill_version_id):
  return re.match("^([a-z]+)(\d+)-(\d+)-([a-z\d]+)$", bill_version_id).groups()

def split_vote_id(bill_id):
  return re.match("^(h|s)(\d+)-(\d+).(\d\d\d\d)$", bill_id).groups()

# nomination_type, congress, nomination_number
# I think it's always PN, but might as well include
def split_nomination_id(nomination_id):
  try:
    return re.match("^([A-z]{2})(\d+)-(\d+)$", nomination_id).groups()
  except Exception, e:
    logging.error("Unabled to parse %s" % nomination_id)
    return (None, None, None)

def process_set(to_fetch, fetch_func, options, *extra_args):
  errors = []
  saved = []
  skips = []

  for id in to_fetch:
    try:
      results = fetch_func(id, options, *extra_args)
    except Exception, e:
      if options.get('raise', False):
        raise
      else:
        errors.append((id, e, format_exception(e)))
        continue

    if results.get('ok', False):
      if results.get('saved', False):
        saved.append(id)
        logging.info("[%s] Updated" % id)
      else:
        skips.append(id)
        logging.warn("[%s] Skipping: %s" % (id, results['reason']))
    else:
      errors.append((id, results, None))
      logging.error("[%s] Error: %s" % (id, results['reason']))

  if len(errors) > 0:
    message = "\nErrors for %s items:\n" % len(errors)
    for id, error, msg in errors:
      message += "\n\n"
      if isinstance(error, Exception):
        message += "[%s] Exception:\n\n" % id
        message += msg
      else:
        message += "[%s] %s" % (id, error)

    admin(message) # email if possible

  logging.warning("\nErrors for %s." % len(errors))
  logging.warning("Skipped %s." % len(skips))
  logging.warning("Saved data for %s." % len(saved))

  return saved + skips # all of the OK's


# Download file at `url`, cache to `destination`.
# Takes many options to customize behavior.
_download_zip_files = { }
def download(url, destination=None, options={}):
  # uses cache by default, override (True) to ignore
  force = options.get('force', False)

  # saves in cache dir by default, override (False) to save to exact destination
  to_cache = options.get('to_cache', True)

  # unescapes HTML encoded characters by default, set this (True) to not do that
  is_binary = options.get('binary', False)

  # used by test suite to use special (versioned) test cache dir
  test = options.get('test', False)

  # if need a POST request with data
  postdata = options.get('postdata', False)

  # caller cares about actually bytes or only success/fail
  needs_content = options.get('needs_content', True) or not is_binary or postdata

  # form the path to the file if we intend on saving it to disk
  if destination:
    if to_cache:
      if test:
        cache = test_cache_dir()
      else:
        cache = cache_dir()
      cache_path = os.path.join(cache, destination)

    else:
      cache_path = destination

  # If we are working in the cache directory, look for a zip file
  # anywhere along the path like "cache/93/bills.zip", and see if
  # the file is already cached inside it (e.g. as 'bills/pages/...").
  # If it is, and force is true, then raise an Exception because we
  # can't update the ZIP file with new content (I imagine it would
  # be very slow). If force is false, return the content from the
  # archive.
  if destination and to_cache:
    dparts = destination.split(os.sep)
    for i in xrange(len(dparts)-1):
      # form the ZIP file name and test if it exists...
      zfn = os.path.join(cache, *dparts[:i+1]) + ".zip"
      if not os.path.exists(zfn): continue

      # load and keep the ZIP file instance in memory because it's slow to instantiate this object
      zf = _download_zip_files.get(zfn)
      if not zf:
        zf = zipfile.ZipFile(zfn, "r")
        _download_zip_files[zfn] = zf
        logging.warn("Loaded: %s" % zfn)

      # see if the inner file exists, and if so read the bytes
      try:
        zfn_inner = os.path.join(*dparts[i:])
        body = zf.read(zfn_inner)
      except KeyError:
        # does not exist
        continue

      if not test: logging.info("Cached: (%s, %s)" % (zfn + "#" + zfn_inner, url))
      if force: raise Exception("Cannot re-download a file already cached to a ZIP file.")

      if not is_binary:
        body = body.decode("utf8")
        body = unescape(body)

      return body

  # Load the file from disk if it's already been downloaded and force is False.
  if destination and (not force) and os.path.exists(cache_path):
    if not test: logging.info("Cached: (%s, %s)" % (cache_path, url))
    if not needs_content: return True
    with open(cache_path, 'r') as f:
      body = f.read()
    if not is_binary:
      body = body.decode("utf8")

  # Download from the network and cache to disk.
  else:
    try:
      logging.info("Downloading: %s" % url)

      if postdata:
        response = scraper.urlopen(url, 'POST', postdata)
      else:
        if not needs_content:
          import subprocess
          mkdir_p(os.path.dirname(cache_path))
          return True if (subprocess.call(["wget", "-q", "-O", cache_path, url]) == 0) else None
        response = scraper.urlopen(url)

      if not is_binary:
        body = response # a subclass of a 'unicode' instance
        if not isinstance(body, unicode): raise ValueError("Content not decoded.")
      else:
        body = response.bytes # a 'str' instance
        if isinstance(body, unicode): raise ValueError("Binary content improperly decoded.")
    except scrapelib.HTTPError as e:
      logging.error("Error downloading %s:\n\n%s" % (url, format_exception(e)))
      return None

    # don't allow 0-byte files
    if (not body) or (not body.strip()):
      return None

    # cache content to disk
    if destination:
      write(body if is_binary else body.encode("utf8"), cache_path)

  if not is_binary:
    body = unescape(body)

  return body

def write(content, destination):
  mkdir_p(os.path.dirname(destination))
  f = open(destination, 'w')
  f.write(content)
  f.close()

def read(destination):
  if os.path.exists(destination):
    with open(destination) as f:
      return f.read()

# dict1 gets overwritten with anything in dict2
def merge(dict1, dict2):
  return dict(dict1.items() + dict2.items())

# de-dupe a list, taken from:
# http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order
def uniq(seq):
  seen = set()
  seen_add = seen.add
  return [ x for x in seq if x not in seen and not seen_add(x)]

import os, errno

# mkdir -p in python, from:
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
def mkdir_p(path):
  try:
    os.makedirs(path)
  except OSError as exc: # Python >2.5
    if exc.errno == errno.EEXIST:
      pass
    else:
      raise

def xpath_regex(doc, element, pattern):
  return doc.xpath(
    "//%s[re:match(text(), '%s')]" % (element, pattern),
    namespaces={"re": "http://exslt.org/regular-expressions"})

# taken from http://effbot.org/zone/re-sub.htm#unescape-html
def unescape(text):

  def remove_unicode_control(str):
    remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
    return remove_re.sub('', str)

  def fixup(m):
    text = m.group(0)
    if text[:2] == "&#":
      # character reference
      try:
        if text[:3] == "&#x":
          return unichr(int(text[3:-1], 16))
        else:
          return unichr(int(text[2:-1]))
      except ValueError:
        pass
    else:
      # named entity
      try:
        text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
      except KeyError:
        pass
    return text # leave as is

  text = re.sub("&#?\w+;", fixup, text)
  text = remove_unicode_control(text)
  return text

def extract_bills(text, session):
  bill_ids = []

  p = re.compile('((S\.|H\.)(\s?J\.|\s?R\.|\s?Con\.| ?)(\s?Res\.)*\s?\d+)', flags=re.IGNORECASE)
  bill_matches = p.findall(text)

  if bill_matches:
    for b in bill_matches:
      bill_text = "%s-%s" % (b[0].lower().replace(" ", '').replace('.', '').replace("con", "c"), session)
      if bill_text not in bill_ids:
        bill_ids.append(bill_text)

  return bill_ids

# uses config values if present
def cache_dir():
  cache = None

  if config:
    output = config.get('output', None)
    if output:
      cache = output.get('cache', None)

  if not cache:
    cache = "cache"

  return cache

def test_cache_dir():
  return "test/fixtures/cache"

# uses config values if present
def data_dir():
  data = None

  if config:
    output = config.get('output', None)
    if output:
      data = output.get('data', None)

  if not data:
    data = "data"

  return data

# if email settings are supplied, email the text - otherwise, just print it
def admin(body):
  try:
    if isinstance(body, Exception):
      body = format_exception(body)

    logging.error(body) # always print it

    if config:
      details = config.get('email', None)
      if details:
        send_email(body)

  except Exception as exception:
    print "Exception logging message to admin, halting as to avoid loop"
    print format_exception(exception)

def format_exception(exception):
  exc_type, exc_value, exc_traceback = sys.exc_info()
  return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))

# this should only be called if the settings are definitely there
def send_email(message):
  settings = config['email']

  # adapted from http://www.doughellmann.com/PyMOTW/smtplib/
  msg = MIMEText(message)
  msg.set_unixfrom('author')
  msg['To'] = email.utils.formataddr(('Recipient', settings['to']))
  msg['From'] = email.utils.formataddr((settings['from_name'], settings['from']))
  msg['Subject'] = settings['subject']

  server = smtplib.SMTP(settings['hostname'])
  try:
    server.ehlo()
    if settings['starttls'] and server.has_extn('STARTTLS'):
      server.starttls()
      server.ehlo()

    server.login(settings['user_name'], settings['password'])
    server.sendmail(settings['from'], [settings['to']], msg.as_string())
  finally:
    server.quit()

  logging.info("Sent email to %s" % settings['to'])


thomas_types = {
  'hr': ('HR', 'H.R.'),
  'hres': ('HE', 'H.RES.'),
  'hjres': ('HJ', 'H.J.RES.'),
  'hconres': ('HC', 'H.CON.RES.'),
  's': ('SN', 'S.'),
  'sres': ('SE', 'S.RES.'),
  'sjres': ('SJ', 'S.J.RES.'),
  'sconres': ('SC', 'S.CON.RES.'),
  'hamdt': ('HZ', 'H.AMDT.'),
  'samdt': ('SP', 'S.AMDT.'),
  'supamdt': ('SU', 'S.UP.AMDT.'),
}
thomas_types_2 = dict( (v[0], k) for (k, v) in thomas_types.items() )  # map e.g. { SE: sres, ...}

# cached committee map to map names to IDs
committee_names = {}

# get the mapping from THOMAS's committee names to THOMAS's committee IDs
# found on the advanced search page. committee_names[congress][name] = ID
# with subcommittee names as the committee name plus a pipe plus the subcommittee
# name.
def fetch_committee_names(congress, options):
  congress = int(congress)

  # Parse the THOMAS advanced search pages for the names that THOMAS uses for
  # committees on bill pages, and map those to the IDs for the committees that are
  # listed on the advanced search pages (but aren't shown on bill pages).
  if not options.get('test', False): logging.info("[%d] Fetching committee names..." % congress)

  # allow body to be passed in from fixtures
  if options.has_key('body'):
    body = options['body']
  else:
    body = download(
      "http://thomas.loc.gov/home/LegislativeData.php?&n=BSS&c=%d" % congress,
      "%s/meta/thomas_committee_names.html" % congress,
      options)

  for chamber, options in re.findall('>Choose (House|Senate) Committees</option>(.*?)</select>', body, re.I | re.S):
    for name, id in re.findall(r'<option value="(.*?)\{(.*?)}">', options, re.I | re.S):
      id = str(id).upper()
      name = name.strip().replace("  ", " ") # weirdness
      if id.endswith("00"):
        # Map chamber + committee name to its ID, minus the 00 at the end. On bill pages,
        # committees appear as e.g. "House Finance." Except the JCSE.
        if id != "JCSE00":
          name = chamber + " " + name

        # Correct for some oddness on THOMAS (but not on Congress.gov): The House Committee
        # on House Administration appears just as "House Administration".
        if name == "House House Administration": name = "House Administration"

        committee_names[name] = id[0:-2]

      else:
        # map committee ID + "|" + subcommittee name to the zero-padded subcommittee numeric ID
        committee_names[id[0:-2] + "|" + name] = id[-2:]

  # Correct for a limited number of other ways committees appear, owing probably to the
  # committee name being changed mid-way through a Congress.
  if congress == 95:
    committee_names["House Intelligence (Select)"] = committee_names["House Intelligence (Permanent Select)"]
  if congress == 96:
    committee_names["Senate Human Resources"] = "SSHR"
  if congress == 97:
    committee_names["Senate Small Business (Select)"] = committee_names["Senate Small Business"]
  if congress == 98:
    committee_names["Senate Indian Affairs (Select)"] = committee_names["Senate Indian Affairs (Permanent Select)"]
  if congress == 100:
    committee_names["HSPO|Hoc Task Force on Presidential Pay Recommendation"] = committee_names["HSPO|Ad Hoc Task Force on Presidential Pay Recommendation"]
  if congress == 103:
    committee_names["Senate Indian Affairs (Permanent Select)"] = committee_names["Senate Indian Affairs"]
  if congress == 108:
    # This appears to be a mistake, a subcommittee appearing as a full committee. Map it to
    # the full committee for now.
    committee_names["House Antitrust (Full Committee Task Force)"] = committee_names["House Judiciary"]

def make_node(parent, tag, text, **attrs):
  """Make a node in an XML document."""
  n = etree.Element(tag)
  parent.append(n)
  n.text = text
  for k, v in attrs.items():
    if v is None: continue
    if isinstance(v, datetime.datetime):
      v = format_datetime(v)
    n.set(k.replace("___", ""), v)
  return n

# Correct mistakes on THOMAS
def thomas_corrections(thomas_id):

  # C.A. Dutch Ruppersberger
  if thomas_id == "02188": thomas_id = "01728"

  # Pat Toomey
  if thomas_id == "01594": thomas_id = "02085"

  return thomas_id

def yaml_load(file_name):
  import yaml
  try:
    from yaml import CLoader as Loader, CDumper as Dumper
  except ImportError:
    from yaml import Loader, Dumper
  return yaml.load(open(file_name), Loader=Loader)

def require_congress_legislators_repo():
  # Clone the congress-legislators repo if we don't have it.
  if not os.path.exists("cache/congress-legislators"):
    logging.warn("Cloning the congress-legislators repo into the cache directory...")
    os.system("git clone -q --depth 1 https://github.com/unitedstates/congress-legislators cache/congress-legislators")

  # Update the repo so we have the latest.
  logging.warn("Updating the congress-legislators repo...")
  os.system("cd cache/congress-legislators; git fetch -pq") # these two == git pull, but git pull ignores -q on the merge part so is less quiet
  os.system("cd cache/congress-legislators; git merge --ff-only -q origin/master")

lookup_legislator_cache = []
def lookup_legislator(congress, role_type, name, state, party, when, id_requested, exclude=set()):
  # This is a basic lookup function given the legislator's name, state, party,
  # and the date of the vote.

  # On the first load, cache all of the legislators' terms in memory.
  # Group by Congress so we can limit our search later to be faster.
  global lookup_legislator_cache
  if not lookup_legislator_cache:
    require_congress_legislators_repo()
    lookup_legislator_cache = { } # from Congress number to list of (moc,term) tuples that might be in that Congress
    for fn in ('legislators-historical', 'legislators-current'):
      for moc in yaml_load("cache/congress-legislators/" + fn + ".yaml"):
        for term in moc["terms"]:
          for c in xrange(congress_from_legislative_year(int(term['start'][0:4]))-1,
            congress_from_legislative_year(int(term['end'][0:4]))+1+1):
            lookup_legislator_cache.setdefault(c, []).append( (moc, term) )

  def to_ascii(name):
    name = name.replace("-", " ")
    if not isinstance(name, unicode): return name
    import unicodedata
    return u"".join(c for c in unicodedata.normalize('NFKD', name) if not unicodedata.combining(c))

  # Scan all of the terms that cover 'when' for a match.
  if isinstance(when, datetime.datetime): when = when.date()
  when = when.isoformat()
  name_parts = to_ascii(name).split(", ", 1)
  matches = []
  for moc, term in lookup_legislator_cache[congress]:
    # Make sure the date is surrounded by the term start/end dates.
    if term['start'] > when: continue # comparing ISO-formatted date strings
    if term['end'] < when: continue # comparing ISO-formatted date strings

    # Compare the role type, state, and party, except for people who we know changed party.
    if term['type'] != role_type: continue
    if term['state'] != state: continue
    if term['party'][0] != party and name not in ("Laughlin", "Crenshaw", "Goode", "Martinez"): continue

    # When doing process-of-elimination matching, don't match on people we've already seen.
    if moc["id"].get(id_requested) in exclude: continue

    # Compare the last name. Allow "Chenoweth" to match "Chenoweth Hage", but also
    # allow "Millender McDonald" to match itself.
    if name_parts[0] != to_ascii(moc['name']['last']) and \
      name_parts[0] not in to_ascii(moc['name']['last']).split(" "): continue

    # Compare the first name. Allow it to match either the first or middle name,
    # and an initialized version of the first name (i.e. "E." matches "Eddie").
    # Test the whole string (so that "Jo Ann" is compared to "Jo Ann") but also
    # the first part of a string split (so "E. B." is compared as "E." to "Eddie").
    first_names = (to_ascii(moc['name']['first']), to_ascii(moc['name'].get('nickname', "")), to_ascii(moc['name']['first'])[0] + ".")
    if len(name_parts) >= 2 and \
      name_parts[1] not in first_names and \
      name_parts[1].split(" ")[0] not in first_names: continue

    # This is a possible match.
    matches.append((moc, term))

  # Return if there is a unique match.
  if len(matches) == 0:
    logging.warn("Could not match name %s (%s-%s; %s) to any legislator." % (name, state, party, when))
    return None
  if len(matches) > 1:
    logging.warn("Multiple matches of name %s (%s-%s; %s) to legislators (excludes %s)." % (name, state, party, when, str(exclude)))
    return None
  return matches[0][0]['id'][id_requested]

def get_govtrack_person_id(source_id_type, source_id):
  # Load the legislators database to map various IDs to GovTrack IDs.
  # Cache in a pickled file because loading the whole YAML db is super slow.
  global govtrack_person_id_map
  import os, os.path, pickle

  # On the first call to this function...
  if not govtrack_person_id_map:
    require_congress_legislators_repo()

    govtrack_person_id_map = { }
    for fn in ('legislators-historical', 'legislators-current'):
      # Check if the pickled file is older than the YAML files.
      cachefn = os.path.join(cache_dir(), fn + '-id-map')
      if os.path.exists(cachefn) and os.stat(cachefn).st_mtime > os.stat("cache/congress-legislators/%s.yaml" % fn).st_mtime:
        # Pickled file is newer, so use it.
        m = pickle.load(open(cachefn))
      else:
        # Make a new mapping. Load the YAML file and create
        # a master map from (id-type, id) to GovTrack ID,
        # where id-type is e.g. thomas, lis, bioguide. Then
        # save it to a pickled file.
        logging.warn("Making %s ID map..." % fn)
        m = { }
        for moc in yaml_load("cache/congress-legislators/" + fn + ".yaml"):
          if "govtrack" in moc["id"]:
            for k, v in moc["id"].items():
              if k in ('bioguide', 'lis', 'thomas'):
                m[(k,v)] = moc["id"]["govtrack"]
        pickle.dump(m, open(cachefn, "w"))

      # Combine the mappings from the historical and current files.
      govtrack_person_id_map.update(m)

  # Now do the lookup.
  if (source_id_type, source_id) not in govtrack_person_id_map:
      see_also = ""
      if source_id_type == "thomas":
        see_also = "http://beta.congress.gov/member/xxx/" + source_id
      logging.error("GovTrack ID not known for %s %s. (%s)" % (source_id_type, str(source_id), see_also))
      raise UnmatchedIdentifer(source_id_type, source_id, see_also)
  return govtrack_person_id_map[(source_id_type, source_id)]