congress-legislators/scripts/utils.py

# Helpful functions for finding data about members and committees

CURRENT_CONGRESS = 115
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming',
        'OL': 'Orleans',
        'DK': 'Dakota',
        'PI': 'Philippine Islands'
}


import urllib.request, urllib.error, urllib.parse
import os, errno, sys, traceback
import re, html.entities
import pprint
import rtyaml
from datetime import datetime, date
import time
import json

import lxml.html # for meta redirect parsing
import yaml

import smtplib
import email.utils
from email.mime.text import MIMEText


# read in an opt-in config file for supplying email settings
# returns None if it's not there, and this should always be handled gracefully
path = "email/config.yml"
if os.path.exists(path):
  email_settings = yaml.load(open(path, 'r')).get('email', None)
else:
  email_settings = None


def congress_from_legislative_year(year):
  return ((year + 1) / 2) - 894

def legislative_year(date=None):
  if not date:
    date = datetime.now()

  if date.month == 1:
    if date.day == 1 or date.day == 2:
      return date.year - 1
    elif date.day == 3:
        if isinstance(date,datetime):
          if date.hour < 12:
            return date.year -1
          else:
            return date.year
        else:
          return date.year
    else:
      return date.year
  else:
    return date.year

def congress_start_end_dates(congress):
  # Get the start date and end date of the given Congress (i.g. 1 for the 1st Congress).
  # Sadly, the date of the end of one Congress is identical with the date of the start
  # of the next because the switchover time is at noon (at least since 1935).
  # Also see get_congress_from_date.
  start_year = 1789 + (congress-1)*2
  end_year = start_year + 2
  if congress < 73:
    # The 1st Congress met on March 4, 1789, per an act of the Continental
    # Congress adpoted Sept 13, 1788. The Constitutional term period would
    # end two years later, of course. Looking at actual adjournment dates,
    # it seems that Congress believed its term ended on March 3rd's.
    if congress != 69:
      return (date(start_year, 3, 4), date(end_year, 3, 3))
    else:
      # But the 69th Congress (and only that Congress) adjourned on a March 4,
      # which means that they must have viewed their Constitutional term as
      # expiring at the actual time of day that the first Congress began?
      # Since we use March 4 as the term end dates for the 69th Congress in our
      # data, we'll use that as the end date for the 69th Congress only.
      return (date(start_year, 3, 4), date(end_year, 3, 4))
  elif congress == 73:
    # The end of the 73rd Congress was changed by the 20th Amendment. So
    # it began on a March 4 but ended on the January 3rd (at noon) that
    # preceded the usual March 3 (1935). (Congress adjourned in 1934 anyway.)
    return (date(start_year, 3, 4), date(end_year, 1, 3))
  else:
    # Starting with the 74th Congress, Congresses begin and end on January
    # 3rds at noon.
    return (date(start_year, 1, 3), date(end_year, 1, 3))

def get_congress_from_date(d, range_type=None):
  # This is the inverse of congress_start_end_dates.
  #
  # Return the Congress number that the date 'd' occurs in by first computing
  # the 'legislative year' it occurs in, and then using some simple arithmetic
  # counting back to 1789 (the first legislative year) and dividing by two
  # (since Congresses are two years).
  #
  # Since Congresses start and end on the same date at noon (at least since
  # 1935, but we treat it similarly for prior years), those dates are ambiguous.
  # The caller passes range_type='start' to exclude the possibility that
  # if the date is on a transition date that it is in the subsequent Congress,
  # and vice versa for range_type='end'.
  if (d.year % 2) == 0:
    # Even years occur entirely within a Congress.
    y = d.year
  else:
    # In odd-numbered years, the period before the transition date, and if
    # range_type == 'end' the transition date itself, is assigned to the
    # previous legislative year

    # Get the transition date of this year.
    if d.year < 1935:
      # Through 1933, the transition date was March 4. Although most
      # Congresses adjourned on March 3 or earlier probably to not
      # have to deal with the question of what time the first session
      # began.
      td = date(d.year, 3, 4)
    else:
      # Since 1935, the transition date is Jan 3.
      td = date(d.year, 1, 3)

    # Check if d is before, after, or on the transition date.
    if d < td:
      y = d.year - 1
    elif d > td:
      y = d.year
    else:
      if range_type == "end":
      	# Assign this date to the previous Congress.
        y = d.year - 1
      elif range_type == "start":
      	# Assign this date to the next Congress.
        y = d.year
      else:
        raise ValueError("Date {} is ambiguous; must pass range_type='start' or 'end'.".format(d))

  # Now do some simple integer math to compute the Congress number.
  return ((y + 1) // 2) - 894

def parse_date(date):
  return datetime.strptime(date, "%Y-%m-%d").date()

def log(object):
  if isinstance(object, str):
    print(object)
  else:
    pprint(object)

def uniq(seq):
  seen = set()
  seen_add = seen.add
  return [ x for x in seq if x not in seen and not seen_add(x)]

def args():
  args = []
  for token in sys.argv[1:]:
    if not token.startswith("--"):
      args.append(token)
  return args

def flags():
  options = {}
  for token in sys.argv[1:]:
    if token.startswith("--"):

      if "=" in token:
        key, value = token.split('=')
      else:
        key, value = token, True

      key = key.split("--")[1]
      if value == 'True': value = True
      elif value == 'False': value = False
      options[key.lower()] = value
  return options

##### Data management

def data_dir():
  return ".."

def load_data(path):
  return yaml_load(os.path.join(data_dir(), path))

def save_data(data, path):
  yaml_dump(data, os.path.join(data_dir(), path))
  write(
		json.dumps(data, default=format_datetime),
		"../alternate_formats/%s" %path.replace(".yaml", ".json"))

##### Downloading

import scrapelib
scraper = scrapelib.Scraper(requests_per_minute=60, retry_attempts=3)
scraper.user_agent = "the @unitedstates project (https://github.com/unitedstates/congress-legislators)"

def cache_dir():
  return "cache"

def download(url, destination=None, force=False, options=None):
  if not destination and not force:
    raise TypeError("destination must not be None if force is False.")

  if not options:
    options = {}

  # get the path to cache the file, or None if destination is None
  cache = os.path.join(cache_dir(), destination) if destination else None

  if not force and os.path.exists(cache):
    if options.get('debug', False):
      log("Cached: (%s, %s)" % (cache, url))

    with open(cache, 'r') as f:
      body = f.read()
  else:
    try:
      if options.get('debug', False):
        log("Downloading: %s" % url)

      if options.get('urllib', False):
        response = urllib.request.urlopen(url)
        body = response.read()
        if not options.get('binary', False):
          body = body.decode("utf-8") # guessing encoding
      else:
        response = scraper.get(url)
        if not options.get('binary', False):
          body = response.text
        else:
          body = response.content
    except scrapelib.HTTPError:
      log("Error downloading %s" % url)
      return None

    # don't allow 0-byte files
    if (not body) or (not body.strip()):
      return None

    # the downloader can optionally parse the body as HTML
    # and look for meta redirects. a bit expensive, so opt-in.
    if options.get('check_redirects', False):
      try:
        html_tree = lxml.html.fromstring(body)
      except ValueError:
        log("Error parsing source from url {0}".format(url))
        return None

      meta = html_tree.xpath("//meta[translate(@http-equiv, 'REFSH', 'refsh') = 'refresh']/@content")
      if meta:
        attr = meta[0]
        wait, text = attr.split(";")
        if text.lower().startswith("url="):

          new_url = text[4:]
          if not new_url.startswith(url): #dont print if a local redirect
            print("Found redirect for {}, downloading {} instead..".format(url, new_url))

          options.pop('check_redirects')
          body = download(new_url, None, True, options)

    # cache content to disk
    if cache: write(body, cache)


  return body

from pytz import timezone
eastern_time_zone = timezone('US/Eastern')
def format_datetime(obj):
  if isinstance(obj, datetime):
    return eastern_time_zone.localize(obj.replace(microsecond=0)).isoformat()
  elif isinstance(obj, str):
    return obj
  else:
    return None

def write(content, destination):
  # content must be a str instance (not bytes), will be written in utf-8 per open()'s default
  mkdir_p(os.path.dirname(destination))
  f = open(destination, 'w' + ('' if isinstance(content, str) else 'b'))
  f.write(content)
  f.close()

# mkdir -p in python, from:
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
def mkdir_p(path):
  try:
    os.makedirs(path)
  except OSError as exc: # Python >2.5
    if exc.errno == errno.EEXIST:
      pass
    else:
      raise

def format_exception(exception):
  exc_type, exc_value, exc_traceback = sys.exc_info()
  return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))

# taken from http://effbot.org/zone/re-sub.htm#unescape-html
def unescape(text, encoding=None):

  def remove_unicode_control(str):
    remove_re = re.compile('[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
    return remove_re.sub('', str)

  def fixup(m):
    text = m.group(0)
    if text[:2] == "&#":
      # character reference
      if encoding == None:
        try:
          if text[:3] == "&#x":
            return chr(int(text[3:-1], 16))
          else:
            return chr(int(text[2:-1]))
        except ValueError:
          pass
      else:
        try:
          if text[:3] == "&#x":
            return bytes([int(text[3:-1], 16)]).decode(encoding)
          else:
            return bytes([int(text[2:-1])]).decode(encoding)
        except ValueError:
          pass
    else:
      # named entity
      try:
        text = chr(html.entities.name2codepoint[text[1:-1]])
      except KeyError:
        pass
    return text # leave as is

  text = re.sub(r"&#?\w+;", fixup, text)
  text = remove_unicode_control(text)
  return text

##### YAML serialization ######

# Apply some common settings for loading/dumping YAML and cache the
# data in pickled format which is a LOT faster than YAML.

def yaml_load(path, use_cache=True):
    # Loading YAML is ridiculously slow, so cache the YAML data
    # in a pickled file which loads much faster.

    # Check if the .pickle file exists and a hash stored inside it
    # matches the hash of the YAML file, and if so unpickle it.
    import pickle as pickle, os.path, hashlib
    h = hashlib.sha1(open(path, 'rb').read()).hexdigest()
    if use_cache and os.path.exists(path + ".pickle"):

        try:
          store = pickle.load(open(path + ".pickle", 'rb'))
          if store["hash"] == h:
            return store["data"]
        except EOFError:
          pass # bad .pickle file, pretend it doesn't exist

    # No cached pickled data exists, so load the YAML file.
    data = rtyaml.load(open(path))

    # Store in a pickled file for fast access later.
    pickle.dump({ "hash": h, "data": data }, open(path+".pickle", "wb"))

    return data

def yaml_dump(data, path):
    # write file
    rtyaml.dump(data, open(path, "w"))

    # Store in a pickled file for fast access later.
    import pickle as pickle, hashlib
    h = hashlib.sha1(open(path, 'rb').read()).hexdigest()
    pickle.dump({ "hash": h, "data": data }, open(path+".pickle", "wb"))

# if email settings are supplied, email the text - otherwise, just print it
def admin(body):
  try:
    if isinstance(body, Exception):
      body = format_exception(body)

    print(body) # always print it

    if email_settings:
        send_email(body)

  except Exception as exception:
    print("Exception logging message to admin, halting as to avoid loop")
    print(format_exception(exception))

# this should only be called if the settings are definitely there
def send_email(message):
  print("Sending email to %s..." % email_settings['to'])

  # adapted from http://www.doughellmann.com/PyMOTW/smtplib/
  msg = MIMEText(message)
  msg.set_unixfrom('author')
  msg['To'] = email.utils.formataddr(('Recipient', email_settings['to']))
  msg['From'] = email.utils.formataddr((email_settings['from_name'], email_settings['from']))
  msg['Subject'] = "%s - %i" % (email_settings['subject'], int(time.time()))

  server = smtplib.SMTP(email_settings['hostname'])
  try:
    server.ehlo()
    if email_settings['starttls'] and server.has_extn('STARTTLS'):
      server.starttls()
      server.ehlo()

    server.login(email_settings['user_name'], email_settings['password'])
    server.sendmail(email_settings['from'], [email_settings['to']], msg.as_string())
  finally:
    server.quit()

  print("Sent email to %s." % email_settings['to'])