Files
congress/tasks/utils.py
2013-03-21 18:09:20 -04:00

518 lines
16 KiB
Python

import os, os.path, errno, sys, traceback
import re, htmlentitydefs
import yaml, json
from pytz import timezone
import datetime, time
from lxml import html, etree
import scrapelib
import pprint
import logging
import smtplib
import email.utils
from email.mime.text import MIMEText
import getpass
# read in an opt-in config file for changing directories and supplying email settings
# returns None if it's not there, and this should always be handled gracefully
path = "config.yml"
if os.path.exists(path):
config = yaml.load(open(path, 'r'))
else:
config = None
eastern_time_zone = timezone('US/Eastern')
# scraper should be instantiated at class-load time, so that it can rate limit appropriately
scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3)
govtrack_person_id_map = None
class UnmatchedIdentifer(Exception):
def __init__(self, id_type, id_value, help_url):
super(UnmatchedIdentifer, self).__init__("%s=%s %s" % (id_type, str(id_value), help_url))
def format_datetime(obj):
if isinstance(obj, datetime.datetime):
return eastern_time_zone.localize(obj.replace(microsecond=0)).isoformat()
elif isinstance(obj, str):
return obj
else:
return None
def current_congress():
year = current_legislative_year()
return ((year + 1) / 2) - 894
def current_legislative_year(date=None):
if not date:
date = datetime.datetime.now()
year = date.year
if date.month == 1:
if date.day == 1 or date.day == 2:
return date.year - 1
elif date.day == 3 and date.hour < 12:
return date.year - 1
else:
return date.year
else:
return date.year
def get_congress_first_year(congress):
return (((int(congress)+894)*2) - 1)
# get the three calendar years that the Congress extends through (Jan 3 to Jan 3).
def get_congress_years(congress):
y1 = get_congress_first_year(congress)
return (y1, y1+1, y1+2)
# bill_type, bill_number, congress
def split_bill_id(bill_id):
return re.match("^([a-z]+)(\d+)-(\d+)$", bill_id).groups()
# bill_type, bill_number, congress, version_code
def split_bill_version_id(bill_version_id):
return re.match("^([a-z]+)(\d+)-(\d+)-([a-z\d]+)$", bill_version_id).groups()
def split_vote_id(bill_id):
return re.match("^(h|s)(\d+)-(\d+).(\d\d\d\d)$", bill_id).groups()
# nomination_type, congress, nomination_number
# I think it's always PN, but might as well include
def split_nomination_id(nomination_id):
try:
return re.match("^([A-z]{2})(\d+)-(\d+)$", nomination_id).groups()
except Exception, e:
logging.error("Unabled to parse %s" % nomination_id)
return (None, None, None)
def process_set(to_fetch, fetch_func, options, *extra_args):
errors = []
saved = []
skips = []
for id in to_fetch:
try:
results = fetch_func(id, options, *extra_args)
except Exception, e:
if options.get('raise', False):
raise
else:
errors.append((id, e))
continue
if results.get('ok', False):
if results.get('saved', False):
saved.append(id)
logging.info("[%s] Updated" % id)
else:
skips.append(id)
logging.warn("[%s] Skipping: %s" % (id, results['reason']))
else:
errors.append((id, results))
logging.error("[%s] Error: %s" % (id, results['reason']))
if len(errors) > 0:
message = "\nErrors for %s items:\n" % len(errors)
for id, error in errors:
if isinstance(error, Exception):
message += "[%s] Exception:\n\n" % id
message += format_exception(error)
else:
message += "[%s] %s" % (id, error)
admin(message) # email if possible
logging.warning("\nErrors for %s." % len(errors))
logging.warning("Skipped %s." % len(skips))
logging.warning("Saved data for %s." % len(saved))
return saved + skips # all of the OK's
# Download file at `url`, cache to `destination`.
# Takes many options to customize behavior.
def download(url, destination=None, options={}):
# uses cache by default, override (True) to ignore
force = options.get('force', False)
# saves in cache dir by default, override (False) to save to exact destination
to_cache = options.get('to_cache', True)
# unescapes HTML encoded characters by default, set this (True) to not do that
xml = options.get('xml', False)
# used by test suite to use special (versioned) test cache dir
test = options.get('test', False)
# if need a POST request with data
postdata = options.get('postdata', False)
if test:
cache = test_cache_dir()
else:
cache = cache_dir()
if destination:
if to_cache:
cache_path = os.path.join(cache, destination)
else:
cache_path = destination
if destination and (not force) and os.path.exists(cache_path):
if not test: logging.info("Cached: (%s, %s)" % (cache, url))
with open(cache_path, 'r') as f:
body = f.read()
else:
try:
logging.info("Downloading: %s" % url)
if postdata:
response = scraper.urlopen(url, 'POST', postdata)
else:
response = scraper.urlopen(url)
body = response.bytes # str(...) tries to encode as ASCII the already-decoded unicode content
except scrapelib.HTTPError as e:
logging.error("Error downloading %s:\n\n%s" % (url, format_exception(e)))
return None
# don't allow 0-byte files
if (not body) or (not body.strip()):
return None
# cache content to disk
if destination:
write(body, cache_path)
if not xml:
body = unescape(body)
return body
def write(content, destination):
mkdir_p(os.path.dirname(destination))
f = open(destination, 'w')
f.write(content)
f.close()
def read(destination):
if os.path.exists(destination):
with open(destination) as f:
return f.read()
# dict1 gets overwritten with anything in dict2
def merge(dict1, dict2):
return dict(dict1.items() + dict2.items())
# de-dupe a list, taken from:
# http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order
def uniq(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
import os, errno
# mkdir -p in python, from:
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else:
raise
def xpath_regex(doc, element, pattern):
return doc.xpath(
"//%s[re:match(text(), '%s')]" % (element, pattern),
namespaces={"re": "http://exslt.org/regular-expressions"})
# taken from http://effbot.org/zone/re-sub.htm#unescape-html
def unescape(text):
def remove_unicode_control(str):
remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
return remove_re.sub('', str)
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
#TEMP FIX
#getting error in this page on byte 0xd0
#http://thomas.loc.gov/cgi-bin/query/R?r113:FLD001:S01599
try:
text = re.sub("&#?\w+;", fixup, text)
except:
text = re.sub("&#?\w+;", fixup, text.decode('latin-1'))
text = remove_unicode_control(text)
return text
def extract_bills(text, session):
bill_ids = []
p = re.compile('((S\.|H\.)(\s?J\.|\s?R\.|\s?Con\.| ?)(\s?Res\.)*\s?\d+)', flags=re.IGNORECASE)
bill_matches = p.findall(text)
if bill_matches:
for b in bill_matches:
bill_text = "%s-%s" % (b[0].lower().replace(" ", '').replace('.', '').replace("con", "c"), session)
if bill_text not in bill_ids:
bill_ids.append(bill_text)
return bill_ids
# uses config values if present
def cache_dir():
cache = None
if config:
output = config.get('output', None)
if output:
cache = output.get('cache', None)
if not cache:
cache = "cache"
return cache
def test_cache_dir():
return "test/fixtures/cache"
# uses config values if present
def data_dir():
data = None
if config:
output = config.get('output', None)
if output:
data = output.get('data', None)
if not data:
data = "data"
return data
# if email settings are supplied, email the text - otherwise, just print it
def admin(body):
try:
if isinstance(body, Exception):
body = format_exception(body)
logging.error(body) # always print it
if config:
details = config.get('email', None)
if details:
send_email(body)
except Exception as exception:
print "Exception logging message to admin, halting as to avoid loop"
print format_exception(exception)
def format_exception(exception):
exc_type, exc_value, exc_traceback = sys.exc_info()
return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
# this should only be called if the settings are definitely there
def send_email(message):
settings = config['email']
# adapted from http://www.doughellmann.com/PyMOTW/smtplib/
msg = MIMEText(message)
msg.set_unixfrom('author')
msg['To'] = email.utils.formataddr(('Recipient', settings['to']))
msg['From'] = email.utils.formataddr((settings['from_name'], settings['from']))
msg['Subject'] = "%s - %i" % (settings['subject'], int(time.time()))
server = smtplib.SMTP(settings['hostname'])
try:
server.ehlo()
if settings['starttls'] and server.has_extn('STARTTLS'):
server.starttls()
server.ehlo()
server.login(settings['user_name'], settings['password'])
server.sendmail(settings['from'], [settings['to']], msg.as_string())
finally:
server.quit()
logging.info("Sent email to %s" % settings['to'])
thomas_types = {
'hr': ('HR', 'H.R.'),
'hres': ('HE', 'H.RES.'),
'hjres': ('HJ', 'H.J.RES.'),
'hconres': ('HC', 'H.CON.RES.'),
's': ('SN', 'S.'),
'sres': ('SE', 'S.RES.'),
'sjres': ('SJ', 'S.J.RES.'),
'sconres': ('SC', 'S.CON.RES.'),
'hamdt': ('HZ', 'H.AMDT.'),
'samdt': ('SP', 'S.AMDT.'),
}
thomas_types_2 = dict( (v[0], k) for (k, v) in thomas_types.items() ) # map e.g. { SE: sres, ...}
# cached committee map to map names to IDs
committee_names = {}
# get the mapping from THOMAS's committee names to THOMAS's committee IDs
# found on the advanced search page. committee_names[congress][name] = ID
# with subcommittee names as the committee name plus a pipe plus the subcommittee
# name.
def fetch_committee_names(congress, options):
congress = int(congress)
# Parse the THOMAS advanced search pages for the names that THOMAS uses for
# committees on bill pages, and map those to the IDs for the committees that are
# listed on the advanced search pages (but aren't shown on bill pages).
if not options.get('test', False): logging.info("[%d] Fetching committee names..." % congress)
# allow body to be passed in from fixtures
if options.has_key('body'):
body = options['body']
else:
body = download(
"http://thomas.loc.gov/home/LegislativeData.php?&n=BSS&c=%d" % congress,
"%s/meta/thomas_committee_names.html" % congress,
options)
for chamber, options in re.findall('>Choose (House|Senate) Committees</option>(.*?)</select>', body, re.I | re.S):
for name, id in re.findall(r'<option value="(.*?)\{(.*?)}">', options, re.I | re.S):
id = str(id).upper()
name = name.strip().replace(" ", " ") # weirdness
if id.endswith("00"):
# Map chamber + committee name to its ID, minus the 00 at the end. On bill pages,
# committees appear as e.g. "House Finance." Except the JCSE.
if id != "JCSE00":
name = chamber + " " + name
# Correct for some oddness on THOMAS (but not on Congress.gov): The House Committee
# on House Administration appears just as "House Administration".
if name == "House House Administration": name = "House Administration"
committee_names[name] = id[0:-2]
else:
# map committee ID + "|" + subcommittee name to the zero-padded subcommittee numeric ID
committee_names[id[0:-2] + "|" + name] = id[-2:]
# Correct for a limited number of other ways committees appear, owing probably to the
# committee name being changed mid-way through a Congress.
if congress == 95:
committee_names["House Intelligence (Select)"] = committee_names["House Intelligence (Permanent Select)"]
if congress == 96:
committee_names["Senate Human Resources"] = "SSHR"
if congress == 97:
committee_names["Senate Small Business (Select)"] = committee_names["Senate Small Business"]
if congress == 98:
committee_names["Senate Indian Affairs (Select)"] = committee_names["Senate Indian Affairs (Permanent Select)"]
if congress == 100:
committee_names["HSPO|Hoc Task Force on Presidential Pay Recommendation"] = committee_names["HSPO|Ad Hoc Task Force on Presidential Pay Recommendation"]
if congress == 103:
committee_names["Senate Indian Affairs (Permanent Select)"] = committee_names["Senate Indian Affairs"]
if congress == 108:
# This appears to be a mistake, a subcommittee appearing as a full committee. Map it to
# the full committee for now.
committee_names["House Antitrust (Full Committee Task Force)"] = committee_names["House Judiciary"]
def make_node(parent, tag, text, **attrs):
"""Make a node in an XML document."""
n = etree.Element(tag)
parent.append(n)
n.text = text
for k, v in attrs.items():
if v is None: continue
if isinstance(v, datetime.datetime):
v = format_datetime(v)
n.set(k.replace("___", ""), v)
return n
# Correct mistakes on THOMAS
def thomas_corrections(thomas_id):
# C.A. Dutch Ruppersberger
if thomas_id == "02188": thomas_id = "01728"
return thomas_id
def get_govtrack_person_id(source_id_type, source_id):
# Load the legislators database to map various IDs to GovTrack IDs.
# Cache in a pickled file because loading the whole YAML db is super slow.
global govtrack_person_id_map
import os, os.path, pickle, yaml
# On the first call to this function...
if not govtrack_person_id_map:
# Clone the congress-legislators repo if we don't have it.
if not os.path.exists("cache/congress-legislators"):
logging.warn("Cloning the congress-legislators repo into the cache directory...")
os.system("git clone -q --depth 1 https://github.com/unitedstates/congress-legislators cache/congress-legislators")
# Update the repo so we have the latest.
logging.warn("Updating the congress-legislators repo...")
os.system("cd cache/congress-legislators; git fetch -pq") # these two == git pull, but git pull ignores -q on the merge part so is less quiet
os.system("cd cache/congress-legislators; git merge --ff-only -q origin/master")
govtrack_person_id_map = { }
for fn in ('legislators-historical', 'legislators-current'):
# Check if the pickled file is older than the YAML files.
cachefn = os.path.join(cache_dir(), fn + '-id-map')
if os.path.exists(cachefn) and os.stat(cachefn).st_mtime > os.stat("cache/congress-legislators/%s.yaml" % fn).st_mtime:
# Pickled file is newer, so use it.
m = pickle.load(open(cachefn))
else:
# Make a new mapping. Load the YAML file and create
# a master map from (id-type, id) to GovTrack ID,
# where id-type is e.g. thomas, lis, bioguide. Then
# save it to a pickled file.
try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import Loader, Dumper
logging.warn("Making %s ID map..." % fn)
m = { }
for moc in yaml.load(open("cache/congress-legislators/" + fn + ".yaml"), Loader=Loader):
if "govtrack" in moc["id"]:
for k, v in moc["id"].items():
if k in ('bioguide', 'lis', 'thomas'):
m[(k,v)] = moc["id"]["govtrack"]
pickle.dump(m, open(cachefn, "w"))
# Combine the mappings from the historical and current files.
govtrack_person_id_map.update(m)
# Now do the lookup.
if (source_id_type, source_id) not in govtrack_person_id_map:
see_also = ""
if source_id_type == "thomas":
see_also = "http://beta.congress.gov/member/xxx/" + source_id
logging.error("GovTrack ID not known for %s %s. (%s)" % (source_id_type, str(source_id), see_also))
raise UnmatchedIdentifer(source_id_type, source_id, see_also)
return govtrack_person_id_map[(source_id_type, source_id)]