mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 18:05:51 -05:00
333 lines
10 KiB
Python
333 lines
10 KiB
Python
# Helpful functions for finding data about members and committees
|
|
|
|
CURRENT_CONGRESS = 113
|
|
|
|
import urllib2
|
|
import os, errno, sys, traceback
|
|
import re, htmlentitydefs
|
|
import pprint
|
|
from datetime import datetime
|
|
import time
|
|
|
|
import lxml.html # for meta redirect parsing
|
|
|
|
import yaml
|
|
|
|
import smtplib
|
|
import email.utils
|
|
from email.mime.text import MIMEText
|
|
import getpass
|
|
|
|
|
|
# read in an opt-in config file for supplying email settings
|
|
# returns None if it's not there, and this should always be handled gracefully
|
|
path = "email/config.yml"
|
|
if os.path.exists(path):
|
|
email_settings = yaml.load(open(path, 'r')).get('email', None)
|
|
else:
|
|
email_settings = None
|
|
|
|
|
|
def parse_date(date):
|
|
return datetime.strptime(date, "%Y-%m-%d").date()
|
|
|
|
def log(object):
|
|
if isinstance(object, (str, unicode)):
|
|
print object
|
|
else:
|
|
pprint(object)
|
|
|
|
def uniq(seq):
|
|
seen = set()
|
|
seen_add = seen.add
|
|
return [ x for x in seq if x not in seen and not seen_add(x)]
|
|
|
|
def flags():
|
|
options = {}
|
|
for arg in sys.argv[1:]:
|
|
if arg.startswith("--"):
|
|
|
|
if "=" in arg:
|
|
key, value = arg.split('=')
|
|
else:
|
|
key, value = arg, True
|
|
|
|
key = key.split("--")[1]
|
|
if value == 'True': value = True
|
|
elif value == 'False': value = False
|
|
options[key.lower()] = value
|
|
return options
|
|
|
|
##### Data management
|
|
|
|
def data_dir():
|
|
return ".."
|
|
|
|
def load_data(path):
|
|
return yaml_load(os.path.join(data_dir(), path))
|
|
|
|
def save_data(data, path):
|
|
return yaml_dump(data, os.path.join(data_dir(), path))
|
|
|
|
|
|
##### Downloading
|
|
|
|
import scrapelib
|
|
scraper = scrapelib.Scraper(requests_per_minute=60, follow_robots=False, retry_attempts=3)
|
|
scraper.user_agent = "github.com/unitedstates/congress-legislators"
|
|
|
|
def cache_dir():
|
|
return "cache"
|
|
|
|
def download(url, destination=None, force=False, options=None):
|
|
if not destination and not force:
|
|
raise TypeError("destination must not be None if force is False.")
|
|
|
|
if not options:
|
|
options = {}
|
|
|
|
# get the path to cache the file, or None if destination is None
|
|
cache = os.path.join(cache_dir(), destination) if destination else None
|
|
|
|
if not force and os.path.exists(cache):
|
|
if options.get('debug', False):
|
|
log("Cached: (%s, %s)" % (cache, url))
|
|
|
|
with open(cache, 'r') as f:
|
|
body = f.read()
|
|
else:
|
|
try:
|
|
if options.get('debug', False):
|
|
log("Downloading: %s" % url)
|
|
|
|
if options.get('urllib', False):
|
|
response = urllib2.urlopen(url)
|
|
body = response.read()
|
|
else:
|
|
response = scraper.urlopen(url)
|
|
body = response.encode('utf-8')
|
|
except scrapelib.HTTPError as e:
|
|
log("Error downloading %s" % url)
|
|
return None
|
|
|
|
# don't allow 0-byte files
|
|
if (not body) or (not body.strip()):
|
|
return None
|
|
|
|
# the downloader can optionally parse the body as HTML
|
|
# and look for meta redirects. a bit expensive, so opt-in.
|
|
if options.get('check_redirects', False):
|
|
html_tree = lxml.html.fromstring(body)
|
|
meta = html_tree.xpath("//meta[translate(@http-equiv, 'REFSH', 'refsh') = 'refresh']/@content")
|
|
if meta:
|
|
attr = meta[0]
|
|
wait, text = attr.split(";")
|
|
if text.lower().startswith("url="):
|
|
|
|
new_url = text[4:]
|
|
print "Found redirect, downloading %s instead.." % new_url
|
|
|
|
options.pop('check_redirects')
|
|
body = download(new_url, None, True, options)
|
|
|
|
# cache content to disk
|
|
if cache: write(body, cache)
|
|
|
|
|
|
return body
|
|
|
|
def write(content, destination):
|
|
mkdir_p(os.path.dirname(destination))
|
|
f = open(destination, 'w')
|
|
f.write(content)
|
|
f.close()
|
|
|
|
# mkdir -p in python, from:
|
|
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
|
|
def mkdir_p(path):
|
|
try:
|
|
os.makedirs(path)
|
|
except OSError as exc: # Python >2.5
|
|
if exc.errno == errno.EEXIST:
|
|
pass
|
|
else:
|
|
raise
|
|
|
|
def format_exception(exception):
|
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
|
|
|
|
# taken from http://effbot.org/zone/re-sub.htm#unescape-html
|
|
def unescape(text):
|
|
|
|
def remove_unicode_control(str):
|
|
remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
|
|
return remove_re.sub('', str)
|
|
|
|
def fixup(m):
|
|
text = m.group(0)
|
|
if text[:2] == "&#":
|
|
# character reference
|
|
try:
|
|
if text[:3] == "&#x":
|
|
return unichr(int(text[3:-1], 16))
|
|
else:
|
|
return unichr(int(text[2:-1]))
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
# named entity
|
|
try:
|
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
|
except KeyError:
|
|
pass
|
|
return text # leave as is
|
|
|
|
text = re.sub("&#?\w+;", fixup, text)
|
|
text = remove_unicode_control(text)
|
|
return text
|
|
|
|
##### YAML serialization ######
|
|
|
|
# In order to preserve the order of attributes, YAML must be
|
|
# hooked to load mappings as OrderedDicts. Adapted from:
|
|
# https://gist.github.com/317164
|
|
|
|
try:
|
|
from yaml import CSafeLoader as Loader, CDumper as Dumper
|
|
except ImportError:
|
|
from yaml import SafeLoader as Loader, Dumper
|
|
from collections import OrderedDict
|
|
|
|
def construct_odict(load, node):
|
|
omap = OrderedDict()
|
|
yield omap
|
|
if not isinstance(node, yaml.MappingNode):
|
|
raise yaml.constructor.ConstructorError(
|
|
"while constructing an ordered map",
|
|
node.start_mark,
|
|
"expected a map, but found %s" % node.id, node.start_mark
|
|
)
|
|
for key, value in node.value:
|
|
key = load.construct_object(key)
|
|
value = load.construct_object(value)
|
|
omap[key] = value
|
|
|
|
Loader.add_constructor(u'tag:yaml.org,2002:map', construct_odict)
|
|
def ordered_dict_serializer(self, data):
|
|
return self.represent_mapping('tag:yaml.org,2002:map', data.items())
|
|
Dumper.add_representer(OrderedDict, ordered_dict_serializer)
|
|
|
|
# Likewise, when we store unicode objects make sure we don't write
|
|
# them with weird YAML tags indicating the Python data type. The
|
|
# standard string type is fine. We should do this:
|
|
# Dumper.add_representer(unicode, lambda dumper, value: dumper.represent_scalar(u'tag:yaml.org,2002:str', value))
|
|
#
|
|
# However, the standard PyYAML representer for strings does something
|
|
# weird: if a value cannot be parsed as an integer quotes are omitted.
|
|
#
|
|
# This is incredibly odd when the value is an integer with a leading
|
|
# zero. These values are typically parsed as octal integers, meaning
|
|
# quotes would normally be required (that's good). But when the value
|
|
# has an '8' or '9' in it, this would make it an invalid octal number
|
|
# and so quotes would no longer be required (that's confusing).
|
|
# We will override str and unicode output to choose the quotation
|
|
# style with our own logic. (According to PyYAML, style can be one of
|
|
# the empty string, ', ", |, or >, or None to, presumably, choose
|
|
# automatically.
|
|
def our_string_representer(dumper, value):
|
|
# If it looks like an octal number, force '-quote style.
|
|
style = None
|
|
if re.match(r"^0\d*$", value): style = "'"
|
|
return dumper.represent_scalar(u'tag:yaml.org,2002:str', value, style=style)
|
|
Dumper.add_representer(str, our_string_representer)
|
|
Dumper.add_representer(unicode, our_string_representer)
|
|
|
|
# Add a representer for nulls too. YAML accepts "~" for None, but the
|
|
# default output converts that to "null".
|
|
Dumper.add_representer(type(None), lambda dumper, value : \
|
|
dumper.represent_scalar(u'tag:yaml.org,2002:null', u"~"))
|
|
|
|
# Apply some common settings for loading/dumping YAML and cache the
|
|
# data in pickled format which is a LOT faster than YAML.
|
|
|
|
def yaml_load(path, use_cache=True):
|
|
# Loading YAML is ridiculously slow, so cache the YAML data
|
|
# in a pickled file which loads much faster.
|
|
|
|
# Check if the .pickle file exists and a hash stored inside it
|
|
# matches the hash of the YAML file, and if so unpickle it.
|
|
import cPickle as pickle, os.path, hashlib
|
|
h = hashlib.sha1(open(path).read()).hexdigest()
|
|
if use_cache and os.path.exists(path + ".pickle"):
|
|
|
|
try:
|
|
store = pickle.load(open(path + ".pickle"))
|
|
if store["hash"] == h:
|
|
return store["data"]
|
|
except EOFError:
|
|
pass # bad .pickle file, pretend it doesn't exist
|
|
|
|
# No cached pickled data exists, so load the YAML file.
|
|
data = yaml.load(open(path), Loader=Loader)
|
|
|
|
# Store in a pickled file for fast access later.
|
|
pickle.dump({ "hash": h, "data": data }, open(path+".pickle", "w"))
|
|
|
|
return data
|
|
|
|
def yaml_dump(data, path):
|
|
yaml.dump(data, open(path, "w"), default_flow_style=False, allow_unicode=True, Dumper=Dumper)
|
|
|
|
# Store in a pickled file for fast access later.
|
|
import cPickle as pickle, hashlib
|
|
h = hashlib.sha1(open(path).read()).hexdigest()
|
|
pickle.dump({ "hash": h, "data": data }, open(path+".pickle", "w"))
|
|
|
|
def pprint(data):
|
|
yaml.dump(data, sys.stdout, default_flow_style=False, allow_unicode=True)
|
|
|
|
|
|
# if email settings are supplied, email the text - otherwise, just print it
|
|
def admin(body):
|
|
try:
|
|
if isinstance(body, Exception):
|
|
body = format_exception(body)
|
|
|
|
print body # always print it
|
|
|
|
if email_settings:
|
|
send_email(body)
|
|
|
|
except Exception as exception:
|
|
print "Exception logging message to admin, halting as to avoid loop"
|
|
print format_exception(exception)
|
|
|
|
def format_exception(exception):
|
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
|
|
|
|
# this should only be called if the settings are definitely there
|
|
def send_email(message):
|
|
print "Sending email to %s..." % email_settings['to']
|
|
|
|
# adapted from http://www.doughellmann.com/PyMOTW/smtplib/
|
|
msg = MIMEText(message)
|
|
msg.set_unixfrom('author')
|
|
msg['To'] = email.utils.formataddr(('Recipient', email_settings['to']))
|
|
msg['From'] = email.utils.formataddr((email_settings['from_name'], email_settings['from']))
|
|
msg['Subject'] = "%s - %i" % (email_settings['subject'], int(time.time()))
|
|
|
|
server = smtplib.SMTP(email_settings['hostname'])
|
|
try:
|
|
server.ehlo()
|
|
if email_settings['starttls'] and server.has_extn('STARTTLS'):
|
|
server.starttls()
|
|
server.ehlo()
|
|
|
|
server.login(email_settings['user_name'], email_settings['password'])
|
|
server.sendmail(email_settings['from'], [email_settings['to']], msg.as_string())
|
|
finally:
|
|
server.quit()
|
|
|
|
print "Sent email to %s." % email_settings['to'] |