import os, errno, sys, traceback import time import re, htmlentitydefs from dateutil import tz import yaml from pytz import timezone import datetime, time from lxml import html import scrapelib import pprint import logging import smtplib import email.utils from email.mime.text import MIMEText import getpass # read in an opt-in config file for changing directories and supplying email settings # returns None if it's not there, and this should always be handled gracefully path = "config.yml" if os.path.exists(path): config = yaml.load(open(path, 'r')) else: config = None # scraper should be instantiated at class-load time, so that it can rate limit appropriately scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3) def format_datetime(obj): if isinstance(obj, datetime.datetime): return obj.replace(microsecond=0, tzinfo=timezone("US/Eastern")).isoformat() elif isinstance(obj, str): return obj else: return None def EST(): return tz.gettz("America/New_York") def in_est(dt): return dt.astimezone(EST()) def current_congress(year=None): if not year: year = datetime.datetime.now().year return ((year + 1) / 2) - 894 def split_bill_id(bill_id): return re.match("^([a-z]+)(\d+)-(\d+)$", bill_id).groups() def download(url, destination, force=False, options={}): test = options.get('test', False) if test: cache = test_cache_dir() else: cache = cache_dir() cache_path = os.path.join(cache, destination) if not force and os.path.exists(cache_path): if not test: logging.info("Cached: (%s, %s)" % (cache, url)) with open(cache_path, 'r') as f: body = f.read() else: try: logging.info("Downloading: %s" % url) response = scraper.urlopen(url) body = str(response) except scrapelib.HTTPError as e: logging.error("Error downloading %s:\n\n%s" % (url, format_exception(e))) return None # don't allow 0-byte files if (not body) or (not body.strip()): return None # cache content to disk write(body, cache_path) return unescape(body) def write(content, destination): mkdir_p(os.path.dirname(destination)) f = open(destination, 'w') f.write(content) f.close() # de-dupe a list, taken from: # http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order def uniq(seq): seen = set() seen_add = seen.add return [ x for x in seq if x not in seen and not seen_add(x)] import os, errno # mkdir -p in python, from: # http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST: pass else: raise def xpath_regex(doc, element, pattern): return doc.xpath( "//%s[re:match(text(), '%s')]" % (element, pattern), namespaces={"re": "http://exslt.org/regular-expressions"}) # taken from http://effbot.org/zone/re-sub.htm#unescape-html def unescape(text): def remove_unicode_control(str): remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]') return remove_re.sub('', str) def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is text = re.sub("&#?\w+;", fixup, text) text = remove_unicode_control(text) return text def extract_bills(text, session): bill_ids = [] p = re.compile('((S\.|H\.)(\s?J\.|\s?R\.|\s?Con\.| ?)(\s?Res\.)*\s?\d+)', flags=re.IGNORECASE) bill_matches = p.findall(text) if bill_matches: for b in bill_matches: bill_text = "%s-%s" % (b[0].lower().replace(" ", '').replace('.', '').replace("con", "c"), session) if bill_text not in bill_ids: bill_ids.append(bill_text) return bill_ids # uses config values if present def cache_dir(): cache = None if config: output = config.get('output', None) if output: cache = output.get('cache', None) if not cache: cache = "cache" return cache def test_cache_dir(): return "test/fixtures/cache" # uses config values if present def data_dir(): data = None if config: output = config.get('output', None) if output: data = output.get('data', None) if not data: data = "data" return data # if email settings are supplied, email the text - otherwise, just print it def admin(body): try: if isinstance(body, Exception): body = format_exception(body) logging.error(body) # always print it if config: details = config.get('email', None) if details: send_email(body) except Exception as exception: print "Exception logging message to admin, halting as to avoid loop" print format_exception(exception) def format_exception(exception): exc_type, exc_value, exc_traceback = sys.exc_info() return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback)) # this should only be called if the settings are definitely there def send_email(message): settings = config['email'] # adapted from http://www.doughellmann.com/PyMOTW/smtplib/ msg = MIMEText(message) msg.set_unixfrom('author') msg['To'] = email.utils.formataddr(('Recipient', settings['to'])) msg['From'] = email.utils.formataddr((settings['from_name'], settings['from'])) msg['Subject'] = "%s - %i" % (settings['subject'], int(time.time())) server = smtplib.SMTP(settings['hostname']) try: server.ehlo() if settings['starttls'] and server.has_extn('STARTTLS'): server.starttls() server.ehlo() server.login(settings['user_name'], settings['password']) server.sendmail(settings['from'], [settings['to']], msg.as_string()) finally: server.quit() logging.info("Sent email to %s" % settings['to']) thomas_types = { 'hr': ('HR', 'H.R.'), 'hres': ('HE', 'H.RES.'), 'hjres': ('HJ', 'H.J.RES.'), 'hconres': ('HC', 'H.CON.RES.'), 's': ('SN', 'S.'), 'sres': ('SE', 'S.RES.'), 'sjres': ('SJ', 'S.J.RES.'), 'sconres': ('SC', 'S.CON.RES.'), } # cached committee map to map names to IDs committee_names = {} # get the mapping from THOMAS's committee names to THOMAS's committee IDs # found on the advanced search page. committee_names[congress][name] = ID # with subcommittee names as the committee name plus a pipe plus the subcommittee # name. def fetch_committee_names(congress, options): congress = int(congress) # Parse the THOMAS advanced search pages for the names that THOMAS uses for # committees on bill pages, and map those to the IDs for the committees that are # listed on the advanced search pages (but aren't shown on bill pages). if not options.get('test', False): log("[%d] Fetching committee names..." % congress) # allow body to be passed in from fixtures if options.has_key('body'): body = options['body'] else: body = download( "http://thomas.loc.gov/home/LegislativeData.php?&n=BSS&c=%d" % congress, "%s/meta/thomas_committee_names.html" % congress, options.get('force', False), options) for chamber, options in re.findall('>Choose (House|Senate) Committees(.*?)', body, re.I | re.S): for name, id in re.findall(r'