mirror of
https://github.com/unitedstates/congress.git
synced 2025-12-22 18:42:35 -05:00
313 lines
9.4 KiB
Python
313 lines
9.4 KiB
Python
import os, errno, sys, traceback
|
|
import time
|
|
import re, htmlentitydefs
|
|
from dateutil import tz
|
|
import yaml
|
|
from pytz import timezone
|
|
import datetime, time
|
|
from lxml import html
|
|
import scrapelib
|
|
import pprint
|
|
import logging
|
|
|
|
import smtplib
|
|
import email.utils
|
|
from email.mime.text import MIMEText
|
|
import getpass
|
|
|
|
|
|
# read in an opt-in config file for changing directories and supplying email settings
|
|
# returns None if it's not there, and this should always be handled gracefully
|
|
path = "config.yml"
|
|
if os.path.exists(path):
|
|
config = yaml.load(open(path, 'r'))
|
|
else:
|
|
config = None
|
|
|
|
|
|
# scraper should be instantiated at class-load time, so that it can rate limit appropriately
|
|
scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3)
|
|
|
|
|
|
def format_datetime(obj):
|
|
if isinstance(obj, datetime.datetime):
|
|
return obj.replace(microsecond=0, tzinfo=timezone("US/Eastern")).isoformat()
|
|
elif isinstance(obj, str):
|
|
return obj
|
|
else:
|
|
return None
|
|
|
|
def EST():
|
|
return tz.gettz("America/New_York")
|
|
|
|
def in_est(dt):
|
|
return dt.astimezone(EST())
|
|
|
|
def current_congress(year=None):
|
|
if not year:
|
|
year = datetime.datetime.now().year
|
|
return ((year + 1) / 2) - 894
|
|
|
|
def split_bill_id(bill_id):
|
|
return re.match("^([a-z]+)(\d+)-(\d+)$", bill_id).groups()
|
|
|
|
def download(url, destination, force=False, options={}):
|
|
test = options.get('test', False)
|
|
|
|
if test:
|
|
cache = test_cache_dir()
|
|
else:
|
|
cache = cache_dir()
|
|
|
|
cache_path = os.path.join(cache, destination)
|
|
|
|
if not force and os.path.exists(cache_path):
|
|
if not test: logging.info("Cached: (%s, %s)" % (cache, url))
|
|
with open(cache_path, 'r') as f:
|
|
body = f.read()
|
|
else:
|
|
try:
|
|
logging.info("Downloading: %s" % url)
|
|
response = scraper.urlopen(url)
|
|
body = str(response)
|
|
except scrapelib.HTTPError as e:
|
|
logging.error("Error downloading %s:\n\n%s" % (url, format_exception(e)))
|
|
return None
|
|
|
|
# don't allow 0-byte files
|
|
if (not body) or (not body.strip()):
|
|
return None
|
|
|
|
# cache content to disk
|
|
write(body, cache_path)
|
|
|
|
return unescape(body)
|
|
|
|
def write(content, destination):
|
|
mkdir_p(os.path.dirname(destination))
|
|
f = open(destination, 'w')
|
|
f.write(content)
|
|
f.close()
|
|
|
|
# de-dupe a list, taken from:
|
|
# http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order
|
|
def uniq(seq):
|
|
seen = set()
|
|
seen_add = seen.add
|
|
return [ x for x in seq if x not in seen and not seen_add(x)]
|
|
|
|
import os, errno
|
|
|
|
# mkdir -p in python, from:
|
|
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
|
|
def mkdir_p(path):
|
|
try:
|
|
os.makedirs(path)
|
|
except OSError as exc: # Python >2.5
|
|
if exc.errno == errno.EEXIST:
|
|
pass
|
|
else:
|
|
raise
|
|
|
|
def xpath_regex(doc, element, pattern):
|
|
return doc.xpath(
|
|
"//%s[re:match(text(), '%s')]" % (element, pattern),
|
|
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
|
|
# taken from http://effbot.org/zone/re-sub.htm#unescape-html
|
|
def unescape(text):
|
|
|
|
def remove_unicode_control(str):
|
|
remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
|
|
return remove_re.sub('', str)
|
|
|
|
def fixup(m):
|
|
text = m.group(0)
|
|
if text[:2] == "&#":
|
|
# character reference
|
|
try:
|
|
if text[:3] == "&#x":
|
|
return unichr(int(text[3:-1], 16))
|
|
else:
|
|
return unichr(int(text[2:-1]))
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
# named entity
|
|
try:
|
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
|
except KeyError:
|
|
pass
|
|
return text # leave as is
|
|
|
|
text = re.sub("&#?\w+;", fixup, text)
|
|
text = remove_unicode_control(text)
|
|
return text
|
|
|
|
def extract_bills(text, session):
|
|
bill_ids = []
|
|
|
|
p = re.compile('((S\.|H\.)(\s?J\.|\s?R\.|\s?Con\.| ?)(\s?Res\.)*\s?\d+)', flags=re.IGNORECASE)
|
|
bill_matches = p.findall(text)
|
|
|
|
if bill_matches:
|
|
for b in bill_matches:
|
|
bill_text = "%s-%s" % (b[0].lower().replace(" ", '').replace('.', '').replace("con", "c"), session)
|
|
if bill_text not in bill_ids:
|
|
bill_ids.append(bill_text)
|
|
|
|
return bill_ids
|
|
|
|
# uses config values if present
|
|
def cache_dir():
|
|
cache = None
|
|
|
|
if config:
|
|
output = config.get('output', None)
|
|
if output:
|
|
cache = output.get('cache', None)
|
|
|
|
if not cache:
|
|
cache = "cache"
|
|
|
|
return cache
|
|
|
|
def test_cache_dir():
|
|
return "test/fixtures/cache"
|
|
|
|
# uses config values if present
|
|
def data_dir():
|
|
data = None
|
|
|
|
if config:
|
|
output = config.get('output', None)
|
|
if output:
|
|
data = output.get('data', None)
|
|
|
|
if not data:
|
|
data = "data"
|
|
|
|
return data
|
|
|
|
# if email settings are supplied, email the text - otherwise, just print it
|
|
def admin(body):
|
|
try:
|
|
if isinstance(body, Exception):
|
|
body = format_exception(body)
|
|
|
|
logging.error(body) # always print it
|
|
|
|
if config:
|
|
details = config.get('email', None)
|
|
if details:
|
|
send_email(body)
|
|
|
|
except Exception as exception:
|
|
print "Exception logging message to admin, halting as to avoid loop"
|
|
print format_exception(exception)
|
|
|
|
def format_exception(exception):
|
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
|
|
|
|
# this should only be called if the settings are definitely there
|
|
def send_email(message):
|
|
settings = config['email']
|
|
|
|
# adapted from http://www.doughellmann.com/PyMOTW/smtplib/
|
|
msg = MIMEText(message)
|
|
msg.set_unixfrom('author')
|
|
msg['To'] = email.utils.formataddr(('Recipient', settings['to']))
|
|
msg['From'] = email.utils.formataddr((settings['from_name'], settings['from']))
|
|
msg['Subject'] = "%s - %i" % (settings['subject'], int(time.time()))
|
|
|
|
server = smtplib.SMTP(settings['hostname'])
|
|
try:
|
|
server.ehlo()
|
|
if settings['starttls'] and server.has_extn('STARTTLS'):
|
|
server.starttls()
|
|
server.ehlo()
|
|
|
|
server.login(settings['user_name'], settings['password'])
|
|
server.sendmail(settings['from'], [settings['to']], msg.as_string())
|
|
finally:
|
|
server.quit()
|
|
|
|
logging.info("Sent email to %s" % settings['to'])
|
|
|
|
|
|
thomas_types = {
|
|
'hr': ('HR', 'H.R.'),
|
|
'hres': ('HE', 'H.RES.'),
|
|
'hjres': ('HJ', 'H.J.RES.'),
|
|
'hconres': ('HC', 'H.CON.RES.'),
|
|
's': ('SN', 'S.'),
|
|
'sres': ('SE', 'S.RES.'),
|
|
'sjres': ('SJ', 'S.J.RES.'),
|
|
'sconres': ('SC', 'S.CON.RES.'),
|
|
}
|
|
|
|
|
|
# cached committee map to map names to IDs
|
|
committee_names = {}
|
|
|
|
# get the mapping from THOMAS's committee names to THOMAS's committee IDs
|
|
# found on the advanced search page. committee_names[congress][name] = ID
|
|
# with subcommittee names as the committee name plus a pipe plus the subcommittee
|
|
# name.
|
|
def fetch_committee_names(congress, options):
|
|
congress = int(congress)
|
|
|
|
# Parse the THOMAS advanced search pages for the names that THOMAS uses for
|
|
# committees on bill pages, and map those to the IDs for the committees that are
|
|
# listed on the advanced search pages (but aren't shown on bill pages).
|
|
if not options.get('test', False): log("[%d] Fetching committee names..." % congress)
|
|
|
|
# allow body to be passed in from fixtures
|
|
if options.has_key('body'):
|
|
body = options['body']
|
|
else:
|
|
body = download(
|
|
"http://thomas.loc.gov/home/LegislativeData.php?&n=BSS&c=%d" % congress,
|
|
"%s/meta/thomas_committee_names.html" % congress,
|
|
options.get('force', False), options)
|
|
|
|
for chamber, options in re.findall('>Choose (House|Senate) Committees</option>(.*?)</select>', body, re.I | re.S):
|
|
for name, id in re.findall(r'<option value="(.*?)\{(.*?)}">', options, re.I | re.S):
|
|
id = str(id).upper()
|
|
name = name.strip().replace(" ", " ") # weirdness
|
|
if id.endswith("00"):
|
|
# Map chamber + committee name to its ID, minus the 00 at the end. On bill pages,
|
|
# committees appear as e.g. "House Finance." Except the JCSE.
|
|
if id != "JCSE00":
|
|
name = chamber + " " + name
|
|
|
|
# Correct for some oddness on THOMAS (but not on Congress.gov): The House Committee
|
|
# on House Administration appears just as "House Administration".
|
|
if name == "House House Administration": name = "House Administration"
|
|
|
|
committee_names[name] = id[0:-2]
|
|
|
|
else:
|
|
# map committee ID + "|" + subcommittee name to the zero-padded subcommittee numeric ID
|
|
committee_names[id[0:-2] + "|" + name] = id[-2:]
|
|
|
|
# Correct for a limited number of other ways committees appear, owing probably to the
|
|
# committee name being changed mid-way through a Congress.
|
|
if congress == 95:
|
|
committee_names["House Intelligence (Select)"] = committee_names["House Intelligence (Permanent Select)"]
|
|
if congress == 96:
|
|
committee_names["Senate Human Resources"] = "SSHR"
|
|
if congress == 97:
|
|
committee_names["Senate Small Business (Select)"] = committee_names["Senate Small Business"]
|
|
if congress == 98:
|
|
committee_names["Senate Indian Affairs (Select)"] = committee_names["Senate Indian Affairs (Permanent Select)"]
|
|
if congress == 100:
|
|
committee_names["HSPO|Hoc Task Force on Presidential Pay Recommendation"] = committee_names["HSPO|Ad Hoc Task Force on Presidential Pay Recommendation"]
|
|
if congress == 103:
|
|
committee_names["Senate Indian Affairs (Permanent Select)"] = committee_names["Senate Indian Affairs"]
|
|
if congress == 108:
|
|
# This appears to be a mistake, a subcommittee appearing as a full committee. Map it to
|
|
# the full committee for now.
|
|
committee_names["House Antitrust (Full Committee Task Force)"] = committee_names["House Judiciary"]
|