Files
congress/tasks/votes.py

141 lines
4.7 KiB
Python

import utils, json, iso8601, datetime
import os, os.path
import re, urlparse
import time, datetime
from lxml import html, etree
import logging
import vote_info
def run(options):
vote_id = options.get('vote_id', None)
if vote_id:
vote_chamber, vote_number, congress, session_year = utils.split_vote_id(vote_id)
to_fetch = [vote_id]
else:
congress = options.get('congress', None)
if congress:
session_year = options.get('session', None)
if not session_year:
logging.error("If you provide a --congress, provide a --session year.")
return None
else:
congress = utils.current_congress()
session_year = options.get('session', str(datetime.datetime.now().year))
chamber = options.get('chamber', None)
if chamber == "house":
to_fetch = vote_ids_for_house(congress, session_year, options)
elif chamber == "senate":
to_fetch = vote_ids_for_senate(congress, session_year, options)
else:
to_fetch = vote_ids_for_house(congress, session_year, options) + vote_ids_for_senate(congress, session_year, options)
if not to_fetch:
if not options.get("fast", False):
logging.error("Error figuring out which votes to download, aborting.")
else:
logging.warn("No new or recent votes.")
return None
limit = options.get('limit', None)
if limit:
to_fetch = to_fetch[:int(limit)]
if options.get('pages_only', False):
return None
logging.warn("Going to fetch %i votes from congress #%s session %s" % (len(to_fetch), congress, session_year))
utils.process_set(to_fetch, vote_info.fetch_vote, options)
# page through listing of House votes of a particular congress and session
def vote_ids_for_house(congress, session_year, options):
vote_ids = []
index_page = "http://clerk.house.gov/evs/%s/index.asp" % session_year
group_page = r"ROLL_(\d+)\.asp"
link_pattern = r"http://clerk.house.gov/cgi-bin/vote.asp\?year=%s&rollnumber=(\d+)" % session_year
# download index page, find the matching links to the paged listing of votes
page = utils.download(
index_page,
"%s/votes/%s/pages/house.html" % (congress, session_year),
options)
if not page:
logging.error("Couldn't download House vote index page, aborting")
return None
# extract matching links
doc = html.document_fromstring(page)
links = doc.xpath(
"//a[re:match(@href, '%s')]" % group_page,
namespaces={"re": "http://exslt.org/regular-expressions"})
for link in links:
# get some identifier for this inside page for caching
grp = re.match(group_page, link.get("href")).group(1)
# download inside page, find the matching links
page = utils.download(
urlparse.urljoin(index_page, link.get("href")),
"%s/votes/%s/pages/house_%s.html" % (congress, session_year, grp),
options)
if not page:
logging.error("Couldn't download House vote group page (%s), aborting" % grp)
continue
doc = html.document_fromstring(page)
votelinks = doc.xpath(
"//a[re:match(@href, '%s')]" % link_pattern,
namespaces={"re": "http://exslt.org/regular-expressions"})
for votelink in votelinks:
num = re.match(link_pattern, votelink.get("href")).group(1)
vote_id = "h" + num + "-" + str(congress) + "." + session_year
if not should_process(vote_id, options): continue
vote_ids.append(vote_id)
return utils.uniq(vote_ids)
def vote_ids_for_senate(congress, session_year, options):
session_num = int(session_year) - utils.get_congress_first_year(int(congress)) + 1
vote_ids = []
page = utils.download(
"http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_%s_%d.xml" % (congress, session_num),
"%s/votes/%s/pages/senate.xml" % (congress, session_year),
utils.merge(options, {'xml': True})
)
if not page:
logging.error("Couldn't download Senate vote XML index, aborting")
return None
dom = etree.fromstring(page)
for vote in dom.xpath("//vote"):
num = int(vote.xpath("vote_number")[0].text)
vote_id = "s" + str(num) + "-" + str(congress) + "." + session_year
if not should_process(vote_id, options): continue
vote_ids.append(vote_id)
return vote_ids
def should_process(vote_id, options):
if not options.get("fast", False): return True
# If --fast is used, only download new votes or votes taken in the last
# three days (when most vote changes and corrections should occur).
f = vote_info.output_for_vote(vote_id, "json")
if not os.path.exists(f):
return True
v = json.load(open(f))
now = utils.eastern_time_zone.localize(datetime.datetime.now())
return (now - iso8601.parse_date(v["date"])) < datetime.timedelta(days=3)