mirror of
https://github.com/unitedstates/congress.git
synced 2025-12-19 17:16:58 -05:00
103 lines
2.8 KiB
Python
103 lines
2.8 KiB
Python
import utils
|
|
import os
|
|
import os.path
|
|
import re
|
|
from lxml import html, etree
|
|
import logging
|
|
|
|
import nomination_info
|
|
|
|
|
|
def run(options):
|
|
nomination_id = options.get('nomination_id', None)
|
|
|
|
if nomination_id:
|
|
nomination_type, number, congress = utils.split_nomination_id(nomination_id)
|
|
to_fetch = [nomination_id]
|
|
else:
|
|
congress = options.get('congress', utils.current_congress())
|
|
to_fetch = nomination_ids_for(congress, options)
|
|
if not to_fetch:
|
|
if options.get("fast", False):
|
|
logging.warn("No nominations changed.")
|
|
else:
|
|
logging.error("Error figuring out which nominations to download, aborting.")
|
|
return None
|
|
|
|
limit = options.get('limit', None)
|
|
if limit:
|
|
to_fetch = to_fetch[:int(limit)]
|
|
|
|
logging.warn("Going to fetch %i nominations from congress #%s" % (len(to_fetch), congress))
|
|
|
|
saved_nominations = utils.process_set(to_fetch, nomination_info.fetch_nomination, options)
|
|
|
|
# page through listings for bills of a particular congress
|
|
|
|
|
|
def nomination_ids_for(congress, options={}):
|
|
nomination_ids = []
|
|
|
|
page = page_for(congress, options)
|
|
if not page:
|
|
logging.error("Couldn't download page for %d congress" % congress)
|
|
return None
|
|
|
|
# extract matching links
|
|
doc = html.document_fromstring(page)
|
|
raw_nomination_ids = doc.xpath('//div[@id="content"]/p[2]/a/text()')
|
|
nomination_ids = []
|
|
|
|
for raw_id in raw_nomination_ids:
|
|
pieces = raw_id.split(' ')
|
|
|
|
# ignore these
|
|
if raw_id in ["PDF", "Text", "split into two or more parts"]:
|
|
pass
|
|
elif len(pieces) < 2:
|
|
logging.error("Bad nomination ID detected: %s" % raw_id)
|
|
return None
|
|
else:
|
|
nomination_ids.append(pieces[1])
|
|
|
|
return utils.uniq(nomination_ids)
|
|
|
|
|
|
def page_cache_for(congress):
|
|
return "%s/nominations/pages/search.html" % congress
|
|
|
|
# unlike bills.py, we're going to fetch the page instead of producing the URL,
|
|
# since a POST is required.
|
|
|
|
|
|
def page_for(congress, options):
|
|
congress = int(congress)
|
|
postdata = {
|
|
"database": "nominations",
|
|
"MaxDocs": '5000',
|
|
"submit": "SEARCH",
|
|
"querytype": "phrase",
|
|
"query": "",
|
|
"Stemming": "No",
|
|
"congress": "%d" % congress,
|
|
"CIVcategory": "on",
|
|
"LSTcategory": "on",
|
|
"committee": "",
|
|
"LBDateSel": "FLD606",
|
|
"EBSDate": "",
|
|
"EBEDate": "",
|
|
"sort": "sh_docid_rc",
|
|
}
|
|
|
|
post_options = {'postdata': postdata}
|
|
post_options.update(options)
|
|
|
|
# unused: never cache search listing
|
|
cache = page_cache_for(congress)
|
|
|
|
page = utils.download("http://thomas.loc.gov/cgi-bin/thomas",
|
|
None,
|
|
post_options
|
|
)
|
|
return page
|