congress/tasks/votes.py

import json
import iso8601
import os
import os.path
import re
import urlparse
import time
import datetime
from lxml import html, etree
import logging

from tasks import Task, current_congress, uniq, get_congress_first_year, merge, format_datetime, make_node


class Votes(Task):

    def __init__(self, options, config):
        super(Votes, self).__init__(options, config)

    def run(self):
        vote_id = self.options.get('vote_id', None)

        if vote_id:
            vote_chamber, vote_number, congress, session_year = self.split_vote_id(vote_id)
            to_fetch = [vote_id]
        else:
            congress = self.options.get('congress', None)
            if congress:
                session_year = self.options.get('session', None)
                if not session_year:
                    logging.error("If you provide a --congress, provide a --session year.")
                    return None
            else:
                congress = current_congress()
                session_year = self.options.get('session', str(datetime.datetime.now().year))

            chamber = self.options.get('chamber', None)

            if chamber == "house":
                to_fetch = self.vote_ids_for_house(congress, session_year)
            elif chamber == "senate":
                to_fetch = self.vote_ids_for_senate(congress, session_year)
            else:
                to_fetch = (self.vote_ids_for_house(congress, session_year) or []) + (self.vote_ids_for_senate(congress, session_year) or [])

            if not to_fetch:
                if not self.options.get("fast", False):
                    logging.error("Error figuring out which votes to download, aborting.")
                else:
                    logging.warn("No new or recent votes.")
                return None

            limit = self.options.get('limit', None)
            if limit:
                to_fetch = to_fetch[:int(limit)]

        if self.options.get('pages_only', False):
            return None

        logging.warn("Going to fetch %i votes from congress #%s session %s" % (len(to_fetch), congress, session_year))

        self.process_set(to_fetch, self.fetch_vote)

    def vote_ids_for_house(self, congress, session_year):
        """
        Page through listing of House votes of a particular congress and session

        @param congress:
        @type congress:
        @param session_year:
        @type session_year:
        @return:
        @rtype:
        """
        vote_ids = []

        index_page = "http://clerk.house.gov/evs/%s/index.asp" % session_year
        group_page = r"ROLL_(\d+)\.asp"
        link_pattern = r"http://clerk.house.gov/cgi-bin/vote.asp\?year=%s&rollnumber=(\d+)" % session_year

        # download index page, find the matching links to the paged listing of votes
        page = self.download(
            index_page,
            "%s/votes/%s/pages/house.html" % (congress, session_year),
            self.options)

        if not page:
            logging.error("Couldn't download House vote index page, aborting")
            return None

        # extract matching links
        doc = html.document_fromstring(page)
        links = doc.xpath(
            "//a[re:match(@href, '%s')]" % group_page,
            namespaces={"re": "http://exslt.org/regular-expressions"})

        for link in links:
            # get some identifier for this inside page for caching
            grp = re.match(group_page, link.get("href")).group(1)

            # download inside page, find the matching links
            page = self.download(
                urlparse.urljoin(index_page, link.get("href")),
                "%s/votes/%s/pages/house_%s.html" % (congress, session_year, grp),
                self.options)

            if not page:
                logging.error("Couldn't download House vote group page (%s), aborting" % grp)
                continue

            doc = html.document_fromstring(page)
            votelinks = doc.xpath(
                "//a[re:match(@href, '%s')]" % link_pattern,
                namespaces={"re": "http://exslt.org/regular-expressions"})

            for votelink in votelinks:
                num = re.match(link_pattern, votelink.get("href")).group(1)
                vote_id = "h" + num + "-" + str(congress) + "." + session_year
                if not self.should_process(vote_id):
                    continue
                vote_ids.append(vote_id)

        return uniq(vote_ids)

    def vote_ids_for_senate(self, congress, session_year):
        session_num = int(session_year) - get_congress_first_year(int(congress)) + 1

        vote_ids = []

        page = self.download(
            "http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_%s_%d.xml" % (congress, session_num),
            "%s/votes/%s/pages/senate.xml" % (congress, session_year),
            merge(self.options, {'binary': True})
        )

        if not page:
            logging.error("Couldn't download Senate vote XML index, aborting")
            return None

        dom = etree.fromstring(page)

        # Sanity checks.
        if int(congress) != int(dom.xpath("congress")[0].text):
            logging.error("Senate vote XML returns the wrong Congress: %s" % dom.xpath("congress")[0].text)
            return None
        if int(session_year) != int(dom.xpath("congress_year")[0].text):
            logging.error("Senate vote XML returns the wrong session: %s" % dom.xpath("congress_year")[0].text)
            return None

        # Get vote list.
        for vote in dom.xpath("//vote"):
            num = int(vote.xpath("vote_number")[0].text)
            vote_id = "s" + str(num) + "-" + str(congress) + "." + session_year
            if not self.should_process(vote_id):
                continue
            vote_ids.append(vote_id)
        return vote_ids

    def output_for_vote(self, vote_id, format):
        """

        @param vote_id:
        @type vote_id:
        @param format:
        @type format:
        @return:
        @rtype:
        """
        vote_chamber, vote_number, vote_congress, vote_session_year = self.split_vote_id(vote_id)
        return "%s/%s/votes/%s/%s%s/%s" % (self.storage.data_dir, vote_congress, vote_session_year, vote_chamber, vote_number, "data.%s" % format)

    def should_process(self, vote_id):
        if not self.options.get("fast", False):
            return True

        # If --fast is used, only download new votes or votes taken in the last
        # three days (when most vote changes and corrections should occur).
        f = self.output_for_vote(vote_id, "json")
        if not os.path.exists(f):
            return True

        v = json.load(open(f))
        now = self.EASTERN_TIME_ZONE.localize(datetime.datetime.now())
        return (now - iso8601.parse_date(v["date"])) < datetime.timedelta(days=3)

    def fetch_vote(self, vote_id):
        logging.info("\n[%s] Fetching..." % vote_id)

        vote_chamber, vote_number, vote_congress, vote_session_year = self.split_vote_id(vote_id)

        if vote_chamber == "h":
            url = "http://clerk.house.gov/evs/%s/roll%03d.xml" % (vote_session_year, int(vote_number))
        else:
            session_num = int(vote_session_year) - get_congress_first_year(int(vote_congress)) + 1
            url = "http://www.senate.gov/legislative/LIS/roll_call_votes/vote%d%d/vote_%d_%d_%05d.xml" % (int(vote_congress), session_num, int(vote_congress), session_num, int(vote_number))

        # fetch vote XML page
        body = self.download(
            url,
            "%s/votes/%s/%s%s/%s%s.xml" % (vote_congress, vote_session_year, vote_chamber, vote_number, vote_chamber, vote_number),
            merge(self.options, {'binary': True}),
        )

        if not body:
            return {'saved': False, 'ok': False, 'reason': "failed to download"}

        if self.options.get("download_only", False):
            return {'saved': False, 'ok': True, 'reason': "requested download only"}

        if "This vote was vacated" in body:
            # Vacated votes: 2011-484, 2012-327, ...
            # Remove file, since it may previously have existed with data.
            for f in (self.output_for_vote(vote_id, "json"), self.output_for_vote(vote_id, "xml")):
                if self.storage.exists(f):
                    self.storage.remove(f)
            return {'saved': False, 'ok': True, 'reason': "vote was vacated"}

        dom = etree.fromstring(body)

        vote = {
            'vote_id': vote_id,
            'chamber': vote_chamber,
            'congress': int(vote_congress),
            'session': vote_session_year,
            'number': int(vote_number),
            'updated_at': datetime.datetime.fromtimestamp(time.time()),
            'source_url': url,
        }

        # do the heavy lifting

        if vote_chamber == "h":
            self.parse_house_vote(dom, vote)
        elif vote_chamber == "s":
            self.parse_senate_vote(dom, vote)

        # output and return

        self.output_vote(vote)

        return {'ok': True, 'saved': True}

    def parse_house_vote(self, dom, vote):
        def parse_date(d):
            d = d.strip()
            if " " in d:
                return datetime.datetime.strptime(d, "%d-%b-%Y %I:%M %p")
            else:  # some votes have no times?
                print vote
                return datetime.datetime.strptime(d, "%d-%b-%Y")

        vote["date"] = parse_date(str(dom.xpath("string(vote-metadata/action-date)")) + " " + str(dom.xpath("string(vote-metadata/action-time)")))
        vote["question"] = unicode(dom.xpath("string(vote-metadata/vote-question)"))
        vote["type"] = unicode(dom.xpath("string(vote-metadata/vote-question)"))
        vote["type"] = self.normalize_vote_type(vote["type"])
        if unicode(dom.xpath("string(vote-metadata/vote-desc)")).startswith("Impeaching "):
            vote["category"] = "impeachment"
        else:
            vote["category"] = self.get_vote_category(vote["question"])
        vote["subject"] = unicode(dom.xpath("string(vote-metadata/vote-desc)"))
        if not vote["subject"]:
            del vote["subject"]

        vote_types = {"YEA-AND-NAY": "1/2", "2/3 YEA-AND-NAY": "2/3", "3/5 YEA-AND-NAY": "3/5", "1/2": "1/2", "2/3": "2/3", "QUORUM": "QUORUM", "RECORDED VOTE": "1/2", "2/3 RECORDED VOTE": "2/3", "3/5 RECORDED VOTE": "3/5"}
        vote["requires"] = vote_types.get(str(dom.xpath("string(vote-metadata/vote-type)")), "unknown")

        vote["result_text"] = unicode(dom.xpath("string(vote-metadata/vote-result)"))
        vote["result"] = unicode(dom.xpath("string(vote-metadata/vote-result)"))

        bill_num = unicode(dom.xpath("string(vote-metadata/legis-num)"))
        if bill_num not in ("", "QUORUM", "JOURNAL", "MOTION", "ADJOURN") and not re.match(r"QUORUM \d+$", bill_num):
            bill_types = {"S": "s", "S CON RES": "sconres", "S J RES": "sjres", "S RES": "sres", "H R": "hr", "H CON RES": "hconres", "H J RES": "hjres", "H RES": "hres"}
            try:
                bill_type, bill_number = bill_num.rsplit(" ", 1)
                vote["bill"] = {
                    "congress": vote["congress"],
                    "type": bill_types[bill_type],
                    "number": int(bill_number)
                }
            except ValueError:  # rsplit failed, i.e. there is no space in the legis-num field
                raise Exception("Unhandled bill number in the legis-num field")

        if str(dom.xpath("string(vote-metadata/amendment-num)")):
            vote["amendment"] = {
                "type": "h-bill",
                "number": int(str(dom.xpath("string(vote-metadata/amendment-num)"))),
                "author": unicode(dom.xpath("string(vote-metadata/amendment-author)")),
            }

        # Assemble a complete question from the vote type, amendment, and bill number.
        if "amendment" in vote and "bill" in vote:
            vote["question"] += ": Amendment %s to %s" % (vote["amendment"]["number"], unicode(dom.xpath("string(vote-metadata/legis-num)")))
        elif "amendment" in vote:
            vote["question"] += ": Amendment %s to [unknown bill]" % vote["amendment"]["number"]
        elif "bill" in vote:
            vote["question"] += ": " + unicode(dom.xpath("string(vote-metadata/legis-num)"))
            if "subject" in vote:
                vote["question"] += " " + vote["subject"]
        elif "subject" in vote:
            vote["question"] += ": " + vote["subject"]

        # Count up the votes.
        vote["votes"] = {}  # by vote type

        def add_vote(vote_option, voter):
            vote["votes"].setdefault(vote_option, []).append(voter)

        # Ensure the options are noted, even if no one votes that way.
        if unicode(dom.xpath("string(vote-metadata/vote-question)")) == "Election of the Speaker":
            for n in dom.xpath('vote-metadata/vote-totals/totals-by-candidate/candidate'):
                vote["votes"][n.text] = []
        elif unicode(dom.xpath("string(vote-metadata/vote-question)")) == "Call of the House":
            for n in dom.xpath('vote-metadata/vote-totals/totals-by-candidate/candidate'):
                vote["votes"][n.text] = []
        elif "YEA-AND-NAY" in dom.xpath('string(vote-metadata/vote-type)'):
            vote["votes"]['Yea'] = []
            vote["votes"]['Nay'] = []
            vote["votes"]['Present'] = []
            vote["votes"]['Not Voting'] = []
        else:
            vote["votes"]['Aye'] = []
            vote["votes"]['No'] = []
            vote["votes"]['Present'] = []
            vote["votes"]['Not Voting'] = []

        for member in dom.xpath("vote-data/recorded-vote"):
            display_name = unicode(member.xpath("string(legislator)"))
            state = str(member.xpath("string(legislator/@state)"))
            party = str(member.xpath("string(legislator/@party)"))
            vote_cast = str(member.xpath("string(vote)"))
            bioguideid = str(member.xpath("string(legislator/@name-id)"))
            add_vote(vote_cast, {
                "id": bioguideid,
                "state": state,
                "party": party,
                "display_name": display_name,
            })

        # Through the 107th Congress and sporadically in more recent data, the bioguide field
        # is not present. Look up the Members' bioguide IDs by name/state/party/date. This works
        # reasonably well, but there are many gaps. When there's a gap, it raises an exception
        # and the file is not saved.
        #
        # Take into account that the vote may list both a "Smith" and a "Smith, John". Resolve
        # "Smith" by process of elimination, i.e. he must not be whoever "Smith, John" resolved
        # to. To do that, process the voters from longest specified display name to shortest.
        #
        # One example of a sporadic case is 108th Congress, 2nd session (2004), votes 405 through
        # 544, where G.K. Butterfield's bioguide ID is 000000. It should have been B001251.
        # See https://github.com/unitedstates/congress/issues/46.

        seen_ids = set()
        all_voters = sum(vote["votes"].values(), [])
        all_voters.sort(key=lambda v: len(v["display_name"]), reverse=True)  # process longer names first
        for v in all_voters:
            if v["id"] not in ("", "0000000"):
                continue

            # here are wierd cases from h610-103.1993 that confound our name lookup since it has the wrong state abbr
            if v["state"] == "XX":
                for st in ("PR", "AS", "GU", "VI", "DC"):
                    if v["display_name"].endswith(" (%s)" % st):
                        v["state"] = st

            # get the last name without the state abbreviation in parenthesis, if it is present
            display_name = v["display_name"].strip()
            ss = " (%s)" % v["state"]
            if display_name.endswith(ss):
                display_name = display_name[:-len(ss)].strip()

            # wrong party in upstream data
            if vote["vote_id"] == "h2-106.1999" and display_name == "Hastert":
                v["id"] = "H000323"
                continue

            # dead man recorded as Not Voting (he died the day before, so none of our roles match the vote date)
            if vote["vote_id"] == "h306-106.1999" and display_name == "Brown" and v["state"] == "CA":
                v["id"] = "B000918"
                continue

            # look up ID
            v["id"] = self.lookup_legislator(vote["congress"], "rep", display_name, v["state"], v["party"], vote["date"], "bioguide", exclude=seen_ids)

            if v["id"] is None:
                logging.error("[%s] Missing bioguide ID and name lookup failed for %s (%s-%s on %s)" % (vote["vote_id"], display_name, v["state"], v["party"], vote["date"]))
                raise Exception("No bioguide ID for %s (%s-%s)" % (display_name, v["state"], v["party"]))
            else:
                if vote["congress"] > 107:
                    logging.warn("[%s] Used name lookup for %s because bioguide ID was missing." % (vote["vote_id"], v["display_name"]))
                seen_ids.add(v["id"])

    def parse_senate_vote(self, dom, vote):
        def parse_date(d):
            return datetime.datetime.strptime(d, "%B %d, %Y, %I:%M %p")

        vote["date"] = parse_date(dom.xpath("string(vote_date)"))
        if len(dom.xpath("modify_date")) > 0:
            vote["record_modified"] = parse_date(dom.xpath("string(modify_date)"))  # some votes like s1-110.2008 don't have a modify_date
        vote["question"] = unicode(dom.xpath("string(vote_question_text)"))
        if vote["question"] == "":
            vote["question"] = unicode(dom.xpath("string(question)"))  # historical votes?
        vote["type"] = unicode(dom.xpath("string(vote_question)"))
        if vote["type"] == "":
            vote["type"] = vote["question"]
        vote["type"] = self.normalize_vote_type(vote["type"])
        vote["category"] = self.get_vote_category(vote["type"])
        vote["subject"] = unicode(dom.xpath("string(vote_title)"))
        vote["requires"] = unicode(dom.xpath("string(majority_requirement)"))
        vote["result_text"] = unicode(dom.xpath("string(vote_result_text)"))
        vote["result"] = unicode(dom.xpath("string(vote_result)"))

        bill_types = {"S.": "s", "S.Con.Res.": "sconres", "S.J.Res.": "sjres", "S.Res.": "sres", "H.R.": "hr", "H.Con.Res.": "hconres", "H.J.Res.": "hjres", "H.Res.": "hres"}

        if unicode(dom.xpath("string(document/document_type)")):
            if dom.xpath("string(document/document_type)") == "PN":
                vote["nomination"] = {
                    "number": unicode(dom.xpath("string(document/document_number)")),
                    "title": unicode(dom.xpath("string(document/document_title)")),
                }
                vote["question"] += ": " + vote["nomination"]["title"]
            elif dom.xpath("string(document/document_type)") == "Treaty Doc.":
                vote["treaty"] = {
                    "title": unicode(dom.xpath("string(document/document_title)")),
                }
            else:
                vote["bill"] = {
                    "congress": int(dom.xpath("number(document/document_congress|congress)")),  # some historical files don't have document/document_congress so take the first of document/document_congress or the top-level congress element as a fall-back
                    "type": bill_types[unicode(dom.xpath("string(document/document_type)"))],
                    "number": int(dom.xpath("number(document/document_number)")),
                    "title": unicode(dom.xpath("string(document/document_title)")),
                }

        if unicode(dom.xpath("string(amendment/amendment_number)")):
            m = re.match(r"^S.Amdt. (\d+)", unicode(dom.xpath("string(amendment/amendment_number)")))
            if m:
                vote["amendment"] = {
                    "type": "s",
                    "number": int(m.group(1)),
                    "purpose": unicode(dom.xpath("string(amendment/amendment_purpose)")),
                }

            amendment_to = unicode(dom.xpath("string(amendment/amendment_to_document_number)"))
            if "Treaty" in amendment_to:
                treaty, number = amendment_to.split("-")
                vote["treaty"] = {
                    "congress": vote["congress"],
                    "number": number,
                }
            elif " " in amendment_to:
                bill_type, bill_number = amendment_to.split(" ")
                vote["bill"] = {
                    "congress": vote["congress"],
                    "type": bill_types[bill_type],
                    "number": int(bill_number),
                    "title": unicode(dom.xpath("string(amendment/amendment_to_document_short_title)")),
                }
            else:
                # Senate votes:
                # 102nd Congress, 2nd session (1992): 247, 248, 250; 105th Congress, 2nd session (1998): 106 through 116; 108th Congress, 1st session (2003): 41, 42
                logging.warn("Amendment without corresponding bill info in %s " % vote["vote_id"])

        # Count up the votes.
        vote["votes"] = {}

        def add_vote(vote_option, voter):
            if vote_option == "Present, Giving Live Pair":
                vote_option = "Present"
            vote["votes"].setdefault(vote_option, []).append(voter)

            # In the 101st Congress, 1st session (1989), votes 133 through 136 lack lis_member_id nodes.
            if voter != "VP" and voter["id"] == "":
                voter["id"] = self.lookup_legislator(vote["congress"], "sen", voter["last_name"], voter["state"], voter["party"], vote["date"], "lis")
                if voter["id"] == None:
                    logging.error("[%s] Missing lis_member_id and name lookup failed for %s" % (vote["vote_id"], voter["last_name"]))
                    raise Exception("Could not find ID for %s (%s-%s)" % (voter["last_name"], voter["state"], voter["party"]))
                else:
                    logging.info("[%s] Missing lis_member_id, falling back to name lookup for %s" % (vote["vote_id"], voter["last_name"]))

        # Ensure the options are noted, even if no one votes that way.
        if unicode(dom.xpath("string(question)")) == "Guilty or Not Guilty":
            vote["votes"]['Guilty'] = []
            vote["votes"]['Not Guilty'] = []
        else:
            vote["votes"]['Yea'] = []
            vote["votes"]['Nay'] = []
        vote["votes"]['Present'] = []
        vote["votes"]['Not Voting'] = []

        # VP tie-breaker?
        if str(dom.xpath("string(tie_breaker/by_whom)")):
            add_vote(str(dom.xpath("string(tie_breaker/tie_breaker_vote)")), "VP")

        for member in dom.xpath("members/member"):
            add_vote(str(member.xpath("string(vote_cast)")), {
                "id": str(member.xpath("string(lis_member_id)")),
                "state": str(member.xpath("string(state)")),
                "party": str(member.xpath("string(party)")),
                "display_name": unicode(member.xpath("string(member_full)")),
                "first_name": str(member.xpath("string(first_name)")),
                "last_name": str(member.xpath("string(last_name)")),
            })

    def output_vote(self, vote, id_type=None):
        logging.info("[%s] Writing to disk..." % vote['vote_id'])

        # output JSON - so easy!
        self.storage.write(
            json.dumps(vote, sort_keys=True, indent=2, default=format_datetime),
            self.output_for_vote(vote["vote_id"], "json"),
            options=self.options
        )

        xmloutput = self.generate_xml(vote, id_type)

        self.storage.write(
            xmloutput,
            self.output_for_vote(vote['vote_id'], "xml"),
            options=self.options
        )

    def generate_xml(self, vote, id_type=None):

        # What kind of IDs are we passed for Members of Congress?
        # For current data, we infer from the chamber. For historical data from voteview,
        # we're passed the type in id_type, which is set to "bioguide".
        if not id_type:
            id_type = ("bioguide" if vote["chamber"] == "h" else "lis")

        # output XML
        root = etree.Element("roll")

        root.set("where", "house" if vote['chamber'] == "h" else "senate")
        root.set("session", str(vote["congress"]))
        root.set("year", str(vote["date"].year))
        root.set("roll", str(vote["number"]))
        if "voteview" in vote["source_url"]:
            root.set("source", "keithpoole")
        else:
            root.set("source", "house.gov" if vote["chamber"] == "h" else "senate.gov")

        root.set("datetime", format_datetime(vote['date']))
        root.set("updated", format_datetime(vote['updated_at']))

        def get_votes(option):
            return len(vote["votes"].get(option, []))
        root.set("aye", str(get_votes("Yea") + get_votes("Aye")))
        root.set("nay", str(get_votes("Nay") + get_votes("No")))
        root.set("nv", str(get_votes("Not Voting")))
        root.set("present", str(get_votes("Present")))

        make_node(root, "category", vote["category"])
        make_node(root, "type", vote["type"])
        make_node(root, "question", vote["question"])
        make_node(root, "required", vote["requires"])
        make_node(root, "result", vote["result"])

        if vote.get("bill"):
            govtrack_type_codes = {'hr': 'h', 's': 's', 'hres': 'hr', 'sres': 'sr', 'hjres': 'hj', 'sjres': 'sj', 'hconres': 'hc', 'sconres': 'sc'}
            make_node(root, "bill", None, session=str(vote["bill"]["congress"]), type=govtrack_type_codes[vote["bill"]["type"]], number=str(vote["bill"]["number"]))

        if "amendment" in vote:
            n = make_node(root, "amendment", None)
            if vote["amendment"]["type"] == "s":
                n.set("ref", "regular")
                n.set("session", str(vote["congress"]))
                n.set("number", "s" + str(vote["amendment"]["number"]))
            elif vote["amendment"]["type"] == "h-bill":
                n.set("ref", "bill-serial")
                n.set("session", str(vote["congress"]))
                n.set("number", str(vote["amendment"]["number"]))

        # well-known keys for certain vote types: +/-/P/0
        option_keys = {"Aye": "+", "Yea": "+", "Nay": "-", "No": "-", "Present": "P", "Not Voting": "0", "Guilty": "+", "Not Guilty": "-" }

        # preferred order of output: ayes, nays, present, then not voting, and similarly for guilty/not-guilty
        # and handling other options like people's names for votes for the Speaker.
        option_sort_order = ('Aye', 'Yea', 'Guilty', 'No', 'Nay', 'Not Guilty', 'OTHER', 'Present', 'Not Voting')
        options_list = sorted(vote["votes"].keys(), key=lambda o: option_sort_order.index(o) if o in option_sort_order else option_sort_order.index("OTHER"))
        for option in options_list:
            if option not in option_keys:
                option_keys[option] = option
            make_node(root, "option", option, key=option_keys[option])

        for option in options_list:
            for v in vote["votes"][option]:
                n = make_node(root, "voter", None)
                if v == "VP":
                    n.set("id", "0")
                    n.set("VP", "1")
                elif not self.options.get("govtrack", False):
                    n.set("id", str(v["id"]))
                else:
                    pass
                    # TODO: this is ridiculously complicated id creation schema, does anybody even use it?
                    #n.set("id", str(utils.get_govtrack_person_id(id_type, v["id"])))
                n.set("vote", option_keys[option])
                n.set("value", option)
                if v != "VP":
                    n.set("state", v["state"])
                    if v.get("voteview_votecode_extra") is not None:
                        n.set("voteview_votecode_extra", v["voteview_votecode_extra"])

        xmloutput = etree.tostring(root, pretty_print=True, encoding="utf8")

        # mimick two hard line breaks in GovTrack's legacy output to ease running diffs
        xmloutput = re.sub('(source=".*?") ', r"\1\n  ", xmloutput)
        xmloutput = re.sub('(updated=".*?") ', r"\1\n  ", xmloutput)

        return xmloutput

    @staticmethod
    def normalize_vote_type(vote_type):
        """
        Takes the "type" field of a House or Senate vote and returns a
        normalized version of the same, as best as possible.

        Note that these allow .* after each pattern, so some things look like
        no-ops but they are really truncating the type after the specified text.

        @param vote_type:
        @type vote_type:
        @return:
        @rtype:
        """
        mapping = (
            (r"^On the Resolution of Ratification.*", "On the Resolution of Ratification"), # order matters so must go before other resolutions
            (r"On (Agreeing to )?the (Joint |Concurrent )?Resolution", "On the $2Resolution"),
            (r"On (Agreeing to )?the Conference Report", "On the Conference Report"),
            (r"On (Agreeing to )?the (En Bloc )?Amendments?", "On the Amendment"),
            (r"On (?:the )?Motion to Recommit", "On the Motion to Recommit"),
            (r"(On Motion to )?(Concur in|Concurring|On Concurring|Agree to|On Agreeing to) (the )?Senate (Amendment|amdt|Adt)s?", "Concurring in the Senate Amendment"),
            (r"(On Motion to )?Suspend (the )?Rules and (Agree|Concur|Pass)(, As Amended)", "On Motion to Suspend the Rules and $3$4"),
            (r"Will the House Now Consider the Resolution|On (Question of )?Consideration of the Resolution", "On Consideration of the Resolution"),
            (r"On (the )?Motion to Adjourn", "On the Motion to Adjourn"),
            (r"On (the )?Cloture Motion", "On the Cloture Motion"),
            (r"On Cloture on the Motion to Proceed", "On the Cloture Motion"),
            (r"On (the )?Nomination", "On the Nomination"),
            (r"On Passage( of the Bill|$)", "On Passage of the Bill"),
            (r"On (the )?Motion to Proceed", "On the Motion to Proceed"),
        )

        for regex, replacement in mapping:
            m = re.match(regex, vote_type, re.I)
            if m:
                if m.groups():
                    for i, val in enumerate(m.groups()):
                        replacement = replacement.replace("$%d" % (i + 1), val if val else "")
                return replacement

        return vote_type

    @staticmethod
    def get_vote_category(vote_question):
        """
        Takes the type/question field of a House or Senate vote and returns a normalized
        category for the vote type.

        Based on Eric's vote_type_for function in sunlightlabs/congress.

        @param vote_question:
        @type vote_question:
        @return:
        @rtype:
        """

        mapping = (
            # empty text (historical data)
            (r"^$", "unknown"),

            # common
            (r"^On Overriding the Veto", "veto-override"),
            (r"^On Presidential Veto", "veto-override"),
            (r"Objections of the President Not ?Withstanding", "veto-override"),  # order matters so must go before bill passage
            (r"^On Passage", "passage"),
            (r"^On the Resolution of Ratification.*", "treaty"), # order matters so must go before other resolutions
            (r"^On (Agreeing to )?the (Joint |Concurrent )?Resolution", "passage"),
            (r"^On (Agreeing to )?the Conference Report", "passage"),
            (r"^On (Agreeing to )?the (En Bloc )?Amendments?", "amendment"),

            # senate only
            (r"cloture", "cloture"),
            (r"^On the Nomination", "nomination"),
            (r"^Guilty or Not Guilty", "conviction"),  # was "impeachment" in sunlightlabs/congress but that's not quite right
            (r"^On (?:the )?Motion to Recommit", "recommit"),
            (r"^On the Motion \(Motion to Concur", "passage"),

            # house only
            (r"^(On Motion (to|that the House) )?(Concur in|Concurring|Concurring in|On Concurring|Agree to|On Agreeing to) (the )?Senate (Amendment|amdt|Adt)s?", "passage"),
            (r"^(On Motion to )?Suspend (the )?Rules and (Agree|Concur|Pass)", "passage-suspension"),
            (r"^Call of the House$", "quorum"),
            (r"^Call by States$", "quorum"),
            (r"^Election of the Speaker$", "leadership"),

            # various procedural things
            # order matters, so these must go last
            (r"^On Ordering the Previous Question", "procedural"),
            (r"^On Approving the Journal", "procedural"),
            (r"^Will the House Now Consider the Resolution|On (Question of )?Consideration of the Resolution", "procedural"),
            (r"^On (the )?Motion to Adjourn", "procedural"),
            (r"Authoriz(e|ing) Conferees", "procedural"),
            (r"On the Point of Order|Sustaining the Ruling of the Chair", "procedural"),
            (r"^On .*Motion ", "procedural"),  # $1 is a name like "Broun of Georgia"
            (r"^On the Decision of the Chair", "procedural"),
            (r"^Whether the Amendment is Germane", "procedural"),
        )

        for regex, category in mapping:
            if re.search(regex, vote_question, re.I):
                return category

        # unhandled
        logging.warn("Unhandled vote question: %s" % vote_question)
        return "unknown"

    @staticmethod
    def split_vote_id(vote_id):
        # Sessions are either four-digit years for modern day votes or a digit or letter
        # for historical votes before sessions were basically calendar years.
        return re.match("^(h|s)(\d+)-(\d+).(\d\d\d\d|[0-9A-Z])$", vote_id).groups()