make congress into a python package (#267)

change directory structure to make python package conventional
add setup.py file to specify deps

guide users to use the installed `usc-run` command
associated changes to other scripts
make scripts installable when package is installed

add a symlink for congress/run.py to run for backwards compat

remove redundant requirements file
This commit is contained in:
Akash Patel
2022-02-27 20:13:50 -05:00
committed by GitHub
parent 659b293b8e
commit c10772e3f3
31 changed files with 198 additions and 166 deletions

View File

@@ -5,12 +5,12 @@ python:
os:
- linux
install: pip install -r requirements.txt
install: pip install .
script: python test/run
after_success:
- pip install pyflakes
- pyflakes tasks/*.py | tee >(wc -l)
- pyflakes congress/tasks/*.py | tee >(wc -l)
- pyflakes test/*.py | tee >(wc -l)
notifications:

View File

@@ -40,7 +40,7 @@ RUN mkdir -p /opt/theunitedstates.io/
ADD . /opt/theunitedstates.io/congress/
WORKDIR /opt/theunitedstates.io/congress/
RUN pip install -r requirements.txt
RUN pip install .
RUN echo "/opt/theunitedstates.io/congress/" > /usr/lib/python3.6/dist-packages/congress.pth
@@ -48,4 +48,4 @@ RUN mkdir -p /congress
WORKDIR /congress
CMD []
ENTRYPOINT ["/opt/theunitedstates.io/congress/run"]
ENTRYPOINT ["/opt/theunitedstates.io/congress/congress/run.py"]

View File

@@ -43,17 +43,18 @@ It's recommended you use a `virtualenv` (virtual environment) for development. C
python3 -m venv congress
source congress/bin/activate
```
Finally, with your virtual environment activated, install Python packages:
Finally, with your virtual environment activated, install the package, which
will automatically pull in the Python dependencies:
```bash
pip3 install -r requirements.txt
pip install .
```
### Collecting the data
The general form to start the scraping process is:
./run <data-type> [--force] [other options]
usc-run <data-type> [--force] [other options]
where data-type is one of:
@@ -67,8 +68,8 @@ where data-type is one of:
To get data for bills, resolutions, and amendments, run:
```bash
./run govinfo --bulkdata=BILLSTATUS
./run bills
usc-run govinfo --bulkdata=BILLSTATUS
usc-run bills
```
The bills script will output bulk data into a top-level `data` directory, then organized by Congress number, bill type, and bill number. Two data output files will be generated for each bill: a JSON version (data.json) and an XML version (data.xml).

View File

View File

@@ -4,7 +4,7 @@ A module that monkey-patches the output_bill method to push the bill identifier
onto a task queue after the data file has been written to disk. To use this
module, invoke the bills scraper with the --patch option like so:
./run bills --patch=contrib.beanstalkd
usc-run bills --patch=contrib.beanstalkd
You must include a 'beakstalk' section in config.yml with this structure
(though the values are up to you):
@@ -34,9 +34,7 @@ import beanstalkc
# The patch module is loaded after the task module is loaded, so all task
# modules are on the import path.
import bills
import amendment_info
import vote_info
from congress.tasks import bills, amendment_info, vote_info
__all__ = [

78
congress/run.py Executable file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python
import sys
import os
import traceback
import pprint as pp
import logging
import importlib
# set global HTTP timeouts to 10 seconds
import socket
def main():
socket.setdefaulttimeout(10)
CONGRESS_ROOT = os.path.dirname(os.path.abspath(__file__))
# name of the task comes first
task_name = sys.argv[1]
# parse any command line flags off
options = {}
for arg in sys.argv[2:]:
if arg.startswith("--"):
if "=" in arg:
key, value = arg.split('=')
else:
key, value = arg, True
key = key.split("--")[1]
if value == 'True':
value = True
elif value == 'False':
value = False
options[key.lower()] = value
# configure logging
if options.get('debug', False):
log_level = "debug"
else:
log_level = options.get("log", "warn")
if log_level not in ["debug", "info", "warn", "error"]:
print("Invalid log level (specify: debug, info, warn, error).")
sys.exit(1)
if options.get('timestamps', False):
logging.basicConfig(format='%(asctime)s %(message)s', level=log_level.upper())
else:
logging.basicConfig(format='%(message)s', level=log_level.upper())
sys.path.append(os.path.join(CONGRESS_ROOT, "tasks"))
import utils
try:
task_mod = __import__(task_name)
if 'patch' in options:
patch_mod = importlib.import_module(options['patch'])
patch_func = getattr(patch_mod, 'patch', None)
if patch_func is None:
logging.error("You specified a --patch argument but the {} module does not contain a 'patch' function.".format(options['patch']))
sys.exit(1)
elif not callable(patch_func):
logging.error("You specified a --patch argument but {}.patch is not callable".format(options['patch']))
sys.exit(1)
else:
patch_mod.patch(task_name)
task_mod.run(options)
except Exception as exception:
utils.admin(exception)
if __name__ == "__main__":
main()

View File

View File

@@ -6,7 +6,7 @@ import csv
import zipfile
import datetime
import utils
from congress.tasks import utils
def run(options):

View File

@@ -5,9 +5,9 @@ import time
import json
from lxml import etree
import utils
from congress.tasks import utils
from bill_info import sponsor_for as sponsor_for_bill, action_for
from congress.tasks.bill_info import sponsor_for as sponsor_for_bill, action_for
def process_amendment(amdt_data, bill_id, options):
amdt = build_amendment_json_dict(amdt_data, options)

View File

@@ -1,4 +1,4 @@
import utils
from congress.tasks import utils
import logging
import re
import json

View File

@@ -4,10 +4,7 @@ import os
import re
import xmltodict
import bill_info
import amendment_info
import govinfo
import utils
from congress.tasks import bill_info, amendment_info, govinfo, utils
def run(options):

View File

@@ -1,4 +1,4 @@
import utils
from congress.tasks import utils
import os.path
import os
import re

View File

@@ -3,12 +3,12 @@
# https://www.govinfo.gov/sitemaps for a list of collections.
# This service was formerly called "Fdsys."
#
# ./run govinfo--collections=BILLS,STATUTE,...
# usc-run govinfo --collections=BILLS,STATUTE,...
# Download bill text (from the BILLS collection; there's also a bulk
# data BILLS collection but it has less in it), the Statues at Large,
# and other documents from GovInfo.gov's non-bulk-data collections.
#
# ./run govinfo --bulkdata=BILLSTATUS,FR,...
# usc-run govinfo --bulkdata=BILLSTATUS,FR,...
# Download bill status, the Federal Register, and other documents
# from GovInfo.gov's bulk data collections. (The BILLS collection occurs
# both as a regular collection (bill text in multiple formats) and as
@@ -46,7 +46,7 @@ import logging
import os
import os.path
import zipfile
import utils
from congress.tasks import utils
import rtyaml

View File

@@ -1,4 +1,4 @@
import utils
from congress.tasks import utils
import logging
import re
import json

View File

@@ -1,11 +1,11 @@
import utils
from congress.tasks import utils
import os
import os.path
import re
from lxml import html, etree
import logging
import nomination_info
from congress.tasks import nomination_info
def run(options):

View File

@@ -12,15 +12,15 @@
#
# First download the Statutes at Large from GPO:
#
# ./run fdsys --collections=STATUTE --store=mods
# usc-run fdsys --collections=STATUTE --store=mods
#
# To process statute text, get the text PDFs:
#
# ./run fdsys --collections=STATUTE --store=pdfs --granules
# usc-run fdsys --collections=STATUTE --store=pdfs --granules
#
# Then run this script:
#
# ./run statutes
# usc-run statutes
#
# Processes all downloaded statutes files and saves bill files:
# data/82/bills/hr/hr1/data.json and
@@ -41,10 +41,10 @@
# UTF-8 encoded and have form-feed characters marking page breaks.
#
# Examples:
# ./run statutes --volume=65
# ./run statutes --volumes=65-86
# ./run statutes --year=1951
# ./run statutes --years=1951-1972
# usc-run statutes --volume=65
# usc-run statutes --volumes=65-86
# usc-run statutes --year=1951
# usc-run statutes --years=1951-1972
# Processes just the indicated volume or range of volumes.
# Starting with the 93rd Congress (1973-1974, corresponding
# to volume 78 of the Statutes of Large), we have bill
@@ -52,7 +52,7 @@
#
# With bill text missing from THOMAS/GPO from the 93rd to
# 102nd Congresses, fill in the text-versions files like so:
# ./run statutes --volumes=87-106 --textversions
# usc-run statutes --volumes=87-106 --textversions
import logging
import time
@@ -63,9 +63,7 @@ import json
import os.path
import subprocess
import utils
import bill_info
import bill_versions
from congress.tasks import utils, bill_info, bill_versions
import fdsys

View File

@@ -1,4 +1,4 @@
import utils
from congress.tasks import utils
import logging
import sys
import os
@@ -13,7 +13,7 @@ import subprocess
from bs4 import BeautifulSoup
from bills import output_for_bill
from congress.tasks.bills import output_for_bill
# Parsing data from the House' upcoming floor feed, at
# https://docs.house.gov/floor/

View File

@@ -1,4 +1,4 @@
import utils
from congress.tasks import utils
import logging
import re
import json

View File

@@ -1,4 +1,4 @@
import utils
from congress.tasks import utils
import json
from iso8601 import iso8601
import datetime
@@ -11,7 +11,7 @@ import datetime
from lxml import html, etree
import logging
import vote_info
from congress.tasks import vote_info
def run(options):

View File

@@ -5,8 +5,8 @@ import datetime
import time
import logging
import utils
from vote_info import output_vote
from congress.tasks import utils
from congress.tasks.vote_info import output_vote
# load some hard-coded codes
special_vote_options = { }

View File

@@ -1,12 +0,0 @@
pyyaml
iso8601
python-dateutil
lxml>=2.2
pytz
cssselect
scrapelib
mechanize
BeautifulSoup4
mock
xmltodict
rtyaml

74
run
View File

@@ -1,74 +0,0 @@
#!/usr/bin/env python
import sys
import os
import traceback
import pprint as pp
import logging
import importlib
# set global HTTP timeouts to 10 seconds
import socket
socket.setdefaulttimeout(10)
CONGRESS_ROOT = os.path.dirname(os.path.abspath(__file__))
# name of the task comes first
task_name = sys.argv[1]
# parse any command line flags off
options = {}
for arg in sys.argv[2:]:
if arg.startswith("--"):
if "=" in arg:
key, value = arg.split('=')
else:
key, value = arg, True
key = key.split("--")[1]
if value == 'True':
value = True
elif value == 'False':
value = False
options[key.lower()] = value
# configure logging
if options.get('debug', False):
log_level = "debug"
else:
log_level = options.get("log", "warn")
if log_level not in ["debug", "info", "warn", "error"]:
print("Invalid log level (specify: debug, info, warn, error).")
sys.exit(1)
if options.get('timestamps', False):
logging.basicConfig(format='%(asctime)s %(message)s', level=log_level.upper())
else:
logging.basicConfig(format='%(message)s', level=log_level.upper())
sys.path.append(os.path.join(CONGRESS_ROOT, "tasks"))
import utils
try:
task_mod = __import__(task_name)
if 'patch' in options:
patch_mod = importlib.import_module(options['patch'])
patch_func = getattr(patch_mod, 'patch', None)
if patch_func is None:
logging.error("You specified a --patch argument but the {} module does not contain a 'patch' function.".format(options['patch']))
sys.exit(1)
elif not callable(patch_func):
logging.error("You specified a --patch argument but {}.patch is not callable".format(options['patch']))
sys.exit(1)
else:
patch_mod.patch(task_name)
task_mod.run(options)
except Exception as exception:
utils.admin(exception)

1
run Symbolic link
View File

@@ -0,0 +1 @@
congress/run.py

View File

@@ -1,6 +1,6 @@
#!/bin/sh
# Refresh the bulk data collection.
./run govinfo --bulkdata=BILLSTATUS
usc-run govinfo --bulkdata=BILLSTATUS
# Turn into JSON and GovTrack-XML.
./run bills --govtrack $@
usc-run bills --govtrack $@

View File

@@ -1,4 +1,4 @@
#!/bin/sh
./run govinfo --collections=STATUTE --extract=mods,pdf
./run statutes --volumes=65-86 --govtrack # bill status
./run statutes --volumes=65-106 --textversions --extracttext # bill text
usc-run govinfo --collections=STATUTE --extract=mods,pdf
usc-run statutes --volumes=65-86 --govtrack # bill status
usc-run statutes --volumes=65-106 --textversions --extracttext # bill text

View File

@@ -1,27 +1,27 @@
#!/bin/sh
./run votes --govtrack --congress=101 --session=1989 --chamber=senate $@
./run votes --govtrack --congress=101 --session=1990 $@
./run votes --govtrack --congress=102 --session=1991 $@
./run votes --govtrack --congress=102 --session=1992 $@
./run votes --govtrack --congress=103 --session=1993 $@
./run votes --govtrack --congress=103 --session=1994 $@
./run votes --govtrack --congress=104 --session=1995 $@
./run votes --govtrack --congress=104 --session=1996 $@
./run votes --govtrack --congress=105 --session=1997 $@
./run votes --govtrack --congress=105 --session=1998 $@
./run votes --govtrack --congress=106 --session=1999 $@
./run votes --govtrack --congress=106 --session=2000 $@
./run votes --govtrack --congress=107 --session=2001 $@
./run votes --govtrack --congress=107 --session=2002 $@
./run votes --govtrack --congress=108 --session=2003 $@
./run votes --govtrack --congress=108 --session=2004 $@
./run votes --govtrack --congress=109 --session=2005 $@
./run votes --govtrack --congress=109 --session=2006 $@
./run votes --govtrack --congress=110 --session=2007 $@
./run votes --govtrack --congress=110 --session=2008 $@
./run votes --govtrack --congress=111 --session=2009 $@
./run votes --govtrack --congress=111 --session=2010 $@
./run votes --govtrack --congress=112 --session=2011 $@
./run votes --govtrack --congress=112 --session=2012 $@
./run votes --govtrack --congress=113 --session=2013 $@
./run votes --govtrack --congress=113 --session=2014 $@
usc-run votes --govtrack --congress=101 --session=1989 --chamber=senate $@
usc-run votes --govtrack --congress=101 --session=1990 $@
usc-run votes --govtrack --congress=102 --session=1991 $@
usc-run votes --govtrack --congress=102 --session=1992 $@
usc-run votes --govtrack --congress=103 --session=1993 $@
usc-run votes --govtrack --congress=103 --session=1994 $@
usc-run votes --govtrack --congress=104 --session=1995 $@
usc-run votes --govtrack --congress=104 --session=1996 $@
usc-run votes --govtrack --congress=105 --session=1997 $@
usc-run votes --govtrack --congress=105 --session=1998 $@
usc-run votes --govtrack --congress=106 --session=1999 $@
usc-run votes --govtrack --congress=106 --session=2000 $@
usc-run votes --govtrack --congress=107 --session=2001 $@
usc-run votes --govtrack --congress=107 --session=2002 $@
usc-run votes --govtrack --congress=108 --session=2003 $@
usc-run votes --govtrack --congress=108 --session=2004 $@
usc-run votes --govtrack --congress=109 --session=2005 $@
usc-run votes --govtrack --congress=109 --session=2006 $@
usc-run votes --govtrack --congress=110 --session=2007 $@
usc-run votes --govtrack --congress=110 --session=2008 $@
usc-run votes --govtrack --congress=111 --session=2009 $@
usc-run votes --govtrack --congress=111 --session=2010 $@
usc-run votes --govtrack --congress=112 --session=2011 $@
usc-run votes --govtrack --congress=112 --session=2012 $@
usc-run votes --govtrack --congress=113 --session=2013 $@
usc-run votes --govtrack --congress=113 --session=2014 $@

View File

@@ -1,8 +1,8 @@
for congress in {1..100}; do
./run voteview --congress=$congress --govtrack $@
usc-run voteview --congress=$congress --govtrack $@
# After the first run, no need to update legislator info.
export UPDATE_CONGRESS_LEGISLATORS=NO
done
./run voteview --govtrack --congress=101 --session=1989 --chamber=h $@
usc-run voteview --govtrack --congress=101 --session=1989 --chamber=h $@

45
setup.py Normal file
View File

@@ -0,0 +1,45 @@
"""Setup file for using congress as a python package."""
from os import path
import setuptools
# Obtain long_description from README.md
here = path.abspath(path.dirname(__file__))
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setuptools.setup(
name='united-states-congress',
version='0.0.1',
author='The unitedstates organization on GitHub',
long_description=long_description,
description='Public domain data collectors for the work of Congress, '
'including legislation, amendments, and votes.',
license='CC0-1.0',
packages=setuptools.find_packages(),
install_requires=[
'beautifulsoup4',
'cssselect',
'iso8601',
'lxml',
'mechanize',
'mock',
'rtyaml',
'python-dateutil',
'pytz',
'pyyaml',
'scrapelib',
'xmltodict',
],
entry_points={
'console_scripts': [
'usc-run=congress.run:main'
],
},
scripts=[
'scripts/bills.sh',
'scripts/statutes.sh',
'scripts/votes.sh',
'scripts/voteview.sh'
],
)

View File

@@ -2,7 +2,7 @@
import sys
import unittest
sys.path.append("tasks") # allow test classes to easily load tasks
sys.path.append("congress/tasks") # allow test classes to easily load tasks
sys.path.append("test") # allow fixtures.py to be loaded
tests = unittest.TestLoader().discover("test")