mirror of
https://github.com/unitedstates/congress.git
synced 2025-12-19 09:06:59 -05:00
make congress into a python package (#267)
change directory structure to make python package conventional add setup.py file to specify deps guide users to use the installed `usc-run` command associated changes to other scripts make scripts installable when package is installed add a symlink for congress/run.py to run for backwards compat remove redundant requirements file
This commit is contained in:
@@ -5,12 +5,12 @@ python:
|
||||
os:
|
||||
- linux
|
||||
|
||||
install: pip install -r requirements.txt
|
||||
install: pip install .
|
||||
script: python test/run
|
||||
|
||||
after_success:
|
||||
- pip install pyflakes
|
||||
- pyflakes tasks/*.py | tee >(wc -l)
|
||||
- pyflakes congress/tasks/*.py | tee >(wc -l)
|
||||
- pyflakes test/*.py | tee >(wc -l)
|
||||
|
||||
notifications:
|
||||
|
||||
@@ -40,7 +40,7 @@ RUN mkdir -p /opt/theunitedstates.io/
|
||||
ADD . /opt/theunitedstates.io/congress/
|
||||
WORKDIR /opt/theunitedstates.io/congress/
|
||||
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install .
|
||||
|
||||
RUN echo "/opt/theunitedstates.io/congress/" > /usr/lib/python3.6/dist-packages/congress.pth
|
||||
|
||||
@@ -48,4 +48,4 @@ RUN mkdir -p /congress
|
||||
WORKDIR /congress
|
||||
|
||||
CMD []
|
||||
ENTRYPOINT ["/opt/theunitedstates.io/congress/run"]
|
||||
ENTRYPOINT ["/opt/theunitedstates.io/congress/congress/run.py"]
|
||||
|
||||
11
README.md
11
README.md
@@ -43,17 +43,18 @@ It's recommended you use a `virtualenv` (virtual environment) for development. C
|
||||
python3 -m venv congress
|
||||
source congress/bin/activate
|
||||
```
|
||||
Finally, with your virtual environment activated, install Python packages:
|
||||
Finally, with your virtual environment activated, install the package, which
|
||||
will automatically pull in the Python dependencies:
|
||||
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
pip install .
|
||||
```
|
||||
|
||||
### Collecting the data
|
||||
|
||||
The general form to start the scraping process is:
|
||||
|
||||
./run <data-type> [--force] [other options]
|
||||
usc-run <data-type> [--force] [other options]
|
||||
|
||||
where data-type is one of:
|
||||
|
||||
@@ -67,8 +68,8 @@ where data-type is one of:
|
||||
To get data for bills, resolutions, and amendments, run:
|
||||
|
||||
```bash
|
||||
./run govinfo --bulkdata=BILLSTATUS
|
||||
./run bills
|
||||
usc-run govinfo --bulkdata=BILLSTATUS
|
||||
usc-run bills
|
||||
```
|
||||
|
||||
The bills script will output bulk data into a top-level `data` directory, then organized by Congress number, bill type, and bill number. Two data output files will be generated for each bill: a JSON version (data.json) and an XML version (data.xml).
|
||||
|
||||
0
congress/contrib/__init__.py
Normal file
0
congress/contrib/__init__.py
Normal file
@@ -4,7 +4,7 @@ A module that monkey-patches the output_bill method to push the bill identifier
|
||||
onto a task queue after the data file has been written to disk. To use this
|
||||
module, invoke the bills scraper with the --patch option like so:
|
||||
|
||||
./run bills --patch=contrib.beanstalkd
|
||||
usc-run bills --patch=contrib.beanstalkd
|
||||
|
||||
You must include a 'beakstalk' section in config.yml with this structure
|
||||
(though the values are up to you):
|
||||
@@ -34,9 +34,7 @@ import beanstalkc
|
||||
|
||||
# The patch module is loaded after the task module is loaded, so all task
|
||||
# modules are on the import path.
|
||||
import bills
|
||||
import amendment_info
|
||||
import vote_info
|
||||
from congress.tasks import bills, amendment_info, vote_info
|
||||
|
||||
|
||||
__all__ = [
|
||||
78
congress/run.py
Executable file
78
congress/run.py
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import os
|
||||
import traceback
|
||||
import pprint as pp
|
||||
import logging
|
||||
import importlib
|
||||
|
||||
# set global HTTP timeouts to 10 seconds
|
||||
import socket
|
||||
|
||||
def main():
|
||||
socket.setdefaulttimeout(10)
|
||||
|
||||
CONGRESS_ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# name of the task comes first
|
||||
task_name = sys.argv[1]
|
||||
|
||||
# parse any command line flags off
|
||||
options = {}
|
||||
for arg in sys.argv[2:]:
|
||||
if arg.startswith("--"):
|
||||
|
||||
if "=" in arg:
|
||||
key, value = arg.split('=')
|
||||
else:
|
||||
key, value = arg, True
|
||||
|
||||
key = key.split("--")[1]
|
||||
if value == 'True':
|
||||
value = True
|
||||
elif value == 'False':
|
||||
value = False
|
||||
options[key.lower()] = value
|
||||
|
||||
|
||||
# configure logging
|
||||
if options.get('debug', False):
|
||||
log_level = "debug"
|
||||
else:
|
||||
log_level = options.get("log", "warn")
|
||||
|
||||
if log_level not in ["debug", "info", "warn", "error"]:
|
||||
print("Invalid log level (specify: debug, info, warn, error).")
|
||||
sys.exit(1)
|
||||
|
||||
if options.get('timestamps', False):
|
||||
logging.basicConfig(format='%(asctime)s %(message)s', level=log_level.upper())
|
||||
else:
|
||||
logging.basicConfig(format='%(message)s', level=log_level.upper())
|
||||
|
||||
|
||||
sys.path.append(os.path.join(CONGRESS_ROOT, "tasks"))
|
||||
import utils
|
||||
|
||||
try:
|
||||
task_mod = __import__(task_name)
|
||||
|
||||
if 'patch' in options:
|
||||
patch_mod = importlib.import_module(options['patch'])
|
||||
patch_func = getattr(patch_mod, 'patch', None)
|
||||
if patch_func is None:
|
||||
logging.error("You specified a --patch argument but the {} module does not contain a 'patch' function.".format(options['patch']))
|
||||
sys.exit(1)
|
||||
elif not callable(patch_func):
|
||||
logging.error("You specified a --patch argument but {}.patch is not callable".format(options['patch']))
|
||||
sys.exit(1)
|
||||
else:
|
||||
patch_mod.patch(task_name)
|
||||
|
||||
task_mod.run(options)
|
||||
except Exception as exception:
|
||||
utils.admin(exception)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
congress/tasks/__init__.py
Normal file
0
congress/tasks/__init__.py
Normal file
@@ -6,7 +6,7 @@ import csv
|
||||
import zipfile
|
||||
import datetime
|
||||
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
|
||||
|
||||
def run(options):
|
||||
@@ -5,9 +5,9 @@ import time
|
||||
import json
|
||||
from lxml import etree
|
||||
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
|
||||
from bill_info import sponsor_for as sponsor_for_bill, action_for
|
||||
from congress.tasks.bill_info import sponsor_for as sponsor_for_bill, action_for
|
||||
|
||||
def process_amendment(amdt_data, bill_id, options):
|
||||
amdt = build_amendment_json_dict(amdt_data, options)
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
@@ -4,10 +4,7 @@ import os
|
||||
import re
|
||||
import xmltodict
|
||||
|
||||
import bill_info
|
||||
import amendment_info
|
||||
import govinfo
|
||||
import utils
|
||||
from congress.tasks import bill_info, amendment_info, govinfo, utils
|
||||
|
||||
|
||||
def run(options):
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import os.path
|
||||
import os
|
||||
import re
|
||||
@@ -3,12 +3,12 @@
|
||||
# https://www.govinfo.gov/sitemaps for a list of collections.
|
||||
# This service was formerly called "Fdsys."
|
||||
#
|
||||
# ./run govinfo--collections=BILLS,STATUTE,...
|
||||
# usc-run govinfo --collections=BILLS,STATUTE,...
|
||||
# Download bill text (from the BILLS collection; there's also a bulk
|
||||
# data BILLS collection but it has less in it), the Statues at Large,
|
||||
# and other documents from GovInfo.gov's non-bulk-data collections.
|
||||
#
|
||||
# ./run govinfo --bulkdata=BILLSTATUS,FR,...
|
||||
# usc-run govinfo --bulkdata=BILLSTATUS,FR,...
|
||||
# Download bill status, the Federal Register, and other documents
|
||||
# from GovInfo.gov's bulk data collections. (The BILLS collection occurs
|
||||
# both as a regular collection (bill text in multiple formats) and as
|
||||
@@ -46,7 +46,7 @@ import logging
|
||||
import os
|
||||
import os.path
|
||||
import zipfile
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
|
||||
import rtyaml
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
@@ -1,11 +1,11 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
from lxml import html, etree
|
||||
import logging
|
||||
|
||||
import nomination_info
|
||||
from congress.tasks import nomination_info
|
||||
|
||||
|
||||
def run(options):
|
||||
@@ -12,15 +12,15 @@
|
||||
#
|
||||
# First download the Statutes at Large from GPO:
|
||||
#
|
||||
# ./run fdsys --collections=STATUTE --store=mods
|
||||
# usc-run fdsys --collections=STATUTE --store=mods
|
||||
#
|
||||
# To process statute text, get the text PDFs:
|
||||
#
|
||||
# ./run fdsys --collections=STATUTE --store=pdfs --granules
|
||||
# usc-run fdsys --collections=STATUTE --store=pdfs --granules
|
||||
#
|
||||
# Then run this script:
|
||||
#
|
||||
# ./run statutes
|
||||
# usc-run statutes
|
||||
#
|
||||
# Processes all downloaded statutes files and saves bill files:
|
||||
# data/82/bills/hr/hr1/data.json and
|
||||
@@ -41,10 +41,10 @@
|
||||
# UTF-8 encoded and have form-feed characters marking page breaks.
|
||||
#
|
||||
# Examples:
|
||||
# ./run statutes --volume=65
|
||||
# ./run statutes --volumes=65-86
|
||||
# ./run statutes --year=1951
|
||||
# ./run statutes --years=1951-1972
|
||||
# usc-run statutes --volume=65
|
||||
# usc-run statutes --volumes=65-86
|
||||
# usc-run statutes --year=1951
|
||||
# usc-run statutes --years=1951-1972
|
||||
# Processes just the indicated volume or range of volumes.
|
||||
# Starting with the 93rd Congress (1973-1974, corresponding
|
||||
# to volume 78 of the Statutes of Large), we have bill
|
||||
@@ -52,7 +52,7 @@
|
||||
#
|
||||
# With bill text missing from THOMAS/GPO from the 93rd to
|
||||
# 102nd Congresses, fill in the text-versions files like so:
|
||||
# ./run statutes --volumes=87-106 --textversions
|
||||
# usc-run statutes --volumes=87-106 --textversions
|
||||
|
||||
import logging
|
||||
import time
|
||||
@@ -63,9 +63,7 @@ import json
|
||||
import os.path
|
||||
import subprocess
|
||||
|
||||
import utils
|
||||
import bill_info
|
||||
import bill_versions
|
||||
from congress.tasks import utils, bill_info, bill_versions
|
||||
import fdsys
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
@@ -13,7 +13,7 @@ import subprocess
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bills import output_for_bill
|
||||
from congress.tasks.bills import output_for_bill
|
||||
|
||||
# Parsing data from the House' upcoming floor feed, at
|
||||
# https://docs.house.gov/floor/
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils
|
||||
from congress.tasks import utils
|
||||
import json
|
||||
from iso8601 import iso8601
|
||||
import datetime
|
||||
@@ -11,7 +11,7 @@ import datetime
|
||||
from lxml import html, etree
|
||||
import logging
|
||||
|
||||
import vote_info
|
||||
from congress.tasks import vote_info
|
||||
|
||||
|
||||
def run(options):
|
||||
@@ -5,8 +5,8 @@ import datetime
|
||||
import time
|
||||
import logging
|
||||
|
||||
import utils
|
||||
from vote_info import output_vote
|
||||
from congress.tasks import utils
|
||||
from congress.tasks.vote_info import output_vote
|
||||
|
||||
# load some hard-coded codes
|
||||
special_vote_options = { }
|
||||
@@ -1,12 +0,0 @@
|
||||
pyyaml
|
||||
iso8601
|
||||
python-dateutil
|
||||
lxml>=2.2
|
||||
pytz
|
||||
cssselect
|
||||
scrapelib
|
||||
mechanize
|
||||
BeautifulSoup4
|
||||
mock
|
||||
xmltodict
|
||||
rtyaml
|
||||
74
run
74
run
@@ -1,74 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import os
|
||||
import traceback
|
||||
import pprint as pp
|
||||
import logging
|
||||
import importlib
|
||||
|
||||
# set global HTTP timeouts to 10 seconds
|
||||
import socket
|
||||
socket.setdefaulttimeout(10)
|
||||
|
||||
CONGRESS_ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# name of the task comes first
|
||||
task_name = sys.argv[1]
|
||||
|
||||
# parse any command line flags off
|
||||
options = {}
|
||||
for arg in sys.argv[2:]:
|
||||
if arg.startswith("--"):
|
||||
|
||||
if "=" in arg:
|
||||
key, value = arg.split('=')
|
||||
else:
|
||||
key, value = arg, True
|
||||
|
||||
key = key.split("--")[1]
|
||||
if value == 'True':
|
||||
value = True
|
||||
elif value == 'False':
|
||||
value = False
|
||||
options[key.lower()] = value
|
||||
|
||||
|
||||
# configure logging
|
||||
if options.get('debug', False):
|
||||
log_level = "debug"
|
||||
else:
|
||||
log_level = options.get("log", "warn")
|
||||
|
||||
if log_level not in ["debug", "info", "warn", "error"]:
|
||||
print("Invalid log level (specify: debug, info, warn, error).")
|
||||
sys.exit(1)
|
||||
|
||||
if options.get('timestamps', False):
|
||||
logging.basicConfig(format='%(asctime)s %(message)s', level=log_level.upper())
|
||||
else:
|
||||
logging.basicConfig(format='%(message)s', level=log_level.upper())
|
||||
|
||||
|
||||
sys.path.append(os.path.join(CONGRESS_ROOT, "tasks"))
|
||||
import utils
|
||||
|
||||
try:
|
||||
task_mod = __import__(task_name)
|
||||
|
||||
if 'patch' in options:
|
||||
patch_mod = importlib.import_module(options['patch'])
|
||||
patch_func = getattr(patch_mod, 'patch', None)
|
||||
if patch_func is None:
|
||||
logging.error("You specified a --patch argument but the {} module does not contain a 'patch' function.".format(options['patch']))
|
||||
sys.exit(1)
|
||||
elif not callable(patch_func):
|
||||
logging.error("You specified a --patch argument but {}.patch is not callable".format(options['patch']))
|
||||
sys.exit(1)
|
||||
else:
|
||||
patch_mod.patch(task_name)
|
||||
|
||||
task_mod.run(options)
|
||||
except Exception as exception:
|
||||
utils.admin(exception)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
# Refresh the bulk data collection.
|
||||
./run govinfo --bulkdata=BILLSTATUS
|
||||
usc-run govinfo --bulkdata=BILLSTATUS
|
||||
|
||||
# Turn into JSON and GovTrack-XML.
|
||||
./run bills --govtrack $@
|
||||
usc-run bills --govtrack $@
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/sh
|
||||
./run govinfo --collections=STATUTE --extract=mods,pdf
|
||||
./run statutes --volumes=65-86 --govtrack # bill status
|
||||
./run statutes --volumes=65-106 --textversions --extracttext # bill text
|
||||
usc-run govinfo --collections=STATUTE --extract=mods,pdf
|
||||
usc-run statutes --volumes=65-86 --govtrack # bill status
|
||||
usc-run statutes --volumes=65-106 --textversions --extracttext # bill text
|
||||
|
||||
@@ -1,27 +1,27 @@
|
||||
#!/bin/sh
|
||||
./run votes --govtrack --congress=101 --session=1989 --chamber=senate $@
|
||||
./run votes --govtrack --congress=101 --session=1990 $@
|
||||
./run votes --govtrack --congress=102 --session=1991 $@
|
||||
./run votes --govtrack --congress=102 --session=1992 $@
|
||||
./run votes --govtrack --congress=103 --session=1993 $@
|
||||
./run votes --govtrack --congress=103 --session=1994 $@
|
||||
./run votes --govtrack --congress=104 --session=1995 $@
|
||||
./run votes --govtrack --congress=104 --session=1996 $@
|
||||
./run votes --govtrack --congress=105 --session=1997 $@
|
||||
./run votes --govtrack --congress=105 --session=1998 $@
|
||||
./run votes --govtrack --congress=106 --session=1999 $@
|
||||
./run votes --govtrack --congress=106 --session=2000 $@
|
||||
./run votes --govtrack --congress=107 --session=2001 $@
|
||||
./run votes --govtrack --congress=107 --session=2002 $@
|
||||
./run votes --govtrack --congress=108 --session=2003 $@
|
||||
./run votes --govtrack --congress=108 --session=2004 $@
|
||||
./run votes --govtrack --congress=109 --session=2005 $@
|
||||
./run votes --govtrack --congress=109 --session=2006 $@
|
||||
./run votes --govtrack --congress=110 --session=2007 $@
|
||||
./run votes --govtrack --congress=110 --session=2008 $@
|
||||
./run votes --govtrack --congress=111 --session=2009 $@
|
||||
./run votes --govtrack --congress=111 --session=2010 $@
|
||||
./run votes --govtrack --congress=112 --session=2011 $@
|
||||
./run votes --govtrack --congress=112 --session=2012 $@
|
||||
./run votes --govtrack --congress=113 --session=2013 $@
|
||||
./run votes --govtrack --congress=113 --session=2014 $@
|
||||
usc-run votes --govtrack --congress=101 --session=1989 --chamber=senate $@
|
||||
usc-run votes --govtrack --congress=101 --session=1990 $@
|
||||
usc-run votes --govtrack --congress=102 --session=1991 $@
|
||||
usc-run votes --govtrack --congress=102 --session=1992 $@
|
||||
usc-run votes --govtrack --congress=103 --session=1993 $@
|
||||
usc-run votes --govtrack --congress=103 --session=1994 $@
|
||||
usc-run votes --govtrack --congress=104 --session=1995 $@
|
||||
usc-run votes --govtrack --congress=104 --session=1996 $@
|
||||
usc-run votes --govtrack --congress=105 --session=1997 $@
|
||||
usc-run votes --govtrack --congress=105 --session=1998 $@
|
||||
usc-run votes --govtrack --congress=106 --session=1999 $@
|
||||
usc-run votes --govtrack --congress=106 --session=2000 $@
|
||||
usc-run votes --govtrack --congress=107 --session=2001 $@
|
||||
usc-run votes --govtrack --congress=107 --session=2002 $@
|
||||
usc-run votes --govtrack --congress=108 --session=2003 $@
|
||||
usc-run votes --govtrack --congress=108 --session=2004 $@
|
||||
usc-run votes --govtrack --congress=109 --session=2005 $@
|
||||
usc-run votes --govtrack --congress=109 --session=2006 $@
|
||||
usc-run votes --govtrack --congress=110 --session=2007 $@
|
||||
usc-run votes --govtrack --congress=110 --session=2008 $@
|
||||
usc-run votes --govtrack --congress=111 --session=2009 $@
|
||||
usc-run votes --govtrack --congress=111 --session=2010 $@
|
||||
usc-run votes --govtrack --congress=112 --session=2011 $@
|
||||
usc-run votes --govtrack --congress=112 --session=2012 $@
|
||||
usc-run votes --govtrack --congress=113 --session=2013 $@
|
||||
usc-run votes --govtrack --congress=113 --session=2014 $@
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
for congress in {1..100}; do
|
||||
./run voteview --congress=$congress --govtrack $@
|
||||
usc-run voteview --congress=$congress --govtrack $@
|
||||
|
||||
# After the first run, no need to update legislator info.
|
||||
export UPDATE_CONGRESS_LEGISLATORS=NO
|
||||
done
|
||||
./run voteview --govtrack --congress=101 --session=1989 --chamber=h $@
|
||||
usc-run voteview --govtrack --congress=101 --session=1989 --chamber=h $@
|
||||
|
||||
|
||||
45
setup.py
Normal file
45
setup.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Setup file for using congress as a python package."""
|
||||
from os import path
|
||||
|
||||
import setuptools
|
||||
|
||||
# Obtain long_description from README.md
|
||||
here = path.abspath(path.dirname(__file__))
|
||||
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
|
||||
long_description = f.read()
|
||||
|
||||
setuptools.setup(
|
||||
name='united-states-congress',
|
||||
version='0.0.1',
|
||||
author='The unitedstates organization on GitHub',
|
||||
long_description=long_description,
|
||||
description='Public domain data collectors for the work of Congress, '
|
||||
'including legislation, amendments, and votes.',
|
||||
license='CC0-1.0',
|
||||
packages=setuptools.find_packages(),
|
||||
install_requires=[
|
||||
'beautifulsoup4',
|
||||
'cssselect',
|
||||
'iso8601',
|
||||
'lxml',
|
||||
'mechanize',
|
||||
'mock',
|
||||
'rtyaml',
|
||||
'python-dateutil',
|
||||
'pytz',
|
||||
'pyyaml',
|
||||
'scrapelib',
|
||||
'xmltodict',
|
||||
],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'usc-run=congress.run:main'
|
||||
],
|
||||
},
|
||||
scripts=[
|
||||
'scripts/bills.sh',
|
||||
'scripts/statutes.sh',
|
||||
'scripts/votes.sh',
|
||||
'scripts/voteview.sh'
|
||||
],
|
||||
)
|
||||
2
test/run
2
test/run
@@ -2,7 +2,7 @@
|
||||
|
||||
import sys
|
||||
import unittest
|
||||
sys.path.append("tasks") # allow test classes to easily load tasks
|
||||
sys.path.append("congress/tasks") # allow test classes to easily load tasks
|
||||
sys.path.append("test") # allow fixtures.py to be loaded
|
||||
|
||||
tests = unittest.TestLoader().discover("test")
|
||||
|
||||
Reference in New Issue
Block a user