mirror of
https://github.com/unitedstates/congress.git
synced 2026-03-25 14:00:05 -04:00
128 lines
4.9 KiB
Python
128 lines
4.9 KiB
Python
from pdfminer.pdfparser import PDFParser, PDFDocument
|
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
|
from pdfminer.converter import TextConverter, HTMLConverter
|
|
from pdfminer.layout import LAParams, LTText, LTTextBoxHorizontal
|
|
from pdfminer.converter import PDFPageAggregator
|
|
from pdfminer.pdfdevice import PDFDevice
|
|
import re, math
|
|
from collections import defaultdict
|
|
import logging
|
|
|
|
rsrcmgr = PDFResourceManager()
|
|
laparams = LAParams()
|
|
|
|
#divide between page number boxes and text
|
|
XDIVIDE = 149
|
|
|
|
#maximum acceptable distance in y values between two corresponding boxes.
|
|
MARGIN = 3
|
|
|
|
#pixels per
|
|
SPACE = 5
|
|
|
|
def serialize_pdf_item(item):
|
|
return {
|
|
'x0': item.x0,
|
|
'y0': item.y0,
|
|
'x1': item.x1,
|
|
'y1': item.y1,
|
|
'text': item.get_text()
|
|
}
|
|
|
|
#see http://www.unixuser.org/~euske/python/pdfminer/programming.html#layout
|
|
#take the PDF of the bill and return a generator for each page
|
|
def get_text_from_pdf(filename):
|
|
fp = file(filename, 'rb')
|
|
parser = PDFParser(fp)
|
|
doc = PDFDocument()
|
|
parser.set_document(doc)
|
|
doc.set_parser(parser)
|
|
doc.initialize('')
|
|
device = PDFDevice(rsrcmgr)
|
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
#page numbers are given in LTTextLineHorizontal objects
|
|
#borrowing some ideas from https://github.com/euske/pdfminer/blob/master/pdfminer/converter.py
|
|
text = {}
|
|
|
|
#roadmap will contain markers to section headers
|
|
roadmap = {}
|
|
for page in enumerate(doc.get_pages()):
|
|
logging.info("Extracting page %d" % (page[0] + 1))
|
|
interpreter.process_page(page[1])
|
|
layout = device.get_result()
|
|
|
|
#we're going to build two lists, one of line numbers and one of text corresponding to those line numbers
|
|
line_numbers = []
|
|
line_text = []
|
|
|
|
for item in [x for x in layout if isinstance(x, LTTextBoxHorizontal)]:
|
|
#print item
|
|
# if it's an LTTextBoxHorizontal, left of 150px and a digit, it's (hopefully) a line number
|
|
if item.x0 < XDIVIDE and re.sub("[\s\d]+", "", item.get_text()) == "":
|
|
line_numbers.append(item)
|
|
# else if it's on the other side of the xdivide, add to candidates for lines of text
|
|
elif item.x0 >= XDIVIDE:
|
|
line_text.append(item)
|
|
#we can optionally catch everything else here
|
|
'''
|
|
else:
|
|
extra.append(item)
|
|
'''
|
|
|
|
# now we need to match them up
|
|
# MARGIN is the acceptable distance in y values between two boxes to still decide they correspond to one another
|
|
lines = defaultdict(list)
|
|
for number in line_numbers:
|
|
c = 0
|
|
while c < len(line_text):
|
|
if abs(number.y0 - line_text[c].y0) <= MARGIN: #and abs(number.y1 - line_text[c].y1) <= MARGIN:
|
|
lines[number.get_text().strip()].append(line_text[c])
|
|
del line_text[c]
|
|
else:
|
|
c += 1
|
|
|
|
# this will contain new lines we discover -- see below
|
|
extras = {}
|
|
|
|
# sort text elements in the line by x pos, and indent according to first element's x position
|
|
for line in lines:
|
|
lines[line] = sorted(lines[line], key=lambda x: x.x0)
|
|
# store multiline nodes
|
|
previous = [x for x in lines[line] if len(x.get_text().split('\n')) > 2]
|
|
|
|
prefix = " " * int(math.floor((lines[line][0].x0 - XDIVIDE) / SPACE))
|
|
lines[line] = prefix + " ".join([x.get_text().strip().split('\n')[-1] for x in lines[line]])
|
|
|
|
# if we don't have anything for the previous line, put multiline nodes here. Mainly useful for TITLE TK \n SUBHEAD \n etc
|
|
if line != '1' and str(int(line) - 1) not in lines:
|
|
extras[str(int(line) - 1)] = " ".join([x.get_text().strip().split('\n')[-2] for x in previous])
|
|
|
|
#print extras
|
|
lines = dict(lines.items() + extras.items())
|
|
text[page[0] + 1] = lines
|
|
|
|
# build roadmap
|
|
current_title = ""
|
|
for page in text:
|
|
lines = text[page]
|
|
for line in sorted(lines.keys()):
|
|
temp = re.search("TITLE [IVXL]+", lines[line])
|
|
if temp:
|
|
roadmap[temp.group(0).upper()] = [page, line]
|
|
current_title = temp.group(0)
|
|
temp = re.search("Subtitle [A-Z]", lines[line])
|
|
if temp:
|
|
roadmap[temp.group(0).upper() + " OF " + current_title] = [page, line]
|
|
|
|
#add last repository for "appropriate place"
|
|
last_page = sorted(text.keys())[-1]
|
|
text[int(last_page) + 1] = { '0' : "<em>Amendments with unspecified locations.</em>" }
|
|
roadmap["APPROPRIATE PLACE"] = [last_page + 1, 0]
|
|
|
|
return {
|
|
"text": text,
|
|
"roadmap": roadmap
|
|
}
|