mirror of
https://github.com/unitedstates/contact-congress.git
synced 2025-12-19 17:37:24 -05:00
all of these forms use generic "Thank" for the body contains: field A000367 (fixes $491) B000013 (fixes #487) B001252 (fixes #484) B001269 (fixes #485) B001270 (fixes #482) H001064 (fixes #11) K000381 (fixes #12) K000385 (fixes #9) P000607 (fixes #10) S001192 (fixes #14) S001195 (fixes #8) V000132 (fixes #15)
114 lines
5.0 KiB
Python
114 lines
5.0 KiB
Python
'''
|
|
HOW TO USE:
|
|
This file:
|
|
Extracts the option "value" attributes, and creates a hash if the option's text does not match the option's value
|
|
Extracts all input name + value, and outputs a fill_in list (note that this will output for ALL <input> tags)
|
|
|
|
You should try to give this file inputs containing only the forms that you want to extract from.
|
|
|
|
Copy this file to a separate location (so as not to send random files
|
|
to github)
|
|
|
|
Create a folder named "input" in the same location
|
|
|
|
Copy the section of the HTML containing the form options that you
|
|
want to extract (does not have to be exact, you can even copy and paste
|
|
the entire HTML source code if you want, only there is less chance for
|
|
errors if you only copy/paste the section containing the form)
|
|
|
|
Paste into a text file (NOT word) and save in the "input" folder.
|
|
|
|
Run the .py file in python 3.3, and copy/paste the resulting output.
|
|
|
|
It is recommended to check the output for any errors. If the attribute is
|
|
in single quotes you can change this in the regular expressions below to
|
|
selectoptions: r'(?:<option.*?value\s*=\s*)(\'.*?\')'
|
|
inputname: r'(?:<input.*?name\s*=\s*\')(.*?)(?:\')'
|
|
inputselectors: r'(?:<input.*?id\s*=\s*\')(.*?)(?:\')'
|
|
'''
|
|
|
|
import os
|
|
import re
|
|
|
|
YAMLselections = []
|
|
|
|
#compile regular expressions
|
|
#select = re.compile(r'<select.*?</select>', re.DOTALL | re.IGNORECASE)
|
|
selectoptions = re.compile(r'(?:<option.*?value\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
|
|
selecttext = re.compile(r'(?:<option.*?value.*?>)(.*?)(?:</)', re.IGNORECASE)
|
|
inputname = re.compile(r'(?:<input.*?name\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
|
|
inputselectors = re.compile(r'(?:<input.*?id\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
|
|
for root,dirs,files in os.walk('.\input'):
|
|
for file in files:
|
|
with open('.\input\\' + file, 'r') as f:
|
|
html = f.read()
|
|
#limit by form
|
|
#input search
|
|
data = '------------\n' + file + '\n' + 'inputs:' + '\n'
|
|
inputs = re.findall(inputname, html)
|
|
selectors = re.findall(inputselectors, html)
|
|
data = data + " " + "- fill_in:\n"
|
|
for i in range(0,len(inputs)):
|
|
try:
|
|
data = data + " " + "- name: " + inputs[i] + "\n"
|
|
data = data + " " + "selector: \"#" + selectors[i] + "\"\n"
|
|
data = data + " " + "value: "
|
|
if inputs[i].find("first") != -1:
|
|
data = data + "$NAME_FIRST\n"
|
|
elif inputs[i].find("last") != -1:
|
|
data = data + "$NAME_LAST\n"
|
|
elif inputs[i].find("zip5") != -1:
|
|
data = data + "$ADDRESS_ZIP5\n"
|
|
elif inputs[i].find("zip4") != -1:
|
|
data = data + "$ADDRESS_ZIP4\n"
|
|
elif inputs[i].find("prefix") != -1:
|
|
data = data + "$NAME_PREFIX\n"
|
|
elif inputs[i].find("address2") != -1:
|
|
data = data + "$ADDRESS_STREET_2\n"
|
|
elif inputs[i].find("address") != -1:
|
|
data = data + "$ADDRESS_STREET\n"
|
|
elif inputs[i].find("city") != -1:
|
|
data = data + "$ADDRESS_CITY\n"
|
|
elif inputs[i].find("email") != -1:
|
|
data = data + "$EMAIL\n"
|
|
elif inputs[i].find("phone") != -1:
|
|
data = data + "$PHONE\n"
|
|
elif inputs[i].find("subject") != -1:
|
|
data = data + "$SUBJECT\n"
|
|
elif inputs[i].find("message") != -1:
|
|
data = data + "$MESSAGE\n"
|
|
else:
|
|
data = data + "\n"
|
|
data = data + " " + "required: "
|
|
if inputs[i].find("required") != -1:
|
|
data = data + "Yes\n"
|
|
else:
|
|
data = data + "\n"
|
|
except IndexError:
|
|
data = data + "\ninputs may not match selectors\n"
|
|
data = data + " " + "- name: \n" + " " + "selector: \"#\n" + " " + "value: \n" + " " + "required: \n"
|
|
#option search
|
|
options = re.findall(selectoptions, html)
|
|
text = re.findall(selecttext, html)
|
|
data = data + '------------\n' + file + '\n' + 'options:' + '\n'
|
|
if len(options) != len(text):
|
|
data = data + "\nerror with text, printing option values only\n"
|
|
for option in options:
|
|
data = data + " "
|
|
data = data + "- \"" + option + "\"\n"
|
|
YAMLselections.append(data)
|
|
elif set(options) != set(text):
|
|
data = data + "\noptions != text, printing hash\n"
|
|
for i in range(0,len(options)):
|
|
data = data + " "
|
|
data = data + "\"" + text[i] + "\":" + " \"" + options[i] + "\"\n"
|
|
YAMLselections.append(data)
|
|
else:
|
|
for option in options:
|
|
data = data + " "
|
|
data = data + "- \"" + option + "\"\n"
|
|
YAMLselections.append(data)
|
|
|
|
for x in YAMLselections:
|
|
print(x)
|