Files
contact-congress/support/extractoptions.py
makecakenotwar97@gmail.com 019ac2054a added a lot of files, edits to extractoptions
all of these forms use generic "Thank" for the body contains: field
A000367 (fixes $491)
B000013 (fixes #487)
B001252 (fixes #484)
B001269 (fixes #485)
B001270 (fixes #482)
H001064 (fixes #11)
K000381 (fixes #12)
K000385 (fixes #9)
P000607 (fixes #10)
S001192 (fixes #14)
S001195 (fixes #8)
V000132 (fixes #15)
2013-09-22 18:56:00 -04:00

114 lines
5.0 KiB
Python

'''
HOW TO USE:
This file:
Extracts the option "value" attributes, and creates a hash if the option's text does not match the option's value
Extracts all input name + value, and outputs a fill_in list (note that this will output for ALL <input> tags)
You should try to give this file inputs containing only the forms that you want to extract from.
Copy this file to a separate location (so as not to send random files
to github)
Create a folder named "input" in the same location
Copy the section of the HTML containing the form options that you
want to extract (does not have to be exact, you can even copy and paste
the entire HTML source code if you want, only there is less chance for
errors if you only copy/paste the section containing the form)
Paste into a text file (NOT word) and save in the "input" folder.
Run the .py file in python 3.3, and copy/paste the resulting output.
It is recommended to check the output for any errors. If the attribute is
in single quotes you can change this in the regular expressions below to
selectoptions: r'(?:<option.*?value\s*=\s*)(\'.*?\')'
inputname: r'(?:<input.*?name\s*=\s*\')(.*?)(?:\')'
inputselectors: r'(?:<input.*?id\s*=\s*\')(.*?)(?:\')'
'''
import os
import re
YAMLselections = []
#compile regular expressions
#select = re.compile(r'<select.*?</select>', re.DOTALL | re.IGNORECASE)
selectoptions = re.compile(r'(?:<option.*?value\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
selecttext = re.compile(r'(?:<option.*?value.*?>)(.*?)(?:</)', re.IGNORECASE)
inputname = re.compile(r'(?:<input.*?name\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
inputselectors = re.compile(r'(?:<input.*?id\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
for root,dirs,files in os.walk('.\input'):
for file in files:
with open('.\input\\' + file, 'r') as f:
html = f.read()
#limit by form
#input search
data = '------------\n' + file + '\n' + 'inputs:' + '\n'
inputs = re.findall(inputname, html)
selectors = re.findall(inputselectors, html)
data = data + " " + "- fill_in:\n"
for i in range(0,len(inputs)):
try:
data = data + " " + "- name: " + inputs[i] + "\n"
data = data + " " + "selector: \"#" + selectors[i] + "\"\n"
data = data + " " + "value: "
if inputs[i].find("first") != -1:
data = data + "$NAME_FIRST\n"
elif inputs[i].find("last") != -1:
data = data + "$NAME_LAST\n"
elif inputs[i].find("zip5") != -1:
data = data + "$ADDRESS_ZIP5\n"
elif inputs[i].find("zip4") != -1:
data = data + "$ADDRESS_ZIP4\n"
elif inputs[i].find("prefix") != -1:
data = data + "$NAME_PREFIX\n"
elif inputs[i].find("address2") != -1:
data = data + "$ADDRESS_STREET_2\n"
elif inputs[i].find("address") != -1:
data = data + "$ADDRESS_STREET\n"
elif inputs[i].find("city") != -1:
data = data + "$ADDRESS_CITY\n"
elif inputs[i].find("email") != -1:
data = data + "$EMAIL\n"
elif inputs[i].find("phone") != -1:
data = data + "$PHONE\n"
elif inputs[i].find("subject") != -1:
data = data + "$SUBJECT\n"
elif inputs[i].find("message") != -1:
data = data + "$MESSAGE\n"
else:
data = data + "\n"
data = data + " " + "required: "
if inputs[i].find("required") != -1:
data = data + "Yes\n"
else:
data = data + "\n"
except IndexError:
data = data + "\ninputs may not match selectors\n"
data = data + " " + "- name: \n" + " " + "selector: \"#\n" + " " + "value: \n" + " " + "required: \n"
#option search
options = re.findall(selectoptions, html)
text = re.findall(selecttext, html)
data = data + '------------\n' + file + '\n' + 'options:' + '\n'
if len(options) != len(text):
data = data + "\nerror with text, printing option values only\n"
for option in options:
data = data + " "
data = data + "- \"" + option + "\"\n"
YAMLselections.append(data)
elif set(options) != set(text):
data = data + "\noptions != text, printing hash\n"
for i in range(0,len(options)):
data = data + " "
data = data + "\"" + text[i] + "\":" + " \"" + options[i] + "\"\n"
YAMLselections.append(data)
else:
for option in options:
data = data + " "
data = data + "- \"" + option + "\"\n"
YAMLselections.append(data)
for x in YAMLselections:
print(x)