mirror of
https://github.com/unitedstates/contact-congress.git
synced 2025-12-19 17:37:24 -05:00
83 lines
3.5 KiB
Python
83 lines
3.5 KiB
Python
'''
|
|
HOW TO USE:
|
|
This file:
|
|
Extracts the option "value" attributes, and creates a hash if the option's text does not match the option's value
|
|
Extracts all input name + value, and outputs a fill_in list (note that this will output for ALL <input> tags)
|
|
|
|
You should try to give this file inputs containing only the forms that you want to extract from.
|
|
|
|
Copy this file to a separate location (so as not to send random files
|
|
to github)
|
|
|
|
Create a folder named "input" in the same location
|
|
|
|
Copy the section of the HTML containing the form options that you
|
|
want to extract (does not have to be exact, you can even copy and paste
|
|
the entire HTML source code if you want, only there is less chance for
|
|
errors if you only copy/paste the section containing the form)
|
|
|
|
Paste into a text file (NOT word) and save in the "input" folder.
|
|
|
|
Run the .py file in python 3.3, and copy/paste the resulting output.
|
|
|
|
It is recommended to check the output for any errors. If the attribute is
|
|
in single quotes you can change this in the regular expressions below to
|
|
selectoptions: r'(?:<option.*?value\s*=\s*)(\'.*?\')'
|
|
inputname: r'(?:<input.*?name\s*=\s*\')(.*?)(?:\')'
|
|
inputselectors: r'(?:<input.*?id\s*=\s*\')(.*?)(?:\')'
|
|
'''
|
|
|
|
import os
|
|
import re
|
|
|
|
YAMLselections = []
|
|
|
|
#compile regular expressions
|
|
#select = re.compile(r'<select.*?</select>', re.DOTALL | re.IGNORECASE)
|
|
selectoptions = re.compile(r'(?:<option.*?value\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
|
|
selecttext = re.compile(r'(?:<option.*?value.*?>)(.*?)(?:</)', re.IGNORECASE)
|
|
inputname = re.compile(r'(?:<input.*?name\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
|
|
inputselectors = re.compile(r'(?:<input.*?id\s*=\s*\")(.*?)(?:\")', re.IGNORECASE)
|
|
for root,dirs,files in os.walk('.\input'):
|
|
for file in files:
|
|
with open('.\input\\' + file, 'r') as f:
|
|
html = f.read()
|
|
#limit by form
|
|
#input search
|
|
data = '------------\n' + file + '\n' + 'inputs:' + '\n'
|
|
inputs = re.findall(inputname, html)
|
|
selectors = re.findall(inputselectors, html)
|
|
data = data + " " + "- fill_in:\n"
|
|
for i in range(0,len(inputs)):
|
|
try:
|
|
data = data + " " + "- name: " + inputs[i] + "\n"
|
|
data = data + " " + "selector: \"#" + selectors[i] + "\"\n"
|
|
data = data + " " + "value: \n"
|
|
data = data + " " + "required: \n"
|
|
except IndexError:
|
|
data = data + "\ninputs may not match selectors\n"
|
|
#option search
|
|
options = re.findall(selectoptions, html)
|
|
text = re.findall(selecttext, html)
|
|
data = data + '------------\n' + file + '\n' + 'options:' + '\n'
|
|
if len(options) != len(text):
|
|
data = data + "\nerror with text, printing option values only\n"
|
|
for option in options:
|
|
data = data + " "
|
|
data = data + "- \"" + option + "\"\n"
|
|
YAMLselections.append(data)
|
|
elif set(options) != set(text):
|
|
data = data + "\noptions != text, printing hash\n"
|
|
for i in range(0,len(options)):
|
|
data = data + " "
|
|
data = data + "\"" + text[i] + "\":" + " \"" + options[i] + "\"\n"
|
|
YAMLselections.append(data)
|
|
else:
|
|
for option in options:
|
|
data = data + " "
|
|
data = data + "- \"" + option + "\"\n"
|
|
YAMLselections.append(data)
|
|
|
|
for x in YAMLselections:
|
|
print(x)
|