mirror of
https://github.com/unitedstates/contact-congress.git
synced 2025-12-19 17:37:24 -05:00
.py file for extracting the options from the dropdown lists
It's in Python 3.3. I coded it really quickly but it's been really helpful and I thought I might share it. What you do is create a folder named "input" in the same directory as the .py file. Then save the html source of the page (not the entire website) into the input folder. Then run the .py file, and it'll print out to the shell a list of all of the dropdown menu options. It needs a bit of formatting but it's certainly better than manually typing all of them :)
This commit is contained in:
27
support/extractoptions.py
Normal file
27
support/extractoptions.py
Normal file
@@ -0,0 +1,27 @@
|
||||
#regular expression: (?:<option value\s*=\s*)([\'\"].*?[\'\"])
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
os.makedirs('.\output', exist_ok=True)
|
||||
|
||||
YAMLselections = []
|
||||
|
||||
#compile regular expressions
|
||||
select = re.compile(r'<select.*?</select>', re.DOTALL)
|
||||
selectoptions = re.compile(r'(?:<option value\s*=\s*)([\'\"].*?[\'\"])')
|
||||
|
||||
for root,dirs,files in os.walk('.\input'):
|
||||
for file in files:
|
||||
with open('.\input\\' + file, 'r') as f:
|
||||
html = f.read()
|
||||
selectors = re.findall(select, html)
|
||||
for selector in selectors:
|
||||
options = re.findall(selectoptions, selector)
|
||||
data = '------------\n' + file + '\n' + 'options:' + '\n'
|
||||
for option in options:
|
||||
data = data + " "
|
||||
data = data + "- " + option + '\n'
|
||||
YAMLselections.append(data)
|
||||
for x in YAMLselections:
|
||||
print(x)
|
||||
Reference in New Issue
Block a user