Files
congress-legislators/scripts/export_csv.py

89 lines
2.7 KiB
Python
Executable File

# Converts the specified YAML file to an equivalent-ish CSV file
# (on standard output).
#
# python export_csv.py ../legislators-current.yaml
import sys, csv
from collections import OrderedDict
from utils import yaml_load
def run():
if len(sys.argv) < 2:
print("Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv")
sys.exit(0)
data = yaml_load(sys.argv[1])
###############################################
def flatten_object(obj, path, ret):
"""Takes an object obj and flattens it into a dictionary ret.
For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
"""
for k, v in list(obj.items()):
if isinstance(v, dict):
flatten_object(v, (path + "__" if path else "") + k + "__", ret)
elif isinstance(v, list):
# don't peek inside lists
pass
else:
ret[path + k] = v
return ret
# Scan through the records recursively to get a list of column names.
# Attempt to preserve the field order as found in the YAML file. Since
# any field may be absent, no one record can provide the complete field
# order. Build the best field order by looking at what each field tends
# to be preceded by.
fields = set()
preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred
for record in data:
prev_key = None
for key in flatten_object(record, "", OrderedDict()):
fields.add(key)
preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
preceding_keys[key][prev_key] += 1
prev_key = key
# Convert to relative frequencies.
for k, v in list(preceding_keys.items()):
s = float(sum(v.values()))
for k2 in v:
v[k2] /= s
# Get a good order for the fields. Greedily add keys from left to right
# maximizing the conditional probability that the preceding key would
# precede the key on the right.
field_order = [None]
prev_key = None
while len(field_order) < len(fields):
# Which key is such that prev_key is its most likely precedessor?
# We do it this way (and not what is prev_key's most likely follower)
# because we should be using a probability (of sorts) that is
# conditional on the key being present. Otherwise we lost infrequent
# keys.
next_key = max([f for f in fields if f not in field_order], key =
lambda k :
max(preceding_keys[k].get(pk, 0) for pk in field_order))
field_order.append(next_key)
prev_key = next_key
field_order = field_order[1:] # remove the None at the start
# Write CSV header.
w = csv.writer(sys.stdout)
w.writerow(field_order)
# Write the objects.
for record in data:
obj = flatten_object(record, "", {})
w.writerow([
obj.get(f, "")
for f in field_order
])
if __name__ == '__main__':
run()