when exporting to CSV, attempt to keep the field order as the fields appear in the YAML file

This commit is contained in:
Joshua Tauberer
2013-11-18 13:49:59 -05:00
parent 26a37fdad5
commit 2ef60a2b7f

View File

@@ -4,6 +4,7 @@
# python export_csv.py ../legislators-current.yaml
import sys, csv
from collections import OrderedDict
from utils import yaml_load
@@ -31,22 +32,53 @@ def flatten_object(obj, path, ret):
return ret
# Scan through the records recursively to get a list of column names.
# Attempt to preserve the field order as found in the YAML file. Since
# any field may be absent, no one record can provide the complete field
# order. Build the best field order by looking at what each field tends
# to be preceded by.
fields = set()
preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred
for record in data:
for key in flatten_object(record, "", {}):
prev_key = None
for key in flatten_object(record, "", OrderedDict()):
fields.add(key)
# Map column indexes to key names.
fields = sorted(fields)
preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
preceding_keys[key][prev_key] += 1
prev_key = key
# Convert to relative frequencies.
for k, v in preceding_keys.items():
s = float(sum(v.values()))
for k2 in v:
v[k2] /= s
# Get a good order for the fields. Greedily add keys from left to right
# maximizing the conditional probability that the preceding key would
# precede the key on the right.
field_order = [None]
prev_key = None
while len(field_order) < len(fields):
# Which key is such that prev_key is its most likely precedessor?
# We do it this way (and not what is prev_key's most likely follower)
# because we should be using a probability (of sorts) that is
# conditional on the key being present. Otherwise we lost infrequent
# keys.
next_key = max([f for f in fields if f not in field_order], key =
lambda k :
max(preceding_keys[k].get(pk, 0) for pk in field_order))
field_order.append(next_key)
prev_key = next_key
field_order = field_order[1:] # remove the None at the start
# Write CSV header.
w = csv.writer(sys.stdout)
w.writerow(fields)
w.writerow(field_order)
# Write the objects.
for record in data:
obj = flatten_object(record, "", {})
w.writerow([
unicode(obj.get(f, "")).encode("utf8")
for f in fields
])
for f in field_order
])