mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-25 02:14:46 -05:00
when exporting to CSV, attempt to keep the field order as the fields appear in the YAML file
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
# python export_csv.py ../legislators-current.yaml
|
||||
|
||||
import sys, csv
|
||||
from collections import OrderedDict
|
||||
|
||||
from utils import yaml_load
|
||||
|
||||
@@ -31,22 +32,53 @@ def flatten_object(obj, path, ret):
|
||||
return ret
|
||||
|
||||
# Scan through the records recursively to get a list of column names.
|
||||
# Attempt to preserve the field order as found in the YAML file. Since
|
||||
# any field may be absent, no one record can provide the complete field
|
||||
# order. Build the best field order by looking at what each field tends
|
||||
# to be preceded by.
|
||||
fields = set()
|
||||
preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred
|
||||
for record in data:
|
||||
for key in flatten_object(record, "", {}):
|
||||
prev_key = None
|
||||
for key in flatten_object(record, "", OrderedDict()):
|
||||
fields.add(key)
|
||||
|
||||
# Map column indexes to key names.
|
||||
fields = sorted(fields)
|
||||
preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
|
||||
preceding_keys[key][prev_key] += 1
|
||||
prev_key = key
|
||||
|
||||
# Convert to relative frequencies.
|
||||
for k, v in preceding_keys.items():
|
||||
s = float(sum(v.values()))
|
||||
for k2 in v:
|
||||
v[k2] /= s
|
||||
|
||||
# Get a good order for the fields. Greedily add keys from left to right
|
||||
# maximizing the conditional probability that the preceding key would
|
||||
# precede the key on the right.
|
||||
field_order = [None]
|
||||
prev_key = None
|
||||
while len(field_order) < len(fields):
|
||||
# Which key is such that prev_key is its most likely precedessor?
|
||||
# We do it this way (and not what is prev_key's most likely follower)
|
||||
# because we should be using a probability (of sorts) that is
|
||||
# conditional on the key being present. Otherwise we lost infrequent
|
||||
# keys.
|
||||
next_key = max([f for f in fields if f not in field_order], key =
|
||||
lambda k :
|
||||
max(preceding_keys[k].get(pk, 0) for pk in field_order))
|
||||
field_order.append(next_key)
|
||||
prev_key = next_key
|
||||
field_order = field_order[1:] # remove the None at the start
|
||||
|
||||
# Write CSV header.
|
||||
w = csv.writer(sys.stdout)
|
||||
w.writerow(fields)
|
||||
w.writerow(field_order)
|
||||
|
||||
# Write the objects.
|
||||
for record in data:
|
||||
obj = flatten_object(record, "", {})
|
||||
w.writerow([
|
||||
unicode(obj.get(f, "")).encode("utf8")
|
||||
for f in fields
|
||||
])
|
||||
for f in field_order
|
||||
])
|
||||
|
||||
Reference in New Issue
Block a user