congress-legislators/scripts/export_csv.py

# Converts the specified YAML file to an equivalent-ish CSV file
# (on standard output).
#
# python export_csv.py ../legislators-current.yaml

import sys, csv
from collections import OrderedDict

from utils import yaml_load

def run():

	if len(sys.argv) < 2:
		print("Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv")
		sys.exit(0)

	data = yaml_load(sys.argv[1])

	###############################################

	def flatten_object(obj, path, ret):
		"""Takes an object obj and flattens it into a dictionary ret.

		For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
		"""
		for k, v in list(obj.items()):
			if isinstance(v, dict):
				flatten_object(v, (path + "__" if path else "") + k + "__", ret)
			elif isinstance(v, list):
				# don't peek inside lists
				pass
			else:
				ret[path + k] = v
		return ret

	# Scan through the records recursively to get a list of column names.
	# Attempt to preserve the field order as found in the YAML file. Since
	# any field may be absent, no one record can provide the complete field
	# order. Build the best field order by looking at what each field tends
	# to be preceded by.
	fields = set()
	preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred
	for record in data:
		prev_key = None
		for key in flatten_object(record, "", OrderedDict()):
			fields.add(key)

			preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
			preceding_keys[key][prev_key] += 1
			prev_key = key

	# Convert to relative frequencies.
	for k, v in list(preceding_keys.items()):
		s = float(sum(v.values()))
		for k2 in v:
			v[k2] /= s

	# Get a good order for the fields. Greedily add keys from left to right
	# maximizing the conditional probability that the preceding key would
	# precede the key on the right.
	field_order = [None]
	prev_key = None
	while len(field_order) < len(fields):
		# Which key is such that prev_key is its most likely precedessor?
		# We do it this way (and not what is prev_key's most likely follower)
		# because we should be using a probability (of sorts) that is
		# conditional on the key being present. Otherwise we lost infrequent
		# keys.
		next_key = max([f for f in fields if f not in field_order], key =
			lambda k :
				max(preceding_keys[k].get(pk, 0) for pk in field_order))
		field_order.append(next_key)
		prev_key = next_key
	field_order = field_order[1:] # remove the None at the start

	# Write CSV header.
	w = csv.writer(sys.stdout)
	w.writerow(field_order)

	# Write the objects.
	for record in data:
		obj = flatten_object(record, "", {})
		w.writerow([
			obj.get(f, "")
			for f in field_order
			])

if __name__ == '__main__':
  run()