# --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- """Utilities for azureml-contrib-fairness notebooks.""" import arff from collections import OrderedDict from contextlib import closing import gzip import pandas as pd from sklearn.utils import Bunch from time import sleep def _is_gzip_encoded(_fsrc): return _fsrc.info().get('Content-Encoding', '') == 'gzip' _categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country' ] def fetch_census_dataset(): """Fetch the Adult Census Dataset This uses a particular URL for the Adult Census dataset. The code is a simplified version of fetch_openml() in sklearn. The data are copied from: https://openml.org/data/v1/download/1595261.gz (as of 2021-03-31) """ dataset_path = "1595261.gz" try: file_stream = gzip.GzipFile(filename=dataset_path, mode='rb') with closing(file_stream): def _stream_generator(response): for line in response: yield line.decode('utf-8') stream = _stream_generator(file_stream) data = arff.load(stream) except Exception as exc: raise Exception("Could not load dataset from {} with exception {}".format(dataset_path, exc)) attributes = OrderedDict(data['attributes']) arff_columns = list(attributes) raw_df = pd.DataFrame(data=data['data'], columns=arff_columns) target_column_name = 'class' target = raw_df.pop(target_column_name) for col_name in _categorical_columns: dtype = pd.api.types.CategoricalDtype(attributes[col_name]) raw_df[col_name] = raw_df[col_name].astype(dtype, copy=False) result = Bunch() result.data = raw_df result.target = target return result