Files
MachineLearningNotebooks/how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/utilities.py

74 lines
1.9 KiB
Python

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
"""Utilities for azureml-contrib-fairness notebooks."""
import arff
from collections import OrderedDict
from contextlib import closing
import gzip
import pandas as pd
from sklearn.utils import Bunch
from time import sleep
def _is_gzip_encoded(_fsrc):
return _fsrc.info().get('Content-Encoding', '') == 'gzip'
_categorical_columns = [
'workclass',
'education',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'native-country'
]
def fetch_census_dataset():
"""Fetch the Adult Census Dataset
This uses a particular URL for the Adult Census dataset. The code
is a simplified version of fetch_openml() in sklearn.
The data are copied from:
https://openml.org/data/v1/download/1595261.gz
(as of 2021-03-31)
"""
dataset_path = "1595261.gz"
try:
file_stream = gzip.GzipFile(filename=dataset_path, mode='rb')
with closing(file_stream):
def _stream_generator(response):
for line in response:
yield line.decode('utf-8')
stream = _stream_generator(file_stream)
data = arff.load(stream)
except Exception as exc:
raise Exception("Could not load dataset from {} with exception {}".format(dataset_path, exc))
attributes = OrderedDict(data['attributes'])
arff_columns = list(attributes)
raw_df = pd.DataFrame(data=data['data'], columns=arff_columns)
target_column_name = 'class'
target = raw_df.pop(target_column_name)
for col_name in _categorical_columns:
dtype = pd.api.types.CategoricalDtype(attributes[col_name])
raw_df[col_name] = raw_df[col_name].astype(dtype, copy=False)
result = Bunch()
result.data = raw_df
result.target = target
return result