MachineLearningNotebooks/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/cleanse.py

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license.

import argparse
import os
from azureml.core import Run


def get_dict(dict_str):
    pairs = dict_str.strip("{}").split("\;")
    new_dict = {}
    for pair in pairs:
        key, value = pair.strip().split(":")
        new_dict[key.strip().strip("'")] = value.strip().strip("'")

    return new_dict


print("Cleans the input data")

# Get the input green_taxi_data. To learn more about how to access dataset in your script, please
# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets.
run = Run.get_context()
raw_data = run.input_datasets["raw_data"]


parser = argparse.ArgumentParser("cleanse")
parser.add_argument("--output_cleanse", type=str, help="cleaned taxi data directory")
parser.add_argument("--useful_columns", type=str, help="useful columns to keep")
parser.add_argument("--columns", type=str, help="rename column pattern")

args = parser.parse_args()

print("Argument 1(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;")))
print("Argument 2(columns renaming mapping): %s" % str(args.columns.strip("{}").split("\;")))
print("Argument 3(output cleansed taxi data path): %s" % args.output_cleanse)

# These functions ensure that null data is removed from the dataset,
# which will help increase machine learning model accuracy.

useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")]
columns = get_dict(args.columns)

new_df = (raw_data.to_pandas_dataframe()
          .dropna(how='all')
          .rename(columns=columns))[useful_columns]

new_df.reset_index(inplace=True, drop=True)

if not (args.output_cleanse is None):
    os.makedirs(args.output_cleanse, exist_ok=True)
    print("%s created" % args.output_cleanse)
    path = args.output_cleanse + "/processed.parquet"
    write_df = new_df.to_parquet(path)