Files
2019-06-19 18:55:32 -04:00

32 lines
1.4 KiB
Python

import argparse
import os
import azureml.dataprep as dprep
import azureml.core
print("Extracts important features from prepared data")
parser = argparse.ArgumentParser("featurization")
parser.add_argument("--input_featurization", type=str, help="input featurization")
parser.add_argument("--useful_columns", type=str, help="columns to use")
parser.add_argument("--output_featurization", type=str, help="output featurization")
args = parser.parse_args()
print("Argument 1(input training data path): %s" % args.input_featurization)
print("Argument 2(column features to use): %s" % str(args.useful_columns.strip("[]").split("\;")))
print("Argument 3:(output featurized training data path) %s" % args.output_featurization)
dflow_prepared = dprep.read_csv(args.input_featurization + '/part-*')
# These functions extracts useful features for training
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail
useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")]
dflow = dflow_prepared.keep_columns(useful_columns)
if not (args.output_featurization is None):
os.makedirs(args.output_featurization, exist_ok=True)
print("%s created" % args.output_featurization)
write_df = dflow.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_featurization))
write_df.run_local()