Files

39 lines
1.4 KiB
Python

import argparse
import os
import azureml.core
from azureml.core import Run
from sklearn.model_selection import train_test_split
def write_output(df, path):
os.makedirs(path, exist_ok=True)
print("%s created" % path)
df.to_parquet(path + "/processed.parquet")
print("Split the data into train and test")
run = Run.get_context()
transformed_data = run.input_datasets['transformed_data']
transformed_df = transformed_data.to_pandas_dataframe()
parser = argparse.ArgumentParser("split")
parser.add_argument("--output_split_train", type=str, help="output split train data")
parser.add_argument("--output_split_test", type=str, help="output split test data")
args = parser.parse_args()
print("Argument 1(output training data split path): %s" % args.output_split_train)
print("Argument 2(output test data split path): %s" % args.output_split_test)
# These functions splits the input features and labels into test and train data
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail
output_split_train, output_split_test = train_test_split(transformed_df, test_size=0.2, random_state=223)
output_split_train.reset_index(inplace=True, drop=True)
output_split_test.reset_index(inplace=True, drop=True)
if not (args.output_split_train is None and
args.output_split_test is None):
write_output(output_split_train, args.output_split_train)
write_output(output_split_test, args.output_split_test)