mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 17:45:10 -05:00
46 lines
1.7 KiB
Python
46 lines
1.7 KiB
Python
import argparse
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import azureml.train.automl.runtime._hts.hts_runtime_utilities as hru
|
|
|
|
from azureml.core import Run
|
|
from azureml.core.dataset import Dataset
|
|
|
|
# Parse the arguments.
|
|
args = {
|
|
"step_size": "--step-size",
|
|
"step_number": "--step-number",
|
|
"time_column_name": "--time-column-name",
|
|
"time_series_id_column_names": "--time-series-id-column-names",
|
|
"out_dir": "--output-dir",
|
|
}
|
|
parser = argparse.ArgumentParser("Parsing input arguments.")
|
|
for argname, arg in args.items():
|
|
parser.add_argument(arg, dest=argname, required=True)
|
|
parsed_args, _ = parser.parse_known_args()
|
|
step_number = int(parsed_args.step_number)
|
|
step_size = int(parsed_args.step_size)
|
|
# Create the working dirrectory to store the temporary csv files.
|
|
working_dir = parsed_args.out_dir
|
|
os.makedirs(working_dir, exist_ok=True)
|
|
# Set input and output
|
|
script_run = Run.get_context()
|
|
input_dataset = script_run.input_datasets["training_data"]
|
|
X_train = input_dataset.to_pandas_dataframe()
|
|
# Split the data.
|
|
for i in range(step_number):
|
|
file_name = os.path.join(working_dir, "backtest_{}.csv".format(i))
|
|
if parsed_args.time_series_id_column_names:
|
|
dfs = []
|
|
for _, one_series in X_train.groupby([parsed_args.time_series_id_column_names]):
|
|
one_series = one_series.sort_values(
|
|
by=[parsed_args.time_column_name], inplace=False
|
|
)
|
|
dfs.append(one_series.iloc[: len(one_series) - step_size * i])
|
|
pd.concat(dfs, sort=False, ignore_index=True).to_csv(file_name, index=False)
|
|
else:
|
|
X_train.sort_values(by=[parsed_args.time_column_name], inplace=True)
|
|
X_train.iloc[: len(X_train) - step_size * i].to_csv(file_name, index=False)
|