Files

38 lines
1002 B
Python

from pathlib import Path
from azureml.core import Run
import argparse
def main(args):
output = Path(args.output)
output.mkdir(parents=True, exist_ok=True)
run_context = Run.get_context()
dataset = run_context.input_datasets["train_10_models"]
df = dataset.to_pandas_dataframe()
# Drop the column "Revenue" from the dataset to avoid information leak as
# "Quantity" = "Revenue" / "Price". Please modify the logic based on your data.
drop_column_name = "Revenue"
if drop_column_name in df.columns:
df.drop(drop_column_name, axis=1, inplace=True)
# Apply any data pre-processing techniques here
df.to_parquet(output / "data_prepared_result.parquet", compression=None)
def my_parse_args():
parser = argparse.ArgumentParser("Test")
parser.add_argument("--input", type=str)
parser.add_argument("--output", type=str)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = my_parse_args()
main(args)