Files

49 lines
1.8 KiB
Python

import argparse
import os
import pandas as pd
from azureml.core import Run
print("Replace undefined values to relavant values and rename columns to meaningful names")
run = Run.get_context()
# To learn more about how to access dataset in your script, please
# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets.
filtered_data = run.input_datasets['filtered_data']
combined_converted_df = filtered_data.to_pandas_dataframe()
parser = argparse.ArgumentParser("normalize")
parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns")
args = parser.parse_args()
print("Argument (output normalized taxi data path): %s" % args.output_normalize)
# These functions replace undefined values and rename to use meaningful names.
replaced_stfor_vals_df = (combined_converted_df.replace({"store_forward": "0"}, {"store_forward": "N"})
.fillna({"store_forward": "N"}))
replaced_distance_vals_df = (replaced_stfor_vals_df.replace({"distance": ".00"}, {"distance": 0})
.fillna({"distance": 0}))
normalized_df = replaced_distance_vals_df.astype({"distance": 'float64'})
temp = pd.DatetimeIndex(normalized_df["pickup_datetime"])
normalized_df["pickup_date"] = temp.date
normalized_df["pickup_time"] = temp.time
temp = pd.DatetimeIndex(normalized_df["dropoff_datetime"])
normalized_df["dropoff_date"] = temp.date
normalized_df["dropoff_time"] = temp.time
del normalized_df["pickup_datetime"]
del normalized_df["dropoff_datetime"]
normalized_df.reset_index(inplace=True, drop=True)
if not (args.output_normalize is None):
os.makedirs(args.output_normalize, exist_ok=True)
print("%s created" % args.output_normalize)
path = args.output_normalize + "/processed.parquet"
write_df = normalized_df.to_parquet(path)