diff --git a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb deleted file mode 100644 index 45ab51ca..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb +++ /dev/null @@ -1,515 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cleaning up New York Taxi Cab data\n", - "Copyright (c) Microsoft Corporation. All rights reserved.
\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's use DataPrep to clean and featurize the data which can then be used to predict taxi trip duration. We will not use the For Hire Vehicle (FHV) datasets as they are not really taxi rides and they don't provide drop-off time and geo-coordinates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import display\n", - "from os import path\n", - "from tempfile import mkdtemp\n", - "\n", - "import pandas as pd\n", - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a quick peek at yellow cab data and green cab data to see what the data looks like. DataPrep supports globing, so you will notice below that we have added a `*` in the path.\n", - "\n", - "*We are using a small sample of the taxi data for this demo. You can find a bigger sample ~6GB by changing \"green-small\" to \"green-sample\" and \"yellow-small\" to \"yellow-sample\" in the paths below.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "\n", - "cache_location = mkdtemp()\n", - "dataset_root = \"https://dprepdata.blob.core.windows.net/demo\"\n", - "\n", - "green_path = \"/\".join([dataset_root, \"green-small/*\"])\n", - "yellow_path = \"/\".join([dataset_root, \"yellow-small/*\"])\n", - "\n", - "print(\"Retrieving data from the following two sources:\")\n", - "print(green_path)\n", - "print(yellow_path)\n", - "\n", - "green_df = dprep.read_csv(path=green_path, header=dprep.PromoteHeadersMode.GROUPED)\n", - "yellow_df = dprep.auto_read_file(path=yellow_path)\n", - "\n", - "display(green_df.head(5))\n", - "display(yellow_df.head(5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Cleanup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define some shortcut transforms that will apply to all Dataflows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_columns = dprep.ColumnSelector(term=\".*\", use_regex=True)\n", - "drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)]\n", - "useful_columns = [\n", - " \"cost\", \"distance\"\"distance\", \"dropoff_datetime\", \"dropoff_latitude\", \"dropoff_longitude\",\n", - " \"passengers\", \"pickup_datetime\", \"pickup_latitude\", \"pickup_longitude\", \"store_forward\", \"vendor\"\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's first work with the green taxi data and get it into a good shape that then can be combined with the yellow taxi data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (green_df\n", - " .replace_na(columns=all_columns)\n", - " .drop_nulls(*drop_if_all_null)\n", - " .rename_columns(column_pairs={\n", - " \"VendorID\": \"vendor\",\n", - " \"lpep_pickup_datetime\": \"pickup_datetime\",\n", - " \"Lpep_dropoff_datetime\": \"dropoff_datetime\",\n", - " \"lpep_dropoff_datetime\": \"dropoff_datetime\",\n", - " \"Store_and_fwd_flag\": \"store_forward\",\n", - " \"store_and_fwd_flag\": \"store_forward\",\n", - " \"Pickup_longitude\": \"pickup_longitude\",\n", - " \"Pickup_latitude\": \"pickup_latitude\",\n", - " \"Dropoff_longitude\": \"dropoff_longitude\",\n", - " \"Dropoff_latitude\": \"dropoff_latitude\",\n", - " \"Passenger_count\": \"passengers\",\n", - " \"Fare_amount\": \"cost\",\n", - " \"Trip_distance\": \"distance\"\n", - " })\n", - " .keep_columns(columns=useful_columns))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "green_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's do the same thing to yellow taxi data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (yellow_df\n", - " .replace_na(columns=all_columns)\n", - " .drop_nulls(*drop_if_all_null)\n", - " .rename_columns(column_pairs={\n", - " \"vendor_name\": \"vendor\",\n", - " \"VendorID\": \"vendor\",\n", - " \"vendor_id\": \"vendor\",\n", - " \"Trip_Pickup_DateTime\": \"pickup_datetime\",\n", - " \"tpep_pickup_datetime\": \"pickup_datetime\",\n", - " \"Trip_Dropoff_DateTime\": \"dropoff_datetime\",\n", - " \"tpep_dropoff_datetime\": \"dropoff_datetime\",\n", - " \"store_and_forward\": \"store_forward\",\n", - " \"store_and_fwd_flag\": \"store_forward\",\n", - " \"Start_Lon\": \"pickup_longitude\",\n", - " \"Start_Lat\": \"pickup_latitude\",\n", - " \"End_Lon\": \"dropoff_longitude\",\n", - " \"End_Lat\": \"dropoff_latitude\",\n", - " \"Passenger_Count\": \"passengers\",\n", - " \"passenger_count\": \"passengers\",\n", - " \"Fare_Amt\": \"cost\",\n", - " \"fare_amount\": \"cost\",\n", - " \"Trip_Distance\": \"distance\",\n", - " \"trip_distance\": \"distance\"\n", - " })\n", - " .keep_columns(columns=useful_columns))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "yellow_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now append the rows from the `yellow_df` to `green_df`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = green_df.append_rows(dataflows=[yellow_df])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the pickup and drop-off coordinates' data profile to see how the data is distributed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)\n", - "combined_df = combined_df.set_column_types(type_conversions={\n", - " \"pickup_longitude\": decimal_type,\n", - " \"pickup_latitude\": decimal_type,\n", - " \"dropoff_longitude\": decimal_type,\n", - " \"dropoff_latitude\": decimal_type\n", - "})\n", - "combined_df.keep_columns(columns=[\n", - " \"pickup_longitude\", \"pickup_latitude\", \n", - " \"dropoff_longitude\", \"dropoff_latitude\"\n", - "]).get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data profile, we can see that there are coordinates that are missing and coordinates that are not in New York. Let's filter out coordinates not in the [city border](https://mapmakerapp.com?map=5b60a055a191245990310739f658)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .drop_nulls(\n", - " columns=[\"pickup_longitude\", \"pickup_latitude\", \"dropoff_longitude\", \"dropoff_latitude\"],\n", - " column_relationship=dprep.ColumnRelationship(dprep.ColumnRelationship.ANY)\n", - " ) \n", - " .filter(dprep.f_and(\n", - " dprep.col(\"pickup_longitude\") <= -73.72,\n", - " dprep.col(\"pickup_longitude\") >= -74.09,\n", - " dprep.col(\"pickup_latitude\") <= 40.88,\n", - " dprep.col(\"pickup_latitude\") >= 40.53,\n", - " dprep.col(\"dropoff_longitude\") <= -73.72,\n", - " dprep.col(\"dropoff_longitude\") >= -74.09,\n", - " dprep.col(\"dropoff_latitude\") <= 40.88,\n", - " dprep.col(\"dropoff_latitude\") >= 40.53\n", - " )))\n", - "tmp_df.keep_columns(columns=[\n", - " \"pickup_longitude\", \"pickup_latitude\", \n", - " \"dropoff_longitude\", \"dropoff_latitude\"\n", - "]).get_profile()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the data profile for the `store_forward` column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df.keep_columns(columns='store_forward').get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data profile of `store_forward` above, we can see that the data is inconsistent and there are missing values. Let's fix them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = combined_df.replace(columns=\"store_forward\", find=\"0\", replace_with=\"N\").fill_nulls(\"store_forward\", \"N\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now split the pick up and drop off datetimes into a date column and a time column. We will use `split_column_by_example` to perform the split. If the `example` parameter of `split_column_by_example` is omitted, we will automatically try to figure out where to split based on the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .split_column_by_example(source_column=\"pickup_datetime\")\n", - " .split_column_by_example(source_column=\"dropoff_datetime\"))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's rename the columns generated by `split_column_by_example` into meaningful names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .rename_columns(column_pairs={\n", - " \"pickup_datetime_1\": \"pickup_date\",\n", - " \"pickup_datetime_2\": \"pickup_time\",\n", - " \"dropoff_datetime_1\": \"dropoff_date\",\n", - " \"dropoff_datetime_2\": \"dropoff_time\"\n", - " }))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Feature Engineering" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Datetime features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's split the pickup and drop-off date further into day of week, day of month, and month. For pickup and drop-off time columns, we will split it into hour, minute, and second." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .derive_column_by_example(\n", - " source_columns=\"pickup_date\", \n", - " new_column_name=\"pickup_weekday\", \n", - " example_data=[(\"2009-01-04\", \"Sunday\"), (\"2013-08-22\", \"Thursday\")]\n", - " )\n", - " .derive_column_by_example(\n", - " source_columns=\"dropoff_date\",\n", - " new_column_name=\"dropoff_weekday\",\n", - " example_data=[(\"2013-08-22\", \"Thursday\"), (\"2013-11-03\", \"Sunday\")]\n", - " )\n", - " .split_column_by_example(source_column=\"pickup_date\")\n", - " .split_column_by_example(source_column=\"pickup_time\")\n", - " .split_column_by_example(source_column=\"dropoff_date\")\n", - " .split_column_by_example(source_column=\"dropoff_time\")\n", - " .split_column_by_example(source_column=\"pickup_time_1\")\n", - " .split_column_by_example(source_column=\"dropoff_time_1\")\n", - " .drop_columns(columns=[\n", - " \"pickup_date\", \"pickup_time\", \"dropoff_date\", \"dropoff_time\", \n", - " \"pickup_date_1\", \"dropoff_date_1\", \"pickup_time_1\", \"dropoff_time_1\"\n", - " ])\n", - " .rename_columns(column_pairs={\n", - " \"pickup_date_2\": \"pickup_month\",\n", - " \"pickup_date_3\": \"pickup_monthday\",\n", - " \"pickup_time_1_1\": \"pickup_hour\",\n", - " \"pickup_time_1_2\": \"pickup_minute\",\n", - " \"pickup_time_2\": \"pickup_second\",\n", - " \"dropoff_date_2\": \"dropoff_month\",\n", - " \"dropoff_date_3\": \"dropoff_monthday\",\n", - " \"dropoff_time_1_1\": \"dropoff_hour\",\n", - " \"dropoff_time_1_2\": \"dropoff_minute\",\n", - " \"dropoff_time_2\": \"dropoff_second\"\n", - " }))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data above, we can see that the pickup and drop-off date and time components produced from the transforms above looks good. Let's drop the `pickup_datetime` and `dropoff_datetime` columns as they are no longer needed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = combined_df.drop_columns(columns=[\"pickup_datetime\", \"dropoff_datetime\"])\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now save the transformation steps into a DataPrep package so we can use it to to run on spark." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_path = path.join(mkdtemp(), \"new_york_taxi.dprep\")\n", - "combined_df.save(file_path=dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.png)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file