Update aml-pipelines-use-databricks-as-compute-target.ipynb

add "how to" guidance for common issue in DatabricksStep
2021-09-13 09:32:32 -07:00 · 2021-09-10 13:20:03 -07:00 · 2021-09-10 12:51:41 -07:00
5 changed files with 995 additions and 921 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1 @@
+{}
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb
--- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/normalize.py
+++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/normalize.py
@@ -28,21 +28,13 @@ replaced_distance_vals_df = (replaced_stfor_vals_df.replace({"distance": ".00"},

 normalized_df = replaced_distance_vals_df.astype({"distance": 'float64'})

-
-def time_to_us(time_str):
-    hh, mm , ss = map(int, time_str.split(':'))
-    return (ss + 60 * (mm + 60 * hh)) * (10**6)
-
-
 temp = pd.DatetimeIndex(normalized_df["pickup_datetime"])
-normalized_df["pickup_date"] = pd.to_datetime(temp.date)
+normalized_df["pickup_date"] = temp.date
 normalized_df["pickup_time"] = temp.time
-normalized_df["pickup_time"] = normalized_df["pickup_time"].apply(lambda x: time_to_us(str(x)))

 temp = pd.DatetimeIndex(normalized_df["dropoff_datetime"])
-normalized_df["dropoff_date"] = pd.to_datetime(temp.date)
+normalized_df["dropoff_date"] = temp.date
 normalized_df["dropoff_time"] = temp.time
-normalized_df["dropoff_time"] = normalized_df["dropoff_time"].apply(lambda x: time_to_us(str(x)))

 del normalized_df["pickup_datetime"]
 del normalized_df["dropoff_datetime"]
--- a/how-to-use-azureml/work-with-data/datasets-tutorial/pipeline-with-datasets/pipeline-for-image-classification.ipynb
+++ b/how-to-use-azureml/work-with-data/datasets-tutorial/pipeline-with-datasets/pipeline-for-image-classification.ipynb
@@ -272,8 +272,7 @@
        "dependencies:\n",
        "- python=3.6.2\n",
        "- pip:\n",
-        "  - azureml-core\n",
-        "  - azureml-dataset-runtime\n",
+        "  - azureml-defaults\n",
        "  - keras==2.4.3\n",
        "  - tensorflow==2.4.3\n",
        "  - numpy\n",
--- a/tutorials/regression-automl-nyc-taxi-data/regression-automated-ml.ipynb
+++ b/tutorials/regression-automl-nyc-taxi-data/regression-automated-ml.ipynb
@@ -101,7 +101,9 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "Remove some of the columns that you won't need for training or additional feature building.  Automate machine learning will automatically handle time-based features such as lpepPickupDatetime."
+        "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day, and will allow the model to factor in time-based seasonality. \n",
+        "\n",
+        "Use the `apply()` function on the dataframe to iteratively apply the `build_time_features()` function to each row in the taxi data."
      ]
    },
    {
@@ -110,7 +112,33 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
+        "def build_time_features(vector):\n",
+        "    pickup_datetime = vector[0]\n",
+        "    month_num = pickup_datetime.month\n",
+        "    day_of_month = pickup_datetime.day\n",
+        "    day_of_week = pickup_datetime.weekday()\n",
+        "    hour_of_day = pickup_datetime.hour\n",
+        "    \n",
+        "    return pd.Series((month_num, day_of_month, day_of_week, hour_of_day))\n",
+        "\n",
+        "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n",
+        "green_taxi_df.head(10)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Remove some of the columns that you won't need for training or additional feature building."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "columns_to_remove = [\"lpepPickupDatetime\", \"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
        "                     \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n",
        "                     \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"\n",
        "                    ]\n",
Author	SHA1	Message	Date
Sharmeelee Bijlani	279a1ba2c0	Update aml-pipelines-use-databricks-as-compute-target.ipynb	2021-09-13 09:32:32 -07:00
Sharmeelee Bijlani	8233533dcd	Update aml-pipelines-use-databricks-as-compute-target.ipynb	2021-09-10 13:20:03 -07:00
Sharmeelee Bijlani	89f23e6d50	add "how to" guidance for common issue in DatabricksStep	2021-09-10 12:51:41 -07:00