update samples from Release-168 as a part of SDK release

2025-12-20 09:37:04 -05:00 · 2022-12-05 17:52:07 +00:00
parent 38d5743bbb
commit 4404e62f58
44 changed files with 187 additions and 814 deletions
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb
@@ -330,7 +330,7 @@
        "- **inputs:** List of input connections for data consumed by this step. Fetch this inside the notebook using dbutils.widgets.get(\"input\")\n",
        "- **outputs:** List of output port definitions for outputs produced by this step. Fetch this inside the notebook using dbutils.widgets.get(\"output\")\n",
        "- **existing_cluster_id:** Cluster ID of an existing Interactive cluster on the Databricks workspace. If you are providing this, do not provide any of the parameters below that are used to create a new cluster such as spark_version, node_type, etc.\n",
-        "- **spark_version:** Version of spark for the databricks run cluster. default value: 4.0.x-scala2.11\n",
+        "- **spark_version:** Version of spark for the databricks run cluster. You can refer to [DataBricks runtime version](https://learn.microsoft.com/azure/databricks/dev-tools/api/#--runtime-version-strings) to specify the spark version. default value: 4.0.x-scala2.11\n",
        "- **node_type:** Azure vm node types for the databricks run cluster. default value: Standard_D3_v2\n",
        "- **num_workers:** Specifies a static number of workers for the databricks run cluster\n",
        "- **min_workers:** Specifies a min number of workers to use for auto-scaling the databricks run cluster\n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb
@@ -252,7 +252,7 @@
        "#              is_directory=None)\n",
        "\n",
        "# Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.\n",
-        "processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store)\n",
+        "processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store, is_directory=True)\n",
        "print(\"PipelineData object created\")"
      ]
    },
@@ -347,7 +347,7 @@
      "source": [
        "# step5 to use the intermediate data produced by step4\n",
        "# This step also produces an output processed_data2\n",
-        "processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store)\n",
+        "processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store, is_directory=True)\n",
        "source_directory = \"data_dependency_run_extract\"\n",
        "\n",
        "extractStep = PythonScriptStep(\n",
@@ -394,7 +394,7 @@
      "outputs": [],
      "source": [
        "# Now define the compare step which takes two inputs and produces an output\n",
-        "processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store)\n",
+        "processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store, is_directory=True)\n",
        "source_directory = \"data_dependency_run_compare\"\n",
        "\n",
        "compareStep = PythonScriptStep(\n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-notebook-runner-step.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-notebook-runner-step.ipynb
@@ -235,7 +235,8 @@
        "    path_on_datastore=\"titanic/Titanic.csv\")\n",
        "\n",
        "output_data = PipelineData(name=\"processed_data\",\n",
-        "                           datastore=Datastore.get(ws, \"workspaceblobstore\"))"
+        "                           datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
+        "                           is_directory=True)"
      ]
    },
    {
@@ -306,7 +307,8 @@
        "from azureml.pipeline.core import PipelineParameter\n",
        "\n",
        "output_from_notebook = PipelineData(name=\"notebook_processed_data\",\n",
-        "                                    datastore=Datastore.get(ws, \"workspaceblobstore\"))\n",
+        "                                    datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
+        "                                    is_directory=True)\n",
        "\n",
        "my_pipeline_param = PipelineParameter(name=\"pipeline_param\", default_value=\"my_param\")\n",
        "\n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/commandstep_r/Dockerfile
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/commandstep_r/Dockerfile
@@ -1,5 +1,5 @@
 # DisableDockerDetector "Disabled to unblock PRs until the owner can fix the file. Not used in any prod deployments - only as a documentation for the customers"
-FROM rocker/tidyverse:4.0.0-ubuntu18.04
+FROM rocker/tidyverse:4.0.0-ubuntu20.04
 
 # Install python
 RUN apt-get update -qq && \
--- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb
@@ -363,7 +363,7 @@
        "}).replace(\",\", \";\")\n",
        "\n",
        "# Define output after cleansing step\n",
-        "cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store).as_dataset()\n",
+        "cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
        "\n",
@@ -414,7 +414,7 @@
        "}).replace(\",\", \";\")\n",
        "\n",
        "# Define output after cleansing step\n",
-        "cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store).as_dataset()\n",
+        "cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
        "\n",
@@ -452,7 +452,7 @@
      "outputs": [],
      "source": [
        "# Define output after merging step\n",
-        "merged_data = PipelineData(\"merged_data\", datastore=default_store).as_dataset()\n",
+        "merged_data = PipelineData(\"merged_data\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Merge script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
        "\n",
@@ -489,7 +489,7 @@
      "outputs": [],
      "source": [
        "# Define output after merging step\n",
-        "filtered_data = PipelineData(\"filtered_data\", datastore=default_store).as_dataset()\n",
+        "filtered_data = PipelineData(\"filtered_data\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
        "\n",
@@ -525,7 +525,7 @@
      "outputs": [],
      "source": [
        "# Define output after normalize step\n",
-        "normalized_data = PipelineData(\"normalized_data\", datastore=default_store).as_dataset()\n",
+        "normalized_data = PipelineData(\"normalized_data\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Normalize script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
        "\n",
@@ -566,7 +566,7 @@
      "outputs": [],
      "source": [
        "# Define output after transform step\n",
-        "transformed_data = PipelineData(\"transformed_data\", datastore=default_store).as_dataset()\n",
+        "transformed_data = PipelineData(\"transformed_data\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
        "\n",
@@ -604,8 +604,8 @@
        "train_model_folder = './scripts/trainmodel'\n",
        "\n",
        "# train and test splits output\n",
-        "output_split_train = PipelineData(\"output_split_train\", datastore=default_store).as_dataset()\n",
-        "output_split_test = PipelineData(\"output_split_test\", datastore=default_store).as_dataset()\n",
+        "output_split_train = PipelineData(\"output_split_train\", datastore=default_store, is_directory=True).as_dataset()\n",
+        "output_split_test = PipelineData(\"output_split_test\", datastore=default_store, is_directory=True).as_dataset()\n",
        "\n",
        "print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))\n",
        "\n",