mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 09:37:04 -05:00
Compare commits
6 Commits
azureml-sd
...
shbijlan-u
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
279a1ba2c0 | ||
|
|
8233533dcd | ||
|
|
89f23e6d50 | ||
|
|
4cac072fa4 | ||
|
|
aeab6b3e28 | ||
|
|
015e261f29 |
1
.vscode/settings.json
vendored
Normal file
1
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
||||
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -18,12 +18,13 @@ dependencies:
|
||||
- holidays==0.9.11
|
||||
- pytorch::pytorch=1.4.0
|
||||
- cudatoolkit=10.1.243
|
||||
- tornado==6.1.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.33.0
|
||||
- azureml-widgets~=1.34.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.1.8
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlresources-prod.azureedge.net/validated-requirements/1.33.0/validated_win32_requirements.txt [--no-deps]
|
||||
- -r https://automlresources-prod.azureedge.net/validated-requirements/1.34.0/validated_win32_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -18,12 +18,13 @@ dependencies:
|
||||
- holidays==0.9.11
|
||||
- pytorch::pytorch=1.4.0
|
||||
- cudatoolkit=10.1.243
|
||||
- tornado==6.1.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.33.0
|
||||
- azureml-widgets~=1.34.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.1.8
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlresources-prod.azureedge.net/validated-requirements/1.33.0/validated_linux_requirements.txt [--no-deps]
|
||||
- -r https://automlresources-prod.azureedge.net/validated-requirements/1.34.0/validated_linux_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -19,12 +19,13 @@ dependencies:
|
||||
- holidays==0.9.11
|
||||
- pytorch::pytorch=1.4.0
|
||||
- cudatoolkit=9.0
|
||||
- tornado==6.1.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.33.0
|
||||
- azureml-widgets~=1.34.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.1.8
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlresources-prod.azureedge.net/validated-requirements/1.33.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- -r https://automlresources-prod.azureedge.net/validated-requirements/1.34.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -104,7 +104,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -93,7 +93,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -215,7 +215,7 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'average_precision_score_weighted',\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"max_concurrent_iterations\": 2, # This is a limit for testing purpose, please increase it as per cluster size\n",
|
||||
" \"experiment_timeout_hours\": 0.25, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ablity to find the best model possible\n",
|
||||
|
||||
@@ -96,7 +96,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -284,7 +284,7 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"experiment_timeout_minutes\": 30,\n",
|
||||
" \"primary_metric\": 'accuracy',\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"max_concurrent_iterations\": num_nodes, \n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" \"enable_dnn\": True,\n",
|
||||
|
||||
@@ -81,7 +81,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -348,7 +348,7 @@
|
||||
" \"iteration_timeout_minutes\": 10,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'r2_score',\n",
|
||||
" \"primary_metric\": 'normalized_root_mean_squared_error',\n",
|
||||
" \"max_concurrent_iterations\": 3,\n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -113,7 +113,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -71,7 +71,8 @@
|
||||
"\n",
|
||||
"from azureml.core import Workspace, Experiment, Dataset\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from datetime import datetime"
|
||||
"from datetime import datetime\n",
|
||||
"from azureml.automl.core.featurization import FeaturizationConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -87,7 +88,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -303,6 +304,25 @@
|
||||
"forecast_horizon = 14"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert prediction type to integer\n",
|
||||
"The featurization configuration can be used to change the default prediction type from decimal numbers to integer. This customization can be used in the scenario when the target column is expected to contain whole values as the number of rented bikes per day."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"featurization_config = FeaturizationConfig()\n",
|
||||
"# Force the target column, to be integer type.\n",
|
||||
"featurization_config.add_prediction_transform_type('Integer')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -327,6 +347,7 @@
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task='forecasting', \n",
|
||||
" primary_metric='normalized_root_mean_squared_error',\n",
|
||||
" featurization=featurization_config,\n",
|
||||
" blocked_models = ['ExtremeRandomTrees'], \n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
@@ -507,7 +528,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Download the prediction result for metrics calcuation\n",
|
||||
"### Download the prediction result for metrics calculation\n",
|
||||
"The test data with predictions are saved in artifact outputs/predictions.csv. You can download it and calculation some error metrics for the forecasts and vizualize the predictions vs. the actuals."
|
||||
]
|
||||
},
|
||||
|
||||
@@ -99,7 +99,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -94,7 +94,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -81,7 +81,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -96,7 +96,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -173,7 +173,7 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'average_precision_score_weighted',\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"experiment_timeout_hours\": 0.25, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ability to find the best model possible\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
" \"enable_stack_ensemble\": False\n",
|
||||
|
||||
@@ -95,7 +95,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -213,7 +213,7 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'r2_score',\n",
|
||||
" \"primary_metric\": 'normalized_root_mean_squared_error',\n",
|
||||
" \"enable_early_stopping\": True, \n",
|
||||
" \"experiment_timeout_hours\": 0.3, #for real scenarios we reccommend a timeout of at least one hour \n",
|
||||
" \"max_concurrent_iterations\": 4,\n",
|
||||
|
||||
@@ -19,20 +19,21 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using Databricks as a Compute Target from Azure Machine Learning Pipeline\n",
|
||||
"To use Databricks as a compute target from [Azure Machine Learning Pipeline](https://aka.ms/pl-concept), a [DatabricksStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.databricks_step.databricksstep?view=azure-ml-py) is used. This notebook demonstrates the use of DatabricksStep in Azure Machine Learning Pipeline.\n",
|
||||
"\n",
|
||||
"The notebook will show:\n",
|
||||
"1. Running an arbitrary Databricks notebook that the customer has in Databricks workspace\n",
|
||||
"2. Running an arbitrary Python script that the customer has in DBFS\n",
|
||||
"3. Running an arbitrary Python script that is available on local computer (will upload to DBFS, and then run in Databricks) \n",
|
||||
"4. Running a JAR job that the customer has in DBFS.\n",
|
||||
"\n",
|
||||
"## Before you begin:\n",
|
||||
"\n",
|
||||
"1. **Create an Azure Databricks workspace** in the same subscription where you have your Azure Machine Learning workspace. You will need details of this workspace later on to define DatabricksStep. [Click here](https://ms.portal.azure.com/#blade/HubsExtension/Resources/resourceType/Microsoft.Databricks%2Fworkspaces) for more information.\n",
|
||||
"2. **Create PAT (access token)**: Manually create a Databricks access token at the Azure Databricks portal. See [this](https://docs.databricks.com/api/latest/authentication.html#generate-a-token) for more information.\n",
|
||||
"3. **Add demo notebook to ADB**: This notebook has a sample you can use as is. Launch Azure Databricks attached to your Azure Machine Learning workspace and add a new notebook. \n",
|
||||
"# Using Databricks as a Compute Target from Azure Machine Learning Pipeline\r\n",
|
||||
"To use Databricks as a compute target from [Azure Machine Learning Pipeline](https://aka.ms/pl-concept), a [DatabricksStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.databricks_step.databricksstep?view=azure-ml-py) is used. This notebook demonstrates the use of DatabricksStep in Azure Machine Learning Pipeline.\r\n",
|
||||
"\r\n",
|
||||
"The notebook will show:\r\n",
|
||||
"1. Running an arbitrary Databricks notebook that the customer has in Databricks workspace\r\n",
|
||||
"2. Running an arbitrary Python script that the customer has in DBFS\r\n",
|
||||
"3. Running an arbitrary Python script that is available on local computer (will upload to DBFS, and then run in Databricks) \r\n",
|
||||
"4. Running a JAR job that the customer has in DBFS.\r\n",
|
||||
"5. How to get run context in a Databricks interactive cluster\r\n",
|
||||
"\r\n",
|
||||
"## Before you begin:\r\n",
|
||||
"\r\n",
|
||||
"1. **Create an Azure Databricks workspace** in the same subscription where you have your Azure Machine Learning workspace. You will need details of this workspace later on to define DatabricksStep. [Click here](https://ms.portal.azure.com/#blade/HubsExtension/Resources/resourceType/Microsoft.Databricks%2Fworkspaces) for more information.\r\n",
|
||||
"2. **Create PAT (access token)**: Manually create a Databricks access token at the Azure Databricks portal. See [this](https://docs.databricks.com/api/latest/authentication.html#generate-a-token) for more information.\r\n",
|
||||
"3. **Add demo notebook to ADB**: This notebook has a sample you can use as is. Launch Azure Databricks attached to your Azure Machine Learning workspace and add a new notebook. \r\n",
|
||||
"4. **Create/attach a Blob storage** for use from ADB"
|
||||
]
|
||||
},
|
||||
@@ -48,33 +49,33 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"```python\n",
|
||||
"# direct access\n",
|
||||
"dbutils.widgets.get(\"myparam\")\n",
|
||||
"p = getArgument(\"myparam\")\n",
|
||||
"print (\"Param -\\'myparam':\")\n",
|
||||
"print (p)\n",
|
||||
"\n",
|
||||
"dbutils.widgets.get(\"input\")\n",
|
||||
"i = getArgument(\"input\")\n",
|
||||
"print (\"Param -\\'input':\")\n",
|
||||
"print (i)\n",
|
||||
"\n",
|
||||
"dbutils.widgets.get(\"output\")\n",
|
||||
"o = getArgument(\"output\")\n",
|
||||
"print (\"Param -\\'output':\")\n",
|
||||
"print (o)\n",
|
||||
"\n",
|
||||
"n = i + \"/testdata.txt\"\n",
|
||||
"df = spark.read.csv(n)\n",
|
||||
"\n",
|
||||
"display (df)\n",
|
||||
"\n",
|
||||
"data = [('value1', 'value2')]\n",
|
||||
"df2 = spark.createDataFrame(data)\n",
|
||||
"\n",
|
||||
"z = o + \"/output.txt\"\n",
|
||||
"df2.write.csv(z)\n",
|
||||
"```python\r\n",
|
||||
"# direct access\r\n",
|
||||
"dbutils.widgets.get(\"myparam\")\r\n",
|
||||
"p = getArgument(\"myparam\")\r\n",
|
||||
"print (\"Param -\\'myparam':\")\r\n",
|
||||
"print (p)\r\n",
|
||||
"\r\n",
|
||||
"dbutils.widgets.get(\"input\")\r\n",
|
||||
"i = getArgument(\"input\")\r\n",
|
||||
"print (\"Param -\\'input':\")\r\n",
|
||||
"print (i)\r\n",
|
||||
"\r\n",
|
||||
"dbutils.widgets.get(\"output\")\r\n",
|
||||
"o = getArgument(\"output\")\r\n",
|
||||
"print (\"Param -\\'output':\")\r\n",
|
||||
"print (o)\r\n",
|
||||
"\r\n",
|
||||
"n = i + \"/testdata.txt\"\r\n",
|
||||
"df = spark.read.csv(n)\r\n",
|
||||
"\r\n",
|
||||
"display (df)\r\n",
|
||||
"\r\n",
|
||||
"data = [('value1', 'value2')]\r\n",
|
||||
"df2 = spark.createDataFrame(data)\r\n",
|
||||
"\r\n",
|
||||
"z = o + \"/output.txt\"\r\n",
|
||||
"df2.write.csv(z)\r\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
@@ -91,18 +92,18 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core.runconfig import JarLibrary\n",
|
||||
"from azureml.core.compute import ComputeTarget, DatabricksCompute\n",
|
||||
"from azureml.exceptions import ComputeTargetException\n",
|
||||
"from azureml.core import Workspace, Experiment\n",
|
||||
"from azureml.pipeline.core import Pipeline, PipelineData\n",
|
||||
"from azureml.pipeline.steps import DatabricksStep\n",
|
||||
"from azureml.core.datastore import Datastore\n",
|
||||
"from azureml.data.data_reference import DataReference\n",
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"import os\r\n",
|
||||
"import azureml.core\r\n",
|
||||
"from azureml.core.runconfig import JarLibrary\r\n",
|
||||
"from azureml.core.compute import ComputeTarget, DatabricksCompute\r\n",
|
||||
"from azureml.exceptions import ComputeTargetException\r\n",
|
||||
"from azureml.core import Workspace, Experiment\r\n",
|
||||
"from azureml.pipeline.core import Pipeline, PipelineData\r\n",
|
||||
"from azureml.pipeline.steps import DatabricksStep\r\n",
|
||||
"from azureml.core.datastore import Datastore\r\n",
|
||||
"from azureml.data.data_reference import DataReference\r\n",
|
||||
"\r\n",
|
||||
"# Check core SDK version number\r\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
@@ -121,7 +122,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"ws = Workspace.from_config()\r\n",
|
||||
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
@@ -149,29 +150,29 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Replace with your account info before running.\n",
|
||||
" \n",
|
||||
"db_compute_name=os.getenv(\"DATABRICKS_COMPUTE_NAME\", \"<my-databricks-compute-name>\") # Databricks compute name\n",
|
||||
"db_resource_group=os.getenv(\"DATABRICKS_RESOURCE_GROUP\", \"<my-db-resource-group>\") # Databricks resource group\n",
|
||||
"db_workspace_name=os.getenv(\"DATABRICKS_WORKSPACE_NAME\", \"<my-db-workspace-name>\") # Databricks workspace name\n",
|
||||
"db_access_token=os.getenv(\"DATABRICKS_ACCESS_TOKEN\", \"<my-access-token>\") # Databricks access token\n",
|
||||
" \n",
|
||||
"try:\n",
|
||||
" databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)\n",
|
||||
" print('Compute target {} already exists'.format(db_compute_name))\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Compute not found, will use below parameters to attach new one')\n",
|
||||
" print('db_compute_name {}'.format(db_compute_name))\n",
|
||||
" print('db_resource_group {}'.format(db_resource_group))\n",
|
||||
" print('db_workspace_name {}'.format(db_workspace_name))\n",
|
||||
" print('db_access_token {}'.format(db_access_token))\n",
|
||||
" \n",
|
||||
" config = DatabricksCompute.attach_configuration(\n",
|
||||
" resource_group = db_resource_group,\n",
|
||||
" workspace_name = db_workspace_name,\n",
|
||||
" access_token= db_access_token)\n",
|
||||
" databricks_compute=ComputeTarget.attach(ws, db_compute_name, config)\n",
|
||||
" databricks_compute.wait_for_completion(True)\n"
|
||||
"# Replace with your account info before running.\r\n",
|
||||
" \r\n",
|
||||
"db_compute_name=os.getenv(\"DATABRICKS_COMPUTE_NAME\", \"<my-databricks-compute-name>\") # Databricks compute name\r\n",
|
||||
"db_resource_group=os.getenv(\"DATABRICKS_RESOURCE_GROUP\", \"<my-db-resource-group>\") # Databricks resource group\r\n",
|
||||
"db_workspace_name=os.getenv(\"DATABRICKS_WORKSPACE_NAME\", \"<my-db-workspace-name>\") # Databricks workspace name\r\n",
|
||||
"db_access_token=os.getenv(\"DATABRICKS_ACCESS_TOKEN\", \"<my-access-token>\") # Databricks access token\r\n",
|
||||
" \r\n",
|
||||
"try:\r\n",
|
||||
" databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)\r\n",
|
||||
" print('Compute target {} already exists'.format(db_compute_name))\r\n",
|
||||
"except ComputeTargetException:\r\n",
|
||||
" print('Compute not found, will use below parameters to attach new one')\r\n",
|
||||
" print('db_compute_name {}'.format(db_compute_name))\r\n",
|
||||
" print('db_resource_group {}'.format(db_resource_group))\r\n",
|
||||
" print('db_workspace_name {}'.format(db_workspace_name))\r\n",
|
||||
" print('db_access_token {}'.format(db_access_token))\r\n",
|
||||
" \r\n",
|
||||
" config = DatabricksCompute.attach_configuration(\r\n",
|
||||
" resource_group = db_resource_group,\r\n",
|
||||
" workspace_name = db_workspace_name,\r\n",
|
||||
" access_token= db_access_token)\r\n",
|
||||
" databricks_compute=ComputeTarget.attach(ws, db_compute_name, config)\r\n",
|
||||
" databricks_compute.wait_for_completion(True)\r\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -303,20 +304,20 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core import PipelineParameter\n",
|
||||
"\n",
|
||||
"# Use the default blob storage\n",
|
||||
"def_blob_store = Datastore(ws, \"workspaceblobstore\")\n",
|
||||
"print('Datastore {} will be used'.format(def_blob_store.name))\n",
|
||||
"\n",
|
||||
"pipeline_param = PipelineParameter(name=\"my_pipeline_param\", default_value=\"pipeline_param1\")\n",
|
||||
"\n",
|
||||
"# We are uploading a sample file in the local directory to be used as a datasource\n",
|
||||
"def_blob_store.upload_files(files=[\"./testdata.txt\"], target_path=\"dbtest\", overwrite=False)\n",
|
||||
"\n",
|
||||
"step_1_input = DataReference(datastore=def_blob_store, path_on_datastore=\"dbtest\",\n",
|
||||
" data_reference_name=\"input\")\n",
|
||||
"\n",
|
||||
"from azureml.pipeline.core import PipelineParameter\r\n",
|
||||
"\r\n",
|
||||
"# Use the default blob storage\r\n",
|
||||
"def_blob_store = Datastore(ws, \"workspaceblobstore\")\r\n",
|
||||
"print('Datastore {} will be used'.format(def_blob_store.name))\r\n",
|
||||
"\r\n",
|
||||
"pipeline_param = PipelineParameter(name=\"my_pipeline_param\", default_value=\"pipeline_param1\")\r\n",
|
||||
"\r\n",
|
||||
"# We are uploading a sample file in the local directory to be used as a datasource\r\n",
|
||||
"def_blob_store.upload_files(files=[\"./testdata.txt\"], target_path=\"dbtest\", overwrite=False)\r\n",
|
||||
"\r\n",
|
||||
"step_1_input = DataReference(datastore=def_blob_store, path_on_datastore=\"dbtest\",\r\n",
|
||||
" data_reference_name=\"input\")\r\n",
|
||||
"\r\n",
|
||||
"step_1_output = PipelineData(\"output\", datastore=def_blob_store)"
|
||||
]
|
||||
},
|
||||
@@ -412,7 +413,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 1. Running the demo notebook already added to the Databricks workspace\n",
|
||||
"Create a notebook in the Azure Databricks workspace, and provide the path to that notebook as the value associated with the environment variable \"DATABRICKS_NOTEBOOK_PATH\". This will then set the variable\u00c2\u00a0notebook_path\u00c2\u00a0when you run the code cell below:\n",
|
||||
"Create a notebook in the Azure Databricks workspace, and provide the path to that notebook as the value associated with the environment variable \"DATABRICKS_NOTEBOOK_PATH\". This will then set the variable notebook_path when you run the code cell below:\n",
|
||||
"\n",
|
||||
"your notebook's path in Azure Databricks UI by hovering over to notebook's title. A typical path of notebook looks like this `/Users/example@databricks.com/example`. See [Databricks Workspace](https://docs.azuredatabricks.net/user-guide/workspace.html) to learn about the folder structure.\n",
|
||||
"\n",
|
||||
@@ -425,19 +426,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"notebook_path=os.getenv(\"DATABRICKS_NOTEBOOK_PATH\", \"<my-databricks-notebook-path>\") # Databricks notebook path\n",
|
||||
"\n",
|
||||
"dbNbStep = DatabricksStep(\n",
|
||||
" name=\"DBNotebookInWS\",\n",
|
||||
" inputs=[step_1_input],\n",
|
||||
" outputs=[step_1_output],\n",
|
||||
" num_workers=1,\n",
|
||||
" notebook_path=notebook_path,\n",
|
||||
" notebook_params={'myparam': 'testparam', \n",
|
||||
" 'myparam2': pipeline_param},\n",
|
||||
" run_name='DB_Notebook_demo',\n",
|
||||
" compute_target=databricks_compute,\n",
|
||||
" allow_reuse=True\n",
|
||||
"notebook_path=os.getenv(\"DATABRICKS_NOTEBOOK_PATH\", \"<my-databricks-notebook-path>\") # Databricks notebook path\r\n",
|
||||
"\r\n",
|
||||
"dbNbStep = DatabricksStep(\r\n",
|
||||
" name=\"DBNotebookInWS\",\r\n",
|
||||
" inputs=[step_1_input],\r\n",
|
||||
" outputs=[step_1_output],\r\n",
|
||||
" num_workers=1,\r\n",
|
||||
" notebook_path=notebook_path,\r\n",
|
||||
" notebook_params={'myparam': 'testparam', \r\n",
|
||||
" 'myparam2': pipeline_param},\r\n",
|
||||
" run_name='DB_Notebook_demo',\r\n",
|
||||
" compute_target=databricks_compute,\r\n",
|
||||
" allow_reuse=True\r\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -456,9 +457,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = [dbNbStep]\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Notebook_demo').submit(pipeline)\n",
|
||||
"steps = [dbNbStep]\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\r\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Notebook_demo').submit(pipeline)\r\n",
|
||||
"pipeline_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
@@ -475,7 +476,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"from azureml.widgets import RunDetails\r\n",
|
||||
"RunDetails(pipeline_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -503,17 +504,17 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_script_path = os.getenv(\"DATABRICKS_PYTHON_SCRIPT_PATH\", \"<my-databricks-python-script-path>\") # Databricks python script path\n",
|
||||
"\n",
|
||||
"dbPythonInDbfsStep = DatabricksStep(\n",
|
||||
" name=\"DBPythonInDBFS\",\n",
|
||||
" inputs=[step_1_input],\n",
|
||||
" num_workers=1,\n",
|
||||
" python_script_path=python_script_path,\n",
|
||||
" python_script_params={'arg1', pipeline_param, 'arg2'},\n",
|
||||
" run_name='DB_Python_demo',\n",
|
||||
" compute_target=databricks_compute,\n",
|
||||
" allow_reuse=True\n",
|
||||
"python_script_path = os.getenv(\"DATABRICKS_PYTHON_SCRIPT_PATH\", \"<my-databricks-python-script-path>\") # Databricks python script path\r\n",
|
||||
"\r\n",
|
||||
"dbPythonInDbfsStep = DatabricksStep(\r\n",
|
||||
" name=\"DBPythonInDBFS\",\r\n",
|
||||
" inputs=[step_1_input],\r\n",
|
||||
" num_workers=1,\r\n",
|
||||
" python_script_path=python_script_path,\r\n",
|
||||
" python_script_params={'arg1', pipeline_param, 'arg2'},\r\n",
|
||||
" run_name='DB_Python_demo',\r\n",
|
||||
" compute_target=databricks_compute,\r\n",
|
||||
" allow_reuse=True\r\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -530,9 +531,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = [dbPythonInDbfsStep]\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Python_demo').submit(pipeline)\n",
|
||||
"steps = [dbPythonInDbfsStep]\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\r\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Python_demo').submit(pipeline)\r\n",
|
||||
"pipeline_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
@@ -549,7 +550,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"from azureml.widgets import RunDetails\r\n",
|
||||
"RunDetails(pipeline_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -573,18 +574,18 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_script_name = \"train-db-local.py\"\n",
|
||||
"source_directory = \"./databricks_train\"\n",
|
||||
"\n",
|
||||
"dbPythonInLocalMachineStep = DatabricksStep(\n",
|
||||
" name=\"DBPythonInLocalMachine\",\n",
|
||||
" inputs=[step_1_input],\n",
|
||||
" num_workers=1,\n",
|
||||
" python_script_name=python_script_name,\n",
|
||||
" source_directory=source_directory,\n",
|
||||
" run_name='DB_Python_Local_demo',\n",
|
||||
" compute_target=databricks_compute,\n",
|
||||
" allow_reuse=True\n",
|
||||
"python_script_name = \"train-db-local.py\"\r\n",
|
||||
"source_directory = \"./databricks_train\"\r\n",
|
||||
"\r\n",
|
||||
"dbPythonInLocalMachineStep = DatabricksStep(\r\n",
|
||||
" name=\"DBPythonInLocalMachine\",\r\n",
|
||||
" inputs=[step_1_input],\r\n",
|
||||
" num_workers=1,\r\n",
|
||||
" python_script_name=python_script_name,\r\n",
|
||||
" source_directory=source_directory,\r\n",
|
||||
" run_name='DB_Python_Local_demo',\r\n",
|
||||
" compute_target=databricks_compute,\r\n",
|
||||
" allow_reuse=True\r\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -601,9 +602,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = [dbPythonInLocalMachineStep]\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Python_Local_demo').submit(pipeline)\n",
|
||||
"steps = [dbPythonInLocalMachineStep]\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\r\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Python_Local_demo').submit(pipeline)\r\n",
|
||||
"pipeline_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
@@ -620,7 +621,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"from azureml.widgets import RunDetails\r\n",
|
||||
"RunDetails(pipeline_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -646,19 +647,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"main_jar_class_name = \"com.microsoft.aeva.Main\"\n",
|
||||
"jar_library_dbfs_path = os.getenv(\"DATABRICKS_JAR_LIB_PATH\", \"<my-databricks-jar-lib-path>\") # Databricks jar library path\n",
|
||||
"\n",
|
||||
"dbJarInDbfsStep = DatabricksStep(\n",
|
||||
" name=\"DBJarInDBFS\",\n",
|
||||
" inputs=[step_1_input],\n",
|
||||
" num_workers=1,\n",
|
||||
" main_class_name=main_jar_class_name,\n",
|
||||
" jar_params={'arg1', pipeline_param, 'arg2'},\n",
|
||||
" run_name='DB_JAR_demo',\n",
|
||||
" jar_libraries=[JarLibrary(jar_library_dbfs_path)],\n",
|
||||
" compute_target=databricks_compute,\n",
|
||||
" allow_reuse=True\n",
|
||||
"main_jar_class_name = \"com.microsoft.aeva.Main\"\r\n",
|
||||
"jar_library_dbfs_path = os.getenv(\"DATABRICKS_JAR_LIB_PATH\", \"<my-databricks-jar-lib-path>\") # Databricks jar library path\r\n",
|
||||
"\r\n",
|
||||
"dbJarInDbfsStep = DatabricksStep(\r\n",
|
||||
" name=\"DBJarInDBFS\",\r\n",
|
||||
" inputs=[step_1_input],\r\n",
|
||||
" num_workers=1,\r\n",
|
||||
" main_class_name=main_jar_class_name,\r\n",
|
||||
" jar_params={'arg1', pipeline_param, 'arg2'},\r\n",
|
||||
" run_name='DB_JAR_demo',\r\n",
|
||||
" jar_libraries=[JarLibrary(jar_library_dbfs_path)],\r\n",
|
||||
" compute_target=databricks_compute,\r\n",
|
||||
" allow_reuse=True\r\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -675,9 +676,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = [dbJarInDbfsStep]\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_JAR_demo').submit(pipeline)\n",
|
||||
"steps = [dbJarInDbfsStep]\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\r\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_JAR_demo').submit(pipeline)\r\n",
|
||||
"pipeline_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
@@ -694,19 +695,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"from azureml.widgets import RunDetails\r\n",
|
||||
"RunDetails(pipeline_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 5. Running demo notebook already added to the Databricks workspace using existing cluster\n",
|
||||
"First you need register DBFS datastore and make sure path_on_datastore does exist in databricks file system, you can browser the files by refering [this](https://docs.azuredatabricks.net/user-guide/dbfs-databricks-file-system.html).\n",
|
||||
"\n",
|
||||
"Find existing_cluster_id by opeing Azure Databricks UI with Clusters page and in url you will find a string connected with '-' right after \"clusters/\"."
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -714,13 +715,13 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" dbfs_ds = Datastore.get(workspace=ws, datastore_name='dbfs_datastore')\n",
|
||||
" print('DBFS Datastore already exists')\n",
|
||||
"except Exception as ex:\n",
|
||||
" dbfs_ds = Datastore.register_dbfs(ws, datastore_name='dbfs_datastore')\n",
|
||||
"\n",
|
||||
"step_1_input = DataReference(datastore=dbfs_ds, path_on_datastore=\"FileStore\", data_reference_name=\"input\")\n",
|
||||
"try:\r\n",
|
||||
" dbfs_ds = Datastore.get(workspace=ws, datastore_name='dbfs_datastore')\r\n",
|
||||
" print('DBFS Datastore already exists')\r\n",
|
||||
"except Exception as ex:\r\n",
|
||||
" dbfs_ds = Datastore.register_dbfs(ws, datastore_name='dbfs_datastore')\r\n",
|
||||
"\r\n",
|
||||
"step_1_input = DataReference(datastore=dbfs_ds, path_on_datastore=\"FileStore\", data_reference_name=\"input\")\r\n",
|
||||
"step_1_output = PipelineData(\"output\", datastore=dbfs_ds)"
|
||||
]
|
||||
},
|
||||
@@ -730,26 +731,26 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dbNbWithExistingClusterStep = DatabricksStep(\n",
|
||||
" name=\"DBFSReferenceWithExisting\",\n",
|
||||
" inputs=[step_1_input],\n",
|
||||
" outputs=[step_1_output],\n",
|
||||
" notebook_path=notebook_path,\n",
|
||||
" notebook_params={'myparam': 'testparam', \n",
|
||||
" 'myparam2': pipeline_param},\n",
|
||||
" run_name='DBFS_Reference_With_Existing',\n",
|
||||
" compute_target=databricks_compute,\n",
|
||||
" existing_cluster_id=\"your existing cluster id\",\n",
|
||||
" allow_reuse=True\n",
|
||||
"dbNbWithExistingClusterStep = DatabricksStep(\r\n",
|
||||
" name=\"DBFSReferenceWithExisting\",\r\n",
|
||||
" inputs=[step_1_input],\r\n",
|
||||
" outputs=[step_1_output],\r\n",
|
||||
" notebook_path=notebook_path,\r\n",
|
||||
" notebook_params={'myparam': 'testparam', \r\n",
|
||||
" 'myparam2': pipeline_param},\r\n",
|
||||
" run_name='DBFS_Reference_With_Existing',\r\n",
|
||||
" compute_target=databricks_compute,\r\n",
|
||||
" existing_cluster_id=\"your existing cluster id\",\r\n",
|
||||
" allow_reuse=True\r\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Build and submit the Experiment"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -757,18 +758,18 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = [dbNbWithExistingClusterStep]\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
|
||||
"pipeline_run = Experiment(ws, 'DBFS_Reference_With_Existing').submit(pipeline)\n",
|
||||
"steps = [dbNbWithExistingClusterStep]\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\r\n",
|
||||
"pipeline_run = Experiment(ws, 'DBFS_Reference_With_Existing').submit(pipeline)\r\n",
|
||||
"pipeline_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### View Run Details"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -776,19 +777,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"from azureml.widgets import RunDetails\r\n",
|
||||
"RunDetails(pipeline_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 6. Running a Python script in Databricks that currenlty is in local computer with existing cluster\n",
|
||||
"When you access azure blob or data lake storage from an existing (interactive) cluster, you need to ensure the Spark configuration is set up correctly to access this storage and this set up may require the cluster to be restarted.\n",
|
||||
"\n",
|
||||
"### 6. Running a Python script in Databricks that is currently in local computer with existing cluster\r\n",
|
||||
"When you access azure blob or data lake storage from an existing (interactive) cluster, you need to ensure the Spark configuration is set up correctly to access this storage and this set up may require the cluster to be restarted.\r\n",
|
||||
"\r\n",
|
||||
"If you set permit_cluster_restart to True, AML will check if the spark configuration needs to be updated and restart the cluster for you if required. This will ensure that the storage can be correctly accessed from the Databricks cluster."
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -796,28 +797,28 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"step_1_input = DataReference(datastore=def_blob_store, path_on_datastore=\"dbtest\",\n",
|
||||
" data_reference_name=\"input\")\n",
|
||||
"\n",
|
||||
"dbPythonInLocalWithExistingStep = DatabricksStep(\n",
|
||||
" name=\"DBPythonInLocalMachineWithExisting\",\n",
|
||||
" inputs=[step_1_input],\n",
|
||||
" python_script_name=python_script_name,\n",
|
||||
" source_directory=source_directory,\n",
|
||||
" run_name='DB_Python_Local_existing_demo',\n",
|
||||
" compute_target=databricks_compute,\n",
|
||||
" existing_cluster_id=\"your existing cluster id\",\n",
|
||||
" allow_reuse=False,\n",
|
||||
" permit_cluster_restart=True\n",
|
||||
"step_1_input = DataReference(datastore=def_blob_store, path_on_datastore=\"dbtest\",\r\n",
|
||||
" data_reference_name=\"input\")\r\n",
|
||||
"\r\n",
|
||||
"dbPythonInLocalWithExistingStep = DatabricksStep(\r\n",
|
||||
" name=\"DBPythonInLocalMachineWithExisting\",\r\n",
|
||||
" inputs=[step_1_input],\r\n",
|
||||
" python_script_name=python_script_name,\r\n",
|
||||
" source_directory=source_directory,\r\n",
|
||||
" run_name='DB_Python_Local_existing_demo',\r\n",
|
||||
" compute_target=databricks_compute,\r\n",
|
||||
" existing_cluster_id=\"your existing cluster id\",\r\n",
|
||||
" allow_reuse=False,\r\n",
|
||||
" permit_cluster_restart=True\r\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Build and submit the Experiment"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -825,18 +826,18 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = [dbPythonInLocalWithExistingStep]\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Python_Local_existing_demo').submit(pipeline)\n",
|
||||
"steps = [dbPythonInLocalWithExistingStep]\r\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=steps)\r\n",
|
||||
"pipeline_run = Experiment(ws, 'DB_Python_Local_existing_demo').submit(pipeline)\r\n",
|
||||
"pipeline_run.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### View Run Details"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -844,17 +845,70 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"from azureml.widgets import RunDetails\r\n",
|
||||
"RunDetails(pipeline_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### How to get run context in a Databricks interactive cluster\r\n",
|
||||
"\r\n",
|
||||
"Users are used to being able to use Run.get_context() to retrieve the parent_run_id for a given run_id. In DatabricksStep, however, a little more work is required to achieve this.\r\n",
|
||||
"\r\n",
|
||||
"The solution is to parse the script arguments and set corresponding environment variables to access the run context from within Databricks.\r\n",
|
||||
"Note that this workaround is not required for job clusters. \r\n",
|
||||
"\r\n",
|
||||
"Here is a code sample:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"```python\r\n",
|
||||
"from azureml.core import Run\r\n",
|
||||
"import argparse\r\n",
|
||||
"import os\r\n",
|
||||
"\r\n",
|
||||
"\r\n",
|
||||
"def populate_environ():\r\n",
|
||||
" parser = argparse.ArgumentParser(description='Process arguments passed to script')\r\n",
|
||||
" parser.add_argument('--AZUREML_SCRIPT_DIRECTORY_NAME')\r\n",
|
||||
" parser.add_argument('--AZUREML_RUN_TOKEN')\r\n",
|
||||
" parser.add_argument('--AZUREML_RUN_TOKEN_EXPIRY')\r\n",
|
||||
" parser.add_argument('--AZUREML_RUN_ID')\r\n",
|
||||
" parser.add_argument('--AZUREML_ARM_SUBSCRIPTION')\r\n",
|
||||
" parser.add_argument('--AZUREML_ARM_RESOURCEGROUP')\r\n",
|
||||
" parser.add_argument('--AZUREML_ARM_WORKSPACE_NAME')\r\n",
|
||||
" parser.add_argument('--AZUREML_ARM_PROJECT_NAME')\r\n",
|
||||
" parser.add_argument('--AZUREML_SERVICE_ENDPOINT')\r\n",
|
||||
"\r\n",
|
||||
" args = parser.parse_args()\r\n",
|
||||
" os.environ['AZUREML_SCRIPT_DIRECTORY_NAME'] = args.AZUREML_SCRIPT_DIRECTORY_NAME\r\n",
|
||||
" os.environ['AZUREML_RUN_TOKEN'] = args.AZUREML_RUN_TOKEN\r\n",
|
||||
" os.environ['AZUREML_RUN_TOKEN_EXPIRY'] = args.AZUREML_RUN_TOKEN_EXPIRY\r\n",
|
||||
" os.environ['AZUREML_RUN_ID'] = args.AZUREML_RUN_ID\r\n",
|
||||
" os.environ['AZUREML_ARM_SUBSCRIPTION'] = args.AZUREML_ARM_SUBSCRIPTION\r\n",
|
||||
" os.environ['AZUREML_ARM_RESOURCEGROUP'] = args.AZUREML_ARM_RESOURCEGROUP\r\n",
|
||||
" os.environ['AZUREML_ARM_WORKSPACE_NAME'] = args.AZUREML_ARM_WORKSPACE_NAME\r\n",
|
||||
" os.environ['AZUREML_ARM_PROJECT_NAME'] = args.AZUREML_ARM_PROJECT_NAME\r\n",
|
||||
" os.environ['AZUREML_SERVICE_ENDPOINT'] = args.AZUREML_SERVICE_ENDPOINT\r\n",
|
||||
"\r\n",
|
||||
"populate_environ()\r\n",
|
||||
"run = Run.get_context(allow_offline=False)\r\n",
|
||||
"print(run._run_dto[\"parent_run_id\"])\r\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Next: ADLA as a Compute Target\n",
|
||||
"To use ADLA as a compute target from Azure Machine Learning Pipeline, a AdlaStep is used. This [notebook](https://aka.ms/pl-adla) demonstrates the use of AdlaStep in Azure Machine Learning Pipeline."
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -298,7 +298,7 @@
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"cluster_name = \"hd-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
|
||||
@@ -117,7 +117,7 @@
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"cluster_name = \"hd-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
|
||||
@@ -142,7 +142,7 @@
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"cpu-cluster\"\n",
|
||||
"cluster_name = \"hd-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
|
||||
@@ -295,7 +295,7 @@
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"cluster_name = \"hd-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
|
||||
@@ -312,7 +312,7 @@
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"cluster_name = \"hd-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
|
||||
@@ -6,15 +6,15 @@ import mlflow.keras
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
import keras
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense
|
||||
from keras.optimizers import RMSprop
|
||||
from tensorflow import keras
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.layers import Dense
|
||||
from tensorflow.keras.optimizers import RMSprop
|
||||
|
||||
print("Keras version:", keras.__version__)
|
||||
|
||||
# Enable auto-logging to MLflow to capture Keras metrics.
|
||||
mlflow.keras.autolog()
|
||||
mlflow.autolog()
|
||||
|
||||
# Model / data parameters
|
||||
n_inputs = 28 * 28
|
||||
|
||||
@@ -450,6 +450,10 @@
|
||||
" # GPU\n",
|
||||
" use_gpu=False, \n",
|
||||
" \n",
|
||||
" # Shared memory size\n",
|
||||
" # Uncomment line below to set shm_size for workers (requires Azure Machine Learning SDK 1.33 or greater)\n",
|
||||
" # shm_size=1024*1024*1024, \n",
|
||||
" \n",
|
||||
" # PIP packages to use\n",
|
||||
")"
|
||||
]
|
||||
|
||||
@@ -256,7 +256,6 @@
|
||||
" dockerfile=f.read()\n",
|
||||
"\n",
|
||||
" xvfb_env = Environment(name='xvfb-vdisplay')\n",
|
||||
" xvfb_env.docker.enabled = True\n",
|
||||
" xvfb_env.docker.base_image = None\n",
|
||||
" xvfb_env.docker.base_dockerfile = dockerfile\n",
|
||||
" \n",
|
||||
@@ -713,7 +712,6 @@
|
||||
" dockerfile=f.read()\n",
|
||||
"\n",
|
||||
"xvfb_env = Environment(name='xvfb-vdisplay')\n",
|
||||
"xvfb_env.docker.enabled = True\n",
|
||||
"xvfb_env.docker.base_image = None\n",
|
||||
"xvfb_env.docker.base_dockerfile = dockerfile\n",
|
||||
" \n",
|
||||
|
||||
@@ -14,6 +14,8 @@ RUN conda install -y conda=4.7.12 python=3.7 && conda clean -ay && \
|
||||
azureml-dataset-runtime[fuse,pandas] \
|
||||
azureml-contrib-reinforcementlearning \
|
||||
gputil \
|
||||
scipy \
|
||||
pyglet \
|
||||
cloudpickle==1.3.0 \
|
||||
tensorboardX \
|
||||
tensorflow==1.14.0 \
|
||||
|
||||
@@ -0,0 +1,688 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Automated Machine Learning\n",
|
||||
"_**Regression with Aml Compute**_\n",
|
||||
"\n",
|
||||
"## Contents\n",
|
||||
"1. [Introduction](#Introduction)\n",
|
||||
"1. [Setup](#Setup)\n",
|
||||
"1. [Data](#Data)\n",
|
||||
"1. [Train](#Train)\n",
|
||||
"1. [Results](#Results)\n",
|
||||
"1. [Test](#Test)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"In this example we use the Hardware Performance Dataset to showcase how you can use AutoML for a simple regression problem. The regression goal is to predict the performance of certain combinations of hardware parts.\n",
|
||||
"After training AutoML models for this regression data set, we show how you can compute model explanations on your remote compute using a sample explainer script.\n",
|
||||
"\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
|
||||
"\n",
|
||||
"In this notebook you will learn how to:\n",
|
||||
"1. Create an `Experiment` in an existing `Workspace`.\n",
|
||||
"2. Instantiate AutoMLConfig with FeaturizationConfig for customization.\n",
|
||||
"3. Train the model using remote compute.\n",
|
||||
"4. Explore the results and featurization transparency options.\n",
|
||||
"5. Setup remote compute for computing the model explanations for a given AutoML model.\n",
|
||||
"6. Start an AzureML experiment on your remote compute.\n",
|
||||
"7. Submit model analysis, explain runs and counterfactual runs for a specific AutoML model.\n",
|
||||
"8. Download the feature importance for raw features and visualize the explanations for raw features on azure portal. \n",
|
||||
"10. Download counterfactual examples and view them in the notebook.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"As part of the setup you have already created an Azure ML `Workspace` object. For Automated ML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"import azureml.dataprep as dprep\n",
|
||||
"from azureml.automl.core.featurization import FeaturizationConfig\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from azureml.core.dataset import Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# Choose a name for the experiment.\n",
|
||||
"experiment_name = 'automl-regression-rai'\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace Name'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create or Attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster.\n",
|
||||
"amlcompute_cluster_name = \"hardware-rai\"\n",
|
||||
"\n",
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n",
|
||||
" max_nodes=4)\n",
|
||||
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Setup Training and Test Data for AutoML experiment\n",
|
||||
"\n",
|
||||
"Load the hardware dataset from a csv file containing both training features and labels. The features are inputs to the model, while the training labels represent the expected output of the model. Next, we'll split the data using random_split and extract the training data for the model. We also register the datasets in your workspace using a name so that these datasets may be accessed from the remote compute."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv'\n",
|
||||
"\n",
|
||||
"dataset = Dataset.Tabular.from_delimited_files(data)\n",
|
||||
"\n",
|
||||
"# Split the dataset into train and test datasets\n",
|
||||
"train_data, test_data = dataset.random_split(percentage=0.8, seed=223)\n",
|
||||
"\n",
|
||||
"# Drop ModelName\n",
|
||||
"train_data = train_data.drop_columns(['ModelName'])\n",
|
||||
"test_data = test_data.drop_columns(['ModelName'])\n",
|
||||
"\n",
|
||||
"# Register the train dataset with your workspace\n",
|
||||
"train_data.register(workspace = ws, name = 'rai_machine_train_dataset',\n",
|
||||
" description = 'hardware performance training data',\n",
|
||||
" create_new_version=True)\n",
|
||||
"\n",
|
||||
"# Register the test dataset with your workspace\n",
|
||||
"test_data.register(workspace = ws, name = 'rai_machine_test_dataset', description = 'hardware performance test data', create_new_version=True)\n",
|
||||
"\n",
|
||||
"label =\"ERP\"\n",
|
||||
"\n",
|
||||
"train_data.to_pandas_dataframe().head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train\n",
|
||||
"\n",
|
||||
"Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
|
||||
"\n",
|
||||
"|Property|Description|\n",
|
||||
"|-|-|\n",
|
||||
"|**task**|classification, regression or forecasting|\n",
|
||||
"|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|\n",
|
||||
"|**experiment_timeout_hours**| Maximum amount of time in hours that all iterations combined can take before the experiment terminates.|\n",
|
||||
"|**enable_early_stopping**| Flag to enble early termination if the score is not improving in the short term.|\n",
|
||||
"|**featurization**| 'auto' / 'off' / FeaturizationConfig Indicator for whether featurization step should be done automatically or not, or whether customized featurization should be used. Setting this enables AutoML to perform featurization on the input to handle *missing data*, and to perform some common *feature extraction*. Note: If the input data is sparse, featurization cannot be turned on.|\n",
|
||||
"|**n_cross_validations**|Number of cross validation splits.|\n",
|
||||
"|**training_data**|(sparse) array-like, shape = [n_samples, n_features]|\n",
|
||||
"|**label_column_name**|(sparse) array-like, shape = [n_samples, ], targets values.|"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Customization\n",
|
||||
"\n",
|
||||
"Supported customization includes:\n",
|
||||
"\n",
|
||||
"1. Column purpose update: Override feature type for the specified column.\n",
|
||||
"2. Transformer parameter update: Update parameters for the specified transformer. Currently supports Imputer and HashOneHotEncoder.\n",
|
||||
"3. Drop columns: Columns to drop from being featurized.\n",
|
||||
"4. Block transformers: Allow/Block transformers to be used on featurization process."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create FeaturizationConfig object using API calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"sample-featurizationconfig-remarks2"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"featurization_config = FeaturizationConfig()\n",
|
||||
"featurization_config.blocked_transformers = ['LabelEncoder']\n",
|
||||
"#featurization_config.drop_columns = ['MMIN']\n",
|
||||
"featurization_config.add_column_purpose('MYCT', 'Numeric')\n",
|
||||
"featurization_config.add_column_purpose('VendorName', 'CategoricalHash')\n",
|
||||
"#default strategy mean, add transformer param for for 3 columns\n",
|
||||
"featurization_config.add_transformer_params('Imputer', ['CACH'], {\"strategy\": \"median\"})\n",
|
||||
"featurization_config.add_transformer_params('Imputer', ['CHMIN'], {\"strategy\": \"median\"})\n",
|
||||
"featurization_config.add_transformer_params('Imputer', ['PRP'], {\"strategy\": \"most_frequent\"})\n",
|
||||
"#featurization_config.add_transformer_params('HashOneHotEncoder', [], {\"number_of_bits\": 3})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"sample-featurizationconfig-remarks3"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"enable_early_stopping\": True, \n",
|
||||
" \"experiment_timeout_hours\" : 0.25,\n",
|
||||
" \"max_concurrent_iterations\": 4,\n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" \"n_cross_validations\": 5,\n",
|
||||
" \"primary_metric\": 'normalized_root_mean_squared_error',\n",
|
||||
" \"verbosity\": logging.INFO\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'regression',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" featurization=featurization_config,\n",
|
||||
" training_data = train_data,\n",
|
||||
" label_column_name = label,\n",
|
||||
" **automl_settings\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Call the `submit` method on the experiment object and pass the `AutoMLConfig`. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
|
||||
"In this example, we specify `show_output=False` to suppress output for each iteration. You can monitor the run by clicking on the link in the output."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run = experiment.submit(automl_config, show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run the following cell to access previous runs. Uncomment the cell below and update the run_id."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#from azureml.train.automl.run import AutoMLRun\n",
|
||||
"#remote_run = AutoMLRun(experiment=experiment, run_id='AutoML_1723d4fe-c33d-41f7-83ad-c010215583b0')\n",
|
||||
"#remote_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Generating Responsible AI insights for AutoML model\n",
|
||||
"This section will walk you through the workflow to compute Responsible AI insights like model explanations and counterfactual examples using model analysis workflow for an AutoML model on your remote compute.\n",
|
||||
"\n",
|
||||
"### Retrieve any AutoML Model for explanations\n",
|
||||
"\n",
|
||||
"Below we select an AutoML pipeline from our iterations. The `get_best_child` method returns the a AutoML run with the best score for the specified metric"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_run = remote_run.get_best_child(metric='mean_absolute_error')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Setup model analysis on the remote compute\n",
|
||||
"The following section provides details on how to setup an AzureML experiment to run model analysis for an AutoML model on your remote compute."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Create conda configuration for model analysis and explanations runs from automl_run object."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"# create a new RunConfiguration object\n",
|
||||
"conda_run_config = RunConfiguration(framework=\"python\")\n",
|
||||
"\n",
|
||||
"# Set compute target to AmlCompute\n",
|
||||
"conda_run_config.target = compute_target\n",
|
||||
"conda_run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"# specify CondaDependencies obj\n",
|
||||
"conda_run_config.environment.python.conda_dependencies = automl_run.get_environment().python.conda_dependencies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Register the AutoML model and create a `PickleModelLoader` for the model analysis so that the model analysis can instantiate the model downloaded from AzureML."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Model\n",
|
||||
"from azureml.responsibleai.common.pickle_model_loader import PickleModelLoader\n",
|
||||
"from azureml.responsibleai.tools.model_analysis.model_analysis_config import ModelAnalysisConfig\n",
|
||||
"from azureml.responsibleai.tools.model_analysis.explain_config import ExplainConfig\n",
|
||||
"from azureml.automl.core.shared.constants import MODEL_PATH\n",
|
||||
"\n",
|
||||
"automl_run.download_file(name=MODEL_PATH, output_file_path='model.pkl')\n",
|
||||
"\n",
|
||||
"model = automl_run.register_model(model_name='automl_rai', \n",
|
||||
" model_path='outputs/model.pkl')\n",
|
||||
"\n",
|
||||
"model_loader = PickleModelLoader('model.pkl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Construct the list of the feature column names by dropping the name of the label column from the list of all column names."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_column_names = train_data.to_pandas_dataframe().columns.values\n",
|
||||
"X_column_names = X_column_names[X_column_names!=label]\n",
|
||||
"X_column_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Get the train and test dataset for the model analysis."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dataset = Dataset.get_by_name(workspace=ws, name='rai_machine_train_dataset')\n",
|
||||
"test_dataset = Dataset.get_by_name(workspace=ws, name='rai_machine_test_dataset')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the `ModelAnalysisConfig` below, `confidential_datastore_name` is the name of the datastore where the analyses will be uploaded. This example uses the default data store because the dataset is also in the default datastore. If you have confidential data in the dataset, you should specify a different data store as the `confidential_datastore_name` because analysis makes a copy of the data in this data store."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"categorical_features = ['VendorName']\n",
|
||||
"\n",
|
||||
"model_analysis_config = ModelAnalysisConfig(\n",
|
||||
" title=\"Model analysis\",\n",
|
||||
" model=model,\n",
|
||||
" model_type='regression',\n",
|
||||
" model_loader=model_loader,\n",
|
||||
" train_dataset=train_dataset,\n",
|
||||
" test_dataset=test_dataset,\n",
|
||||
" X_column_names=X_column_names,\n",
|
||||
" target_column_name=label,\n",
|
||||
" confidential_datastore_name=ws.get_default_datastore().name,\n",
|
||||
" run_configuration=conda_run_config,\n",
|
||||
" categorical_column_names=categorical_features\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run model analysis\n",
|
||||
"\n",
|
||||
"The model analysis run takes a snapshot of the data in preparation for model explanation, error analysis, causal and counterfactual.\n",
|
||||
"The model analysis run is the parent run for the model explanation, error analysis, causal and counterfactual runs.\n",
|
||||
"In this example we will just generate an explanation and counterfactuals, but causal and error analyses may be performed as well."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_analysis_run = experiment.submit(model_analysis_config)\n",
|
||||
"model_analysis_run.wait_for_completion(raise_on_error=True, wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compute explanations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run model explanation based on the model analysis.\n",
|
||||
"The explanation run is a child run of the model analysis run.\n",
|
||||
"In the future, the `add_request` method will allow extra parameters to configure the explanation generated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"explain_config = ExplainConfig(model_analysis_run, conda_run_config)\n",
|
||||
"explain_config.add_request()\n",
|
||||
"explain_run = model_analysis_run.submit_child(explain_config)\n",
|
||||
"explain_run.wait_for_completion(raise_on_error=True, wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The `explanation_manager.list` method below returns a list of metadata dictionaries for each explain run. In this case, there is a single explain run. So, the list contains a single dictionary."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"explanations = model_analysis_run.explanation_manager.list()\n",
|
||||
"explanation = explanations[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Feature importance and visualizing explanation dashboard\n",
|
||||
"In this section we describe how you can download the explanation results from the explanations experiment and visualize the feature importance for your AutoML model on the azure portal."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"feature_explanations = model_analysis_run.explanation_manager.download_by_id(explanation['id'])\n",
|
||||
"print(feature_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the explanations for your features under the 'Explanations (preview)' tab in the explain run at:-\\n\" + explain_run.get_portal_url())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Generate counterfactual examples\n",
|
||||
"\n",
|
||||
"Generate counterfactuals for all the samples in the `test_dataset` based on the model analysis.\n",
|
||||
"The counterfactual run is a child run of the model analysis run.\n",
|
||||
"In the future, the `add_request` method will allow extra parameters to configure the counterfactuals generated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.responsibleai.tools.model_analysis.counterfactual_config import CounterfactualConfig\n",
|
||||
"\n",
|
||||
"cf_config = CounterfactualConfig(model_analysis_run, conda_run_config)\n",
|
||||
"cf_config.add_request(total_CFs=10, desired_range=[10, 300], feature_importance=False)\n",
|
||||
"cf_run = model_analysis_run.submit_child(cf_config)\n",
|
||||
"cf_run.wait_for_completion(raise_on_error=True, wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Downloading counterfactual examples\n",
|
||||
"The `counterfactual_manager.list` method below returns a list of metadata dictionaries for each counterfactual run. In this case, there is a single counterfactual run. So, the list contains a single dictionary.\n",
|
||||
"\n",
|
||||
"The `download_by_id()` method available in the `counterfactual_manager` can be used to download the counterfactual examples."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cf_meta = model_analysis_run.counterfactual_manager.list()\n",
|
||||
"counterfactual_object = model_analysis_run.counterfactual_manager.download_by_id(cf_meta[0]['id'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Visualizing the generated counterfactuals\n",
|
||||
"You can use `visualize_as_dataframe()` method to view the generated counterfactual examples for the samples in `test_dataset`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"counterfactual_object.visualize_as_dataframe(show_only_changes=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "jeffshep"
|
||||
}
|
||||
],
|
||||
"categories": [
|
||||
"how-to-use-azureml",
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"category": "tutorial",
|
||||
"compute": [
|
||||
"AML"
|
||||
],
|
||||
"datasets": [
|
||||
"MachineData"
|
||||
],
|
||||
"deployment": [
|
||||
"ACI"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"None"
|
||||
],
|
||||
"friendly_name": "Automated ML run with featurization and model explainability.",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.12"
|
||||
},
|
||||
"tags": [
|
||||
"featurization",
|
||||
"explainability",
|
||||
"remote_run",
|
||||
"AutomatedML"
|
||||
],
|
||||
"task": "Regression"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
name: auto-ml-regression-responsibleai
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-responsibleai
|
||||
@@ -100,7 +100,7 @@
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.33.0, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version 1.34.0, you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
1
index.md
1
index.md
@@ -27,6 +27,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb) | Classification | Creditcard | AML Compute | None | None | remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/experimental/classification-credit-card-fraud-local-managed/auto-ml-classification-credit-card-fraud-local-managed.ipynb) | Classification | Creditcard | AML Compute | None | None | AutomatedML |
|
||||
| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/responsible-ai/auto-ml-regresion-responsibleai/auto-ml-regression-responsibleai.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| :star:[Azure Machine Learning Pipeline with DataTranferStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb) | Demonstrates the use of DataTranferStep | Custom | ADF | None | Azure ML | None |
|
||||
| [Getting Started with Azure Machine Learning Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb) | Getting Started notebook for ANML Pipelines | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Azure Machine Learning Pipeline with AzureBatchStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb) | Demonstrates the use of AzureBatchStep | Custom | Azure Batch | None | Azure ML | None |
|
||||
|
||||
@@ -102,7 +102,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.33.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.34.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user