Compare commits

...

11 Commits

Author SHA1 Message Date
amlrelsa-ms
bb11c80b1b update samples from Release-193 as a part of 1.53.0 SDK stable release 2023-08-23 03:24:03 +00:00
Diondra Peck
d0961b98bf Add disclaimer to README 2023-06-28 15:47:49 -07:00
Paul Shealy
302589b7f9 Merge pull request #1915 from Azure/release_update_stablev2/Release-171
Release update stablev2/release 171 for SDK 1.51.0
2023-06-07 19:19:33 -07:00
amlrelsa-ms
cc85949d6d update samples from Release-171 as a part of 1.51 SDK stable release 2023-06-06 21:58:24 +05:30
amlrelsa-sa
3a1824e3ad update samples from Release-170 as a part of 1.51 SDK stable release 2023-06-06 10:50:33 +05:30
Paul Shealy
579643326d Merge pull request #1911 from diondrapeck/add-deprecation-disclaimer
Add repository deprecation disclaimer and pointer to v2 repo
2023-05-25 08:04:29 -07:00
Diondra Peck
14f76f227e Add deprecation disclaimer 2023-05-23 12:48:14 -07:00
Paul Shealy
25baf5203a Merge pull request #1899 from Azure/release_update/Release-177
update samples from Release-177 as a part of  SDK release
2023-04-17 13:01:27 -07:00
amlrelsa-ms
1178fcb0ba update samples from Release-177 as a part of SDK release 2023-04-17 10:22:59 +00:00
Sasidhar Kasturi
e4d84c8e45 update samples from Release-169 as a part of 1.50.0 SDK stable release (#1898)
Co-authored-by: amlrelsa-ms <amlrelsa@microsoft.com>
2023-04-14 10:39:38 -04:00
Harneet Virk
7a3ab1e44c Merge pull request #1895 from Azure/release_update/Release-175
update samples from Release-175 as a part of  SDK release
2023-03-28 10:17:27 -07:00
118 changed files with 1661 additions and 2707 deletions

View File

@@ -103,7 +103,7 @@
"source": [
"import azureml.core\n",
"\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -329,7 +329,7 @@
" print(\"Creating new gpu-cluster\")\n",
" \n",
" # Specify the configuration for the new cluster\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\",\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"Standard_NC6s_v3\",\n",
" min_nodes=0,\n",
" max_nodes=4)\n",
" # Create the cluster with the specified name and configuration\n",

View File

@@ -174,7 +174,7 @@
"else:\n",
" print(\"creating new cluster\")\n",
" # vm_size parameter below could be modified to one of the RAPIDS-supported VM types\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"Standard_NC6s_v2\", min_nodes=1, max_nodes = 1)\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"Standard_NC6s_v3\", min_nodes=1, max_nodes = 1)\n",
"\n",
" # create the cluster\n",
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)\n",

View File

@@ -3,10 +3,10 @@ dependencies:
- pip:
- azureml-sdk
- azureml-contrib-fairness
- fairlearn>=0.6.2
- fairlearn>=0.6.2,<=0.7.0
- joblib
- liac-arff
- raiwidgets~=0.24.0
- raiwidgets~=0.28.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- protobuf==3.20.0

View File

@@ -3,10 +3,10 @@ dependencies:
- pip:
- azureml-sdk
- azureml-contrib-fairness
- fairlearn>=0.6.2
- fairlearn>=0.6.2,<=0.7.0
- joblib
- liac-arff
- raiwidgets~=0.24.0
- raiwidgets~=0.28.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- protobuf==3.20.0

View File

@@ -7,19 +7,20 @@ dependencies:
# The python interpreter version.
# Azure ML only supports 3.7.0 and later.
- pip==22.3.1
- python>=3.7,<3.9
- python>=3.8,<3.9
- holidays==0.10.3
- conda-forge::fbprophet==0.7.1
- pandas==1.1.5
- scipy==1.5.3
- Cython==0.29.14
- tqdm==4.64.1
- tqdm==4.65.0
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.49.0
- azureml-defaults~=1.49.0
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.49.0/validated_win32_requirements.txt [--no-deps]
- matplotlib==3.6.2
- azureml-widgets~=1.53.0
- azureml-defaults~=1.53.0
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.53.0/validated_win32_requirements.txt [--no-deps]
- matplotlib==3.7.1
- xgboost==1.3.3
- cmdstanpy==0.9.5
- setuptools-git==1.2

View File

@@ -7,13 +7,13 @@ dependencies:
# The python interpreter version.
# Azure ML only supports 3.7 and later.
- pip==22.3.1
- python>=3.7,<3.9
- matplotlib==3.2.1
- python>=3.8,<3.9
- matplotlib==3.7.1
- numpy>=1.21.6,<=1.22.3
- cython==0.29.14
- urllib3==1.26.7
- scipy>=1.4.1,<=1.5.3
- scikit-learn==0.22.1
- scikit-learn==1.1.0
- py-xgboost<=1.3.3
- holidays==0.10.3
- conda-forge::fbprophet==0.7.1
@@ -23,10 +23,10 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.49.0
- azureml-defaults~=1.49.0
- azureml-widgets~=1.53.0
- azureml-defaults~=1.53.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.49.0/validated_linux_requirements.txt [--no-deps]
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.53.0/validated_linux_requirements.txt [--no-deps]

View File

@@ -7,26 +7,22 @@ dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.7 and later.
- pip==22.3.1
- python>=3.7,<3.9
- matplotlib==3.2.1
- python>=3.8,<3.9
- numpy>=1.21.6,<=1.22.3
- cython==0.29.14
- urllib3==1.26.7
- scipy>=1.4.1,<=1.5.3
- scikit-learn==0.22.1
- py-xgboost<=1.3.3
- scikit-learn==1.1.0
- holidays==0.10.3
- conda-forge::fbprophet==0.7.1
- pytorch::pytorch=1.11.0
- cudatoolkit=9.0
- notebook
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.49.0
- azureml-defaults~=1.49.0
- azureml-widgets~=1.53.0
- azureml-defaults~=1.53.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- fbprophet==0.7.1
- xgboost==1.3.3
- matplotlib==3.7.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.49.0/validated_darwin_requirements.txt [--no-deps]
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.53.0/validated_darwin_requirements.txt [--no-deps]

View File

@@ -1,5 +1,21 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -712,7 +728,9 @@
"from azureml.core.model import Model\n",
"from azureml.core.environment import Environment\n",
"\n",
"inference_config = InferenceConfig(entry_script=script_file_name)\n",
"inference_config = InferenceConfig(\n",
" environment=best_run.get_environment(), entry_script=script_file_name\n",
")\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(\n",
" cpu_cores=2,\n",
@@ -828,9 +846,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"metadata": {},
"outputs": [],
"source": [
"%matplotlib notebook\n",

View File

@@ -1,5 +1,21 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -1,5 +1,21 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -139,8 +155,8 @@
" print(\"Found existing cluster, use it.\")\n",
"except ComputeTargetException:\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size=\"STANDARD_NC6\", # CPU for BiLSTM, such as \"STANDARD_D2_V2\"\n",
" # To use BERT (this is recommended for best performance), select a GPU such as \"STANDARD_NC6\"\n",
" vm_size=\"Standard_NC6s_v3\", # CPU for BiLSTM, such as \"STANDARD_D2_V2\"\n",
" # To use BERT (this is recommended for best performance), select a GPU such as \"Standard_NC6s_v3\"\n",
" # or similar GPU option\n",
" # available in your workspace\n",
" idle_seconds_before_scaledown=60,\n",
@@ -336,7 +352,7 @@
"metadata": {},
"source": [
"For local inferencing, you can load the model locally via. the method `remote_run.get_output()`. For more information on the arguments expected by this method, you can run `remote_run.get_output??`.\n",
"Note that when the model contains BERT, this step will require pytorch and pytorch-transformers installed in your local environment. The exact versions of these packages can be found in the **automl_env.yml** file located in the local copy of your azureml-examples folder here: \"azureml-examples/python-sdk/tutorials/automl-with-azureml\""
"Note that when the model contains BERT, this step will require pytorch and pytorch-transformers installed in your local environment. The exact versions of these packages can be found in the **automl_env.yml** file located in the local copy of your MachineLearningNotebooks folder here: \"MachineLearningNotebooks\\how-to-use-azureml\\automated-machine-learning\""
]
},
{

View File

@@ -1,3 +1,4 @@
import json
import pandas as pd
from azureml.core import Environment, ScriptRunConfig
from azureml.core.run import Run
@@ -13,7 +14,16 @@ def run_inference(
model_name,
):
inference_env = train_run.get_environment()
try:
inference_env = train_run.get_environment()
except BaseException:
run_details = train_run.get_details()
run_def = run_details.get("runDefinition")
env = run_def.get("environment")
if env is None:
raise
json.dump(env, open("azureml_environment.json", "w"))
inference_env = Environment.load_from_directory(".")
est = ScriptRunConfig(
source_directory=script_folder,

View File

@@ -3,7 +3,7 @@ import argparse
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import joblib
from azureml.automl.runtime.shared.score import scoring, constants
from azureml.core import Run, Dataset

View File

@@ -1,5 +1,21 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -31,7 +31,7 @@ try:
model = Model(ws, args.model_name)
last_train_time = model.created_time
print("Model was last trained on {0}.".format(last_train_time))
except Exception as e:
except Exception:
print("Could not get last model train time.")
last_train_time = datetime.min.replace(tzinfo=pytz.UTC)

View File

@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -454,10 +454,13 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note:** Not all datasets produce a y_transformer. The dataset used in the current notebook requires a transformer as the y column data is categorical."
"**Note:** Not all datasets produce a y_transformer. The dataset used in the current notebook requires a transformer as the y column data is categorical. \n",
"\n",
"We will go ahead and download the mlflow transformer model and use it to transform test data that can be used for further experimentation below. To run the commented code, make sure the environment requirement is satisfied. You can go ahead and create the environment from the `conda.yaml` file under `/outputs/featurization/pipeline/` and run the given code in it."
]
},
{
@@ -466,7 +469,7 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.automl.core.shared.constants import Transformers\n",
"''' from azureml.automl.core.shared.constants import Transformers\n",
"\n",
"transformers = mlflow.sklearn.load_model(uri) # Using method 1\n",
"data_transformers = transformers.get_transformers()\n",
@@ -474,14 +477,15 @@
"y_transformer = data_transformers[Transformers.Y_TRANSFORMER]\n",
"\n",
"X_test = x_transformer.transform(X_test_data)\n",
"y_test = y_transformer.transform(y_test_data)"
"y_test = y_transformer.transform(y_test_data) '''"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the following cell to see the featurization summary of X and y transformers. "
"Run the following cell to see the featurization summary of X and y transformers. Uncomment to use. "
]
},
{
@@ -490,10 +494,10 @@
"metadata": {},
"outputs": [],
"source": [
"X_data_summary = x_transformer.get_featurization_summary(is_user_friendly=False)\n",
"''' X_data_summary = x_transformer.get_featurization_summary(is_user_friendly=False)\n",
"\n",
"summary_df = pd.DataFrame.from_records(X_data_summary)\n",
"summary_df"
"summary_df '''"
]
},
{
@@ -544,10 +548,11 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Another way to load the data is to go to the above autofeaturization experiment and check for the featurized dataset ids under `Output datasets`. Uncomment and replace them accordingly below to use."
"Another way to load the data is to go to the above autofeaturization experiment and check for the featurized dataset ids under `Output datasets`. Uncomment and replace them accordingly below, to use."
]
},
{
@@ -597,10 +602,20 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Here we are passing our training data to the lightgbm classifier, any custom model can be used with your data."
"Here we are passing our training data to the lightgbm classifier, any custom model can be used with your data. Let us first install lightgbm."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install lightgbm"
]
},
{
@@ -612,11 +627,27 @@
"import lightgbm as lgb\n",
"\n",
"model = lgb.LGBMClassifier(learning_rate=0.08,max_depth=-5,random_state=42)\n",
"model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_test, y_test),(X_train, y_train)],\n",
" verbose=20,eval_metric='logloss')\n",
"\n",
"model.fit(X_train, y_train, sample_weight=sample_weight)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Once training is done, the test data obtained after transforming from the above downloaded transformer can be used to calculate the accuracy "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Training accuracy {:.4f}'.format(model.score(X_train, y_train)))\n",
"print('Testing accuracy {:.4f}'.format(model.score(X_test, y_test)))"
"\n",
"# Uncomment below to test the model on test data \n",
"# print('Testing accuracy {:.4f}'.format(model.score(X_test, y_test)))"
]
},
{
@@ -654,45 +685,8 @@
"metadata": {},
"outputs": [],
"source": [
"y_pred = model.predict(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate metrics for the prediction\n",
"\n",
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
"from the trained model that was returned."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import confusion_matrix\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import itertools\n",
"\n",
"cf =confusion_matrix(y_test,y_pred)\n",
"plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')\n",
"plt.colorbar()\n",
"plt.title('Confusion Matrix')\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"class_labels = ['False','True']\n",
"tick_marks = np.arange(len(class_labels))\n",
"plt.xticks(tick_marks,class_labels)\n",
"plt.yticks([-0.5,0,1,1.5],['','False','True',''])\n",
"# plotting text value inside cells\n",
"thresh = cf.max() / 2.\n",
"for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):\n",
" plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')\n",
"plt.show()"
"# Uncomment below to test the model on test data\n",
"# y_pred = model.predict(X_test)"
]
},
{

View File

@@ -92,7 +92,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -91,7 +91,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -13,7 +13,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.png)"
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.png)"
]
},
{

View File

@@ -7,7 +7,7 @@
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License.\n",
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/automl-forecasting-function.png)"
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/auto-ml-forecasting-backtest-single-model.png)"
]
},
{

View File

@@ -16,6 +16,13 @@
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-bike-share)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -61,7 +68,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1680248038565
}
},
"outputs": [],
"source": [
"import json\n",
@@ -170,25 +181,6 @@
"source": [
"## Data\n",
"\n",
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datastore = ws.get_default_datastore()\n",
"datastore.upload_files(\n",
" files=[\"./bike-no.csv\"], target_path=\"dataset/\", overwrite=True, show_progress=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's set up what we know about the dataset. \n",
"\n",
"**Target column** is what we want to forecast.\n",
@@ -206,25 +198,50 @@
"time_column_name = \"date\""
]
},
{
"cell_type": "markdown",
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
},
"source": [
"You are now ready to load the historical bike share data. We will load the CSV file into a plain pandas DataFrame."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"jupyter": {
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"dataset = Dataset.Tabular.from_delimited_files(\n",
" path=[(datastore, \"dataset/bike-no.csv\")]\n",
").with_timestamp_columns(fine_grain_timestamp=time_column_name)\n",
"all_data = pd.read_csv(\"bike-no.csv\", parse_dates=[time_column_name])\n",
"\n",
"# Drop the columns 'casual' and 'registered' as these columns are a breakdown of the total and therefore a leak.\n",
"dataset = dataset.drop_columns(columns=[\"casual\", \"registered\"])\n",
"\n",
"dataset.take(5).to_pandas_dataframe().reset_index(drop=True)"
"all_data.drop([\"casual\", \"registered\"], axis=1, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
},
"source": [
"### Split the data\n",
"\n",
@@ -234,22 +251,63 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1680247376789
},
"jupyter": {
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"# select data that occurs before a specified date\n",
"train = dataset.time_before(datetime(2012, 8, 31), include_boundary=True)\n",
"train.to_pandas_dataframe().tail(5).reset_index(drop=True)"
"train = all_data[all_data[time_column_name] <= pd.Timestamp(\"2012-08-31\")].copy()\n",
"test = all_data[all_data[time_column_name] >= pd.Timestamp(\"2012-09-01\")].copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload data to datastore\n",
"\n",
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"jupyter": {
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"test = dataset.time_after(datetime(2012, 9, 1), include_boundary=True)\n",
"test.to_pandas_dataframe().head(5).reset_index(drop=True)"
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
"\n",
"datastore = ws.get_default_datastore()\n",
"\n",
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
" train, target=(datastore, \"dataset/\"), name=\"bike_no_train\"\n",
")\n",
"\n",
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
" test, target=(datastore, \"dataset/\"), name=\"bike_no_test\"\n",
")"
]
},
{
@@ -360,7 +418,7 @@
" featurization=featurization_config,\n",
" blocked_models=[\"ExtremeRandomTrees\"],\n",
" experiment_timeout_hours=0.3,\n",
" training_data=train,\n",
" training_data=train_dataset,\n",
" label_column_name=target_column_name,\n",
" compute_target=compute_target,\n",
" enable_early_stopping=True,\n",
@@ -546,7 +604,7 @@
"from run_forecast import run_rolling_forecast\n",
"\n",
"remote_run = run_rolling_forecast(\n",
" test_experiment, compute_target, best_run, test, target_column_name\n",
" test_experiment, compute_target, best_run, test_dataset, target_column_name\n",
")\n",
"remote_run"
]
@@ -722,6 +780,9 @@
],
"friendly_name": "Forecasting BikeShare Demand",
"index_order": 1,
"kernel_info": {
"name": "python38-azureml"
},
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
@@ -737,11 +798,19 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
"version": "3.8.10"
},
"microsoft": {
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"nteract": {
"version": "nteract-front-end@1.0.0"
},
"pygments_lexer": "ipython3",
"tags": [
"Forecasting"

View File

@@ -1,6 +1,6 @@
import argparse
from azureml.core import Dataset, Run
from sklearn.externals import joblib
import joblib
parser = argparse.ArgumentParser()
parser.add_argument(

View File

@@ -16,6 +16,13 @@
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-energy-demand/automl-forecasting-task-energy-demand-advanced-mlflow.ipynb)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -260,8 +267,12 @@
"outputs": [],
"source": [
"# split into train based on time\n",
"train = dataset.time_before(datetime(2017, 8, 8, 5), include_boundary=True)\n",
"train.to_pandas_dataframe().reset_index(drop=True).sort_values(time_column_name).tail(5)"
"train = (\n",
" dataset.time_before(datetime(2017, 8, 8, 5), include_boundary=True)\n",
" .to_pandas_dataframe()\n",
" .reset_index(drop=True)\n",
")\n",
"train.sort_values(time_column_name).tail(5)"
]
},
{
@@ -271,8 +282,39 @@
"outputs": [],
"source": [
"# split into test based on time\n",
"test = dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))\n",
"test.to_pandas_dataframe().reset_index(drop=True).head(5)"
"test = (\n",
" dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))\n",
" .to_pandas_dataframe()\n",
" .reset_index(drop=True)\n",
")\n",
"test.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"jupyter": {
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"# register the splitted train and test data in workspace storage\n",
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
"\n",
"datastore = ws.get_default_datastore()\n",
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
" train, target=(datastore, \"dataset/\"), name=\"nyc_energy_train\"\n",
")\n",
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
" test, target=(datastore, \"dataset/\"), name=\"nyc_energy_test\"\n",
")"
]
},
{
@@ -361,7 +403,7 @@
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" blocked_models=[\"ExtremeRandomTrees\", \"AutoArima\", \"Prophet\"],\n",
" experiment_timeout_hours=0.3,\n",
" training_data=train,\n",
" training_data=train_dataset,\n",
" label_column_name=target_column_name,\n",
" compute_target=compute_target,\n",
" enable_early_stopping=True,\n",
@@ -521,7 +563,7 @@
" test_experiment=test_experiment,\n",
" compute_target=compute_target,\n",
" train_run=best_run,\n",
" test_dataset=test,\n",
" test_dataset=test_dataset,\n",
" target_column_name=target_column_name,\n",
")\n",
"remote_run_infer.wait_for_completion(show_output=False)\n",
@@ -627,7 +669,7 @@
" \"Prophet\",\n",
" ], # These models are blocked for tutorial purposes, remove this for real use cases.\n",
" experiment_timeout_hours=0.3,\n",
" training_data=train,\n",
" training_data=train_dataset,\n",
" label_column_name=target_column_name,\n",
" compute_target=compute_target,\n",
" enable_early_stopping=True,\n",
@@ -698,7 +740,7 @@
" test_experiment=test_experiment_advanced,\n",
" compute_target=compute_target,\n",
" train_run=best_run_lags,\n",
" test_dataset=test,\n",
" test_dataset=test_dataset,\n",
" target_column_name=target_column_name,\n",
" inference_folder=\"./forecast_advanced\",\n",
")\n",
@@ -766,6 +808,9 @@
"how-to-use-azureml",
"automated-machine-learning"
],
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
@@ -781,7 +826,15 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.10"
},
"microsoft": {
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"nteract": {
"version": "nteract-front-end@1.0.0"
},
"vscode": {
"interpreter": {
@@ -790,5 +843,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View File

@@ -6,7 +6,7 @@ compute instance.
import argparse
from azureml.core import Dataset, Run
from sklearn.externals import joblib
import joblib
from pandas.tseries.frequencies import to_offset
parser = argparse.ArgumentParser()

View File

@@ -19,7 +19,14 @@
"hidePrompt": false
},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.png)"
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-github-dau/auto-ml-forecasting-github-dau.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-github-dau)).</font>"
]
},
{
@@ -382,7 +389,7 @@
"automl_config = AutoMLConfig(\n",
" task=\"forecasting\",\n",
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" experiment_timeout_hours=1,\n",
" experiment_timeout_hours=1.5,\n",
" training_data=train_dataset,\n",
" label_column_name=target_column_name,\n",
" validation_data=valid_dataset,\n",
@@ -695,7 +702,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.8.10"
}
},
"nbformat": 4,

View File

@@ -4,7 +4,7 @@ import os
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
from azureml.automl.runtime.shared.score import scoring, constants

View File

@@ -16,6 +16,13 @@
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-hierarchical-timeseries-in-pipeline)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -666,7 +673,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.8.10"
}
},
"nbformat": 4,

View File

@@ -16,6 +16,13 @@
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-many-models-in-pipeline)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -306,7 +313,7 @@
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"\n",
"# Name your cluster\n",
"compute_name = \"mm-compute\"\n",
"compute_name = \"mm-compute-v1\"\n",
"\n",
"\n",
"if compute_name in ws.compute_targets:\n",
@@ -316,7 +323,7 @@
"else:\n",
" print(\"Creating a new compute target...\")\n",
" provisioning_config = AmlCompute.provisioning_configuration(\n",
" vm_size=\"STANDARD_D16S_V3\", max_nodes=20\n",
" vm_size=\"STANDARD_D14_V2\", max_nodes=20\n",
" )\n",
" # Create the compute target\n",
" compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n",
@@ -878,7 +885,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.10"
},
"vscode": {
"interpreter": {

View File

@@ -16,6 +16,13 @@
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -242,7 +249,9 @@
" time_series_id_column_names, group_keys=False\n",
" )\n",
" df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])\n",
" df_head.reset_index(inplace=True, drop=True)\n",
" df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])\n",
" df_tail.reset_index(inplace=True, drop=True)\n",
" return df_head, df_tail\n",
"\n",
"\n",
@@ -715,7 +724,7 @@
" description=\"Automl forecasting sample service\",\n",
")\n",
"\n",
"aci_service_name = \"automl-oj-forecast-01\"\n",
"aci_service_name = \"automl-oj-forecast-03\"\n",
"print(aci_service_name)\n",
"aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)\n",
"aci_service.wait_for_deployment(True)\n",
@@ -792,7 +801,7 @@
"metadata": {},
"outputs": [],
"source": [
"serv = Webservice(ws, \"automl-oj-forecast-01\")\n",
"serv = Webservice(ws, \"automl-oj-forecast-03\")\n",
"serv.delete() # don't do it accidentally"
]
}
@@ -835,7 +844,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.10"
},
"tags": [
"None"

View File

@@ -6,7 +6,7 @@ compute instance.
import argparse
from azureml.core import Dataset, Run
from sklearn.externals import joblib
import joblib
from pandas.tseries.frequencies import to_offset
parser = argparse.ArgumentParser()

View File

@@ -1,5 +1,21 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1h_automl_in_pipeline/automl-forecasting-in-pipeline)).</font>\n",
"</br>\n",
"</br>\n",
"<font color=\"red\" size=\"5\">\n",
"For examples illustrating how to build pipelines with components, please use the following links:</font>\n",
"<ul>\n",
" <li><a href=\"https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-many-models-in-pipeline\">Many Models</a></li>\n",
" <li><a href=\"https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-hierarchical-timeseries-in-pipeline\">Hierarchical Time Series</a></li>\n",
" <li><a href=\"https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-distributed-tcn\">Distributed TCN</a></li>\n",
"</ul>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -555,40 +571,15 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Model\n",
"from azureml.train.automl.run import AutoMLRun\n",
"\n",
"model = Model(ws, model_name_str)\n",
"download_path = model.download(model_name_str, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After all the files are downloaded, we can generate the run config for inference runs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment, RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"for step in training_pipeline_run.get_steps():\n",
" if step.properties.get(\"StepType\") == \"AutoMLStep\":\n",
" automl_run = AutoMLRun(experiment, step.id)\n",
" break\n",
"\n",
"env_file = os.path.join(download_path, \"conda_env_v_1_0_0.yml\")\n",
"inference_env = Environment(\"oj-inference-env\")\n",
"inference_env.python.conda_dependencies = CondaDependencies(\n",
" conda_dependencies_file_path=env_file\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[Optional] The enviroment can also be assessed from the training run using `get_environment()` API."
"best_run = automl_run.get_best_child()\n",
"inference_env = best_run.get_environment()"
]
},
{

View File

@@ -6,7 +6,7 @@ import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from sklearn.externals import joblib
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
from azureml.data.dataset_factory import TabularDatasetFactory
@@ -30,7 +30,7 @@ def infer_forecasting_dataset_tcn(
run = Run.get_context()
registered_train = TabularDatasetFactory.register_pandas_dataframe(
TabularDatasetFactory.register_pandas_dataframe(
df_all,
target=(
run.experiment.workspace.get_default_datastore(),

View File

@@ -20,7 +20,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we will explore the univaraite time-series data to determine the settings for an automated ML experiment. We will follow the thought process depicted in the following diagram:<br/>\n",
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we will explore the univariate time-series data to determine the settings for an automated ML experiment. We will follow the thought process depicted in the following diagram:<br/>\n",
"![Forecasting after training](figures/univariate_settings_map_20210408.jpg)\n",
"\n",
"The objective is to answer the following questions:\n",
@@ -32,11 +39,11 @@
" </ul>\n",
" <li>Is the data stationary? </li>\n",
" <ul style=\"margin-top:-1px; list-style-type:none\"> \n",
" <li> Importance: In the absense of features that capture trend behavior, ML models (regression and tree based) are not well equiped to predict stochastic trends. Working with stationary data solves this problem. </li>\n",
" <li> Importance: In the absence of features that capture trend behavior, ML models (regression and tree based) are not well equipped to predict stochastic trends. Working with stationary data solves this problem. </li>\n",
" </ul>\n",
" <li>Is there a detectable auto-regressive pattern in the stationary data? </li>\n",
" <ul style=\"margin-top:-1px; list-style-type:none\"> \n",
" <li> Importance: The accuracy of ML models can be improved if serial correlation is modeled by including lags of the dependent/target varaible as features. Including target lags in every experiment by default will result in a regression in accuracy scores if such setting is not warranted. </li>\n",
" <li> Importance: The accuracy of ML models can be improved if serial correlation is modeled by including lags of the dependent/target variable as features. Including target lags in every experiment by default will result in a regression in accuracy scores if such setting is not warranted. </li>\n",
" </ul>\n",
"</ol>\n",
"\n",
@@ -109,7 +116,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The graph plots the alcohol sales in the United States. Because the data is trending, it can be difficult to see cycles, seasonality or other interestng behaviors due to the scaling issues. For example, if there is a seasonal pattern, which we will discuss later, we cannot see them on the trending data. In such case, it is worth plotting the same data in first differences."
"The graph plots the alcohol sales in the United States. Because the data is trending, it can be difficult to see cycles, seasonality or other interesting behaviors due to the scaling issues. For example, if there is a seasonal pattern, which we will discuss later, we cannot see them on the trending data. In such case, it is worth plotting the same data in first differences."
]
},
{
@@ -335,8 +342,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3 Check if there is a clear autoregressive pattern\n",
"We need to determine if we should include lags of the target variable as features in order to improve forecast accuracy. To do this, we will examine the ACF and partial ACF (PACF) plots of the stationary series. In our case, it is a series in first diffrences.\n",
"# 3 Check if there is a clear auto-regressive pattern\n",
"We need to determine if we should include lags of the target variable as features in order to improve forecast accuracy. To do this, we will examine the ACF and partial ACF (PACF) plots of the stationary series. In our case, it is a series in first differences.\n",
"\n",
"<ul>\n",
" <li> Question: What is an Auto-regressive pattern? What are we looking for? </li>\n",
@@ -418,11 +425,11 @@
" </li>\n",
" where $\\sigma_{xzy}$ is the covariance between two random variables $X$ and $Z$; $\\sigma_x$ and $\\sigma_z$ is the variance for $X$ and $Z$, respectively. The correlation coefficient measures the strength of linear relationship between two random variables. This metric can take any value from -1 to 1. <li/>\n",
" <br/>\n",
" <li> The auto-correlation coefficient $\\rho_{Y_{t} Y_{t-k}}$ is the time series equivalent of the correlation coefficient, except instead of measuring linear association between two random variables $X$ and $Z$, it measures the strength of a linear relationship between a random variable $Y_t$ and its lag $Y_{t-k}$ for any positive interger value of $k$. </li> \n",
" <li> The auto-correlation coefficient $\\rho_{Y_{t} Y_{t-k}}$ is the time series equivalent of the correlation coefficient, except instead of measuring linear association between two random variables $X$ and $Z$, it measures the strength of a linear relationship between a random variable $Y_t$ and its lag $Y_{t-k}$ for any positive integer value of $k$. </li> \n",
" <br />\n",
" <li> To visualize the ACF for a particular lag, say lag 2, plot the second lag of a series $y_{t-2}$ on the x-axis, and plot the series itself $y_t$ on the y-axis. The autocorrelation coefficient is the slope of the best fitted regression line and can be interpreted as follows. A one unit increase in the lag of a variable one period ago leads to a $\\rho_{Y_{t} Y_{t-2}}$ units change in the variable in the current period. This interpreation can be applied to any lag. </li> \n",
" <li> To visualize the ACF for a particular lag, say lag 2, plot the second lag of a series $y_{t-2}$ on the x-axis, and plot the series itself $y_t$ on the y-axis. The autocorrelation coefficient is the slope of the best fitted regression line and can be interpreted as follows. A one unit increase in the lag of a variable one period ago leads to a $\\rho_{Y_{t} Y_{t-2}}$ units change in the variable in the current period. This interpretation can be applied to any lag. </li> \n",
" <br />\n",
" <li> In the interpretation posted above we need to be careful not to confuse the word \"leads\" with \"causes\" since these are not the same thing. We do not know the lagged value of the varaible causes it to change. Afterall, there are probably many other features that may explain the movement in $Y_t$. All we are trying to do in this section is to identify situations when the variable contains the strong auto-regressive components that needs to be included in the model to improve forecast accuracy. </li>\n",
" <li> In the interpretation posted above we need to be careful not to confuse the word \"leads\" with \"causes\" since these are not the same thing. We do not know the lagged value of the variable causes it to change. After all, there are probably many other features that may explain the movement in $Y_t$. All we are trying to do in this section is to identify situations when the variable contains the strong auto-regressive components that needs to be included in the model to improve forecast accuracy. </li>\n",
" </ul>\n",
"</ul>"
]
@@ -434,7 +441,7 @@
"<ul>\n",
" <li> Question: What is the PACF? </li>\n",
" <ul style=\"list-style-type:none;\">\n",
" <li> When describing the ACF we essentially running a regression between a partigular lag of a series, say, lag 4, and the series itself. What this implies is the regression coefficient for lag 4 captures the impact of everything that happens in lags 1, 2 and 3. In other words, if lag 1 is the most important lag and we exclude it from the regression, naturally, the regression model will assign the importance of the 1st lag to the 4th one. Partial auto-correlation function fixes this problem since it measures the contribution of each lag accounting for the information added by the intermediary lags. If we were to illustrate ACF and PACF for the fourth lag using the regression analogy, the difference is a follows: \n",
" <li> When describing the ACF we essentially running a regression between a particular lag of a series, say, lag 4, and the series itself. What this implies is the regression coefficient for lag 4 captures the impact of everything that happens in lags 1, 2 and 3. In other words, if lag 1 is the most important lag and we exclude it from the regression, naturally, the regression model will assign the importance of the 1st lag to the 4th one. Partial auto-correlation function fixes this problem since it measures the contribution of each lag accounting for the information added by the intermediary lags. If we were to illustrate ACF and PACF for the fourth lag using the regression analogy, the difference is a follows: \n",
" \\begin{align}\n",
" Y_{t} &= a_{0} + a_{4} Y_{t-4} + e_{t} \\\\\n",
" Y_{t} &= b_{0} + b_{1} Y_{t-1} + b_{2} Y_{t-2} + b_{3} Y_{t-3} + b_{4} Y_{t-4} + \\varepsilon_{t} \\\\\n",
@@ -442,7 +449,7 @@
" </li>\n",
" <br/>\n",
" <li>\n",
" Here, you can think of $a_4$ and $b_{4}$ as the auto- and partial auto-correlation coefficients for lag 4. Notice, in the second equation we explicitely accounting for the intermediate lags by adding them as regrerssors.\n",
" Here, you can think of $a_4$ and $b_{4}$ as the auto- and partial auto-correlation coefficients for lag 4. Notice, in the second equation we explicitly accounting for the intermediate lags by adding them as regressors.\n",
" </li>\n",
" </ul>\n",
"</ul>"
@@ -455,11 +462,11 @@
"<ul>\n",
" <li> Question: Auto-regressive pattern? What are we looking for? </li>\n",
" <ul style=\"list-style-type:none;\">\n",
" <li> We are looking for a classical profiles for an AR(p) process such as an exponential decay of an ACF and a the first $p$ significant lags of the PACF. Let's examine the ACF/PACF profiles of the same simulated AR(2) shown in Section 3, and check if the ACF/PACF explanation are refelcted in these plots. <li/>\n",
" <li> We are looking for a classical profiles for an AR(p) process such as an exponential decay of an ACF and a the first $p$ significant lags of the PACF. Let's examine the ACF/PACF profiles of the same simulated AR(2) shown in Section 3, and check if the ACF/PACF explanation are reflected in these plots. <li/>\n",
" <li><img src=\"figures/ACF_PACF_for_AR2.png\" class=\"img_class\">\n",
" <li> The autocorrelation coefficient for the 3rd lag is 0.6, which can be interpreted that a one unit increase in the value of the target varaible three periods ago leads to 0.6 units increase in the current period. However, the PACF plot shows that the partial autocorrealtion coefficient is zero (from a statistical point of view since it lies within the shaded region). This is happening because the 1st and 2nd lags are good predictors of the target variable. Ommiting these two lags from the regression results in the misleading conclusion that the third lag is a good prediciton. <li/>\n",
" <li> The autocorrelation coefficient for the 3rd lag is 0.6, which can be interpreted that a one unit increase in the value of the target variable three periods ago leads to 0.6 units increase in the current period. However, the PACF plot shows that the partial autocorrelation coefficient is zero (from a statistical point of view since it lies within the shaded region). This is happening because the 1st and 2nd lags are good predictors of the target variable. Omitting these two lags from the regression results in the misleading conclusion that the third lag is a good prediction. <li/>\n",
" <br/>\n",
" <li> This is why it is important to examine both the ACF and the PACF plots when tring to determine the auto regressive order for the variable in question. <li/>\n",
" <li> This is why it is important to examine both the ACF and the PACF plots when trying to determine the auto regressive order for the variable in question. <li/>\n",
" </ul>\n",
"</ul> "
]
@@ -471,6 +478,9 @@
"name": "vlbejan"
}
],
"kernel_info": {
"name": "python38-azureml"
},
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
@@ -486,7 +496,15 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.8.10"
},
"microsoft": {
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"nteract": {
"version": "nteract-front-end@1.0.0"
}
},
"nbformat": 4,

View File

@@ -16,6 +16,13 @@
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-recipes-univariate/2_run_experiment.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate)).</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -300,27 +307,14 @@
"df_train.to_csv(\"train.csv\", index=False)\n",
"df_test.to_csv(\"test.csv\", index=False)\n",
"\n",
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
"\n",
"datastore = ws.get_default_datastore()\n",
"datastore.upload_files(\n",
" files=[\"./train.csv\"],\n",
" target_path=\"uni-recipe-dataset/tabular/\",\n",
" overwrite=True,\n",
" show_progress=True,\n",
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
" df_train, target=(datastore, \"dataset/\"), name=\"train\"\n",
")\n",
"datastore.upload_files(\n",
" files=[\"./test.csv\"],\n",
" target_path=\"uni-recipe-dataset/tabular/\",\n",
" overwrite=True,\n",
" show_progress=True,\n",
")\n",
"\n",
"from azureml.core import Dataset\n",
"\n",
"train_dataset = Dataset.Tabular.from_delimited_files(\n",
" path=[(datastore, \"uni-recipe-dataset/tabular/train.csv\")]\n",
")\n",
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
" path=[(datastore, \"uni-recipe-dataset/tabular/test.csv\")]\n",
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
" df_test, target=(datastore, \"dataset/\"), name=\"test\"\n",
")\n",
"\n",
"# print the first 5 rows of the Dataset\n",
@@ -571,6 +565,9 @@
"name": "vlbejan"
}
],
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
@@ -586,7 +583,15 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.10"
},
"microsoft": {
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"nteract": {
"version": "nteract-front-end@1.0.0"
},
"vscode": {
"interpreter": {

View File

@@ -7,7 +7,7 @@ compute instance.
import argparse
from azureml.core import Dataset, Run
from azureml.automl.core.shared.constants import TimeSeriesInternal
from sklearn.externals import joblib
import joblib
parser = argparse.ArgumentParser()
parser.add_argument(

View File

@@ -1,5 +1,27 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"hideCode": false,
"hidePrompt": false
},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {
"hideCode": false,
"hidePrompt": false
},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -495,6 +517,30 @@
"#### Create conda configuration for model explanations experiment from automl_run object"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from azureml.core import Environment\n",
"\n",
"\n",
"def get_environment_safe(parent_run):\n",
" \"\"\"Get the environment from parent run\"\"\"\n",
" try:\n",
" return parent_run.get_environment()\n",
" except BaseException:\n",
" run_details = parent_run.get_details()\n",
" run_def = run_details.get(\"runDefinition\")\n",
" env = run_def.get(\"environment\")\n",
" if env is None:\n",
" raise\n",
" json.dump(env, open(\"azureml_environment.json\", \"w\"))\n",
" return Environment.load_from_directory(\".\")"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -502,8 +548,6 @@
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"import pkg_resources\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
@@ -513,7 +557,7 @@
"conda_run_config.environment.docker.enabled = True\n",
"\n",
"# specify CondaDependencies obj\n",
"conda_run_config.environment = automl_run.get_environment()"
"conda_run_config.environment = get_environment_safe(automl_run)"
]
},
{
@@ -686,7 +730,7 @@
" description=\"Get local explanations for Machine test data\",\n",
")\n",
"\n",
"myenv = automl_run.get_environment()\n",
"myenv = get_environment_safe(automl_run)\n",
"inference_config = InferenceConfig(entry_script=\"score_explain.py\", environment=myenv)\n",
"\n",
"# Use configs and models generated above\n",
@@ -909,7 +953,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
"version": "3.8.7"
},
"tags": [
"featurization",

View File

@@ -91,7 +91,7 @@
"metadata": {},
"outputs": [],
"source": [
"import joblib\n",
"import dill\n",
"\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.linear_model import Ridge\n",
@@ -101,7 +101,7 @@
"\n",
"model = Ridge().fit(dataset_x, dataset_y)\n",
"\n",
"joblib.dump(model, 'sklearn_regression_model.pkl')"
"dill.dump(model, open('sklearn_regression_model.pkl', 'wb'))"
]
},
{
@@ -285,6 +285,7 @@
" 'azureml-defaults',\n",
" 'inference-schema[numpy-support]',\n",
" 'joblib',\n",
" 'dill==0.3.6',\n",
" 'numpy==1.23',\n",
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
"])"
@@ -486,6 +487,7 @@
" 'azureml-defaults',\n",
" 'inference-schema[numpy-support]',\n",
" 'joblib',\n",
" 'dill==0.3.6',\n",
" 'numpy==1.23',\n",
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
"])\n",

View File

@@ -1,373 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploy models to Azure Kubernetes Service (AKS) using controlled roll out\n",
"This notebook will show you how to deploy mulitple AKS webservices with the same scoring endpoint and how to roll out your models in a controlled manner by configuring % of scoring traffic going to each webservice. If you are using a Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) to install the Azure Machine Learning Python SDK and create an Azure ML Workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check for latest version\n",
"import azureml.core\n",
"print(azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize workspace\n",
"Create a [Workspace](https://docs.microsoft.com/python/api/azureml-core/azureml.core.workspace%28class%29?view=azure-ml-py) object from your persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register the model\n",
"Register a file or folder as a model by calling [Model.register()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#register-workspace--model-path--model-name--tags-none--properties-none--description-none--datasets-none--model-framework-none--model-framework-version-none--child-paths-none-).\n",
"In addition to the content of the model file itself, your registered model will also store model metadata -- model description, tags, and framework information -- that will be useful when managing and deploying models in your workspace. Using tags, for instance, you can categorize your models and apply filters when listing models in your workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Model\n",
"\n",
"model = Model.register(workspace=ws,\n",
" model_name='sklearn_regression_model.pkl', # Name of the registered model in your workspace.\n",
" model_path='./sklearn_regression_model.pkl', # Local file to upload and register as a model.\n",
" model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.\n",
" model_framework_version='0.19.1', # Version of scikit-learn used to create the model.\n",
" description='Ridge regression model to predict diabetes progression.',\n",
" tags={'area': 'diabetes', 'type': 'regression'})\n",
"\n",
"print('Name:', model.name)\n",
"print('Version:', model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register an environment (for all models)\n",
"\n",
"If you control over how your model is run, or if it has special runtime requirements, you can specify your own environment and scoring method.\n",
"\n",
"Specify the model's runtime environment by creating an [Environment](https://docs.microsoft.com/python/api/azureml-core/azureml.core.environment%28class%29?view=azure-ml-py) object and providing the [CondaDependencies](https://docs.microsoft.com/python/api/azureml-core/azureml.core.conda_dependencies.condadependencies?view=azure-ml-py) needed by your model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"environment=Environment('my-sklearn-environment')\n",
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
" 'pip==20.2.4'],\n",
" pip_packages=[\n",
" 'azureml-defaults',\n",
" 'inference-schema[numpy-support]',\n",
" 'numpy==1.23',\n",
" 'scikit-learn==0.22.1',\n",
" 'scipy'\n",
"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When using a custom environment, you must also provide Python code for initializing and running your model. An example script is included with this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('score.py') as f:\n",
" print(f.read())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create the InferenceConfig\n",
"Create the inference configuration to reference your environment and entry script during deployment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.model import InferenceConfig\n",
"\n",
"inference_config = InferenceConfig(entry_script='score.py', \n",
" source_directory='.',\n",
" environment=environment)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Provision the AKS Cluster\n",
"If you already have an AKS cluster attached to this workspace, skip the step below and provide the name of the cluster.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AksCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"# Use the default configuration (can also provide parameters to customize)\n",
"prov_config = AksCompute.provisioning_configuration()\n",
"\n",
"aks_name = 'my-aks' \n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n",
" provisioning_configuration = prov_config) \n",
"aks_target.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an Endpoint and add a version (AKS service)\n",
"This creates a new endpoint and adds a version behind it. By default the first version added is the default version. You can specify the traffic percentile a version takes behind an endpoint. \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# deploying the model and create a new endpoint\n",
"from azureml.core.webservice import AksEndpoint\n",
"# from azureml.core.compute import ComputeTarget\n",
"\n",
"#select a created compute\n",
"compute = ComputeTarget(ws, 'my-aks')\n",
"namespace_name=\"endpointnamespace\"\n",
"# define the endpoint name\n",
"endpoint_name = \"myendpoint1\"\n",
"# define the service name\n",
"version_name= \"versiona\"\n",
"\n",
"endpoint_deployment_config = AksEndpoint.deploy_configuration(tags = {'modelVersion':'firstversion', 'department':'finance'}, \n",
" description = \"my first version\", namespace = namespace_name, \n",
" version_name = version_name, traffic_percentile = 40)\n",
"\n",
"endpoint = Model.deploy(ws, endpoint_name, [model], inference_config, endpoint_deployment_config, compute)\n",
"endpoint.wait_for_deployment(True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"endpoint.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add another version of the service to an existing endpoint\n",
"This adds another version behind an existing endpoint. You can specify the traffic percentile the new version takes. If no traffic_percentile is specified then it defaults to 0. All the unspecified traffic percentile (in this example 50) across all versions goes to default version."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Adding a new version to an existing Endpoint.\n",
"version_name_add=\"versionb\" \n",
"\n",
"endpoint.create_version(version_name = version_name_add, inference_config=inference_config, models=[model], tags = {'modelVersion':'secondversion', 'department':'finance'}, \n",
" description = \"my second version\", traffic_percentile = 10)\n",
"endpoint.wait_for_deployment(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Update an existing version in an endpoint\n",
"There are two types of versions: control and treatment. An endpoint contains one or more treatment versions but only one control version. This categorization helps compare the different versions against the defined control version."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"endpoint.update_version(version_name=endpoint.versions[version_name_add].name, description=\"my second version update\", traffic_percentile=40, is_default=True, is_control_version_type=True)\n",
"endpoint.wait_for_deployment(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test the web service using run method\n",
"Test the web sevice by passing in data. Run() method retrieves API keys behind the scenes to make sure that call is authenticated."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Scoring on endpoint\n",
"import json\n",
"test_sample = json.dumps({'data': [\n",
" [1,2,3,4,5,6,7,8,9,10], \n",
" [10,9,8,7,6,5,4,3,2,1]\n",
"]})\n",
"\n",
"test_sample_encoded = bytes(test_sample, encoding='utf8')\n",
"prediction = endpoint.run(input_data=test_sample_encoded)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Delete Resources"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# deleting a version in an endpoint\n",
"endpoint.delete_version(version_name=version_name)\n",
"endpoint.wait_for_deployment(True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# deleting an endpoint, this will delete all versions in the endpoint and the endpoint itself\n",
"endpoint.delete()"
]
}
],
"metadata": {
"authors": [
{
"name": "shipatel"
}
],
"category": "deployment",
"compute": [
"None"
],
"datasets": [
"Diabetes"
],
"deployment": [
"Azure Kubernetes Service"
],
"exclude_from_index": false,
"framework": [
"Scikit-learn"
],
"friendly_name": "Deploy models to AKS using controlled roll out",
"index_order": 3,
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
},
"star_tag": [
"featured"
],
"tags": [
"None"
],
"task": "Deploy a model with Azure Machine Learning"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,4 +0,0 @@
name: deploy-aks-with-controlled-rollout
dependencies:
- pip:
- azureml-sdk

View File

@@ -1,28 +0,0 @@
import pickle
import json
import numpy
from sklearn.externals import joblib
from sklearn.linear_model import Ridge
from azureml.core.model import Model
def init():
global model
# note here "sklearn_regression_model.pkl" is the name of the model registered under
# this is a different behavior than before when the code is run locally, even though the code is the same.
model_path = Model.get_model_path('sklearn_regression_model.pkl')
# deserialize the model file back into a sklearn model
model = joblib.load(model_path)
# note you can pass in multiple rows for scoring
def run(raw_data):
try:
data = json.loads(raw_data)['data']
data = numpy.array(data)
result = model.predict(data)
# you can return any data type as long as it is JSON-serializable
return result.tolist()
except Exception as e:
error = str(e)
return error

View File

@@ -123,7 +123,7 @@
"import pickle\n",
"import json\n",
"import numpy\n",
"from sklearn.externals import joblib\n",
"import joblib\n",
"from sklearn.linear_model import Ridge\n",
"import time\n",
"\n",

View File

@@ -105,7 +105,7 @@
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
" max_nodes=6)\n",
"\n",
" # create the cluster\n",
@@ -620,7 +620,7 @@
},
"manual": null
},
"vm_size": "STANDARD_NC6"
"vm_size": "Standard_NC6s_v3"
},
"error": "",
"layout": "IPY_MODEL_c899ddfc2b134ca9b89a4f278ac7c997",

View File

@@ -136,6 +136,9 @@
"# Choose a name for your GPU cluster\n",
"gpu_cluster_name = \"aks-gpu-cluster\"\n",
"\n",
"# Choose a location for your GPU cluster\n",
"gpu_cluster_location = \"eastus\"\n",
"\n",
"# Verify that cluster does not exist already\n",
"try:\n",
" gpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
@@ -146,7 +149,8 @@
" # Specify the configuration for the new cluster\n",
" compute_config = AksCompute.provisioning_configuration(cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,\n",
" agent_count=1,\n",
" vm_size=\"Standard_NV6\")\n",
" vm_size=\"Standard_NC6s_v3\",\n",
" location=gpu_cluster_location)\n",
" # Create the cluster with the specified name and configuration\n",
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
"\n",
@@ -170,7 +174,7 @@
"outputs": [],
"source": [
"%%writefile score.py\n",
"import tensorflow as tf\n",
"import tensorflow.compat.v1 as tf\n",
"import numpy as np\n",
"import json\n",
"import os\n",
@@ -240,9 +244,9 @@
"# Please see [Azure ML Containers repository](https://github.com/Azure/AzureML-Containers#featured-tags)\n",
"# for open-sourced GPU base images.\n",
"env.docker.base_image = DEFAULT_GPU_IMAGE\n",
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.6.2\", pin_sdk_version=False,\n",
" conda_packages=['tensorflow-gpu==1.12.0','numpy'],\n",
" pip_packages=['azureml-contrib-services==1.47.0', 'azureml-defaults==1.47.0'])\n",
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.8\", pin_sdk_version=False,\n",
" conda_packages=['tensorflow-gpu','numpy'],\n",
" pip_packages=['azureml-contrib-services', 'azureml-defaults'])\n",
"\n",
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)\n",
"aks_config = AksWebservice.deploy_configuration()\n",

View File

@@ -2,4 +2,3 @@ name: production-deploy-to-aks-gpu
dependencies:
- pip:
- azureml-sdk
- tensorflow

View File

@@ -154,7 +154,7 @@
"import pickle\n",
"import json\n",
"import numpy\n",
"from sklearn.externals import joblib\n",
"import joblib\n",
"from sklearn.linear_model import Ridge\n",
"from inference_schema.schema_decorators import input_schema, output_schema\n",
"from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType\n",

View File

@@ -154,7 +154,7 @@
"import pickle\n",
"import json\n",
"import numpy\n",
"from sklearn.externals import joblib\n",
"import joblib\n",
"from sklearn.linear_model import Ridge\n",
"\n",
"def init():\n",

View File

@@ -1 +0,0 @@
{"class":"org.apache.spark.ml.classification.LogisticRegressionModel","timestamp":1570147252329,"sparkVersion":"2.4.0","uid":"LogisticRegression_5df3978caaf3","paramMap":{"regParam":0.01},"defaultParamMap":{"aggregationDepth":2,"threshold":0.5,"rawPredictionCol":"rawPrediction","featuresCol":"features","labelCol":"label","predictionCol":"prediction","family":"auto","regParam":0.0,"tol":1.0E-6,"probabilityCol":"probability","standardization":true,"elasticNetParam":0.0,"maxIter":100,"fitIntercept":true}}

View File

@@ -1,349 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Register Spark Model and deploy as Webservice\n",
"\n",
"This example shows how to deploy a Webservice in step-by-step fashion:\n",
"\n",
" 1. Register Spark Model\n",
" 2. Deploy Spark Model as Webservice"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the [configuration](../../../configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create workspace"
]
},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Register Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can add tags and descriptions to your Models. Note you need to have a `iris.model` file in the current directory. This model file is generated using [train in spark](../training/train-in-spark/train-in-spark.ipynb) notebook. The below call registers that file as a Model with the same name `iris.model` in the workspace.\n",
"\n",
"Using tags, you can track useful information such as the name and version of the machine learning library used to train the model. Note that tags must be alphanumeric."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"from azureml.core.model import Model\n",
"\n",
"model = Model.register(model_path=\"iris.model\",\n",
" model_name=\"iris.model\",\n",
" tags={'type': \"regression\"},\n",
" description=\"Logistic regression model to predict iris species\",\n",
" workspace=ws)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Fetch Environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can now create and/or use an Environment object when deploying a Webservice. The Environment can have been previously registered with your Workspace, or it will be registered with it as a part of the Webservice deployment.\n",
"\n",
"More information can be found in our [using environments notebook](../training/using-environments/using-environments.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\r\n",
"from azureml.core.environment import SparkPackage\r\n",
"from azureml.core.conda_dependencies import CondaDependencies\r\n",
"\r\n",
"myenv = Environment('my-pyspark-environment')\r\n",
"myenv.docker.base_image = \"mcr.microsoft.com/mmlspark/release:0.15\"\r\n",
"myenv.inferencing_stack_version = \"latest\"\r\n",
"myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[\"azureml-core\",\"azureml-defaults\",\"azureml-telemetry\",\"azureml-train-restclients-hyperdrive\",\"azureml-train-core\"], python_version=\"3.7.0\")\r\n",
"myenv.python.conda_dependencies.add_channel(\"conda-forge\")\r\n",
"myenv.spark.packages = [SparkPackage(\"com.microsoft.ml.spark\", \"mmlspark_2.11\", \"0.15\"), SparkPackage(\"com.microsoft.azure\", \"azure-storage\", \"2.0.0\"), SparkPackage(\"org.apache.hadoop\", \"hadoop-azure\", \"2.7.0\")]\r\n",
"myenv.spark.repositories = [\"https://mmlspark.azureedge.net/maven\"]\r\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Inference Configuration\n",
"\n",
"There is now support for a source directory, you can upload an entire folder from your local machine as dependencies for the Webservice.\n",
"Note: in that case, your entry_script is relative path to the source_directory path.\n",
"\n",
"Sample code for using a source directory:\n",
"\n",
"```python\n",
"inference_config = InferenceConfig(source_directory=\"C:/abc\",\n",
" entry_script=\"x/y/score.py\",\n",
" environment=environment)\n",
"```\n",
"\n",
" - source_directory = holds source path as string, this entire folder gets added in image so its really easy to access any files within this folder or subfolder\n",
" - entry_script = contains logic specific to initializing your model and running predictions\n",
" - environment = An environment object to use for the deployment. Doesn't have to be registered"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create image"
]
},
"outputs": [],
"source": [
"from azureml.core.model import InferenceConfig\n",
"\n",
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=myenv)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy Model as Webservice on Azure Container Instance\n",
"\n",
"Note that the service creation can take few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"azuremlexception-remarks-sample"
]
},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice, Webservice\n",
"from azureml.exceptions import WebserviceException\n",
"\n",
"deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)\n",
"aci_service_name = 'aciservice1'\n",
"\n",
"try:\n",
" # if you want to get existing service below is the command\n",
" # since aci name needs to be unique in subscription deleting existing aci if any\n",
" # we use aci_service_name to create azure aci\n",
" service = Webservice(ws, name=aci_service_name)\n",
" if service:\n",
" service.delete()\n",
"except WebserviceException as e:\n",
" print()\n",
"\n",
"service = Model.deploy(ws, aci_service_name, [model], inference_config, deployment_config)\n",
"\n",
"service.wait_for_deployment(True)\n",
"print(service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Test web service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"test_sample = json.dumps({'features':{'type':1,'values':[4.3,3.0,1.1,0.1]},'label':2.0})\n",
"\n",
"test_sample_encoded = bytes(test_sample, encoding='utf8')\n",
"prediction = service.run(input_data=test_sample_encoded)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Delete ACI to clean up"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model Profiling\n",
"\n",
"You can also take advantage of the profiling feature to estimate CPU and memory requirements for models.\n",
"\n",
"```python\n",
"profile = Model.profile(ws, \"profilename\", [model], inference_config, test_sample)\n",
"profile.wait_for_profiling(True)\n",
"profiling_results = profile.get_results()\n",
"print(profiling_results)\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model Packaging\n",
"\n",
"If you want to build a Docker image that encapsulates your model and its dependencies, you can use the model packaging option. The output image will be pushed to your workspace's ACR.\n",
"\n",
"You must include an Environment object in your inference configuration to use `Model.package()`.\n",
"\n",
"```python\n",
"package = Model.package(ws, [model], inference_config)\n",
"package.wait_for_creation(show_output=True) # Or show_output=False to hide the Docker build logs.\n",
"package.pull()\n",
"```\n",
"\n",
"Instead of a fully-built image, you can also generate a Dockerfile and download all the assets needed to build an image on top of your Environment.\n",
"\n",
"```python\n",
"package = Model.package(ws, [model], inference_config, generate_dockerfile=True)\n",
"package.wait_for_creation(show_output=True)\n",
"package.save(\"./local_context_dir\")\n",
"```"
]
}
],
"metadata": {
"authors": [
{
"name": "vaidyas"
}
],
"category": "deployment",
"compute": [
"None"
],
"datasets": [
"Iris"
],
"deployment": [
"Azure Container Instance"
],
"exclude_from_index": false,
"framework": [
"PySpark"
],
"friendly_name": "Register Spark model and deploy as webservice",
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,4 +0,0 @@
name: model-register-and-deploy-spark
dependencies:
- pip:
- azureml-sdk

View File

@@ -1,37 +0,0 @@
import traceback
from pyspark.ml.linalg import VectorUDT
from azureml.core.model import Model
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession
input_schema = StructType([StructField("features", VectorUDT()), StructField("label", DoubleType())])
reader = spark.read
reader.schema(input_schema)
def init():
global model
# note here "iris.model" is the name of the model registered under the workspace
# this call should return the path to the model.pkl file on the local disk.
model_path = Model.get_model_path('iris.model')
# Load the model file back into a LogisticRegression model
model = LogisticRegressionModel.load(model_path)
def run(data):
try:
input_df = reader.json(sc.parallelize([data]))
result = model.transform(input_df)
# you can return any datatype as long as it is JSON-serializable
return result.collect()[0]['prediction']
except Exception as e:
traceback.print_exc()
error = str(e)
return error

View File

@@ -1,59 +0,0 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license.
from azureml.core.run import Run
from azureml.interpret import ExplanationClient
from interpret_community.adapter import ExplanationAdapter
import joblib
import os
import shap
import xgboost
OUTPUT_DIR = './outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
run = Run.get_context()
client = ExplanationClient.from_run(run)
# get a dataset on income prediction
X, y = shap.datasets.adult()
features = X.columns.values
# train an XGBoost model (but any other tree model type should work)
model = xgboost.XGBClassifier()
model.fit(X, y)
explainer = shap.explainers.GPUTree(model, X)
X_shap = X[:100]
shap_values = explainer(X_shap)
print("computed shap values:")
print(shap_values)
# Use the explanation adapter to convert the importances into an interpret-community
# style explanation which can be uploaded to AzureML or visualized in the
# ExplanationDashboard widget
adapter = ExplanationAdapter(features, classification=True)
global_explanation = adapter.create_global(shap_values.values, X_shap, expected_values=shap_values.base_values)
# write X_shap out as a pickle file for later visualization
x_shap_pkl = 'x_shap.pkl'
with open(x_shap_pkl, 'wb') as file:
joblib.dump(value=X_shap, filename=os.path.join(OUTPUT_DIR, x_shap_pkl))
run.upload_file('x_shap_adult_census.pkl', os.path.join(OUTPUT_DIR, x_shap_pkl))
model_file_name = 'xgboost_.pkl'
# save model in the outputs folder so it automatically gets uploaded
with open(model_file_name, 'wb') as file:
joblib.dump(value=model, filename=os.path.join(OUTPUT_DIR,
model_file_name))
# register the model
run.upload_file('xgboost_model.pkl', os.path.join('./outputs/', model_file_name))
original_model = run.register_model(model_name='xgboost_with_gpu_tree_explainer',
model_path='xgboost_model.pkl')
# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on classification model trained on adult census income dataset'
client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)

View File

@@ -1,517 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Explain tree-based models on GPU using GPUTreeExplainer\n",
"\n",
"\n",
"_**This notebook illustrates how to use shap's GPUTreeExplainer on an Azure GPU machine.**_\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"Problem: Train a tree-based model and explain the model on an Azure GPU machine using the GPUTreeExplainer.\n",
"\n",
"---\n",
"\n",
"## Table of Contents\n",
"\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Run model explainer locally at training time](#Explain)\n",
" 1. Apply feature transformations\n",
" 1. Train a binary classification model\n",
" 1. Explain the model on raw features\n",
" 1. Generate global explanations\n",
" 1. Generate local explanations\n",
"1. [Visualize explanations](#Visualize)\n",
"1. [Deploy model and scoring explainer](#Deploy)\n",
"1. [Next steps](#Next)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"This notebook demonstrates how to use the GPUTreeExplainer on some simple datasets. Like the TreeExplainer, the GPUTreeExplainer is specifically designed for tree-based machine learning models, but it is designed to accelerate the computations using NVIDIA GPUs.\n",
"\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"Notebook synopsis:\n",
"\n",
"1. Creating an Experiment in an existing Workspace\n",
"2. Configuration and remote run with a GPU machine"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import shutil\n",
"\n",
"import pandas as pd\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.core.dataset import Dataset\n",
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"from azureml.core.run import Run\n",
"from azureml.core.model import Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As part of the setup you have already created a <b>Workspace</b>. To run the script, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose an experiment name.\n",
"experiment_name = 'gpu-tree-explainer'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create project directory\n",
"\n",
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"\n",
"project_folder = './azureml-shap-gpu-tree-explainer'\n",
"os.makedirs(project_folder, exist_ok=True)\n",
"shutil.copy('gpu_tree_explainer.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up a compute cluster\n",
"This section uses a user-provided compute cluster (named \"gpu-shap-cluster\" in this example). If a cluster with this name does not exist in the user's workspace, the below code will create a new cluster. You can choose the parameters of the cluster as mentioned in the comments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"num_nodes = 1\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"gpu-shap-cluster\"\n",
"\n",
"# Verify that cluster does not exist already\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
" print('Found existing cluster, use it.')\n",
"except ComputeTargetException:\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\",\n",
" # To use GPUTreeExplainer, select a GPU such as \"STANDARD_NC6\" \n",
" # or similar GPU option\n",
" # available in your workspace\n",
" max_nodes = num_nodes)\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
"\n",
"compute_target.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure & Run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# Create a new RunConfig object\n",
"run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute target created in previous step\n",
"run_config.target = amlcompute_cluster_name\n",
"\n",
"from azureml.core import Environment\n",
"\n",
"environment_name = \"shapgpu\"\n",
"env = Environment(environment_name)\n",
"\n",
"env.docker.enabled = True\n",
"env.docker.base_image = None\n",
"\n",
"\n",
"# Note: this is to pin the pandas and xgboost versions to be same as notebook.\n",
"# In production scenario user would choose their dependencies\n",
"import pkg_resources\n",
"from distutils.version import LooseVersion\n",
"available_packages = pkg_resources.working_set\n",
"pandas_ver = None\n",
"numpy_ver = None\n",
"sklearn_ver = None\n",
"for dist in list(available_packages):\n",
" if dist.key == 'pandas':\n",
" pandas_ver = dist.version\n",
" if dist.key == 'numpy':\n",
" if LooseVersion(dist.version) >= LooseVersion('1.20.0'):\n",
" numpy_ver = dist.version\n",
" else:\n",
" numpy_ver = '1.21.6'\n",
" if dist.key == 'scikit-learn':\n",
" sklearn_ver = dist.version\n",
"pandas_dep = 'pandas'\n",
"numpy_dep = 'numpy'\n",
"sklearn_dep = 'scikit-learn'\n",
"if pandas_ver:\n",
" pandas_dep = 'pandas=={}'.format(pandas_ver)\n",
"if numpy_ver:\n",
" numpy_dep = 'numpy=={}'.format(numpy_ver)\n",
"if sklearn_ver:\n",
" sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n",
"\n",
"# Note: we build shap at commit 690245 for Tesla K80 GPUs\n",
"env.docker.base_dockerfile = f\"\"\"\n",
"FROM nvidia/cuda:10.2-devel-ubuntu18.04\n",
"ENV PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
"ARG PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
"RUN apt-get update && \\\n",
"apt-get install -y fuse && \\\n",
"apt-get install -y build-essential && \\\n",
"apt-get install -y python3-dev && \\\n",
"apt-get install -y wget && \\\n",
"apt-get install -y git && \\\n",
"rm -rf /var/lib/apt/lists/* && \\\n",
"wget \\\n",
"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \\\n",
"mkdir /root/.conda && \\\n",
"bash Miniconda3-latest-Linux-x86_64.sh -b && \\\n",
"rm -f Miniconda3-latest-Linux-x86_64.sh && \\\n",
"conda init bash && \\\n",
". ~/.bashrc && \\\n",
"conda create -n shapgpu python=3.8 && \\\n",
"conda activate shapgpu && \\\n",
"apt-get install -y g++ && \\\n",
"printenv && \\\n",
"echo \"which nvcc: \" && \\\n",
"which nvcc && \\\n",
"pip install azureml-defaults && \\\n",
"pip install azureml-telemetry && \\\n",
"pip install azureml-interpret && \\\n",
"pip install {pandas_dep} && \\\n",
"cd /usr/local/src && \\\n",
"git clone https://github.com/slundberg/shap.git --single-branch && \\\n",
"cd shap && \\\n",
"git reset --hard 690245c6ab043edf40cfce3d8438a62e29ab599f && \\\n",
"mkdir build && \\\n",
"python setup.py install --user && \\\n",
"pip uninstall -y xgboost && \\\n",
"conda install py-xgboost==1.3.3 && \\\n",
"pip uninstall -y numpy && \\\n",
"pip install {numpy_dep} && \\\n",
"pip install {sklearn_dep} && \\\n",
"pip install chardet \\\n",
"\"\"\"\n",
"\n",
"env.python.user_managed_dependencies = True\n",
"env.python.interpreter_path = '/root/miniconda3/envs/shapgpu/bin/python'\n",
"\n",
"from azureml.core import Run\n",
"from azureml.core import ScriptRunConfig\n",
"\n",
"src = ScriptRunConfig(source_directory=project_folder, \n",
" script='gpu_tree_explainer.py', \n",
" compute_target=amlcompute_cluster_name,\n",
" environment=env) \n",
"run = experiment.submit(config=src)\n",
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: if you need to cancel a run, you can follow [these instructions](https://aka.ms/aml-docs-cancel-run)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# Shows output of the run on stdout.\n",
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download \n",
"1. Download model explanation data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.interpret import ExplanationClient\n",
"\n",
"# Get model explanation data\n",
"client = ExplanationClient.from_run(run)\n",
"global_explanation = client.download_model_explanation()\n",
"local_importance_values = global_explanation.local_importance_values\n",
"expected_values = global_explanation.expected_values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the top k (e.g., 4) most important features with their importance values\n",
"global_explanation_topk = client.download_model_explanation(top_k=4)\n",
"global_importance_values = global_explanation_topk.get_ranked_global_values()\n",
"global_importance_names = global_explanation_topk.get_ranked_global_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('global importance values: {}'.format(global_importance_values))\n",
"print('global importance names: {}'.format(global_importance_names))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Download model file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Retrieve model for visualization and deployment\n",
"from azureml.core.model import Model\n",
"import joblib\n",
"original_model = Model(ws, 'xgboost_with_gpu_tree_explainer')\n",
"model_path = original_model.download(exist_ok=True)\n",
"original_model = joblib.load(model_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"3. Download test dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Retrieve x_test for visualization\n",
"x_test_path = './x_shap_adult_census.pkl'\n",
"run.download_file('x_shap_adult_census.pkl', output_file_path=x_test_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x_test = joblib.load('x_shap_adult_census.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualize\n",
"Load the visualization dashboard"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from raiwidgets import ExplanationDashboard"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from interpret_community.common.model_wrapper import wrap_model\n",
"from interpret_community.dataset.dataset_wrapper import DatasetWrapper\n",
"# note we need to wrap the XGBoost model to output predictions and probabilities in the scikit-learn format\n",
"class WrappedXGBoostModel(object):\n",
" \"\"\"A class for wrapping an XGBoost model to output integer predicted classes.\"\"\"\n",
"\n",
" def __init__(self, model):\n",
" self.model = model\n",
"\n",
" def predict(self, dataset):\n",
" return self.model.predict(dataset).astype(int)\n",
"\n",
" def predict_proba(self, dataset):\n",
" return self.model.predict_proba(dataset)\n",
"\n",
"wrapped_model = WrappedXGBoostModel(wrap_model(original_model, DatasetWrapper(x_test), model_task='classification'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ExplanationDashboard(global_explanation, wrapped_model, dataset=x_test)"
]
}
],
"metadata": {
"authors": [
{
"name": "ilmat"
}
],
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,18 +0,0 @@
name: train-explain-model-gpu-tree-explainer
dependencies:
- py-xgboost==1.3.3
- pip:
- azureml-sdk
- azureml-interpret
- flask
- flask-cors
- gevent>=1.3.6
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.24.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3
- protobuf==3.20.0
- jinja2==3.0.3

View File

@@ -8,9 +8,8 @@ dependencies:
- gevent>=1.3.6
- ipython
- matplotlib
- azureml-dataset-runtime
- ipywidgets
- raiwidgets~=0.24.0
- raiwidgets~=0.28.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -9,7 +9,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.24.0
- raiwidgets~=0.28.0
- packaging>=20.9
- itsdangerous==2.0.1
- markupsafe<2.1.0

View File

@@ -9,7 +9,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.24.0
- raiwidgets~=0.28.0
- packaging>=20.9
- itsdangerous==2.0.1
- markupsafe<2.1.0

View File

@@ -8,10 +8,9 @@ dependencies:
- gevent>=1.3.6
- ipython
- matplotlib
- azureml-dataset-runtime
- azureml-core
- ipywidgets
- raiwidgets~=0.24.0
- raiwidgets~=0.28.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -175,7 +175,7 @@
"store_name=os.getenv(\"ADL_STORENAME_62\", \"<my-datastore-name>\") # ADLS account name\n",
"tenant_id=os.getenv(\"ADL_TENANT_62\", \"<my-tenant-id>\") # tenant id of service principal\n",
"client_id=os.getenv(\"ADL_CLIENTID_62\", \"<my-client-id>\") # client id of service principal\n",
"client_secret=os.getenv(\"ADL_CLIENT_SECRET_62\", \"<my-client-secret>\") # the secret of service principal\n",
"client_st=os.getenv(\"ADL_CLIENT_SECRET_62\", \"<my-client-secret>\") # the secret of service principal\n",
"\n",
"try:\n",
" adls_datastore = Datastore.get(ws, datastore_name)\n",
@@ -189,7 +189,7 @@
" store_name=store_name, # ADLS account name\n",
" tenant_id=tenant_id, # tenant id of service principal\n",
" client_id=client_id, # client id of service principal\n",
" client_secret=client_secret) # the secret of service principal\n",
" client_secret=client_st) # the secret of service principal\n",
" print(\"Registered datastore with name: %s\" % datastore_name)\n",
"\n",
"adls_data_ref = DataReference(\n",

View File

@@ -233,7 +233,7 @@
" print('Found existing compute target {}.'.format(cluster_name))\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\",\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"Standard_NC6s_v3\",\n",
" max_nodes=4)\n",
"\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",

View File

@@ -133,7 +133,7 @@
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"Standard_NC6s_v3\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 4)\n",
"\n",

View File

@@ -136,7 +136,7 @@
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"Standard_NC6s_v3\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 4)\n",
"\n",

View File

@@ -147,7 +147,7 @@
"store_name = os.getenv(\"ADL_STORENAME_62\", \"<my-datastore-name>\") # ADLS account name\n",
"tenant_id = os.getenv(\"ADL_TENANT_62\", \"<my-tenant-id>\") # tenant id of service principal\n",
"client_id = os.getenv(\"ADL_CLIENTID_62\", \"<my-client-id>\") # client id of service principal\n",
"client_secret = os.getenv(\"ADL_CLIENT_62_SECRET\", \"<my-client-secret>\") # the secret of service principal\n",
"client_st = os.getenv(\"ADL_CLIENT_62_SECRET\", \"<my-client-secret>\") # the secret of service principal\n",
"\n",
"try:\n",
" adls_datastore = Datastore.get(ws, datastore_name)\n",
@@ -161,7 +161,7 @@
" store_name=store_name, # ADLS account name\n",
" tenant_id=tenant_id, # tenant id of service principal\n",
" client_id=client_id, # client id of service principal\n",
" client_secret=client_secret) # the secret of service principal\n",
" client_secret=client_st) # the secret of service principal\n",
" print(\"registered datastore with name: %s\" % datastore_name)"
]
},

View File

@@ -148,7 +148,7 @@
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
" print('Found existing cluster, use it.')\n",
"except ComputeTargetException:\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',# for GPU, use \"STANDARD_NC6\"\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',# for GPU, use \"Standard_NC6s_v3\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes=4)\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",

View File

@@ -86,7 +86,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `STANDARD_NC6` GPU VMs. This process is broken down into 3 steps:\n",
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `Standard_NC6s_v3` GPU VMs. This process is broken down into 3 steps:\n",
"1. create the configuration (this step is local and only takes a second)\n",
"2. create the cluster (this step will take about **20 seconds**)\n",
"3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell"
@@ -109,7 +109,7 @@
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=4)\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', max_nodes=4)\n",
"\n",
" # create the cluster\n",
" gpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\n",

View File

@@ -176,7 +176,7 @@
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"Standard_NC6s_v3\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 4)\n",
"\n",

View File

@@ -105,7 +105,7 @@
"compute_min_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MIN_NODES\", 0)\n",
"compute_max_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MAX_NODES\", 4)\n",
"\n",
"# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6\n",
"# This example uses CPU VM. For using GPU VM, set SKU to Standard_NC6s_v3\n",
"vm_size = os.environ.get(\"AML_COMPUTE_CLUSTER_SKU\", \"STANDARD_D2_V2\")\n",
"\n",
"\n",

View File

@@ -143,7 +143,7 @@
"compute_min_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MIN_NODES\", 0)\n",
"compute_max_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MAX_NODES\", 2)\n",
"\n",
"# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6\n",
"# This example uses CPU VM. For using GPU VM, set SKU to Standard_NC6s_v3\n",
"vm_size = os.environ.get(\"AML_COMPUTE_CLUSTER_SKU\", \"STANDARD_D2_V2\")\n",
"\n",
"\n",

View File

@@ -103,7 +103,7 @@
"compute_min_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MIN_NODES\", 0)\n",
"compute_max_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MAX_NODES\", 4)\n",
"\n",
"# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6\n",
"# This example uses CPU VM. For using GPU VM, set SKU to Standard_NC6s_v3\n",
"vm_size = os.environ.get(\"AML_COMPUTE_CLUSTER_SKU\", \"STANDARD_D2_V2\")\n",
"\n",
"\n",

View File

@@ -86,7 +86,8 @@
"import requests\n",
"\n",
"oj_sales_path = \"./oj.csv\"\n",
"r = requests.get(\"https://raw.githubusercontent.com/Azure/azureml-examples/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/data/dominicks_OJ.csv\")\n",
"r = requests.get(\"https://raw.githubusercontent.com/Azure/MachineLearningNotebooks/master/how-to-use-azureml/\n",
" automated-machine-learning/automl-forecasting-orange-juice-sales/data/dominicks_OJ.csv\")\n",
"open(oj_sales_path, \"wb\").write(r.content)"
]
},
@@ -165,7 +166,7 @@
"compute_min_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MIN_NODES\", 0)\n",
"compute_max_nodes = os.environ.get(\"AML_COMPUTE_CLUSTER_MAX_NODES\", 2)\n",
"\n",
"# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6\n",
"# This example uses CPU VM. For using GPU VM, set SKU to Standard_NC6s_v3\n",
"vm_size = os.environ.get(\"AML_COMPUTE_CLUSTER_SKU\", \"STANDARD_D2_V2\")\n",
"\n",
"\n",

View File

@@ -210,7 +210,7 @@
" print(\"found existing cluster.\")\n",
"except ComputeTargetException:\n",
" print(\"creating new cluster\")\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\",\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"Standard_NC6s_v3\",\n",
" max_nodes = 3)\n",
"\n",
" # create the cluster\n",

View File

@@ -246,7 +246,7 @@
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3',\n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -40,6 +43,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -80,6 +84,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -101,6 +106,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -131,6 +137,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -168,6 +175,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -206,6 +214,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -213,6 +222,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -240,6 +250,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -269,6 +280,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -279,10 +291,11 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `STANDARD_NC6` GPU VMs. This process is broken down into 3 steps:\n",
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `Standard_NC6s_v3` GPU VMs. This process is broken down into 3 steps:\n",
"1. create the configuration (this step is local and only takes a second)\n",
"2. create the cluster (this step will take about **20 seconds**)\n",
"3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell"
@@ -305,7 +318,7 @@
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -320,6 +333,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -338,6 +352,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -361,6 +376,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -375,6 +391,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -394,6 +411,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -411,6 +429,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -430,12 +449,12 @@
"channels:\n",
"- conda-forge\n",
"dependencies:\n",
"- python=3.7\n",
"- pip=21.3.1\n",
"- python=3.8\n",
"- pip=23.1.2\n",
"- pip:\n",
" - h5py<=2.10.0\n",
" - azureml-defaults\n",
" - tensorflow-gpu==2.0.0\n",
" - tensorflow-gpu==2.2.0\n",
" - keras<=2.3.1\n",
" - matplotlib\n",
" - protobuf==3.20.1"
@@ -457,6 +476,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -501,6 +521,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -518,6 +539,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -547,6 +569,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -572,6 +595,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -579,6 +603,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -619,6 +644,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -626,6 +652,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -649,6 +676,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -657,6 +685,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -668,6 +697,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -675,6 +705,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -691,6 +722,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -698,6 +730,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -712,6 +745,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -719,6 +753,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -726,6 +761,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -753,6 +789,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -775,6 +812,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -791,6 +829,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -813,6 +852,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -829,6 +869,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -863,6 +904,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -890,6 +932,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -906,6 +949,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -922,6 +966,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -970,6 +1015,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -997,6 +1043,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1035,6 +1082,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1051,6 +1099,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1067,6 +1116,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1115,6 +1165,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1133,6 +1184,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1162,6 +1214,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1184,6 +1237,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -3,6 +3,4 @@ dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- tensorflow
- keras<=2.3.1
- matplotlib

View File

@@ -97,7 +97,7 @@
"metadata": {},
"source": [
"## Create or attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `STANDARD_NC6` GPU cluster that autoscales from `0` to `4` nodes.\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `Standard_NC6s_v3` GPU cluster that autoscales from `0` to `4` nodes.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
@@ -123,7 +123,7 @@
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3',\n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -293,7 +293,7 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"pytorch_env = Environment.get(ws, name='AzureML-PyTorch-1.6-GPU')"
"pytorch_env = Environment.get(ws, name='azureml-acpt-pytorch-1.11-cuda11.3')"
]
},
{
@@ -323,7 +323,7 @@
"To use the per-process launch option in which Azure ML will handle launching each of the processes to run your training script,\n",
"\n",
"1. Specify the training script and arguments\n",
"2. Create a `PyTorchConfiguration` and specify `node_count` and `process_count`. The `process_count` is the total number of processes you want to run for the job; this should typically equal the # of GPUs available on each node multiplied by the # of nodes. Since this tutorial uses the `STANDARD_NC6` SKU, which has one GPU, the total process count for a 2-node job is `2`. If you are using a SKU with >1 GPUs, adjust the `process_count` accordingly.\n",
"2. Create a `PyTorchConfiguration` and specify `node_count` and `process_count`. The `process_count` is the total number of processes you want to run for the job; this should typically equal the # of GPUs available on each node multiplied by the # of nodes. Since this tutorial uses the `Standard_NC6s_v3` SKU, which has one GPU, the total process count for a 2-node job is `2`. If you are using a SKU with >1 GPUs, adjust the `process_count` accordingly.\n",
"\n",
"Azure ML will set the `MASTER_ADDR`, `MASTER_PORT`, `NODE_RANK`, `WORLD_SIZE` environment variables on each node, in addition to the process-level `RANK` and `LOCAL_RANK` environment variables, that are needed for distributed PyTorch training."
]

View File

@@ -97,7 +97,7 @@
"metadata": {},
"source": [
"## Create or attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `STANDARD_NC6` GPU cluster that autoscales from `0` to `4` nodes.\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `Standard_NC6s_v3` GPU cluster that autoscales from `0` to `4` nodes.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
@@ -123,7 +123,7 @@
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3',\n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -28,6 +31,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -48,6 +52,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -71,6 +76,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -94,6 +100,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -124,7 +131,7 @@
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -137,6 +144,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -144,6 +152,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -152,6 +161,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -172,6 +182,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -180,6 +191,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -204,6 +216,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -222,6 +235,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -242,32 +256,13 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an environment\n",
"\n",
"Define a conda environment YAML file with your training script dependencies and create an Azure ML environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile conda_dependencies.yml\n",
"\n",
"channels:\n",
"- conda-forge\n",
"- pytorch\n",
"dependencies:\n",
"- python=3.8.12\n",
"- pip=21.3.1\n",
"- pytorch::pytorch==1.8.1\n",
"- pytorch::torchvision==0.9.1\n",
"- pip:\n",
" - azureml-defaults"
"Create an Azure ML environment."
]
},
{
@@ -278,14 +273,11 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"pytorch_env = Environment.from_conda_specification(name = 'pytorch-1.6-gpu', file_path = './conda_dependencies.yml')\n",
"\n",
"# Specify a GPU base image\n",
"pytorch_env.docker.enabled = True\n",
"pytorch_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04'"
"pytorch_env = Environment.get(ws, name='azureml-acpt-pytorch-1.11-cuda11.3')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -310,6 +302,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -338,6 +331,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -357,6 +351,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -373,6 +368,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -381,6 +377,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -417,6 +414,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -434,6 +432,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -451,6 +450,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -476,6 +476,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -517,6 +518,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -534,6 +536,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -542,6 +545,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -555,6 +559,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -564,6 +569,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -598,6 +604,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -605,6 +612,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -621,6 +629,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -637,6 +646,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -695,6 +705,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -25,6 +28,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -32,6 +36,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -51,6 +56,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -58,6 +64,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -76,6 +83,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -83,6 +91,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -105,6 +114,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -112,6 +122,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -123,6 +134,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -164,6 +176,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -171,6 +184,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -178,6 +192,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -185,6 +200,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -192,6 +208,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -211,6 +228,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -218,6 +236,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -258,6 +277,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -265,6 +285,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -284,27 +305,13 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an environment\n",
"\n",
"Define a conda environment YAML file with your training script dependencies and create an Azure ML environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile conda_dependencies.yml\n",
"\n",
"dependencies:\n",
"- python=3.6.2\n",
"- scikit-learn\n",
"- pip:\n",
" - azureml-defaults"
"Create an Azure ML environment."
]
},
{
@@ -315,10 +322,11 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './conda_dependencies.yml')"
"sklearn_env = Environment.get(ws, name='azureml-sklearn-1.0')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -343,6 +351,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -350,6 +359,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -366,6 +376,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -373,6 +384,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -400,6 +412,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -407,6 +420,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -414,6 +428,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -421,6 +436,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -454,6 +470,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -471,6 +488,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -478,6 +496,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -512,6 +531,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -539,6 +559,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -555,6 +576,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -25,6 +28,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -49,6 +53,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -72,6 +77,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -95,6 +101,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -125,7 +132,7 @@
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -138,6 +145,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -145,6 +153,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -152,6 +161,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -159,6 +169,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -179,6 +190,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -195,10 +207,11 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.7-ubuntu20.04-py38-cuda11-gpu')"
"tf_env = Environment.get(ws, name='azureml-tensorflow-2.11-cuda11')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -226,6 +239,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -245,6 +259,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -263,6 +278,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -25,6 +28,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -65,6 +69,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -88,6 +93,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -109,6 +115,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -139,6 +146,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -171,6 +179,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -215,6 +224,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -238,6 +248,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -266,6 +277,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -276,10 +288,11 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `STANDARD_NC6` GPU VMs. This process is broken down into 3 steps:\n",
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `Standard_NC6s_v3` GPU VMs. This process is broken down into 3 steps:\n",
"1. create the configuration (this step is local and only takes a second)\n",
"2. create the cluster (this step will take about **20 seconds**)\n",
"3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell"
@@ -302,7 +315,7 @@
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3',\n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -317,6 +330,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -335,6 +349,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -358,6 +373,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -372,6 +388,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -391,6 +408,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -408,6 +426,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -424,10 +443,11 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.6-ubuntu20.04-py38-cuda11-gpu')"
"tf_env = Environment.get(ws, name='azureml-tensorflow-2.11-cuda11')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -457,6 +477,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -484,6 +505,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -494,6 +516,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -519,6 +542,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -541,6 +565,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -558,6 +583,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -581,6 +607,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -597,6 +624,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -631,6 +659,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -648,6 +677,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -664,6 +694,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -680,6 +711,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -710,6 +742,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -735,6 +768,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -753,6 +787,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -783,6 +818,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -808,6 +844,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -833,6 +870,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -39,6 +42,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -79,6 +83,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -102,6 +107,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -123,6 +129,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -153,6 +160,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -186,6 +194,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -229,6 +238,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -252,6 +262,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -283,6 +294,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -293,10 +305,11 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `STANDARD_NC6` GPU VMs. This process is broken down into 3 steps:\n",
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `Standard_NC6s_v3` GPU VMs. This process is broken down into 3 steps:\n",
"1. create the configuration (this step is local and only takes a second)\n",
"2. create the cluster (this step will take about **20 seconds**)\n",
"3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell"
@@ -319,7 +332,7 @@
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -334,6 +347,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -352,6 +366,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -375,6 +390,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"nbpresent": {
@@ -389,6 +405,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -408,6 +425,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -425,6 +443,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -441,10 +460,11 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.6-ubuntu20.04-py38-cuda11-gpu')"
"tf_env = Environment.get(ws, name='azureml-tensorflow-2.11-cuda11')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -475,6 +495,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -492,6 +513,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -522,6 +544,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -547,6 +570,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -587,6 +611,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -616,6 +641,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -623,6 +649,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -639,6 +666,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -649,6 +677,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -657,6 +686,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -669,6 +699,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -676,6 +707,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -683,6 +715,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -710,6 +743,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -732,6 +766,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -748,6 +783,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -770,6 +806,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -786,6 +823,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -820,6 +858,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -846,6 +885,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -862,6 +902,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -878,6 +919,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -924,6 +966,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -950,6 +993,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -987,6 +1031,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1003,6 +1048,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1019,6 +1065,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1067,6 +1114,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1096,6 +1144,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1118,6 +1167,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -6,7 +6,5 @@ dependencies:
- azureml-sdk
- azureml-widgets
- pandas
- keras
- tensorflow==2.0.0
- matplotlib
- fuse

View File

@@ -5,7 +5,7 @@ import numpy as np
import argparse
import os
import re
import tensorflow as tf
import tensorflow.compat.v1 as tf
import glob
from azureml.core import Run
@@ -41,8 +41,8 @@ y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyt
recursive=True)[0], True).reshape(-1)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')
training_set_size = X_train.shape[0]
tf.disable_v2_behavior()
n_inputs = 28 * 28
n_h1 = 100

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -25,6 +28,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -49,6 +53,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -72,6 +77,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -95,6 +101,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -125,7 +132,7 @@
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
@@ -138,6 +145,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -145,6 +153,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -169,6 +178,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -208,6 +218,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -215,6 +226,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -234,6 +246,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -256,6 +269,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -276,6 +290,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -292,10 +307,11 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-TensorFlow-1.13-GPU')"
"tf_env = Environment.get(ws, name='azureml-tensorflow-2.11-cuda11')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -322,6 +338,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -340,6 +357,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -358,6 +376,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -374,6 +393,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -381,6 +401,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -401,6 +422,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -428,6 +450,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

View File

@@ -164,7 +164,7 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"env = Environment.get(workspace=ws, name=\"AzureML-PyTorch-1.4-GPU\").clone(\"mlflow-env\")\n",
"env = Environment.get(workspace=ws, name=\"azureml-acpt-pytorch-1.11-cuda11.3\").clone(\"mlflow-env\")\n",
"\n",
"env.python.conda_dependencies.add_pip_package(\"azureml-mlflow\")\n",
"env.python.conda_dependencies.add_pip_package(\"Pillow==6.0.0\")"

View File

@@ -44,7 +44,7 @@ To make use of these samples, you need the following.
* A Microsoft Azure subscription.
* A Microsoft Azure resource group.
* An Azure Machine Learning Workspace in the resource group.
* Azure Machine Learning training compute. These samples use the VM sizes `STANDARD_NC6` and `STANDARD_D2_V2`. If these are not available in your region,
* Azure Machine Learning training compute. These samples use the VM sizes `Standard_NC6s_v3` and `STANDARD_D2_V2`. If these are not available in your region,
you can replace them with other sizes.
* A virtual network set up in the resource group for samples that use multiple compute targets. The Cartpole and Multi-agent Particle examples do not need a virtual network. Any network security group defined on the virtual network must allow network traffic on ports used by Azure infrastructure services. Sample instructions are provided in Atari Pong and Minecraft example notebooks.

View File

@@ -1,14 +1,16 @@
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.2-cudnn8-ubuntu20.04:20221010.v1
# NC-series GPUs only support the pytorch-1.11/cuda11.3 combo
# See https://learn.microsoft.com/en-us/azure/machine-learning/resource-curated-environments
FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-cuda11.3
USER root
ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/tensorflow-2.7
# ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/ray-rllib
# Create conda environment
RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
python=3.8 pip=20.2.4
# RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
# python=3.8.5
# Prepend path to AzureML conda environment
ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
# ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
# Install necessary packages to support videos in rllib/gym
RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -22,51 +24,32 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
RUN pip --version
RUN python --version
# Install ray-on-aml
RUN pip install 'ray-on-aml==0.1.6'
RUN pip install ray-on-aml==0.2.4 \
ray==2.4.0 \
ray[rllib]==2.4.0 \
mlflow==2.3.1 \
azureml-defaults==1.50.0 \
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
azureml-contrib-reinforcementlearning==1.50.0 \
gputil==1.4.0 \
scipy==1.9.1 \
pyglet==2.0.6 \
cloudpickle==2.2.1 \
tensorflow==2.11.0 \
tensorflow-probability==0.19.0 \
tabulate==0.9.0 \
dm_tree==0.1.8 \
lz4==4.3.2 \
psutil==5.9.4 \
setproctitle==1.3.2 \
pygame==2.1.0 \
gymnasium[classic_control]==0.26.3 \
gymnasium[atari]==0.26.3 \
gymnasium[accept-rom-license]==0.26.3 \
gym==0.26.2 \
gym[atari]==0.26.2 \
gym[accept-rom-license]==0.26.2
RUN pip install ray==0.8.7
RUN pip install gym[atari]==0.19.0
RUN pip install gym[accept-rom-license]==0.19.0
# Install pip dependencies for Tensorflow
RUN pip install 'matplotlib~=3.5.0' \
'psutil~=5.8.0' \
'tqdm~=4.62.0' \
'pandas~=1.3.0' \
'scipy~=1.7.0' \
'numpy~=1.21.0' \
'ipykernel~=6.0' \
'azureml-core==1.47.0' \
'azureml-defaults==1.47.0' \
'azureml-mlflow==1.47.0' \
'azureml-telemetry==1.47.0' \
'tensorboard~=2.7.0' \
'tensorflow-gpu~=2.7.0' \
'tensorflow-datasets~=4.5.0' \
'onnxruntime-gpu~=1.9.0' \
'protobuf~=3.20' \
'horovod[tensorflow-gpu]~=0.23.0' \
'debugpy~=1.6.3'
RUN pip install --no-cache-dir \
azureml-defaults \
azureml-dataset-runtime[fuse,pandas] \
azureml-contrib-reinforcementlearning \
gputil \
cloudpickle==1.3.0 \
tabulate \
dm_tree \
lz4 \
psutil \
setproctitle
# This is required for ray 0.8.7
RUN pip install -U aiohttp==3.7.4
RUN pip install 'msrest<0.7.0'
RUN pip install protobuf==3.20.0
# This is needed for mpi to locate libpython
ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
# Display all versions
RUN pip freeze

View File

@@ -1,237 +0,0 @@
import sys
import csv
from azure.mgmt.network import NetworkManagementClient
def check_port_in_port_range(expected_port: str,
dest_port_range: str):
"""
Check if a port is within a port range
Port range maybe like *, 8080 or 8888-8889
"""
if dest_port_range == '*':
return True
dest_ports = dest_port_range.split('-')
if len(dest_ports) == 1 and \
int(dest_ports[0]) == int(expected_port):
return True
if len(dest_ports) == 2 and \
int(dest_ports[0]) <= int(expected_port) and \
int(dest_ports[1]) >= int(expected_port):
return True
return False
def check_port_in_destination_port_ranges(expected_port: str,
dest_port_ranges: list):
"""
Check if a port is within a given list of port ranges
i.e. check if port 8080 is in port ranges of 22,80,8080-8090,443
"""
for dest_port_range in dest_port_ranges:
if check_port_in_port_range(expected_port, dest_port_range) is True:
return True
return False
def check_ports_in_destination_port_ranges(expected_ports: list,
dest_port_ranges: list):
"""
Check if all ports in a given port list are within a given list
of port ranges
i.e. check if port 8080,8081 are in port ranges of 22,80,8080-8090,443
"""
for expected_port in expected_ports:
if check_port_in_destination_port_ranges(
expected_port, dest_port_ranges) is False:
return False
return True
def check_source_address_prefix(source_address_prefix: str):
"""Check if source address prefix is BatchNodeManagement or default"""
required_prefix = 'BatchNodeManagement'
default_prefix = 'default'
if source_address_prefix.lower() == required_prefix.lower() or \
source_address_prefix.lower() == default_prefix.lower():
return True
return False
def check_protocol(protocol: str):
"""Check if protocol is supported - Tcp/Any"""
required_protocol = 'Tcp'
any_protocol = 'Any'
if required_protocol.lower() == protocol.lower() or \
any_protocol.lower() == protocol.lower():
return True
return False
def check_direction(direction: str):
"""Check if port direction is inbound"""
required_direction = 'Inbound'
if required_direction.lower() == direction.lower():
return True
return False
def check_provisioning_state(provisioning_state: str):
"""Check if the provisioning state is succeeded"""
required_provisioning_state = 'Succeeded'
if required_provisioning_state.lower() == provisioning_state.lower():
return True
return False
def check_rule_for_Azure_ML(rule):
"""Check if the ports required for Azure Machine Learning are open"""
required_ports = ['29876', '29877']
if check_source_address_prefix(rule.source_address_prefix) is False:
return False
if check_protocol(rule.protocol) is False:
return False
if check_direction(rule.direction) is False:
return False
if check_provisioning_state(rule.provisioning_state) is False:
return False
if rule.destination_port_range is not None:
if check_ports_in_destination_port_ranges(
required_ports,
[rule.destination_port_range]) is False:
return False
else:
if check_ports_in_destination_port_ranges(
required_ports,
rule.destination_port_ranges) is False:
return False
return True
def check_vnet_security_rules(auth_object,
vnet_subscription_id,
vnet_resource_group,
vnet_name,
save_to_file=False):
"""
Check all the rules of virtual network if required ports for Azure Machine
Learning are open
"""
network_client = NetworkManagementClient(
auth_object,
vnet_subscription_id)
# get the vnet
vnet = network_client.virtual_networks.get(
resource_group_name=vnet_resource_group,
virtual_network_name=vnet_name)
vnet_location = vnet.location
vnet_info = []
if vnet.subnets is None or len(vnet.subnets) == 0:
print('WARNING: No subnet found for VNet:', vnet_name)
# for each subnet of the vnet
for subnet in vnet.subnets:
if subnet.network_security_group is None:
print('WARNING: No network security group found for subnet.',
'Subnet',
subnet.id.split("/")[-1])
else:
# get all the rules
network_security_group_name = \
subnet.network_security_group.id.split("/")[-1]
network_security_group_resource_group_name = \
subnet.network_security_group.id.split("/")[4]
network_security_group_subscription_id = \
subnet.network_security_group.id.split("/")[2]
security_rules = list(network_client.security_rules.list(
network_security_group_resource_group_name,
network_security_group_name))
rule_matched = None
for rule in security_rules:
rule_info = []
# add vnet details
rule_info.append(vnet_name)
rule_info.append(vnet_subscription_id)
rule_info.append(vnet_resource_group)
rule_info.append(vnet_location)
# add subnet details
rule_info.append(subnet.id.split("/")[-1])
rule_info.append(network_security_group_name)
rule_info.append(network_security_group_subscription_id)
rule_info.append(network_security_group_resource_group_name)
# add rule details
rule_info.append(rule.priority)
rule_info.append(rule.name)
rule_info.append(rule.source_address_prefix)
if rule.destination_port_range is not None:
rule_info.append(rule.destination_port_range)
else:
rule_info.append(rule.destination_port_ranges)
rule_info.append(rule.direction)
rule_info.append(rule.provisioning_state)
vnet_info.append(rule_info)
if check_rule_for_Azure_ML(rule) is True:
rule_matched = rule
if rule_matched is not None:
print("INFORMATION: Rule matched with required ports. Subnet:",
subnet.id.split("/")[-1], "Rule:", rule.name)
else:
print("WARNING: No rule matched with required ports. Subnet:",
subnet.id.split("/")[-1])
if save_to_file is True:
file_name = vnet_name + ".csv"
with open(file_name, mode='w') as vnet_rule_file:
vnet_rule_file_writer = csv.writer(
vnet_rule_file,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
header = ['VNet_Name', 'VNet_Subscription_ID',
'VNet_Resource_Group', 'VNet_Location',
'Subnet_Name', 'NSG_Name',
'NSG_Subscription_ID', 'NSG_Resource_Group',
'Rule_Priority', 'Rule_Name', 'Rule_Source',
'Rule_Destination_Ports', 'Rule_Direction',
'Rule_Provisioning_State']
vnet_rule_file_writer.writerow(header)
vnet_rule_file_writer.writerows(vnet_info)
print("INFORMATION: Network security group rules for your virtual \
network are saved in file", file_name)

View File

@@ -0,0 +1,18 @@
pong-impala-vectorized:
env: ALE/Pong-v5
run: IMPALA
config:
# Make analogous to old v4 + NoFrameskip.
env_config:
frameskip: 1
full_action_space: false
repeat_action_probability: 0.0
rollout_fragment_length: 50
train_batch_size: 500
num_workers: 11
num_envs_per_worker: 10
framework: torch
log_level: INFO
stop:
episode_reward_mean: 10
time_total_s: 3600

View File

@@ -1,35 +1,35 @@
from ray_on_aml.core import Ray_On_AML
import ray.tune as tune
from ray.rllib import train
import yaml
from ray.tune.tune import run_experiments
from utils import callbacks
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Path to yaml configuration file')
args = parser.parse_args()
ray_on_aml = Ray_On_AML()
ray = ray_on_aml.getRay()
if ray: # in the headnode
# Parse arguments
train_parser = train.create_parser()
ray.init(address="auto")
print("Configuring run from file: ", args.config)
experiment_config = None
with open(args.config, "r") as file:
experiment_config = yaml.safe_load(file)
print(f'Config: {experiment_config}')
args = train_parser.parse_args()
print("Algorithm config:", args.config)
# Set local_dir in each experiment configuration to ensure generated logs get picked up
# by Azure ML
for experiment in experiment_config.values():
experiment["local_dir"] = "./logs"
trials = run_experiments(
experiment_config,
callbacks=[callbacks.TrialCallback()],
verbose=2
)
tune.run(
run_or_experiment=args.run,
config={
"env": args.env,
"num_gpus": args.config["num_gpus"],
"num_workers": args.config["num_workers"],
"callbacks": {"on_train_result": callbacks.on_train_result},
"sample_batch_size": 50,
"train_batch_size": 1000,
"num_sgd_iter": 2,
"num_data_loader_buffers": 2,
"model": {"dim": 42},
},
stop=args.stop,
local_dir='./logs')
else:
print("in worker node")

View File

@@ -3,15 +3,20 @@
'''
from azureml.core import Run
from ray import tune
from ray.tune import Callback
from ray.air import session
def on_train_result(info):
'''Callback on train result to record metrics returned by trainer.
'''
run = Run.get_context()
run.log(
name='episode_reward_mean',
value=info["result"]["episode_reward_mean"])
run.log(
name='episodes_total',
value=info["result"]["episodes_total"])
class TrialCallback(Callback):
def on_trial_result(self, iteration, trials, trial, result, **info):
'''Callback on train result to record metrics returned by trainer.
'''
run = Run.get_context()
run.log(
name='episode_reward_mean',
value=result["episode_reward_mean"])
run.log(
name='episodes_total',
value=result["episodes_total"])

View File

@@ -22,7 +22,8 @@
"source": [
"# Reinforcement Learning in Azure Machine Learning - Pong problem\n",
"Reinforcement Learning in Azure Machine Learning is a managed service for running distributed reinforcement learning training and simulation using the open source Ray framework.\n",
"This example uses Ray RLlib to train a Pong playing agent on a multi-node cluster.\n",
"This noteboook demonstrates how to use Ray to solve a more complex problem using a more complex setup including Ray RLLib running on multiple compute nodes and using a GPU.\n",
"For this example we will train a Pong playing agent on cluster of two NC6 nodes (6 CPU, 1 GPU).\n",
"\n",
"## Pong problem\n",
"[Pong](https://en.wikipedia.org/wiki/Pong) is a two-dimensional sports game that simulates table tennis. The player controls an in-game paddle by moving it vertically across the left or right side of the screen. They can compete against another player controlling a second paddle on the opposing side. Players use the paddles to hit a ball back and forth."
@@ -46,7 +47,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The goal here is to train an agent to win an episode of Pong game against opponent with the score of at least 18 points. An episode in Pong runs until one of the players reaches a score of 21. Episodes are a terminology that is used across all the [OpenAI gym](https://www.gymlibrary.dev/environments/atari/pong/) environments that contains a strictly defined task.\n",
"The goal here is to train an agent to win an episode of Pong game against opponent with the score of at least 10 points. An episode in Pong runs until one of the players reaches a score of 21. Episodes are a terminology that is used across all the [OpenAI gym](https://www.gymlibrary.dev/environments/atari/pong/) environments that contains a strictly defined task.\n",
"\n",
"Training a Pong agent is a compute-intensive task and this example demonstrates the use of Reinforcement Learning in Azure Machine Learning service to train an agent faster in a distributed, parallel environment. You'll learn more about using the head and the worker compute targets to train an agent in this notebook below."
]
@@ -60,19 +61,6 @@
"It is highly recommended that the user should go through the [Reinforcement Learning in Azure Machine Learning - Cartpole Problem on Single Compute](../cartpole-on-single-compute/cartpole_sc.ipynb) to understand the basics of Reinforcement Learning in Azure Machine Learning and Ray RLlib used in this notebook."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up Development Environment\n",
"The following subsections show typical steps to setup your development environment. Setup includes:\n",
"\n",
"* Connecting to a workspace to enable communication between your local machine and remote resources\n",
"* Creating an experiment to track all your runs\n",
"* Setting up a virtual network\n",
"* Creating remote head and worker compute target on a virtual network to use for training"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -86,7 +74,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646081765827
"logged": 1683263371795
}
},
"outputs": [],
@@ -113,7 +101,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646081772340
"logged": 1683263375690
}
},
"outputs": [],
@@ -137,7 +125,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646081775643
"logged": 1683263378789
}
},
"outputs": [],
@@ -165,7 +153,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646086081229
"logged": 1683263385677
}
},
"outputs": [],
@@ -177,8 +165,8 @@
"compute_min_nodes = 0\n",
"compute_max_nodes = 2\n",
"\n",
"# This example uses GPU VM. For using CPU VM, set SKU to STANDARD_D2_V2\n",
"vm_size = 'STANDARD_NC6'\n",
"# This example uses GPU VM.\n",
"vm_size = 'Standard_NC6s_v3'\n",
"\n",
"if compute_name in ws.compute_targets:\n",
" compute_target = ws.compute_targets[compute_name]\n",
@@ -207,15 +195,52 @@
" print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
},
"source": [
"### Create Azure ML Environment\r\n",
"\r\n",
"This step creates and registers an Azure ML Environment that includes all of the dependencies needed to run this example, including CUDA drivers Pytorch, RLLib, and associated tools. This step can take a significant time (30 min) on the first run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646160884910
"logged": 1683263388781
},
"jupyter": {
"outputs_hidden": true,
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"ray_environment_name = 'pong-gpu'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1683056774047
},
"jupyter": {
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
@@ -229,7 +254,6 @@
"import os\n",
"from azureml.core import Environment\n",
"\n",
"ray_environment_name = 'pong-gpu'\n",
"ray_environment_dockerfile_path = os.path.join(os.getcwd(), 'docker', 'Dockerfile-gpu')\n",
"\n",
"# Build GPU image\n",
@@ -249,15 +273,7 @@
"\n",
"The code below submits the training run using a `ScriptRunConfig`. By providing the\n",
"command to run the training, and a `RunConfig` object configured with your\n",
"compute target, number of nodes, and environment image to use.\n",
"\n",
"We specify `episode_reward_mean` to 18 as we want to stop the training as soon as the trained agent reaches an average win margin of at least 18 point over opponent over all episodes in the training epoch.\n",
"Number of Ray worker processes are defined by parameter `num_workers`. We set it to 13 as we have 11 CPUs available in our compute targets. Multiple Ray worker processes parallelizes agent training and helps in achieving our goal faster. \n",
"\n",
"```\n",
"Number of CPUs in the compute cluster = 6 * 2 = 12 CPUs over 2 nodes\n",
"Number of CPUs available = (Number of CPUs in the compute cluster) - (1 CPU for head node) = 12 - 1 = 11\n",
"```"
"compute target, number of nodes, and environment image to use."
]
},
{
@@ -265,7 +281,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646162435310
"logged": 1683264835679
}
},
"outputs": [],
@@ -282,16 +298,12 @@
"aml_run_config_ml.node_count = 2\n",
"aml_run_config_ml.environment = ray_environment\n",
"\n",
"training_algorithm = \"IMPALA\"\n",
"rl_environment = \"PongNoFrameskip-v4\"\n",
"script_name='pong_rllib.py'\n",
"config_name='pong-impala-vectorized.yaml'\n",
"\n",
"command=[\n",
" 'python', script_name,\n",
" '--run', training_algorithm,\n",
" '--env', rl_environment,\n",
" '--config', '\\'{\"num_gpus\": 1, \"num_workers\": 11}\\'',\n",
" '--stop', '\\'{\"episode_reward_mean\": 18, \"time_total_s\": 3600}\\''\n",
" '--config', config_name\n",
"]\n",
"\n",
"config = ScriptRunConfig(source_directory='./files',\n",
@@ -305,25 +317,34 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training script\n",
"As recommended in [RLlib](https://ray.readthedocs.io/en/latest/rllib.html) documentations, we use Ray [Tune](https://ray.readthedocs.io/en/latest/tune.html) API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use tune.run() to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n",
"### Training configuration\n",
"All training parameters (including the Reinforcement Learning algorithm) are set through a single configuration file. For this example we'll be using the IMPALA algorithm to train an agent to play Atari Pong.\n",
"We set `num_workers` to 11 because we have 11 CPUs available for worker nodes (6 CPUs on each of 2 machines, with 1 CPU consumed as a head node).\n",
"We set `episode_reward_mean` (under `stop`) to 10 so that we terminate the run once we achieve a reward score of 10.\n",
"\n",
"Here is the configuration we are using for this example:\n",
"\n",
"```yaml\n",
"pong:\n",
" env: ALE/Pong-v5\n",
" run: IMPALA\n",
" config:\n",
" num_workers: 11\n",
" num_gpus: 1\n",
" rollout_fragment_length: 50\n",
" train_batch_size: 1000\n",
" num_sgd_iter: 2\n",
" num_multi_gpu_tower_stacks: 2\n",
" env_config:\n",
" frameskip: 1\n",
" full_action_space: false\n",
" repeat_action_probability: 0.0\n",
" stop:\n",
" episode_reward_mean: 10\n",
" total_time_s: 3600\n",
" model:\n",
" dim: 42\n",
"\n",
"```python\n",
" tune.run(\n",
" run_or_experiment=args.run,\n",
" config={\n",
" \"env\": args.env,\n",
" \"num_gpus\": args.config[\"num_gpus\"],\n",
" \"num_workers\": args.config[\"num_workers\"],\n",
" \"callbacks\": {\"on_train_result\": callbacks.on_train_result},\n",
" \"sample_batch_size\": 50,\n",
" \"train_batch_size\": 1000,\n",
" \"num_sgd_iter\": 2,\n",
" \"num_data_loader_buffers\": 2,\n",
" \"model\": {\"dim\": 42},\n",
" },\n",
" stop=args.stop,\n",
" local_dir='./logs')\n",
"```"
]
},
@@ -339,7 +360,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1683056781459
}
},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
@@ -359,7 +384,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1683056781759
}
},
"outputs": [],
"source": [
"# Uncomment line below to cancel the run\n",
@@ -379,7 +408,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1682525323059
}
},
"outputs": [],
"source": [
"training_run.wait_for_completion()"
@@ -399,7 +432,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1683064583273
}
},
"outputs": [],
"source": [
"# Get the reward metrics from training_run\n",
@@ -416,7 +453,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1682445012908
}
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
@@ -431,7 +472,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We observe that during the training over multiple episodes, the agent learns to win the Pong game against opponent with our target of 18 points in each episode of 21 points.\n",
"We observe that during the training over multiple episodes, the agent learns to win the Pong game against opponent with our target of 10 points in each episode of 21 points.\n",
"**Congratulations!! You have trained your Pong agent to win a game.**"
]
},
@@ -446,7 +487,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1682445012927
}
},
"outputs": [],
"source": [
"# To archive the created experiment:\n",
@@ -456,14 +501,6 @@
"#head_compute_target.delete()\n",
"#worker_compute_target.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next\n",
"In this example, you learned how to solve distributed reinforcement learning training problems using head and worker compute targets. This was an introductory tutorial on Reinforement Learning in Azure Machine Learning service offering. We would love to hear your feedback to build the features you need!"
]
}
],
"metadata": {
@@ -494,7 +531,17 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.8.5"
},
"microsoft": {
"host": {
"AzureML": {
"notebookHasBeenCompleted": true
}
},
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00afLicensed under the MIT License.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00af ",
"nteract": {

View File

@@ -84,7 +84,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646344676671
"logged": 1683062935076
}
},
"outputs": [],
@@ -106,7 +106,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646344680982
"logged": 1683062936280
}
},
"outputs": [],
@@ -133,7 +133,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646344684217
"logged": 1683062936485
}
},
"outputs": [],
@@ -160,7 +160,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646344690768
"logged": 1683062937126
}
},
"outputs": [],
@@ -212,14 +212,14 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646344835579
"logged": 1683062937499
}
},
"outputs": [],
"source": [
"from azureml.core.experiment import Experiment\n",
"\n",
"experiment_name = 'CartPole-v0-CI'\n",
"experiment_name = 'CartPole-v1-CI'\n",
"experiment = Experiment(workspace=ws, name=experiment_name)"
]
},
@@ -228,7 +228,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646346293902
"logged": 1683064044718
},
"jupyter": {
"outputs_hidden": false,
@@ -282,7 +282,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347120585
"logged": 1683064046594
},
"jupyter": {
"outputs_hidden": false,
@@ -300,18 +300,10 @@
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
"\n",
"training_algorithm = 'PPO'\n",
"rl_environment = 'CartPole-v0'\n",
"\n",
"config_name = 'cartpole-ppo.yaml'\n",
"script_name = 'cartpole_training.py'\n",
"script_arguments = [\n",
" '--run', training_algorithm,\n",
" '--env', rl_environment,\n",
" '--config', '{\"num_gpus\": 0, \"num_workers\": 1}',\n",
" '--stop', '{\"episode_reward_mean\": 200, \"time_total_s\": 300}',\n",
" '--checkpoint-freq', '2',\n",
" '--checkpoint-at-end',\n",
" '--local-dir', './logs'\n",
" '--config', config_name\n",
"]\n",
"\n",
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
@@ -331,43 +323,35 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training script\n",
"### Training configuration\n",
"\n",
"As recommended in RLlib documentations, we use Ray Tune API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use `tune.run()` to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n",
"This is the training configuration (in yaml) that we use to train an agent to solve the CartPole problem using\n",
"the PPO algorithm.\n",
"\n",
"This is the list of parameters we are passing into `tune.run()` via the `script_params` parameter:\n",
"\n",
"- `run_or_experiment`: name of the [built-in algorithm](https://ray.readthedocs.io/en/latest/rllib-algorithms.html#rllib-algorithms), 'PPO' in our example,\n",
"- `config`: Algorithm-specific configuration. This includes specifying the environment, `env`, which in our example is the gym **[CartPole-v0](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)** environment,\n",
"- `stop`: stopping conditions, which could be any of the metrics returned by the trainer. Here we use \"mean of episode reward\", and \"total training time in seconds\" as stop conditions, and\n",
"- `checkpoint_freq` and `checkpoint_at_end`: Frequency of taking checkpoints (number of training iterations between checkpoints), and if a checkpoint should be taken at the end.\n",
"\n",
"We also specify the `local_dir`, the directory in which the training logs, checkpoints and other training artificats will be recorded. \n",
"\n",
"See [RLlib Training APIs](https://ray.readthedocs.io/en/latest/rllib-training.html#rllib-training-apis) for more details, and also [Training (tune.run, tune.Experiment)](https://ray.readthedocs.io/en/latest/tune/api_docs/execution.html#training-tune-run-tune-experiment) for the complete list of parameters.\n",
"\n",
"```python\n",
"import os\n",
"import ray\n",
"import ray.tune as tune\n",
"\n",
"if __name__ == \"__main__\":\n",
"\n",
" # parse arguments ...\n",
" \n",
" # Start ray head (single node)\n",
" os.system('ray start --head')\n",
" ray.init(address='auto')\n",
"\n",
" # Run training task using tune.run\n",
" tune.run(\n",
" run_or_experiment=args.run,\n",
" config=dict(args.config, env=args.env),\n",
" stop=args.stop,\n",
" checkpoint_freq=args.checkpoint_freq,\n",
" checkpoint_at_end=args.checkpoint_at_end,\n",
" local_dir=args.local_dir\n",
" )\n",
"```yaml\n",
"cartpole-ppo:\n",
" env: CartPole-v1\n",
" run: PPO\n",
" stop:\n",
" episode_reward_mean: 475\n",
" time_total_s: 300\n",
" checkpoint_config:\n",
" checkpoint_frequency: 2\n",
" checkpoint_at_end: true\n",
" config:\n",
" # Works for both torch and tf.\n",
" framework: torch\n",
" gamma: 0.99\n",
" lr: 0.0003\n",
" num_workers: 1\n",
" observation_filter: MeanStdFilter\n",
" num_sgd_iter: 6\n",
" vf_loss_coeff: 0.01\n",
" model:\n",
" fcnet_hiddens: [32]\n",
" fcnet_activation: linear\n",
" vf_share_layers: true\n",
" enable_connectors: true\n",
"```"
]
},
@@ -386,7 +370,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347127671
"logged": 1683064049813
}
},
"outputs": [],
@@ -408,7 +392,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1683064050024
}
},
"outputs": [],
"source": [
"# Uncomment line below to cancel the run\n",
@@ -430,7 +418,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347318682
"logged": 1683064304728
}
},
"outputs": [],
@@ -463,7 +451,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347328505
"logged": 1683064305251
}
},
"outputs": [],
@@ -471,7 +459,7 @@
"from os import path\n",
"from distutils import dir_util\n",
"\n",
"training_artifacts_path = path.join(\"logs\", training_algorithm)\n",
"training_artifacts_path = path.join(\"logs\", \"cartpole-ppo\")\n",
"print(\"Training artifacts path:\", training_artifacts_path)\n",
"\n",
"if path.exists(training_artifacts_path):\n",
@@ -493,19 +481,20 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347334571
"logged": 1683064305283
}
},
"outputs": [],
"source": [
"# A helper function to find checkpoint files in a directory\n",
"# A helper function to find all of the checkpoint directories located within a larger directory tree\n",
"def find_checkpoints(file_path):\n",
" print(\"Looking in path:\", file_path)\n",
" checkpoints = []\n",
" for root, _, files in os.walk(file_path):\n",
" for name in files:\n",
" if os.path.basename(root).startswith('checkpoint_'):\n",
" checkpoints.append(path.join(root, name))\n",
" for root, dirs, files in os.walk(file_path):\n",
" trimmed_root = root[len(file_path)+1:]\n",
" for name in dirs:\n",
" if name.startswith('checkpoint_'):\n",
" checkpoints.append(path.join(trimmed_root, name))\n",
" return checkpoints"
]
},
@@ -514,7 +503,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347337724
"logged": 1683064305305
}
},
"outputs": [],
@@ -522,16 +511,16 @@
"# Find checkpoints and last checkpoint number\n",
"checkpoint_files = find_checkpoints(training_artifacts_path)\n",
"\n",
"checkpoint_numbers = []\n",
"for file in checkpoint_files:\n",
" file = os.path.basename(file)\n",
" if file.startswith('checkpoint-') and not file.endswith('.tune_metadata'):\n",
" checkpoint_numbers.append(int(file.split('-')[1]))\n",
"last_checkpoint_path = None\n",
"last_checkpoint_number = -1\n",
"for checkpoint_file in checkpoint_files:\n",
" checkpoint_number = int(os.path.basename(checkpoint_file).split('_')[1])\n",
" if checkpoint_number > last_checkpoint_number:\n",
" last_checkpoint_path = checkpoint_file\n",
" last_checkpoint_number = checkpoint_number\n",
"\n",
"print(\"Checkpoints:\", checkpoint_numbers)\n",
"\n",
"last_checkpoint_number = max(checkpoint_numbers)\n",
"print(\"Last checkpoint number:\", last_checkpoint_number)"
"print(\"Last checkpoint number:\", last_checkpoint_number)\n",
"print(\"Last checkpoint path:\", last_checkpoint_path)"
]
},
{
@@ -546,17 +535,16 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347346085
"logged": 1683064305331
}
},
"outputs": [],
"source": [
"# Upload the checkpoint files and create a DataSet\n",
"from azureml.core import Dataset\n",
"from azureml.data.dataset_factory import FileDatasetFactory\n",
"\n",
"datastore = ws.get_default_datastore()\n",
"checkpoint_dataref = datastore.upload_files(checkpoint_files, target_path='cartpole_checkpoints_' + training_run.id, overwrite=True)\n",
"checkpoint_ds = Dataset.File.from_files(checkpoint_dataref)"
"checkpoint_ds = FileDatasetFactory.upload_directory(training_artifacts_path, (datastore, 'cartpole_checkpoints_' + training_run.id), overwrite=False, show_progress=True)"
]
},
{
@@ -571,7 +559,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347354726
"logged": 1683064305353
}
},
"outputs": [],
@@ -598,7 +586,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347414835
"logged": 1683064305371
},
"jupyter": {
"outputs_hidden": false,
@@ -614,23 +602,18 @@
"source": [
"ray_environment_name = 'cartpole-ray-ci'\n",
"\n",
"experiment_name = 'CartPole-v0-CI'\n",
"training_algorithm = 'PPO'\n",
"rl_environment = 'CartPole-v0'\n",
"experiment_name = 'CartPole-v1-CI'\n",
"\n",
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
"ray_environment = Environment.get(workspace=ws, name=ray_environment_name)\n",
"\n",
"script_name = 'cartpole_rollout.py'\n",
"script_arguments = [\n",
" '--run', training_algorithm,\n",
" '--env', rl_environment,\n",
" '--config', '{}',\n",
" '--steps', '2000',\n",
" '--checkpoint-number', str(last_checkpoint_number),\n",
" '--no-render',\n",
" '--artifacts-dataset', checkpoint_ds.as_named_input('artifacts_dataset'),\n",
" '--artifacts-path', checkpoint_ds.as_named_input('artifacts_path').as_mount()\n",
" '--checkpoint', last_checkpoint_path,\n",
" '--algo', 'PPO',\n",
" '--render', 'false',\n",
" '--dataset_path', checkpoint_ds.as_named_input('dataset_path').as_mount()\n",
"]\n",
"\n",
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
@@ -653,7 +636,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in driver_log.txt file. Note that you may need to wait several minutes before these results become available."
"And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in std_log_process_0.txt file. Note that you may need to wait several minutes before these results become available."
]
},
{
@@ -661,7 +644,7 @@
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646347429626
"logged": 1683064305399
}
},
"outputs": [],
@@ -679,7 +662,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1683064305419
}
},
"outputs": [],
"source": [
"# Uncomment line below to cancel the run\n",
@@ -698,7 +685,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"gather": {
"logged": 1683064305437
}
},
"outputs": [],
"source": [
"# To archive the created experiment:\n",
@@ -750,13 +741,16 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.8.5"
},
"microsoft": {
"host": {
"AzureML": {
"notebookHasBeenCompleted": true
}
},
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",

View File

@@ -0,0 +1,23 @@
cartpole-ppo:
env: CartPole-v1
run: PPO
stop:
episode_reward_mean: 475
time_total_s: 300
checkpoint_config:
checkpoint_frequency: 2
checkpoint_at_end: true
config:
# Works for both torch and tf.
framework: torch
gamma: 0.99
lr: 0.0003
num_workers: 1
observation_filter: MeanStdFilter
num_sgd_iter: 6
vf_loss_coeff: 0.01
model:
fcnet_hiddens: [32]
fcnet_activation: linear
vf_share_layers: true
enable_connectors: true

View File

@@ -1,121 +1,108 @@
import os
import sys
import argparse
import ray
from ray.rllib import rollout
from ray.tune.registry import get_trainable_cls
from ray.rllib.evaluate import RolloutSaver, rollout
from ray_on_aml.core import Ray_On_AML
import ray.cloudpickle as cloudpickle
from ray.tune.utils import merge_dicts
from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR
from azureml.core import Run
from utils import callbacks
import collections
import copy
import gymnasium as gym
import json
from pathlib import Path
def run_rollout(args, parser):
config = args.config
if not args.env:
if not config.get("env"):
parser.error("the following arguments are required: --env")
args.env = config.get("env")
def run_rollout(checkpoint, algo, render, steps, episodes):
config_dir = os.path.dirname(checkpoint)
config_path = os.path.join(config_dir, "params.pkl")
config = None
# Create the Trainer from config.
cls = get_trainable_cls(args.run)
agent = cls(env=args.env, config=config)
# Try parent directory.
if not os.path.exists(config_path):
config_path = os.path.join(config_dir, "../params.pkl")
# Load state from checkpoint.
agent.restore(args.checkpoint)
num_steps = int(args.steps)
num_episodes = int(args.episodes)
# Load the config from pickled.
if os.path.exists(config_path):
with open(config_path, "rb") as f:
config = cloudpickle.load(f)
# If no pkl file found, require command line `--config`.
else:
raise ValueError("Could not find params.pkl in either the checkpoint dir or its parent directory")
# Determine the video output directory.
use_arg_monitor = False
try:
args.video_dir
except AttributeError:
print("There is no such attribute: args.video_dir")
use_arg_monitor = True
# Make sure worker 0 has an Env.
config["create_env_on_driver"] = True
video_dir = None
if not use_arg_monitor:
if args.monitor:
video_dir = os.path.join("./logs", "video")
elif args.video_dir:
video_dir = os.path.expanduser(args.video_dir)
# Merge with `evaluation_config` (first try from command line, then from
# pkl file).
evaluation_config = copy.deepcopy(config.get("evaluation_config", {}))
config = merge_dicts(config, evaluation_config)
env = config.get("env")
# Make sure we have evaluation workers.
if not config.get("evaluation_num_workers"):
config["evaluation_num_workers"] = config.get("num_workers", 0)
if not config.get("evaluation_duration"):
config["evaluation_duration"] = 1
# Hard-override this as it raises a warning by Algorithm otherwise.
# Makes no sense anyways, to have it set to None as we don't call
# `Algorithm.train()` here.
config["evaluation_interval"] = 1
# Rendering settings.
config["render_env"] = render
# Create the Algorithm from config.
cls = get_trainable_cls(algo)
algorithm = cls(env=env, config=config)
# Load state from checkpoint, if provided.
if checkpoint:
algorithm.restore(checkpoint)
# Do the actual rollout.
with rollout.RolloutSaver(
args.out,
args.use_shelve,
write_update_file=args.track_progress,
target_steps=num_steps,
target_episodes=num_episodes,
save_info=args.save_info) as saver:
if use_arg_monitor:
rollout.rollout(
agent,
args.env,
num_steps,
num_episodes,
saver,
args.no_render,
args.monitor)
else:
rollout.rollout(
agent, args.env,
num_steps,
num_episodes,
saver,
args.no_render, video_dir)
with RolloutSaver(
outfile=None,
use_shelve=False,
write_update_file=False,
target_steps=steps,
target_episodes=episodes,
save_info=False,
) as saver:
rollout(algorithm, env, steps, episodes, saver, not render)
algorithm.stop()
if __name__ == "__main__":
# Start ray head (single node)
os.system('ray start --head')
ray.init(address='auto')
ray_on_aml = Ray_On_AML()
ray = ray_on_aml.getRay()
if ray:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset_path', required=True, help='Path to artifacts dataset')
parser.add_argument('--checkpoint', required=True, help='Name of checkpoint file directory')
parser.add_argument('--algo', required=True, help='Name of RL algorithm')
parser.add_argument('--render', default=False, required=False, help='True to render')
parser.add_argument('--steps', required=False, type=int, help='Number of steps to run')
parser.add_argument('--episodes', required=False, type=int, help='Number of episodes to run')
args = parser.parse_args()
# Add positional argument - serves as placeholder for checkpoint
argvc = sys.argv[1:]
argvc.insert(0, 'checkpoint-placeholder')
# Get a handle to run
run = Run.get_context()
# Parse arguments
rollout_parser = rollout.create_parser()
# Get handles to the tarining artifacts dataset and mount path
dataset_path = run.input_datasets['dataset_path']
rollout_parser.add_argument(
'--checkpoint-number', required=False, type=int, default=1,
help='Checkpoint number of the checkpoint from which to roll out')
# Find checkpoint file to be evaluated
checkpoint = os.path.join(dataset_path, args.checkpoint)
print('Checkpoint:', checkpoint)
rollout_parser.add_argument(
'--artifacts-dataset', required=True,
help='The checkpoints artifacts dataset')
rollout_parser.add_argument(
'--artifacts-path', required=True,
help='The checkpoints artifacts path')
args = rollout_parser.parse_args(argvc)
# Get a handle to run
run = Run.get_context()
# Get handles to the tarining artifacts dataset and mount path
artifacts_dataset = run.input_datasets['artifacts_dataset']
artifacts_path = run.input_datasets['artifacts_path']
# Find checkpoint file to be evaluated
checkpoint_id = '-' + str(args.checkpoint_number)
checkpoint_files = list(filter(
lambda filename: filename.endswith(checkpoint_id),
artifacts_dataset.to_path()))
checkpoint_file = checkpoint_files[0]
if checkpoint_file[0] == '/':
checkpoint_file = checkpoint_file[1:]
checkpoint = os.path.join(artifacts_path, checkpoint_file)
print('Checkpoint:', checkpoint)
# Set rollout checkpoint
args.checkpoint = checkpoint
# Start rollout
run_rollout(args, rollout_parser)
# Start rollout
ray.init(address='auto')
run_rollout(checkpoint, args.algo, args.render, args.steps, args.episodes)

View File

@@ -1,32 +1,34 @@
import ray
from ray.rllib import train
from ray import tune
import os
from ray_on_aml.core import Ray_On_AML
import yaml
from ray.tune.tune import run_experiments
from utils import callbacks
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Path to yaml configuration file')
args = parser.parse_args()
# Parse arguments and add callbacks to config
train_parser = train.create_parser()
ray_on_aml = Ray_On_AML()
ray = ray_on_aml.getRay()
if ray: # in the headnode
ray.init(address="auto")
print("Configuring run from file: ", args.config)
experiment_config = None
with open(args.config, "r") as file:
experiment_config = yaml.safe_load(file)
args = train_parser.parse_args()
args.config["callbacks"] = {"on_train_result": callbacks.on_train_result}
# Set local_dir in each experiment configuration to ensure generated logs get picked up
# Also set monitor to ensure videos are captured
for experiment_name, experiment in experiment_config.items():
experiment["storage_path"] = "./logs"
experiment['config']['monitor'] = True
print(f'Config: {experiment_config}')
# Trace if video capturing is on
if 'monitor' in args.config and args.config['monitor']:
print("Video capturing is ON!")
# Start ray head (single node)
os.system('ray start --head')
ray.init(address='auto')
# Run training task using tune.run
tune.run(
run_or_experiment=args.run,
config=dict(args.config, env=args.env),
stop=args.stop,
checkpoint_freq=args.checkpoint_freq,
checkpoint_at_end=args.checkpoint_at_end,
local_dir=args.local_dir
)
trials = run_experiments(
experiment_config,
callbacks=[callbacks.TrialCallback()],
verbose=2
)
else:
print("in worker node")

View File

@@ -1,19 +1,27 @@
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
USER root
RUN conda install -c anaconda python=3.7
RUN pip install ray-on-aml==0.2.4 \
ray==2.4.0 \
ray[rllib]==2.4.0 \
mlflow==2.3.1 \
azureml-defaults==1.50.0 \
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
azureml-contrib-reinforcementlearning==1.50.0 \
gputil==1.4.0 \
scipy==1.9.1 \
pyglet==2.0.6 \
cloudpickle==2.2.1 \
tensorflow==2.11.0 \
tensorflow-probability==0.19.0 \
torch \
tabulate==0.9.0 \
dm_tree==0.1.8 \
lz4==4.3.2 \
psutil==5.9.4 \
setproctitle==1.3.2 \
pygame==2.1.0 \
gymnasium[classic_control]==0.26.3 \
gym[classic_control]==0.26.2
RUN pip install ray-on-aml==0.1.6
RUN pip install gym[atari]==0.19.0
RUN pip install gym[accept-rom-license]==0.19.0
RUN pip install ale-py==0.7.0
RUN pip install azureml-core
RUN pip install azureml-dataset-runtime
RUN pip install ray==0.8.7
RUN pip install ray[rllib,tune,serve]==0.8.7
RUN pip install tensorflow==1.14.0
RUN pip install 'msrest<0.7.0'
RUN apt-get update
RUN apt-get install -y jq
RUN apt-get install -y rsync
# Display the exact versions we have installed
RUN pip freeze

Some files were not shown because too many files have changed in this diff Show More