diff --git a/configuration.ipynb b/configuration.ipynb index cf0b00a1..9bc1c3b4 100644 --- a/configuration.ipynb +++ b/configuration.ipynb @@ -103,7 +103,7 @@ "source": [ "import azureml.core\n", "\n", - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/README.md b/how-to-use-azureml/automated-machine-learning/README.md index ec5aa15a..ec88126b 100644 --- a/how-to-use-azureml/automated-machine-learning/README.md +++ b/how-to-use-azureml/automated-machine-learning/README.md @@ -1,8 +1,8 @@ # Table of Contents 1. [Automated ML Introduction](#introduction) -1. [Setup using Azure Notebooks](#jupyter) -1. [Setup using Azure Databricks](#databricks) +1. [Setup using Compute Instances](#jupyter) 1. [Setup using a Local Conda environment](#localconda) +1. [Setup using Azure Databricks](#databricks) 1. [Automated ML SDK Sample Notebooks](#samples) 1. [Documentation](#documentation) 1. [Running using python command](#pythoncommand) @@ -21,13 +21,13 @@ Below are the three execution environments supported by automated ML. -## Setup using Notebook VMs - Jupyter based notebooks from a Azure VM +## Setup using Compute Instances - Jupyter based notebooks from a Azure Virtual Machine 1. Open the [ML Azure portal](https://ml.azure.com) 1. Select Compute -1. Select Notebook VMs +1. Select Compute Instances 1. Click New -1. Type a name for the Vm and select a VM type +1. Type a Compute Name, select a Virtual Machine type and select a Virtual Machine size 1. Click Create diff --git a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml index 5759576d..501763bb 100644 --- a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml +++ b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml @@ -5,19 +5,18 @@ dependencies: - pip<=19.3.1 - nomkl - python>=3.5.2,<3.6.8 -- wheel==0.30.0 - nb_conda - matplotlib==2.1.0 - numpy>=1.16.0,<=1.16.2 - cython - urllib3<1.24 -- scipy>=1.0.0,<=1.1.0 +- scipy==1.4.1 - scikit-learn>=0.19.0,<=0.20.3 - pandas>=0.22.0,<0.23.0 - py-xgboost<=0.80 -- fbprophet==0.5 -- pytorch=1.1.0 -- cudatoolkit=9.0 +- conda-forge::fbprophet==0.5 +- pytorch::pytorch=1.4.0 +- cudatoolkit=10.1.243 - pip: # Required packages for AzureML execution, history, and data preparation. @@ -30,8 +29,3 @@ dependencies: - pytorch-transformers==1.0.0 - spacy==2.1.8 - https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz - -channels: -- anaconda -- conda-forge -- pytorch diff --git a/how-to-use-azureml/automated-machine-learning/automl_env.yml b/how-to-use-azureml/automated-machine-learning/automl_env_master.yml similarity index 51% rename from how-to-use-azureml/automated-machine-learning/automl_env.yml rename to how-to-use-azureml/automated-machine-learning/automl_env_master.yml index fdddb9d2..7608ab9b 100644 --- a/how-to-use-azureml/automated-machine-learning/automl_env.yml +++ b/how-to-use-azureml/automated-machine-learning/automl_env_master.yml @@ -1,36 +1,33 @@ -name: azure_automl +name: automl_env_master dependencies: # The python interpreter version. # Currently Azure ML only supports 3.5.2 and later. - pip<=19.3.1 - python>=3.5.2,<3.6.8 -- wheel==0.30.0 - nb_conda - matplotlib==2.1.0 - numpy>=1.16.0,<=1.16.2 - cython - urllib3<1.24 -- scipy>=1.0.0,<=1.1.0 +- scipy==1.4.1 - scikit-learn>=0.19.0,<=0.20.3 - pandas>=0.22.0,<=0.23.4 +- testpath=0.3.1 - py-xgboost<=0.90 -- fbprophet==0.5 -- pytorch=1.1.0 -- cudatoolkit=9.0 +- conda-forge::fbprophet==0.5 +- pytorch::pytorch=1.4.0 +- cudatoolkit=10.1.243 - pip: # Required packages for AzureML execution, history, and data preparation. - - azureml-defaults + - --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF + - --extra-index-url https://dataprepdownloads.azureedge.net/pypi/weekly-rc-932B96D048E011E8B56608/latest/ + - azureml-defaults<0.1.50 - azureml-dataprep[pandas] - - azureml-train-automl - - azureml-train - - azureml-widgets - - azureml-pipeline + - azureml-train-automl<0.1.50 + - azureml-train<0.1.50 + - azureml-widgets<0.1.50 + - azureml-pipeline<0.1.50 - pytorch-transformers==1.0.0 - spacy==2.1.8 - https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz - -channels: -- anaconda -- conda-forge -- pytorch diff --git a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb index fbf773bc..324cbffa 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb +++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb @@ -41,7 +41,7 @@ "\n", "In this example we use the UCI Bank Marketing dataset to showcase how you can use AutoML for a classification problem and deploy it to an Azure Container Instance (ACI). The classification goal is to predict if the client will subscribe to a term deposit with the bank.\n", "\n", - "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", + "If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", "\n", "Please find the ONNX related documentations [here](https://github.com/onnx/onnx).\n", "\n", @@ -105,7 +105,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, @@ -643,7 +643,7 @@ "\n", "### Retrieve the Best Model\n", "\n", - "Below we select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last invocation. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*." + "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*." ] }, { diff --git a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb index d8d91819..96cb9a45 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb +++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb @@ -42,7 +42,7 @@ "\n", "This notebook is using remote compute to train the model.\n", "\n", - "If you are using an Azure Machine Learning [Notebook VM](https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-1st-experiment-sdk-setup), you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", + "If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", "\n", "In this notebook you will learn how to:\n", "1. Create an experiment using an existing workspace.\n", @@ -93,7 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, @@ -322,7 +322,7 @@ "\n", "### Retrieve the Best Model\n", "\n", - "Below we select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last invocation. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*." + "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*." ] }, { diff --git a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb index 3bc63650..13dd8cad 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb +++ b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb @@ -97,7 +97,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/infer.py b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/infer.py index e316ca06..9f0e2977 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/infer.py +++ b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/infer.py @@ -2,8 +2,7 @@ import numpy as np import argparse from azureml.core import Run from sklearn.externals import joblib -from azureml.automl.core._vendor.automl.client.core.common import metrics -from automl.client.core.common import constants +from azureml.automl.core.shared import constants, metrics from azureml.core.model import Model diff --git a/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb b/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb index 7bcfc6c0..222d3e88 100644 --- a/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb +++ b/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb @@ -88,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb index 708dd4bc..7848165f 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb @@ -114,7 +114,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/infer.py b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/infer.py index 9b3a3171..40c9b371 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/infer.py +++ b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/infer.py @@ -4,8 +4,7 @@ import argparse from azureml.core import Run from sklearn.externals import joblib from sklearn.metrics import mean_absolute_error, mean_squared_error -from azureml.automl.core._vendor.automl.client.core.common import metrics -from automl.client.core.common import constants +from azureml.automl.core.shared import constants, metrics from pandas.tseries.frequencies import to_offset diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb index 57f7b0e8..e1fb4412 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb @@ -87,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, @@ -510,10 +510,9 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "from azureml.automl.core.shared import constants, metrics\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", "from matplotlib import pyplot as plt\n", - "from automl.client.core.common import constants\n", "\n", "# use automl metrics module\n", "scores = metrics.compute_metrics_regression(\n", diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/forecasting_script.py b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/forecasting_script.py index f3fb7b89..ca951597 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/forecasting_script.py +++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/forecasting_script.py @@ -1,6 +1,6 @@ import argparse import azureml.train.automl -from azureml.automl.runtime._vendor.automl.client.core.runtime import forecasting_models +from azureml.automl.runtime.shared import forecasting_models from azureml.core import Run from sklearn.externals import joblib import forecasting_helper diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb index a554a904..f439d348 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb @@ -42,7 +42,7 @@ "\n", "In this example we use the associated New York City energy demand dataset to showcase how you can use AutoML for a simple forecasting problem and explore the results. The goal is predict the energy demand for the next 48 hours based on historic time-series data.\n", "\n", - "If you are using an Azure Machine Learning [Notebook VM](https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-1st-experiment-sdk-setup), you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n", + "If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n", "\n", "In this notebook you will learn how to:\n", "1. Creating an Experiment using an existing Workspace\n", @@ -97,7 +97,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, @@ -507,9 +507,8 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "from azureml.automl.core.shared import constants, metrics\n", "from matplotlib import pyplot as plt\n", - "from automl.client.core.common import constants\n", "\n", "# use automl metrics module\n", "scores = metrics.compute_metrics_regression(\n", @@ -668,9 +667,8 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "from azureml.automl.core.shared import constants, metrics\n", "from matplotlib import pyplot as plt\n", - "from automl.client.core.common import constants\n", "\n", "# use automl metrics module\n", "scores = metrics.compute_metrics_regression(\n", diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/auto-ml-forecasting-function.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/auto-ml-forecasting-function.ipynb index 5df787be..1c0e43ff 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/auto-ml-forecasting-function.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/auto-ml-forecasting-function.ipynb @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb index 4ceae6ca..f4f73dd0 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb @@ -82,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, @@ -355,11 +355,18 @@ "source": [ "## Train\n", "\n", - "The AutoMLConfig object defines the settings and data for an AutoML training job. Here, we set necessary inputs like the task type, the number of AutoML iterations to try, the training data, and cross-validation parameters. \n", + "The [AutoMLConfig](https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py) object defines the settings and data for an AutoML training job. Here, we set necessary inputs like the task type, the number of AutoML iterations to try, the training data, and cross-validation parameters.\n", "\n", - "For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time, the grain column names, and the maximum forecast horizon. A time column is required for forecasting, while the grain is optional. If a grain is not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n", + "For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time, the grain column names, and the maximum forecast horizon. A time column is required for forecasting, while the grain is optional. If grain columns are not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n", + "\n", + "The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning application that estimates the next month of sales should set the horizon according to suitable planning time-scales. Please see the [energy_demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) for more discussion of forecast horizon.\n", + "\n", + "We note here that AutoML can sweep over two types of time-series models:\n", + "* Models that are trained for each series such as ARIMA and Facebook's Prophet. Note that these models are only available for [Enterprise Edition Workspaces](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace#upgrade).\n", + "* Models trained across multiple time-series using a regression approach.\n", + "\n", + "In the first case, AutoML loops over all time-series in your dataset and trains one model (e.g. AutoArima or Prophet, as the case may be) for each series. This can result in long runtimes to train these models if there are a lot of series in the data. One way to mitigate this problem is to fit models for different series in parallel if you have multiple compute cores available. To enable this behavior, set the `max_cores_per_iteration` parameter in your AutoMLConfig as shown in the example in the next cell. \n", "\n", - "The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly. Please see the [energy_demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) for more discussion of forecast horizon.\n", "\n", "Finally, a note about the cross-validation (CV) procedure for time-series data. AutoML uses out-of-sample error estimates to select a best pipeline/model, so it is important that the CV fold splitting is done correctly. Time-series can violate the basic statistical assumptions of the canonical K-Fold CV strategy, so AutoML implements a [rolling origin validation](https://robjhyndman.com/hyndsight/tscv/) procedure to create CV folds for time-series data. To use this procedure, you just need to specify the desired number of CV folds in the AutoMLConfig object. It is also possible to bypass CV and use your own validation set by setting the *validation_data* parameter of AutoMLConfig.\n", "\n", @@ -381,7 +388,8 @@ "|**time_column_name**|Name of the datetime column in the input data|\n", "|**grain_column_names**|Name(s) of the columns defining individual series in the input data|\n", "|**max_horizon**|Maximum desired forecast horizon in units of time-series frequency|\n", - "|**featurization**| 'auto' / 'off' / FeaturizationConfig Indicator for whether featurization step should be done automatically or not, or whether customized featurization should be used. Setting this enables AutoML to perform featurization on the input to handle *missing data*, and to perform some common *feature extraction*.|" + "|**featurization**| 'auto' / 'off' / FeaturizationConfig Indicator for whether featurization step should be done automatically or not, or whether customized featurization should be used. Setting this enables AutoML to perform featurization on the input to handle *missing data*, and to perform some common *feature extraction*.|\n", + "|**max_cores_per_iteration**|Maximum number of cores to utilize per iteration. A value of -1 indicates all available cores should be used.|" ] }, { @@ -407,6 +415,7 @@ " featurization=featurization_config,\n", " n_cross_validations=3,\n", " verbosity=logging.INFO,\n", + " max_cores_per_iteration=-1,\n", " **time_series_settings)" ] }, @@ -536,7 +545,7 @@ "source": [ "If you are used to scikit pipelines, perhaps you expected `predict(X_test)`. However, forecasting requires a more general interface that also supplies the past target `y` values. Please use `forecast(X,y)` as `predict(X)` is reserved for internal purposes on forecasting models.\n", "\n", - "The [energy demand forecasting notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) demonstrates the use of the forecast function in more detail in the context of using lags and rolling window features. " + "The [forecast function notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/auto-ml-forecasting-function.ipynb) demonstrates the use of the forecast function for a variety of use cases. Also, please see the [API documentation for the forecast function](https://docs.microsoft.com/en-us/python/api/azureml-automl-runtime/azureml.automl.runtime.shared.model_wrappers.forecastingpipelinewrapper?view=azure-ml-py#forecast-x-pred--typing-union-pandas-core-frame-dataframe--nonetype----none--y-pred--typing-union-pandas-core-frame-dataframe--numpy-ndarray--nonetype----none--forecast-destination--typing-union-pandas--libs-tslibs-timestamps-timestamp--nonetype----none--ignore-data-errors--bool---false-----typing-tuple-numpy-ndarray--pandas-core-frame-dataframe-)." ] }, { @@ -567,9 +576,8 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "from azureml.automl.core.shared import constants, metrics\n", "from matplotlib import pyplot as plt\n", - "from automl.client.core.common import constants\n", "\n", "# use automl metrics module\n", "scores = metrics.compute_metrics_regression(\n", diff --git a/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb b/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb index 55f69373..9750a559 100644 --- a/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb +++ b/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb @@ -42,7 +42,7 @@ "\n", "This notebook is using the local machine compute to train the model.\n", "\n", - "If you are using an Azure Machine Learning [Notebook VM](https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-1st-experiment-sdk-setup), you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", + "If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", "\n", "In this notebook you will learn how to:\n", "1. Create an experiment using an existing workspace.\n", @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb index 4951f4f8..099b7539 100644 --- a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb +++ b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb @@ -40,7 +40,7 @@ "In this example we use the Hardware Performance Dataset to showcase how you can use AutoML for a simple regression problem. The Regression goal is to predict the performance of certain combinations of hardware parts.\n", "After training AutoML models for this regression data set, we show how you can compute model explanations on your remote compute using a sample explainer script.\n", "\n", - "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", + "If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", "\n", "An Enterprise workspace is required for this notebook. To learn more about creating an Enterprise workspace or upgrading to an Enterprise workspace from the Azure portal, please visit our [Workspace page.](https://docs.microsoft.com/azure/machine-learning/service/concept-workspace#upgrade) \n", "\n", @@ -98,7 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py index b27d76b2..86595e7d 100644 --- a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py +++ b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/train_explainer.py @@ -10,7 +10,7 @@ from azureml.train.automl.runtime.automl_explain_utilities import AutoMLExplaine automl_setup_model_explanations, automl_check_model_if_explainable from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel from azureml.explain.model.mimic_wrapper import MimicWrapper -from automl.client.core.common.constants import MODEL_PATH +from azureml.automl.core.shared.constants import MODEL_PATH from azureml.explain.model.scoring.scoring_explainer import TreeScoringExplainer, save diff --git a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb index 1cbf6523..a8992a59 100644 --- a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb +++ b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb @@ -40,7 +40,7 @@ "## Introduction\n", "In this example we use the Hardware Performance Dataset to showcase how you can use AutoML for a simple regression problem. The Regression goal is to predict the performance of certain combinations of hardware parts.\n", "\n", - "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", + "If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n", "\n", "In this notebook you will learn how to:\n", "1. Create an `Experiment` in an existing `Workspace`.\n", @@ -92,7 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb index 3ca93022..48aa5518 100644 --- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb +++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb @@ -542,7 +542,7 @@ "metadata": {}, "outputs": [], "source": [ - "from automl.client.core.common import constants\n", + "from azureml.automl.core.shared import constants\n", "conda_env_file_name = 'conda_env.yml'\n", "best_run.download_file(name=\"outputs/conda_env_v_1_0_0.yml\", output_file_path=conda_env_file_name)\n", "with open(conda_env_file_name, \"r\") as conda_file:\n", @@ -564,7 +564,7 @@ "metadata": {}, "outputs": [], "source": [ - "from automl.client.core.common import constants\n", + "from azureml.automl.core.shared import constants\n", "script_file_name = 'scoring_file.py'\n", "best_run.download_file(name=\"outputs/scoring_file_v_1_0_0.py\", output_file_path=script_file_name)\n", "with open(script_file_name, \"r\") as scoring_file:\n", diff --git a/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb b/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb index 6c81ead9..f26620b6 100644 --- a/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb +++ b/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb @@ -383,6 +383,8 @@ "- an inference configuration\n", "- a single column tabular dataset, where each row contains a string representing sample request data sent to the service.\n", "\n", + "Please, note that profiling is a long running operation and can take up to 25 minutes depending on the size of the dataset.\n", + "\n", "At this point we only support profiling of services that expect their request data to be a string, for example: string serialized json, text, string serialized image, etc. The content of each row of the dataset (string) will be put into the body of the HTTP request and sent to the service encapsulating the model for scoring.\n", "\n", "Below is an example of how you can construct an input dataset to profile a service which expects its incoming requests to contain serialized json. In this case we created a dataset based one hundred instances of the same request data. In real world scenarios however, we suggest that you use larger datasets with various inputs, especially if your model resource usage/behavior is input dependent." @@ -483,6 +485,7 @@ " cpu=1.0,\n", " memory_in_gb=0.5)\n", "\n", + "# profiling is a long running operation and may take up to 25 min\n", "profile.wait_for_completion(True)\n", "details = profile.get_details()" ] diff --git a/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb b/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb index b0374f64..8fb8a77c 100644 --- a/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb +++ b/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb @@ -86,7 +86,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can add tags and descriptions to your models. we are using `sklearn_regression_model.pkl` file in the current directory as a model with the name `sklearn_regression_model_local_adv` in the workspace.\n", + "You can add tags and descriptions to your models. we are using `sklearn_regression_model.pkl` file in the current directory as a model with the name `sklearn_regression_model` in the workspace.\n", "\n", "Using tags, you can track useful information such as the name and version of the machine learning library used to train the model, framework, category, target customer etc. Note that tags must be alphanumeric." ] @@ -105,7 +105,7 @@ "from azureml.core.model import Model\n", "\n", "model = Model.register(model_path=\"sklearn_regression_model.pkl\",\n", - " model_name=\"sklearn_regression_model_local_adv\",\n", + " model_name=\"sklearn_regression_model\",\n", " tags={'area': \"diabetes\", 'type': \"regression\"},\n", " description=\"Ridge regression model to predict diabetes\",\n", " workspace=ws)" @@ -126,12 +126,12 @@ "source": [ "import os\n", "\n", - "source_directory = \"C:/abc\"\n", + "source_directory = \"source_directory\"\n", "\n", "os.makedirs(source_directory, exist_ok=True)\n", - "os.makedirs(\"C:/abc/x/y\", exist_ok=True)\n", - "os.makedirs(\"C:/abc/env\", exist_ok=True)\n", - "os.makedirs(\"C:/abc/dockerstep\", exist_ok=True)" + "os.makedirs(os.path.join(source_directory, \"x/y\"), exist_ok=True)\n", + "os.makedirs(os.path.join(source_directory, \"env\"), exist_ok=True)\n", + "os.makedirs(os.path.join(source_directory, \"dockerstep\"), exist_ok=True)" ] }, { @@ -147,7 +147,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile C:/abc/x/y/score.py\n", + "%%writefile source_directory/x/y/score.py\n", "import os\n", "import pickle\n", "import json\n", @@ -170,7 +170,7 @@ " global name\n", " # note here, entire source directory on inference config gets added into image\n", " # bellow is the example how you can use any extra files in image\n", - " with open('./abc/extradata.json') as json_file: \n", + " with open('./source_directory/extradata.json') as json_file:\n", " data = json.load(json_file)\n", " name = data[\"people\"][0][\"name\"]\n", "\n", @@ -191,9 +191,7 @@ }, { "cell_type": "markdown", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "Please note that you must indicate azureml-defaults with verion >= 1.0.45 as a pip dependency for your environemnt. This package contains the functionality needed to host the model as a web service." ] @@ -204,7 +202,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile C:/abc/env/myenv.yml\n", + "%%writefile source_directory/env/myenv.yml\n", "name: project_environment\n", "dependencies:\n", " - python=3.6.2\n", @@ -221,7 +219,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile C:/abc/extradata.json\n", + "%%writefile source_directory/extradata.json\n", "{\n", " \"people\": [\n", " {\n", @@ -255,13 +253,14 @@ "from azureml.core.model import InferenceConfig\n", "\n", "\n", - "myenv = Environment.from_conda_specification(name='myenv', file_path='env/myenv.yml')\n", + "myenv = Environment.from_conda_specification(name='myenv', file_path='myenv.yml')\n", "\n", "# explicitly set base_image to None when setting base_dockerfile\n", "myenv.docker.base_image = None\n", - "myenv.docker.base_dockerfile = \"RUN echo \\\"this is test\\\"\"\n", + "myenv.docker.base_dockerfile = \"FROM mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04\\nRUN echo \\\"this is test\\\"\"\n", + "myenv.inferencing_stack_version = \"latest\"\n", "\n", - "inference_config = InferenceConfig(source_directory=\"C:/abc\",\n", + "inference_config = InferenceConfig(source_directory=source_directory,\n", " entry_script=\"x/y/score.py\",\n", " environment=myenv)\n" ] @@ -379,7 +378,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile C:/abc/x/y/score.py\n", + "%%writefile source_directory/x/y/score.py\n", "import os\n", "import pickle\n", "import json\n", @@ -401,7 +400,7 @@ " global name, from_location\n", " # note here, entire source directory on inference config gets added into image\n", " # bellow is the example how you can use any extra files in image\n", - " with open('./abc/extradata.json') as json_file: \n", + " with open('source_directory/extradata.json') as json_file: \n", " data = json.load(json_file)\n", " name = data[\"people\"][0][\"name\"]\n", " from_location = data[\"people\"][0][\"from\"]\n", diff --git a/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb b/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb index 0b7660a2..fd6c888f 100644 --- a/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb +++ b/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb @@ -82,7 +82,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can add tags and descriptions to your models. we are using `sklearn_regression_model.pkl` file in the current directory as a model with the name `sklearn_regression_model_local` in the workspace.\n", + "You can add tags and descriptions to your models. we are using `sklearn_regression_model.pkl` file in the current directory as a model with the name `sklearn_regression_model` in the workspace.\n", "\n", "Using tags, you can track useful information such as the name and version of the machine learning library used to train the model, framework, category, target customer etc. Note that tags must be alphanumeric." ] @@ -100,7 +100,7 @@ "from azureml.core.model import Model\n", "\n", "model = Model.register(model_path=\"sklearn_regression_model.pkl\",\n", - " model_name=\"sklearn_regression_model_local\",\n", + " model_name=\"sklearn_regression_model\",\n", " tags={'area': \"diabetes\", 'type': \"regression\"},\n", " description=\"Ridge regression model to predict diabetes\",\n", " workspace=ws)" @@ -159,6 +159,8 @@ "- an inference configuration\n", "- a single column tabular dataset, where each row contains a string representing sample request data sent to the service.\n", "\n", + "Please, note that profiling is a long running operation and can take up to 25 minutes depending on the size of the dataset.\n", + "\n", "At this point we only support profiling of services that expect their request data to be a string, for example: string serialized json, text, string serialized image, etc. The content of each row of the dataset (string) will be put into the body of the HTTP request and sent to the service encapsulating the model for scoring.\n", "\n", "Below is an example of how you can construct an input dataset to profile a service which expects its incoming requests to contain serialized json. In this case we created a dataset based one hundred instances of the same request data. In real world scenarios however, we suggest that you use larger datasets with various inputs, especially if your model resource usage/behavior is input dependent." @@ -245,6 +247,7 @@ " cpu=1.0,\n", " memory_in_gb=0.5)\n", "\n", + "# profiling is a long running operation and may take up to 25 min\n", "profile.wait_for_completion(True)\n", "details = profile.get_details()" ] diff --git a/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.yml b/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.yml index 06c455f0..7ea284e5 100644 --- a/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.yml +++ b/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.yml @@ -4,4 +4,4 @@ dependencies: - azureml-sdk - numpy - git+https://github.com/apple/coremltools@v2.1 - - onnxmltools==1.3.1 + - onnxmltools diff --git a/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.yml b/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.yml index 7a41748d..8d9a9c4b 100644 --- a/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.yml +++ b/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.yml @@ -6,4 +6,4 @@ dependencies: - matplotlib - numpy - onnx - - opencv-python + - opencv-python-headless diff --git a/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.yml b/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.yml index 614209c5..0d73085a 100644 --- a/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.yml +++ b/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.yml @@ -6,4 +6,4 @@ dependencies: - matplotlib - numpy - onnx - - opencv-python + - opencv-python-headless diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb index aed26009..d78603b2 100644 --- a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb +++ b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb @@ -59,8 +59,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Register the model\n", - "Register an existing trained model, add descirption and tags. Prior to registering the model, you should have a TensorFlow [Saved Model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) in the `resnet50` directory. You can download a [pretrained resnet50](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW_jpg.tar.gz) and unpack it to that directory." + "# Download the model\n", + "\n", + "Prior to registering the model, you should have a TensorFlow [Saved Model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) in the `resnet50` directory. This cell will download a [pretrained resnet50](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW_jpg.tar.gz) and unpack it to that directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import shutil\n", + "import tarfile\n", + "import tempfile\n", + "\n", + "from io import BytesIO\n", + "\n", + "model_url = \"http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW_jpg.tar.gz\"\n", + "\n", + "archive_prefix = \"./resnet_v1_fp32_savedmodel_NCHW_jpg/1538686758/\"\n", + "target_folder = \"resnet50\"\n", + "\n", + "if not os.path.exists(target_folder):\n", + " response = requests.get(model_url)\n", + " archive = tarfile.open(fileobj=BytesIO(response.content))\n", + " with tempfile.TemporaryDirectory() as temp_folder:\n", + " archive.extractall(temp_folder)\n", + " shutil.copytree(os.path.join(temp_folder, archive_prefix), target_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Register the model\n", + "Register an existing trained model, add description and tags." ] }, { @@ -69,13 +105,13 @@ "metadata": {}, "outputs": [], "source": [ - "#Register the model\n", "from azureml.core.model import Model\n", - "model = Model.register(model_path = \"resnet50\", # this points to a local file\n", - " model_name = \"resnet50\", # this is the name the model is registered as\n", - " tags = {'area': \"Image classification\", 'type': \"classification\"},\n", - " description = \"Image classification trained on Imagenet Dataset\",\n", - " workspace = ws)\n", + "\n", + "model = Model.register(model_path=\"resnet50\", # This points to the local directory to upload.\n", + " model_name=\"resnet50\", # This is the name the model is registered as.\n", + " tags={'area': \"Image classification\", 'type': \"classification\"},\n", + " description=\"Image classification trained on Imagenet Dataset\",\n", + " workspace=ws)\n", "\n", "print(model.name, model.description, model.version)" ] diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb b/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb index 5ea43c86..b9d35d30 100644 --- a/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb +++ b/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb @@ -212,6 +212,8 @@ "- an inference configuration\n", "- a single column tabular dataset, where each row contains a string representing sample request data sent to the service.\n", "\n", + "Please, note that profiling is a long running operation and can take up to 25 minutes depending on the size of the dataset.\n", + "\n", "At this point we only support profiling of services that expect their request data to be a string, for example: string serialized json, text, string serialized image, etc. The content of each row of the dataset (string) will be put into the body of the HTTP request and sent to the service encapsulating the model for scoring.\n", "\n", "Below is an example of how you can construct an input dataset to profile a service which expects its incoming requests to contain serialized json. In this case we created a dataset based one hundred instances of the same request data. In real world scenarios however, we suggest that you use larger datasets with various inputs, especially if your model resource usage/behavior is input dependent." @@ -312,6 +314,7 @@ " cpu=1.0,\n", " memory_in_gb=0.5)\n", "\n", + "# profiling is a long running operation and may take up to 25 min\n", "profile.wait_for_completion(True)\n", "details = profile.get_details()" ] diff --git a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb index 27506f1b..7f0702b3 100644 --- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb @@ -243,8 +243,25 @@ " 'azureml-interpret', 'sklearn-pandas', 'azureml-dataprep'\n", "]\n", "\n", + "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'],\n", + "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=azureml_pip_packages)\n", "\n", "# Now submit a run on AmlCompute\n", @@ -344,8 +361,25 @@ " 'azureml-interpret', 'azureml-dataprep'\n", "]\n", "\n", + "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'],\n", + "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=azureml_pip_packages)\n", "\n", "from azureml.core import Run\n", @@ -457,8 +491,25 @@ "\n", "\n", "\n", + "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'],\n", + "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=azureml_pip_packages)\n", "\n", "from azureml.core import Run\n", diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-keras-locally-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-keras-locally-and-deploy.ipynb index c62d55bf..651b0256 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-keras-locally-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-keras-locally-and-deploy.ipynb @@ -431,8 +431,25 @@ " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", " 'azureml-interpret'\n", "]\n", + "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "myenv = CondaDependencies.create(conda_packages=['scikit-learn', 'pandas'],\n", + "myenv = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=['sklearn-pandas', 'pyyaml', 'tensorflow<2.0', 'keras==2.3.1'] + azureml_pip_packages)\n", "\n", "with open(\"myenv.yml\",\"w\") as f:\n", @@ -476,7 +493,7 @@ "inference_config = InferenceConfig(entry_script=\"score_local_explain_keras.py\", environment=myenv)\n", "\n", "# Use configs and models generated above\n", - "service = Model.deploy(ws, 'model-scoring-deploy-local', [scoring_explainer_model, featurize_model, keras_model], inference_config, aciconfig)\n", + "service = Model.deploy(ws, 'model-scoring-keras-deploy-local', [scoring_explainer_model, featurize_model, keras_model], inference_config, aciconfig)\n", "service.wait_for_deployment(show_output=True)" ] }, diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb index ed070d5d..fb3635fc 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb @@ -328,8 +328,25 @@ "]\n", " \n", "\n", + "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "myenv = CondaDependencies.create(conda_packages=['scikit-learn', 'pandas'],\n", + "myenv = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=['sklearn-pandas', 'pyyaml'] + azureml_pip_packages,\n", " pin_sdk_version=False)\n", "\n", diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb index acce1822..64b4e187 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb @@ -246,8 +246,25 @@ " \n", "\n", "\n", + "# Note: this is to pin the scikit-learn version to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'],\n", + "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=['sklearn_pandas', 'pyyaml'] + azureml_pip_packages,\n", " pin_sdk_version=False)\n", "# Now submit a run on AmlCompute\n", @@ -397,8 +414,25 @@ "]\n", " \n", "\n", + "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n", + "# In production scenario user would choose their dependencies\n", + "import pkg_resources\n", + "available_packages = pkg_resources.working_set\n", + "sklearn_ver = None\n", + "pandas_ver = None\n", + "for dist in available_packages:\n", + " if dist.key == 'scikit-learn':\n", + " sklearn_ver = dist.version\n", + " elif dist.key == 'pandas':\n", + " pandas_ver = dist.version\n", + "sklearn_dep = 'scikit-learn'\n", + "pandas_dep = 'pandas'\n", + "if sklearn_ver:\n", + " sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n", + "if pandas_ver:\n", + " pandas_dep = 'pandas=={}'.format(pandas_ver)\n", "# specify CondaDependencies obj\n", - "myenv = CondaDependencies.create(conda_packages=['scikit-learn', 'pandas'],\n", + "myenv = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n", " pip_packages=['sklearn-pandas', 'pyyaml'] + azureml_pip_packages,\n", " pin_sdk_version=False)\n", "\n", diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb index 1193163c..83f1bba7 100644 --- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb @@ -537,259 +537,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Deploy the model in ACI\n", - "Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). \n", - "### Create score.py\n", - "First, we will create a scoring script that will be invoked by the web service call. \n", - "\n", - "* Note that the scoring script must have two required functions, `init()` and `run(input_data)`. \n", - " * In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started. \n", - " * In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to `run` typically use JSON as serialization and de-serialization format but you are not limited to that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile score.py\n", - "import json\n", - "import numpy as np\n", - "import os\n", - "import tensorflow as tf\n", - "\n", - "def init():\n", - " global X, output, sess\n", - " tf.reset_default_graph()\n", - " # AZUREML_MODEL_DIR is an environment variable created during deployment.\n", - " # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)\n", - " # For multiple models, it points to the folder containing all deployed models (./azureml-models)\n", - " model_root = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model')\n", - " saver = tf.train.import_meta_graph(os.path.join(model_root, 'mnist-tf.model.meta'))\n", - " X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", - " output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", - " \n", - " sess = tf.Session()\n", - " saver.restore(sess, os.path.join(model_root, 'mnist-tf.model'))\n", - "\n", - "def run(raw_data):\n", - " data = np.array(json.loads(raw_data)['data'])\n", - " # make prediction\n", - " out = output.eval(session=sess, feed_dict={X: data})\n", - " y_hat = np.argmax(out, axis=1)\n", - " return y_hat.tolist()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create myenv.yml\n", - "We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `tensorflow`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import CondaDependencies\n", - "\n", - "cd = CondaDependencies.create()\n", - "cd.add_conda_package('numpy')\n", - "cd.add_tensorflow_conda_package()\n", - "cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n", - "\n", - "print(cd.serialize_to_string())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Deploy to ACI\n", - "Now we can deploy. **This cell will run for about 7-8 minutes**. Behind the scene, AzureML will build a Docker container image with the given configuration, if already not available. This image will be deployed to the ACI infrastructure and the scoring script and model will be mounted on the container. The model will then be available as a web service with an HTTP endpoint to accept REST client calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "from azureml.core.environment import Environment\n", - "from azureml.core.model import Model, InferenceConfig\n", - "from azureml.core.webservice import AciWebservice\n", - "\n", - "\n", - "myenv = Environment.from_conda_specification(name=\"env\", file_path=\"myenv.yml\")\n", - "inference_config = InferenceConfig(entry_script=\"score.py\", environment=myenv)\n", - "\n", - "aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n", - " memory_gb=1, \n", - " tags={'name':'mnist', 'framework': 'TensorFlow DNN'},\n", - " description='Tensorflow DNN on MNIST')\n", - "\n", - "service = Model.deploy(ws, 'tf-mnist-svc', [model], inference_config, aciconfig)\n", - "service.wait_for_deployment(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(service.get_logs())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is the scoring web service endpoint:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(service.scoring_uri)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed model\n", - "Let's test the deployed model. Pick 30 random samples from the test set, and send it to the web service hosted in ACI. Note here we are using the `run` API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.\n", - "\n", - "After the invocation, we print the returned predictions and plot them along with the input images. Use red font color and inversed image (white on black) to highlight the misclassified samples. Note since the model accuracy is pretty high, you might have to run the below cell a few times before you can see a misclassified sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "# find 30 random samples from test set\n", - "n = 30\n", - "sample_indices = np.random.permutation(X_test.shape[0])[0:n]\n", - "\n", - "test_samples = json.dumps({\"data\": X_test[sample_indices].tolist()})\n", - "test_samples = bytes(test_samples, encoding='utf8')\n", - "\n", - "# predict using the deployed model\n", - "result = service.run(input_data=test_samples)\n", - "\n", - "# compare actual value vs. the predicted values:\n", - "i = 0\n", - "plt.figure(figsize = (20, 1))\n", - "\n", - "for s in sample_indices:\n", - " plt.subplot(1, n, i + 1)\n", - " plt.axhline('')\n", - " plt.axvline('')\n", - " \n", - " # use different color for misclassified sample\n", - " font_color = 'red' if y_test[s] != result[i] else 'black'\n", - " clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys\n", - " \n", - " plt.text(x=10, y=-10, s=y_hat[s], fontsize=18, color=font_color)\n", - " plt.imshow(X_test[s].reshape(28, 28), cmap=clr_map)\n", - " \n", - " i = i + 1\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also send raw HTTP request to the service." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "\n", - "# send a random row from the test set to score\n", - "random_index = np.random.randint(0, len(X_test)-1)\n", - "input_data = \"{\\\"data\\\": [\" + str(list(X_test[random_index])) + \"]}\"\n", - "\n", - "headers = {'Content-Type':'application/json'}\n", - "\n", - "resp = requests.post(service.scoring_uri, input_data, headers=headers)\n", - "\n", - "print(\"POST to url\", service.scoring_uri)\n", - "print(\"input data:\", input_data)\n", - "print(\"label:\", y_test[random_index])\n", - "print(\"prediction:\", resp.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's look at the workspace after the web service was deployed. You should see \n", - "* a registered model named 'model' and with the id 'model:1'\n", - "* an image called 'tf-mnist' and with a docker image location pointing to your workspace's Azure Container Registry (ACR) \n", - "* a webservice called 'tf-mnist' with some scoring URL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "models = ws.models\n", - "for name, model in models.items():\n", - " print(\"Model: {}, ID: {}\".format(name, model.id))\n", - " \n", - "images = ws.images\n", - "for name, image in images.items():\n", - " print(\"Image: {}, location: {}\".format(name, image.image_location))\n", - " \n", - "webservices = ws.webservices\n", - "for name, webservice in webservices.items():\n", - " print(\"Webservice: {}, scoring URI: {}\".format(name, webservice.scoring_uri))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean up\n", - "You can delete the ACI deployment with a simple delete API call." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "service.delete()" + "For model deployment, please refer to [Training, hyperparameter tune, and deploy with TensorFlow](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb)." ] } ], diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb index be000d26..33c5dfc7 100644 --- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb @@ -136,6 +136,7 @@ "source": [ "from azureml.core.compute import AmlCompute\n", "from azureml.core.compute import ComputeTarget\n", + "from azureml.core.compute_target import ComputeTargetException\n", "\n", "# Choose a name for your CPU cluster\n", "amlcompute_cluster_name = \"cpu-cluster\"\n", diff --git a/how-to-use-azureml/machine-learning-pipelines/parallel-run/file-dataset-image-inference-mnist.ipynb b/how-to-use-azureml/machine-learning-pipelines/parallel-run/file-dataset-image-inference-mnist.ipynb index 398a7f6f..ef0008d1 100644 --- a/how-to-use-azureml/machine-learning-pipelines/parallel-run/file-dataset-image-inference-mnist.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/parallel-run/file-dataset-image-inference-mnist.ipynb @@ -341,7 +341,7 @@ "from azureml.core import Environment\n", "from azureml.core.runconfig import CondaDependencies, DEFAULT_CPU_IMAGE\n", "\n", - "batch_conda_deps = CondaDependencies.create(pip_packages=[\"tensorflow==1.13.1\", \"pillow\"])\n", + "batch_conda_deps = CondaDependencies.create(pip_packages=[\"tensorflow==1.15.2\", \"pillow\"])\n", "\n", "batch_env = Environment(name=\"batch_environment\")\n", "batch_env.python.conda_dependencies = batch_conda_deps\n", diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml index 4302c349..7a3343e6 100644 --- a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml @@ -1,7 +1,7 @@ name: pytorch-mask-rcnn dependencies: - cython -- pytorch -c pytorch +- pytorch==1.4.0 -c pytorch - torchvision -c pytorch - pip: - azureml-sdk diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py index 32e0b8f2..87be1ab3 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py +++ b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py @@ -4,33 +4,100 @@ import numpy as np import argparse import os +import re import tensorflow as tf +import time import glob from azureml.core import Run from utils import load_data +from tensorflow.keras import Model, layers + + +# Create TF Model. +class NeuralNet(Model): + # Set layers. + def __init__(self): + super(NeuralNet, self).__init__() + # First hidden layer. + self.h1 = layers.Dense(n_h1, activation=tf.nn.relu) + # Second hidden layer. + self.h2 = layers.Dense(n_h2, activation=tf.nn.relu) + self.out = layers.Dense(n_outputs) + + # Set forward pass. + def call(self, x, is_training=False): + x = self.h1(x) + x = self.h2(x) + x = self.out(x) + if not is_training: + # Apply softmax when not training. + x = tf.nn.softmax(x) + return x + + +def cross_entropy_loss(y, logits): + # Convert labels to int 64 for tf cross-entropy function. + y = tf.cast(y, tf.int64) + # Apply softmax to logits and compute cross-entropy. + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) + # Average loss across the batch. + return tf.reduce_mean(loss) + + +# Accuracy metric. +def accuracy(y_pred, y_true): + # Predicted class is the index of highest score in prediction vector (i.e. argmax). + correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64)) + return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1) + + +# Optimization process. +def run_optimization(x, y): + # Wrap computation inside a GradientTape for automatic differentiation. + with tf.GradientTape() as g: + # Forward pass. + logits = neural_net(x, is_training=True) + # Compute loss. + loss = cross_entropy_loss(y, logits) + + # Variables to update, i.e. trainable variables. + trainable_variables = neural_net.trainable_variables + + # Compute gradients. + gradients = g.gradient(loss, trainable_variables) + + # Update W and b following gradients. + optimizer.apply_gradients(zip(gradients, trainable_variables)) + print("TensorFlow version:", tf.__version__) parser = argparse.ArgumentParser() -parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') -parser.add_argument('--batch-size', type=int, dest='batch_size', default=50, help='mini batch size for training') -parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=100, +parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point') +parser.add_argument('--batch-size', type=int, dest='batch_size', default=128, help='mini batch size for training') +parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=128, help='# of neurons in the first layer') -parser.add_argument('--second-layer-neurons', type=int, dest='n_hidden_2', default=100, +parser.add_argument('--second-layer-neurons', type=int, dest='n_hidden_2', default=128, help='# of neurons in the second layer') parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.01, help='learning rate') +parser.add_argument('--resume-from', type=str, default=None, + help='location of the model or checkpoint files from where to resume the training') args = parser.parse_args() +previous_model_location = args.resume_from +# You can also use environment variable to get the model/checkpoint files location +# previous_model_location = os.path.expandvars(os.getenv("AZUREML_DATAREFERENCE_MODEL_LOCATION", None)) + data_folder = args.data_folder print('Data folder:', data_folder) # load train and test set into numpy arrays # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), - recursive=True)[0], False) / 255.0 + recursive=True)[0], False) / np.float32(255.0) X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), - recursive=True)[0], False) / 255.0 + recursive=True)[0], False) / np.float32(255.0) y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), @@ -48,65 +115,76 @@ learning_rate = args.learning_rate n_epochs = 20 batch_size = args.batch_size -with tf.name_scope('network'): - # construct the DNN - X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X') - y = tf.placeholder(tf.int64, shape=(None), name='y') - h1 = tf.layers.dense(X, n_h1, activation=tf.nn.relu, name='h1') - h2 = tf.layers.dense(h1, n_h2, activation=tf.nn.relu, name='h2') - output = tf.layers.dense(h2, n_outputs, name='output') +# Build neural network model. +neural_net = NeuralNet() -with tf.name_scope('train'): - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=output) - loss = tf.reduce_mean(cross_entropy, name='loss') - optimizer = tf.train.GradientDescentOptimizer(learning_rate) - train_op = optimizer.minimize(loss) - -with tf.name_scope('eval'): - correct = tf.nn.in_top_k(output, y, 1) - acc_op = tf.reduce_mean(tf.cast(correct, tf.float32)) - -init = tf.global_variables_initializer() -saver = tf.train.Saver() +# Stochastic gradient descent optimizer. +optimizer = tf.optimizers.SGD(learning_rate) # start an Azure ML run run = Run.get_context() -with tf.Session() as sess: - init.run() - for epoch in range(n_epochs): +if previous_model_location: + # Restore variables from latest checkpoint. + checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer) + checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location) + checkpoint.restore(checkpoint_file_path) + checkpoint_filename = os.path.basename(checkpoint_file_path) + num_found = re.search(r'\d+', checkpoint_filename) + if num_found: + start_epoch = int(num_found.group(0)) + print("Resuming from epoch {}".format(str(start_epoch))) - # randomly shuffle training set - indices = np.random.permutation(training_set_size) - X_train = X_train[indices] - y_train = y_train[indices] +start_time = time.perf_counter() +for epoch in range(0, n_epochs): - # batch index - b_start = 0 - b_end = b_start + batch_size - for _ in range(training_set_size // batch_size): - # get a batch - X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end] + # randomly shuffle training set + indices = np.random.permutation(training_set_size) + X_train = X_train[indices] + y_train = y_train[indices] - # update batch index for the next batch - b_start = b_start + batch_size - b_end = min(b_start + batch_size, training_set_size) + # batch index + b_start = 0 + b_end = b_start + batch_size + for _ in range(training_set_size // batch_size): + # get a batch + X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end] - # train - sess.run(train_op, feed_dict={X: X_batch, y: y_batch}) - # evaluate training set - acc_train = acc_op.eval(feed_dict={X: X_batch, y: y_batch}) - # evaluate validation set - acc_val = acc_op.eval(feed_dict={X: X_test, y: y_test}) + # update batch index for the next batch + b_start = b_start + batch_size + b_end = min(b_start + batch_size, training_set_size) - # log accuracies - run.log('training_acc', np.float(acc_train)) - run.log('validation_acc', np.float(acc_val)) - print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val) - y_hat = np.argmax(output.eval(feed_dict={X: X_test}), axis=1) + # train + run_optimization(X_batch, y_batch) - run.log('final_acc', np.float(acc_val)) + # evaluate training set + pred = neural_net(X_batch, is_training=False) + acc_train = accuracy(pred, y_batch) - os.makedirs('./outputs/model', exist_ok=True) - # files saved in the "./outputs" folder are automatically uploaded into run history - saver.save(sess, './outputs/model/mnist-tf.model') + # evaluate validation set + pred = neural_net(X_test, is_training=False) + acc_val = accuracy(pred, y_test) + + # log accuracies + run.log('training_acc', np.float(acc_train)) + run.log('validation_acc', np.float(acc_val)) + print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val) + + # Save checkpoints in the "./outputs" folder so that they are automatically uploaded into run history. + checkpoint_dir = './outputs/' + checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer) + + if epoch % 2 == 0: + checkpoint.save(checkpoint_dir) + +run.log('final_acc', np.float(acc_val)) +os.makedirs('./outputs/model', exist_ok=True) + +# files saved in the "./outputs" folder are automatically uploaded into run history +# this is workaround for https://github.com/tensorflow/tensorflow/issues/33913 and will be fixed once we move to >tf2.1 +neural_net._set_inputs(X_train) +tf.saved_model.save(neural_net, './outputs/model/') + +stop_time = time.perf_counter() +training_time = (stop_time - start_time) * 1000 +print("Total time in milliseconds for training: {}".format(str(training_time))) diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb index 314161c1..5aadb46b 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb +++ b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb @@ -170,18 +170,19 @@ "metadata": {}, "outputs": [], "source": [ - "import urllib\n", - "data_folder = 'data'\n", + "import urllib.request\n", + "\n", + "data_folder = os.path.join(os.getcwd(), 'data')\n", "os.makedirs(data_folder, exist_ok=True)\n", "\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'train-images.gz'))\n", + " filename=os.path.join(data_folder, 'train-images-idx3-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'train-labels.gz'))\n", + " filename=os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'test-images.gz'))\n", + " filename=os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'test-labels.gz'))" + " filename=os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'))" ] }, { @@ -209,11 +210,10 @@ "from utils import load_data\n", "\n", "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster.\n", - "X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0\n", - "y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)\n", - "\n", - "X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0\n", - "y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)\n", + "X_train = load_data(os.path.join(data_folder, 'train-images-idx3-ubyte.gz'), False) / np.float32(255.0)\n", + "X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / np.float32(255.0)\n", + "y_train = load_data(os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'), True).reshape(-1)\n", + "y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)\n", "\n", "count = 0\n", "sample_size = 30\n", @@ -447,9 +447,9 @@ "\n", "script_params = {\n", " '--data-folder': dataset.as_named_input('mnist').as_mount(),\n", - " '--batch-size': 50,\n", - " '--first-layer-neurons': 300,\n", - " '--second-layer-neurons': 100,\n", + " '--batch-size': 64,\n", + " '--first-layer-neurons': 256,\n", + " '--second-layer-neurons': 128,\n", " '--learning-rate': 0.01\n", "}\n", "\n", @@ -458,6 +458,7 @@ " compute_target=compute_target,\n", " entry_script='tf_mnist.py',\n", " use_gpu=True,\n", + " framework_version='2.0',\n", " pip_packages=['azureml-dataprep[pandas,fuse]'])" ] }, @@ -622,14 +623,7 @@ "metadata": {}, "outputs": [], "source": [ - "# create a model folder in the current directory\n", - "os.makedirs('./model', exist_ok=True)\n", - "\n", - "for f in run.get_file_names():\n", - " if f.startswith('outputs/model'):\n", - " output_file_path = os.path.join('./model', f.split('/')[-1])\n", - " print('Downloading from {} to {} ...'.format(f, output_file_path))\n", - " run.download_file(name=f, output_file_path=output_file_path)" + "run.download_files(prefix='outputs/model', output_directory='./model', append_prefix=False)" ] }, { @@ -649,22 +643,7 @@ "outputs": [], "source": [ "import tensorflow as tf\n", - "\n", - "tf.reset_default_graph()\n", - "\n", - "saver = tf.train.import_meta_graph(\"./model/mnist-tf.model.meta\")\n", - "graph = tf.get_default_graph()\n", - "\n", - "for op in graph.get_operations():\n", - " if op.name.startswith('network'):\n", - " print(op.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feed test dataset to the persisted model to get predictions." + "imported_model = tf.saved_model.load('./model')" ] }, { @@ -673,16 +652,8 @@ "metadata": {}, "outputs": [], "source": [ - "# input tensor. this is an array of 784 elements, each representing the intensity of a pixel in the digit image.\n", - "X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", - "# output tensor. this is an array of 10 elements, each representing the probability of predicted value of the digit.\n", - "output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", - "\n", - "with tf.Session() as sess:\n", - " saver.restore(sess, './model/mnist-tf.model')\n", - " k = output.eval(feed_dict={X : X_test})\n", - "# get the prediction, which is the index of the element that has the largest probability value.\n", - "y_hat = np.argmax(k, axis=1)\n", + "pred =imported_model(X_test)\n", + "y_hat = np.argmax(pred, axis=1)\n", "\n", "# print the first 30 labels and predictions\n", "print('labels: \\t', y_test[:30])\n", @@ -690,10 +661,12 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Calculate the overall accuracy by comparing the predicted value against the test set." + "print(\"Accuracy on the test set:\", np.average(y_hat == y_test))" ] }, { @@ -724,9 +697,9 @@ "\n", "ps = RandomParameterSampling(\n", " {\n", - " '--batch-size': choice(25, 50, 100),\n", - " '--first-layer-neurons': choice(10, 50, 200, 300, 500),\n", - " '--second-layer-neurons': choice(10, 50, 200, 500),\n", + " '--batch-size': choice(32, 64, 128),\n", + " '--first-layer-neurons': choice(16, 64, 128, 256, 512),\n", + " '--second-layer-neurons': choice(16, 64, 256, 512),\n", " '--learning-rate': loguniform(-6, -1)\n", " }\n", ")" @@ -748,7 +721,8 @@ "est = TensorFlow(source_directory=script_folder,\n", " script_params={'--data-folder': dataset.as_named_input('mnist').as_mount()},\n", " compute_target=compute_target,\n", - " entry_script='tf_mnist.py', \n", + " entry_script='tf_mnist.py',\n", + " framework_version='2.0',\n", " use_gpu=True,\n", " pip_packages=['azureml-dataprep[pandas,fuse]'])" ] @@ -928,24 +902,20 @@ "from azureml.core.model import Model\n", "\n", "def init():\n", - " global X, output, sess\n", - " tf.reset_default_graph()\n", + " global tf_model\n", " model_root = os.getenv('AZUREML_MODEL_DIR')\n", " # the name of the folder in which to look for tensorflow model files\n", " tf_model_folder = 'model'\n", - " saver = tf.train.import_meta_graph(\n", - " os.path.join(model_root, tf_model_folder, 'mnist-tf.model.meta'))\n", - " X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", - " output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", - "\n", - " sess = tf.Session()\n", - " saver.restore(sess, os.path.join(model_root, tf_model_folder, 'mnist-tf.model'))\n", + " \n", + " tf_model = tf.saved_model.load(os.path.join(model_root, tf_model_folder))\n", "\n", "def run(raw_data):\n", - " data = np.array(json.loads(raw_data)['data'])\n", + " data = np.array(json.loads(raw_data)['data'], dtype=np.float32)\n", + " \n", " # make prediction\n", - " out = output.eval(session=sess, feed_dict={X: data})\n", + " out = tf_model(data)\n", " y_hat = np.argmax(out, axis=1)\n", + "\n", " return y_hat.tolist()" ] }, @@ -967,7 +937,7 @@ "\n", "cd = CondaDependencies.create()\n", "cd.add_conda_package('numpy')\n", - "cd.add_pip_package('tensorflow==1.13.1')\n", + "cd.add_pip_package('tensorflow==2.0.0')\n", "cd.add_pip_package(\"azureml-defaults\")\n", "cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n", "\n", diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml index 3f25441b..4629e907 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml +++ b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml @@ -1,13 +1,13 @@ name: train-hyperparameter-tune-deploy-with-tensorflow dependencies: - numpy -- tensorflow==1.10.0 - matplotlib - pip: - azureml-sdk - azureml-widgets - pandas - keras + - tensorflow==2.0.0 - matplotlib - azureml-dataprep - fuse diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb index aabcacc5..30fe9e25 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb @@ -175,13 +175,13 @@ "os.makedirs(data_folder, exist_ok=True)\n", "\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'train-images.gz'))\n", + " filename=os.path.join(data_folder, 'train-images-idx3-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'train-labels.gz'))\n", + " filename=os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'test-images.gz'))\n", + " filename=os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'test-labels.gz'))" + " filename=os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'))" ] }, { @@ -209,10 +209,10 @@ "from utils import load_data\n", "\n", "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the model converge faster.\n", - "X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0\n", - "X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0\n", - "y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)\n", - "y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)\n", + "X_train = load_data(os.path.join(data_folder, 'train-images-idx3-ubyte.gz'), False) / 255.0\n", + "X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / 255.0\n", + "y_train = load_data(os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'), True).reshape(-1)\n", + "y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)\n", "\n", "# now let's show some randomly chosen images from the training set.\n", "count = 0\n", @@ -243,10 +243,10 @@ "outputs": [], "source": [ "from azureml.core.dataset import Dataset\n", - "web_paths = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',\n", - " 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',\n", - " 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',\n", - " 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'\n", + "web_paths = ['https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',\n", + " 'https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz',\n", + " 'https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n", + " 'https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz'\n", " ]\n", "dataset = Dataset.File.from_files(path = web_paths)" ] @@ -445,9 +445,9 @@ "# ensure latest azureml-dataprep and other required packages installed in the environment\n", "cd = CondaDependencies.create(pip_packages=['keras',\n", " 'azureml-sdk',\n", - " 'tensorflow==1.14.0',\n", + " 'tensorflow==2.0.0',\n", " 'matplotlib',\n", - " 'azureml-dataprep[pandas,fuse]>=1.1.14'])\n", + " 'azureml-dataprep[pandas,fuse]'])\n", "\n", "env.python.conda_dependencies = cd" ] @@ -466,9 +466,9 @@ "\n", "script_params = {\n", " '--data-folder': dataset.as_named_input('mnist').as_mount(),\n", - " '--batch-size': 50,\n", - " '--first-layer-neurons': 300,\n", - " '--second-layer-neurons': 100,\n", + " '--batch-size': 64,\n", + " '--first-layer-neurons': 256,\n", + " '--second-layer-neurons': 128,\n", " '--learning-rate': 0.01\n", "}\n", "\n", @@ -476,7 +476,7 @@ " script_params=script_params,\n", " compute_target=compute_target,\n", " entry_script='tf_mnist.py', \n", - " framework_version='1.13',\n", + " framework_version='2.0',\n", " environment_definition= env)" ] }, @@ -534,9 +534,9 @@ "\n", "ps = RandomParameterSampling(\n", " {\n", - " '--batch-size': choice(25, 50, 100),\n", - " '--first-layer-neurons': choice(10, 50, 200, 300, 500),\n", - " '--second-layer-neurons': choice(10, 50, 200, 500),\n", + " '--batch-size': choice(32, 64, 128),\n", + " '--first-layer-neurons': choice(16, 64, 128, 256, 512),\n", + " '--second-layer-neurons': choice(16, 64, 256, 512),\n", " '--learning-rate': loguniform(-6, -1)\n", " }\n", ")" @@ -558,7 +558,8 @@ "est = TensorFlow(source_directory=script_folder,\n", " script_params={'--data-folder': dataset.as_named_input('mnist').as_mount()},\n", " compute_target=compute_target,\n", - " entry_script='tf_mnist.py', \n", + " entry_script='tf_mnist.py',\n", + " framework_version='2.0',\n", " environment_definition = env)" ] }, @@ -566,7 +567,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next we will define an early termnination policy. This will terminate poorly performing runs automatically, reducing wastage of resources and instead efficiently using these resources for exploring other parameter configurations. In this example, we will use the `TruncationSelectionPolicy`, truncating the bottom performing 10% runs. It states to check the job every 2 iterations. If the primary metric (defined later) falls in the bottom 25% range, Azure ML terminate the job. This saves us from continuing to explore hyperparameters that don't show promise of helping reach our target metric." + "Next we will define an early termnination policy. This will terminate poorly performing runs automatically, reducing wastage of resources and instead efficiently using these resources for exploring other parameter configurations. In this example, we will use the `TruncationSelectionPolicy`, truncating the bottom performing 25% runs. It states to check the job every 2 iterations. If the primary metric (defined later) falls in the bottom 25% range, Azure ML terminate the job. This saves us from continuing to explore hyperparameters that don't show promise of helping reach our target metric." ] }, { diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.yml b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.yml index 0c1fa94c..fa635e3b 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.yml +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.yml @@ -7,7 +7,7 @@ dependencies: - azureml-widgets - pandas - keras - - tensorflow==1.14.0 + - tensorflow - matplotlib - azureml-dataprep - fuse diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/tf_mnist.py b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/tf_mnist.py index 3a08708f..d4ae3425 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/tf_mnist.py +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/tf_mnist.py @@ -11,15 +11,74 @@ import glob from azureml.core import Run from utils import load_data +from tensorflow.keras import Model, layers + + +# Create TF Model. +class NeuralNet(Model): + # Set layers. + def __init__(self): + super(NeuralNet, self).__init__() + # First hidden layer. + self.h1 = layers.Dense(n_h1, activation=tf.nn.relu) + # Second hidden layer. + self.h2 = layers.Dense(n_h2, activation=tf.nn.relu) + self.out = layers.Dense(n_outputs) + + # Set forward pass. + def call(self, x, is_training=False): + x = self.h1(x) + x = self.h2(x) + x = self.out(x) + if not is_training: + # Apply softmax when not training. + x = tf.nn.softmax(x) + return x + + +def cross_entropy_loss(y, logits): + # Convert labels to int 64 for tf cross-entropy function. + y = tf.cast(y, tf.int64) + # Apply softmax to logits and compute cross-entropy. + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) + # Average loss across the batch. + return tf.reduce_mean(loss) + + +# Accuracy metric. +def accuracy(y_pred, y_true): + # Predicted class is the index of highest score in prediction vector (i.e. argmax). + correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64)) + return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1) + + +# Optimization process. +def run_optimization(x, y): + # Wrap computation inside a GradientTape for automatic differentiation. + with tf.GradientTape() as g: + # Forward pass. + logits = neural_net(x, is_training=True) + # Compute loss. + loss = cross_entropy_loss(y, logits) + + # Variables to update, i.e. trainable variables. + trainable_variables = neural_net.trainable_variables + + # Compute gradients. + gradients = g.gradient(loss, trainable_variables) + + # Update W and b following gradients. + optimizer.apply_gradients(zip(gradients, trainable_variables)) + print("TensorFlow version:", tf.__version__) parser = argparse.ArgumentParser() -parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') -parser.add_argument('--batch-size', type=int, dest='batch_size', default=50, help='mini batch size for training') -parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=100, +parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point') +parser.add_argument('--batch-size', type=int, dest='batch_size', default=128, help='mini batch size for training') +parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=128, help='# of neurons in the first layer') -parser.add_argument('--second-layer-neurons', type=int, dest='n_hidden_2', default=100, +parser.add_argument('--second-layer-neurons', type=int, dest='n_hidden_2', default=128, help='# of neurons in the second layer') parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.01, help='learning rate') parser.add_argument('--resume-from', type=str, default=None, @@ -36,9 +95,9 @@ print('Data folder:', data_folder) # load train and test set into numpy arrays # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), - recursive=True)[0], False) / 255.0 + recursive=True)[0], False) / np.float32(255.0) X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), - recursive=True)[0], False) / 255.0 + recursive=True)[0], False) / np.float32(255.0) y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), @@ -56,88 +115,77 @@ learning_rate = args.learning_rate n_epochs = 20 batch_size = args.batch_size -with tf.name_scope('network'): - # construct the DNN - X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X') - y = tf.placeholder(tf.int64, shape=(None), name='y') - h1 = tf.layers.dense(X, n_h1, activation=tf.nn.relu, name='h1') - h2 = tf.layers.dense(h1, n_h2, activation=tf.nn.relu, name='h2') - output = tf.layers.dense(h2, n_outputs, name='output') +# Build neural network model. +neural_net = NeuralNet() -with tf.name_scope('train'): - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=output) - loss = tf.reduce_mean(cross_entropy, name='loss') - optimizer = tf.train.GradientDescentOptimizer(learning_rate) - train_op = optimizer.minimize(loss) - -with tf.name_scope('eval'): - correct = tf.nn.in_top_k(output, y, 1) - acc_op = tf.reduce_mean(tf.cast(correct, tf.float32)) - -init = tf.global_variables_initializer() -saver = tf.train.Saver() +# Stochastic gradient descent optimizer. +optimizer = tf.optimizers.SGD(learning_rate) # start an Azure ML run run = Run.get_context() -with tf.Session() as sess: - start_time = time.perf_counter() +if previous_model_location: + # Restore variables from latest checkpoint. + checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer) + checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location) + checkpoint.restore(checkpoint_file_path) + checkpoint_filename = os.path.basename(checkpoint_file_path) + num_found = re.search(r'\d+', checkpoint_filename) + if num_found: + start_epoch = int(num_found.group(0)) + print("Resuming from epoch {}".format(str(start_epoch))) - start_epoch = 0 - if previous_model_location: - checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location) - saver.restore(sess, checkpoint_file_path) - checkpoint_filename = os.path.basename(checkpoint_file_path) - num_found = re.search(r'\d+', checkpoint_filename) - if num_found: - start_epoch = int(num_found.group(0)) - print("Resuming from epoch {}".format(str(start_epoch))) - else: - init.run() +start_time = time.perf_counter() +for epoch in range(0, n_epochs): - for epoch in range(start_epoch, n_epochs): + # randomly shuffle training set + indices = np.random.permutation(training_set_size) + X_train = X_train[indices] + y_train = y_train[indices] - # randomly shuffle training set - indices = np.random.permutation(training_set_size) - X_train = X_train[indices] - y_train = y_train[indices] + # batch index + b_start = 0 + b_end = b_start + batch_size + for _ in range(training_set_size // batch_size): + # get a batch + X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end] - # batch index - b_start = 0 - b_end = b_start + batch_size - for _ in range(training_set_size // batch_size): - # get a batch - X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end] + # update batch index for the next batch + b_start = b_start + batch_size + b_end = min(b_start + batch_size, training_set_size) - # update batch index for the next batch - b_start = b_start + batch_size - b_end = min(b_start + batch_size, training_set_size) + # train + run_optimization(X_batch, y_batch) - # train - sess.run(train_op, feed_dict={X: X_batch, y: y_batch}) - # evaluate training set - acc_train = acc_op.eval(feed_dict={X: X_batch, y: y_batch}) - # evaluate validation set - acc_val = acc_op.eval(feed_dict={X: X_test, y: y_test}) + # evaluate training set + pred = neural_net(X_batch, is_training=False) + acc_train = accuracy(pred, y_batch) - time.sleep(10) + # evaluate validation set + pred = neural_net(X_test, is_training=False) + acc_val = accuracy(pred, y_test) - # log accuracies - run.log('training_acc', np.float(acc_train)) - run.log('validation_acc', np.float(acc_val)) - print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val) - y_hat = np.argmax(output.eval(feed_dict={X: X_test}), axis=1) + # log accuracies + run.log('training_acc', np.float(acc_train)) + run.log('validation_acc', np.float(acc_val)) + print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val) - # Save checkpoints in the "./outputs" folder so that they are automatically uploaded into run history. - if epoch % 2 == 0: - saver.save(sess, './outputs/', global_step=epoch) + # Save checkpoints in the "./outputs" folder so that they are automatically uploaded into run history. + checkpoint_dir = './outputs/' + checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer) - run.log('final_acc', np.float(acc_val)) + if epoch % 2 == 0: + checkpoint.save(checkpoint_dir) + time.sleep(3) - os.makedirs('./outputs/model', exist_ok=True) - # files saved in the "./outputs" folder are automatically uploaded into run history - saver.save(sess, './outputs/model/mnist-tf.model') +run.log('final_acc', np.float(acc_val)) +os.makedirs('./outputs/model', exist_ok=True) - stop_time = time.perf_counter() - training_time = (stop_time - start_time) * 1000 - print("Total time in milliseconds for training: {}".format(str(training_time))) +# files saved in the "./outputs" folder are automatically uploaded into run history +# this is workaround for https://github.com/tensorflow/tensorflow/issues/33913 and will be fixed once we move to >tf2.1 +neural_net._set_inputs(X_train) +tf.saved_model.save(neural_net, './outputs/model/') + +stop_time = time.perf_counter() +training_time = (stop_time - start_time) * 1000 +print("Total time in milliseconds for training: {}".format(str(training_time))) diff --git a/how-to-use-azureml/monitor-models/data-drift/drift-on-aks.ipynb b/how-to-use-azureml/monitor-models/data-drift/drift-on-aks.ipynb index 9131ed12..3eed691c 100644 --- a/how-to-use-azureml/monitor-models/data-drift/drift-on-aks.ipynb +++ b/how-to-use-azureml/monitor-models/data-drift/drift-on-aks.ipynb @@ -184,11 +184,10 @@ "prov_config = AksCompute.provisioning_configuration()\n", "\n", "aks_name = 'drift-aks'\n", + "aks_target = ws.compute_targets.get(aks_name)\n", "\n", "# Create the cluster\n", - "try:\n", - " aks_target = ws.compute_targets[aks_name]\n", - "except KeyError:\n", + "if not aks_target:\n", " aks_target = ComputeTarget.create(workspace = ws,\n", " name = aks_name,\n", " provisioning_configuration = prov_config)\n", diff --git a/how-to-use-azureml/reinforcement-learning/README.md b/how-to-use-azureml/reinforcement-learning/README.md new file mode 100644 index 00000000..2a322a03 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/README.md @@ -0,0 +1,118 @@ + +# Azure Machine Learning - Reinforcement Learning (Public Preview) + + + +This is an introduction to the [Azure Machine Learning](https://docs.microsoft.com/en-us/azure/machine-learning/service/) Reinforcement Learning (Public Preview) using the [Ray](https://github.com/ray-project/ray/) framework. + +Using these samples, you will be able to do the following. + +1. Use an Azure Machine Learning workspace, set up virtual network and create compute clusters for running Ray. +2. Run some experiments to train a reinforcement learning agent using Ray and RLlib. + +## Contents + +| File/folder | Description | +|-------------------|--------------------------------------------| +| [README.md](README.md) | This README file. | +| [devenv_setup.ipynb](setup/devenv_setup.ipynb) | Notebook to setup development environment for Azure ML RL | +| [cartpole_ci.ipynb](cartpole-on-compute-instance/cartpole_ci.ipynb) | Notebook to train a Cartpole playing agent on an Azure ML Compute Instance | +| [cartpole_cc.ipynb](cartpole-on-single-compute/cartpole_cc.ipynb) | Notebook to train a Cartpole playing agent on an Azure ML Compute Cluster (single node) | +| [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb) | Notebook to train Pong agent using RLlib on multiple compute targets | + +## Prerequisites + +To make use of these samples, you need the following. + +* A Microsoft Azure subscription. +* A Microsoft Azure resource group. +* An Azure Machine Learning Workspace in the resource group. Please make sure that the VM sizes `STANDARD_NC6` and `STANDARD_D2_V2` are supported in the workspace's region. +* A virtual network set up in the resource group. + * A virtual network is needed for the examples training on multiple compute targets. + * The [devenv_setup.ipynb](setup/devenv_setup.ipynb) notebook shows you how to create a virtual network. You can alternatively use an existing virtual network, make sure it's in the same region as workspace is. + * Any network security group defined on the virtual network must allow network traffic on ports used by Azure infrastructure services. This is described in more detail in the [devenv_setup.ipynb](setup/devenv_setup.ipynb) notebook. + + +## Setup + +You can run these samples in the following ways. + +* On an Azure ML Compute Instance or Notebook VM. +* On a workstation with Python and the Azure ML Python SDK installed. + +### Azure ML Compute Instance or Notebook VM +#### Update packages + + +We recommend that you update the required Python packages before you proceed. The following commands are for entering in a Python interpreter such as a notebook. + +```shell +# We recommend updating pip to the latest version. +!pip install --upgrade pip +# Update matplotlib for plotting charts +!pip install --upgrade matplotlib +# Update Azure Machine Learning SDK to the latest version +!pip install --upgrade azureml-sdk +# For Jupyter notebook widget used in samples +!pip install --upgrade azureml-widgets +# For Tensorboard used in samples +!pip install --upgrade azureml-tensorboard +# Install Azure Machine Learning Reinforcement Learning SDK +!pip install --upgrade azureml-contrib-reinforcementlearning +``` + +### Your own workstation +#### Install/update packages + +For a local workstation, create a Python environment and install [Azure Machine Learning SDK](https://docs.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py) and the RL SDK. We recommend Python 3.6 and higher. + +```shell +# Activate your environment first. +# e.g., +# conda activate amlrl +# We recommend updating pip to the latest version. +pip install --upgrade pip +# Install/upgrade matplotlib for plotting charts +pip install --upgrade matplotlib +# Install/upgrade tensorboard used in samples +pip install --upgrade tensorboard +# Install/upgrade Azure ML SDK to the latest version +pip install --upgrade azureml-sdk +# For Jupyter notebook widget used in samples +pip install --upgrade azureml-widgets +# For Tensorboard used in samples +pip install --upgrade azureml-tensorboard +# Install Azure Machine Learning Reinforcement Learning SDK +pip install --upgrade azureml-contrib-reinforcementlearning +# To use the notebook widget, you may need to register and enable the Azure ML extensions first. +jupyter nbextension install --py --user azureml.widgets +jupyter nbextension enable --py --user azureml.widgets +``` + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +For more on SDK concepts, please refer to [notebooks](https://github.com/Azure/MachineLearningNotebooks). + +**Please let us know your feedback.** + + + +![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/README.png) \ No newline at end of file diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/pong_rllib.py b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/pong_rllib.py new file mode 100644 index 00000000..c78a19c6 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/pong_rllib.py @@ -0,0 +1,39 @@ +import ray +import ray.tune as tune +from ray.rllib import train + +import os +import sys + +from azureml.core import Run +from utils import callbacks + +DEFAULT_RAY_ADDRESS = 'localhost:6379' + +if __name__ == "__main__": + + # Parse arguments + train_parser = train.create_parser() + + args = train_parser.parse_args() + print("Algorithm config:", args.config) + + if args.ray_address is None: + args.ray_address = DEFAULT_RAY_ADDRESS + + ray.init(address=args.ray_address) + + tune.run(run_or_experiment=args.run, + config={ + "env": args.env, + "num_gpus": args.config["num_gpus"], + "num_workers": args.config["num_workers"], + "callbacks": {"on_train_result": callbacks.on_train_result}, + "sample_batch_size": 50, + "train_batch_size": 1000, + "num_sgd_iter": 2, + "num_data_loader_buffers": 2, + "model": {"dim": 42}, + }, + stop=args.stop, + local_dir='./logs') diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py new file mode 100644 index 00000000..f34a4e8c --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/files/utils/callbacks.py @@ -0,0 +1,17 @@ +'''RLlib callbacks module: + Common callback methods to be passed to RLlib trainer. +''' + +from azureml.core import Run + + +def on_train_result(info): + '''Callback on train result to record metrics returned by trainer. + ''' + run = Run.get_context() + run.log( + name='episode_reward_mean', + value=info["result"]["episode_reward_mean"]) + run.log( + name='episodes_total', + value=info["result"]["episodes_total"]) diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/images/pong.gif b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/images/pong.gif new file mode 100644 index 00000000..c29cc4a3 Binary files /dev/null and b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/images/pong.gif differ diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb new file mode 100644 index 00000000..0979479b --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb @@ -0,0 +1,604 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/tutorials/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Azure ML Reinforcement Learning Sample - Pong problem\n", + "Azure ML Reinforcement Learning (Azure ML RL) is a managed service for running distributed RL (reinforcement learning) simulation and training using the Ray framework.\n", + "This example uses Ray RLlib to train a Pong playing agent on a multi-node cluster.\n", + "\n", + "## Pong problem\n", + "[Pong](https://en.wikipedia.org/wiki/Pong) is a two-dimensional sports game that simulates table tennis. The player controls an in-game paddle by moving it vertically across the left or right side of the screen. They can compete against another player controlling a second paddle on the opposing side. Players use the paddles to hit a ball back and forth." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\"Pong
Fig 1. Pong game animation (from towardsdatascience.com).
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The goal here is to train an agent to win an episode of Pong game against opponent with the score of at least 18 points. An episode in Pong runs until one of the players reaches a score of 21. Episodes are a terminology that is used across all the [OpenAI gym](https://gym.openai.com/envs/Pong-v0/) environments that contains a strictly defined task.\n", + "\n", + "Training a Pong agent is a CPU intensive task and this example demonstrates the use of Azure ML RL service to train an agent faster in a distributed, parallel environment. You'll learn more about using the head and the worker compute targets to train an agent in this notebook below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "\n", + "The user should have completed the [Azure ML Reinforcement Learning Sample - Setting Up Development Environment](../setup/devenv_setup.ipynb) to setup a virtual network. This virtual network will be used here for head and worker compute targets. It is highly recommended that the user should go through the [Azure ML Reinforcement Learning Sample - Cartpole Problem](../cartpole-on-single-compute/cartpole_cc.ipynb) to understand the basics of Azure ML RL and Ray RLlib used in this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up Development Environment\n", + "The following subsections show typical steps to setup your development environment. Setup includes:\n", + "\n", + "* Connecting to a workspace to enable communication between your local machine and remote resources\n", + "* Creating an experiment to track all your runs\n", + "* Creating a remote head and worker compute target on a vnet to use for training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure Machine Learning SDK\n", + "Display the Azure Machine Learning SDK version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "# Azure ML core imports\n", + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Azure ML workspace\n", + "Get a reference to an existing Azure ML workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.location, ws.resource_group, sep = ' | ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Azure ML experiment\n", + "Create an experiment to track the runs in your workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.experiment import Experiment\n", + "\n", + "# Experiment name\n", + "experiment_name = 'rllib-pong-multi-node'\n", + "exp = Experiment(workspace=ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specify the name of your vnet\n", + "\n", + "The resource group you use must contain a vnet. Specify the name of the vnet here created in the [Azure ML Reinforcement Learning Sample - Setting Up Development Environment](../setup/devenv_setup.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Virtual network name\n", + "vnet_name = 'your_vnet'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create head computing cluster\n", + "\n", + "In this example, we show how to set up separate compute clusters for the Ray head and Ray worker nodes. First we define the head cluster with GPU for the Ray head node. One CPU of the head node will be used for the Ray head process and the rest of the CPUs will be used by the Ray worker processes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import AmlCompute, ComputeTarget\n", + "\n", + "# Choose a name for the Ray head cluster\n", + "head_compute_name = 'head-gpu'\n", + "head_compute_min_nodes = 0\n", + "head_compute_max_nodes = 2\n", + "\n", + "# This example uses GPU VM. For using CPU VM, set SKU to STANDARD_D2_V2\n", + "head_vm_size = 'STANDARD_NC6'\n", + "\n", + "if head_compute_name in ws.compute_targets:\n", + " head_compute_target = ws.compute_targets[head_compute_name]\n", + " if head_compute_target and type(head_compute_target) is AmlCompute:\n", + " if head_compute_target.provisioning_state == 'Succeeded':\n", + " print('found head compute target. just use it', head_compute_name)\n", + " else: \n", + " raise Exception('found head compute target but it is in state', head_compute_target.provisioning_state)\n", + "else:\n", + " print('creating a new head compute target...')\n", + " provisioning_config = AmlCompute.provisioning_configuration(vm_size=head_vm_size,\n", + " min_nodes=head_compute_min_nodes, \n", + " max_nodes=head_compute_max_nodes,\n", + " vnet_resourcegroup_name=ws.resource_group,\n", + " vnet_name=vnet_name,\n", + " subnet_name='default')\n", + "\n", + " # Create the cluster\n", + " head_compute_target = ComputeTarget.create(ws, head_compute_name, provisioning_config)\n", + " \n", + " # Can poll for a minimum number of nodes and for a specific timeout. \n", + " # If no min node count is provided it will use the scale settings for the cluster\n", + " head_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", + " \n", + " # For a more detailed view of current AmlCompute status, use get_status()\n", + " print(head_compute_target.get_status().serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create worker computing cluster\n", + "\n", + "Now we create a compute cluster with CPUs for the additional Ray worker nodes. CPUs in these worker nodes are used by Ray worker processes. Each Ray worker node may have multiple Ray worker processes depending on CPUs on the worker node. Ray can distribute multiple worker tasks on each worker node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Choose a name for your Ray worker cluster\n", + "worker_compute_name = 'worker-cpu'\n", + "worker_compute_min_nodes = 0 \n", + "worker_compute_max_nodes = 4\n", + "\n", + "# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6\n", + "worker_vm_size = 'STANDARD_D2_V2'\n", + "\n", + "# Create the compute target if it hasn't been created already\n", + "if worker_compute_name in ws.compute_targets:\n", + " worker_compute_target = ws.compute_targets[worker_compute_name]\n", + " if worker_compute_target and type(worker_compute_target) is AmlCompute:\n", + " if worker_compute_target.provisioning_state == 'Succeeded':\n", + " print('found worker compute target. just use it', worker_compute_name)\n", + " else: \n", + " raise Exception('found worker compute target but it is in state', head_compute_target.provisioning_state)\n", + "else:\n", + " print('creating a new worker compute target...')\n", + " provisioning_config = AmlCompute.provisioning_configuration(vm_size=worker_vm_size,\n", + " min_nodes=worker_compute_min_nodes, \n", + " max_nodes=worker_compute_max_nodes,\n", + " vnet_resourcegroup_name=ws.resource_group,\n", + " vnet_name=vnet_name,\n", + " subnet_name='default')\n", + "\n", + " # Create the cluster\n", + " worker_compute_target = ComputeTarget.create(ws, worker_compute_name, provisioning_config)\n", + " \n", + " # Can poll for a minimum number of nodes and for a specific timeout. \n", + " # If no min node count is provided it will use the scale settings for the cluster\n", + " worker_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", + " \n", + " # For a more detailed view of current AmlCompute status, use get_status()\n", + " print(worker_compute_target.get_status().serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Pong Agent Using Azure ML RL\n", + "To facilitate reinforcement learning, Azure Machine Learning Python SDK provides a high level abstraction, the _ReinforcementLearningEstimator_ class, which allows users to easily construct RL run configurations for the underlying RL framework. Azure ML RL initially supports the [Ray framework](https://ray.io/) and its highly customizable [RLLib](https://ray.readthedocs.io/en/latest/rllib.html#rllib-scalable-reinforcement-learning). In this section we show how to use _ReinforcementLearningEstimator_ and Ray/RLLib framework to train a Pong playing agent.\n", + "\n", + "\n", + "### Define worker configuration\n", + "Define a `WorkerConfiguration` using your worker compute target. We also specify the number of nodes in the worker compute target to be used for training and additional PIP packages to install on those nodes as a part of setup.\n", + "In this case, we define the PIP packages as dependencies for both head and worker nodes. With this setup, the game simulations will run directly on the worker compute nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.contrib.train.rl import WorkerConfiguration\n", + "\n", + "# Pip packages we will use for both head and worker\n", + "pip_packages=[\"ray[rllib]==0.8.3\"] # Latest version of Ray has fixes for isses related to object transfers\n", + "\n", + "# Specify the Ray worker configuration\n", + "worker_conf = WorkerConfiguration(\n", + " \n", + " # Azure ML compute cluster to run Ray workers\n", + " compute_target=worker_compute_target, \n", + " \n", + " # Number of worker nodes\n", + " node_count=4,\n", + " \n", + " # GPU\n", + " use_gpu=False, \n", + " \n", + " # PIP packages to use\n", + " pip_packages=pip_packages\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create reinforcement learning estimator\n", + "\n", + "The `ReinforcementLearningEstimator` is used to submit a job to Azure Machine Learning to start the Ray experiment run. We define the training script parameters here that will be passed to estimator. \n", + "\n", + "We specify `episode_reward_mean` to 18 as we want to stop the training as soon as the trained agent reaches an average win margin of at least 18 point over opponent over all episodes in the training epoch.\n", + "Number of Ray worker processes are defined by parameter `num_workers`. We set it to 13 as we have 13 CPUs available in our compute targets. Multiple Ray worker processes parallelizes agent training and helps in achieving our goal faster. \n", + "\n", + "```\n", + "Number of CPUs in head_compute_target = 6 CPUs in 1 node = 6\n", + "Number of CPUs in worker_compute_target = 2 CPUs in each of 4 nodes = 8\n", + "Number of CPUs available = (Number of CPUs in head_compute_target) + (Number of CPUs in worker_compute_target) - (1 CPU for head node) = 6 + 8 - 1 = 13\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.contrib.train.rl import ReinforcementLearningEstimator, Ray\n", + "\n", + "training_algorithm = \"IMPALA\"\n", + "rl_environment = \"PongNoFrameskip-v4\"\n", + "\n", + "# Training script parameters\n", + "script_params = {\n", + " \n", + " # Training algorithm, IMPALA in this case\n", + " \"--run\": training_algorithm,\n", + " \n", + " # Environment, Pong in this case\n", + " \"--env\": rl_environment,\n", + " \n", + " # Add additional single quotes at the both ends of string values as we have spaces in the \n", + " # string parameters, outermost quotes are not passed to scripts as they are not actually part of string\n", + " # Number of GPUs\n", + " # Number of ray workers\n", + " \"--config\": '\\'{\"num_gpus\": 1, \"num_workers\": 13}\\'',\n", + " \n", + " # Target episode reward mean to stop the training\n", + " # Total training time in seconds\n", + " \"--stop\": '\\'{\"episode_reward_mean\": 18, \"time_total_s\": 3600}\\'',\n", + "}\n", + "\n", + "# RL estimator\n", + "rl_estimator = ReinforcementLearningEstimator(\n", + " \n", + " # Location of source files\n", + " source_directory='files',\n", + " \n", + " # Python script file\n", + " entry_script=\"pong_rllib.py\",\n", + " \n", + " # Parameters to pass to the script file\n", + " # Defined above.\n", + " script_params=script_params,\n", + " \n", + " # The Azure ML compute target set up for Ray head nodes\n", + " compute_target=head_compute_target,\n", + " \n", + " # Pip packages\n", + " pip_packages=pip_packages,\n", + " \n", + " # GPU usage\n", + " use_gpu=True,\n", + " \n", + " # RL framework. Currently must be Ray.\n", + " rl_framework=Ray(),\n", + " \n", + " # Ray worker configuration defined above.\n", + " worker_configuration=worker_conf,\n", + " \n", + " # How long to wait for whole cluster to start\n", + " cluster_coordination_timeout_seconds=3600,\n", + " \n", + " # Maximum time for the whole Ray job to run\n", + " # This will cut off the run after an hour\n", + " max_run_duration_seconds=3600,\n", + " \n", + " # Allow the docker container Ray runs in to make full use\n", + " # of the shared memory available from the host OS.\n", + " shm_size=24*1024*1024*1024\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training script\n", + "As recommended in [RLLib](https://ray.readthedocs.io/en/latest/rllib.html) documentations, we use Ray [Tune](https://ray.readthedocs.io/en/latest/tune.html) API to run training algorithm. All the RLLib built-in trainers are compatible with the Tune API. Here we use tune.run() to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n", + "\n", + "```python\n", + " tune.run(run_or_experiment=args.run,\n", + " config={\n", + " \"env\": args.env,\n", + " \"num_gpus\": args.config[\"num_gpus\"],\n", + " \"num_workers\": args.config[\"num_workers\"],\n", + " \"callbacks\": {\"on_train_result\": callbacks.on_train_result},\n", + " \"sample_batch_size\": 50,\n", + " \"train_batch_size\": 1000,\n", + " \"num_sgd_iter\": 2,\n", + " \"num_data_loader_buffers\": 2,\n", + " \"model\": {\"dim\": 42},\n", + " },\n", + " stop=args.stop,\n", + " local_dir='./logs')\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit the estimator to start a run\n", + "Now we use the rl_estimator configured above to submit a run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = exp.submit(config=rl_estimator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor the run\n", + "\n", + "Azure ML provides a Jupyter widget to show the real-time status of an experiment run. You could use this widget to monitor the status of runs. The widget shows the list of two child runs, one for head compute target run and one for worker compute target run, as well. You can click on the link under Status to see the details of the child run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.widgets import RunDetails\n", + "\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait for the run to complete before proceeding. If you want to stop the run, you may skip this and move to next section below. \n", + "\n", + "**Note: the run may take anywhere from 30 minutes to 45 minutes to complete.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop the run\n", + "\n", + "To cancel the run, call run.cancel()." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run.cancel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance of the agent during training\n", + "\n", + "Let's get the reward metrics for the training run agent and observe how the agent's rewards improved over the training iterations and how the agent learns to win the Pong game. \n", + "\n", + "Collect the episode reward metrics from the worker run's metrics. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all child runs\n", + "child_runs = list(run.get_children(_rehydrate_runs=False))\n", + "\n", + "# Get the reward metrics from worker run\n", + "if child_runs[0].id.endswith(\"_worker\"):\n", + " episode_reward_mean = child_runs[0].get_metrics(name='episode_reward_mean')\n", + "else:\n", + " episode_reward_mean = child_runs[1].get_metrics(name='episode_reward_mean')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the reward metrics. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.plot(episode_reward_mean['episode_reward_mean'])\n", + "plt.xlabel('training_iteration')\n", + "plt.ylabel('episode_reward_mean')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We observe that during the training over multiple episodes, the agent learn to win the Pong game against opponent with our target of 18 points in each episode of 21 points.\n", + "**Congratulations!! You have trained your Pong agent to win a game marvelously.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up\n", + "For your convenience, below you can find code snippets to clean up any resources created as part of this tutorial that you don't wish to retain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To archive the created experiment:\n", + "#experiment.archive()\n", + "\n", + "# To delete the compute targets:\n", + "#head_compute_target.delete()\n", + "#worker_compute_target.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next\n", + "In this example, you learnt how to solve distributed RL training problems using head and worker compute targets. This is currently the last introductory tutorial for Azure Machine Learning service's Reinforcement Learning offering. We would love to hear your feedback to build the features you need!" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "vineetg" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00e2\u20ac\u00afLicensed under the MIT License.\u00e2\u20ac\u00af " + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.yml b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.yml new file mode 100644 index 00000000..29c57633 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.yml @@ -0,0 +1,7 @@ +name: pong_rllib +dependencies: +- pip: + - azureml-sdk + - azureml-contrib-reinforcementlearning + - azureml-widgets + - matplotlib diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb new file mode 100644 index 00000000..19bc54f2 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb @@ -0,0 +1,700 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/tutorials/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Azure ML Reinforcement Learning Sample - Cartpole Problem on Compute Instance\n", + "\n", + "Azure ML Reinforcement Learning (Azure ML RL) is a managed service for running reinforcement learning training and simulation. With Azure MLRL, data scientists can start developing RL systems on one machine, and scale to compute clusters with 100\u00e2\u20ac\u2122s of nodes if needed.\n", + "\n", + "This example shows how to use Azure ML RL to train a Cartpole playing agent on a compute instance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cartpole problem\n", + "\n", + "Cartpole, also known as [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum), is a pendulum with a center of mass above its pivot point. This formation is essentially unstable and will easily fall over but can be kept balanced by applying appropriate horizontal forces to the pivot point.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Cartpole \n", + "

Fig 1. Cartpole problem schematic description (from towardsdatascience.com).

\n", + "\n", + "The goal here is to train an agent to keep the cartpole balanced by applying appropriate forces to the pivot point.\n", + "\n", + "See [this video](https://www.youtube.com/watch?v=XiigTGKZfks) for a real-world demonstration of cartpole problem." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisite\n", + "The user should have completed the Azure Machine Learning Tutorial: [Get started creating your first ML experiment with the Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup). You will need to make sure that you have a valid subscription id, a resource group and a workspace. All datastores and datasets you use should be associated with your workspace." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up Development Environment\n", + "The following subsections show typical steps to setup your development environment. Setup includes:\n", + "\n", + "* Connecting to a workspace to enable communication between your local machine and remote resources\n", + "* Creating an experiment to track all your runs\n", + "* Using a Compute Instance as compute target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure ML SDK \n", + "Display the Azure ML SDK version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Azure ML workspace\n", + "Get a reference to an existing Azure ML workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.location, ws.resource_group, sep = ' | ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Compute Instance as compute target\n", + "\n", + "A compute target is a designated compute resource where you run your training and simulation scripts. This location may be your local machine or a cloud-based compute resource. For more information see [What are compute targets in Azure Machine Learning?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target)\n", + "\n", + "The code below shows how to use current compute instance as a compute target. First some helper functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os.path\n", + "\n", + "\n", + "# Get information about the currently running compute instance (notebook VM), like its name and prefix.\n", + "def load_nbvm():\n", + " if not os.path.isfile(\"/mnt/azmnt/.nbvm\"):\n", + " return None\n", + " with open(\"/mnt/azmnt/.nbvm\", 'r') as file:\n", + " return {key:value for (key, value) in [line.strip().split('=') for line in file]}\n", + "\n", + "\n", + "# Get information about the capabilities of an azureml.core.compute.AmlCompute target\n", + "# In particular how much RAM + GPU + HDD it has.\n", + "def get_compute_size(self, workspace):\n", + " for size in self.supported_vmsizes(workspace):\n", + " if(size['name'].upper() == self.vm_size):\n", + " return size\n", + "\n", + "azureml.core.compute.ComputeTarget.size = get_compute_size\n", + "del(get_compute_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we use these helper functions to get a handle to current compute instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load current compute instance info\n", + "current_compute_instance = load_nbvm()\n", + "print(\"Current compute instance:\", current_compute_instance)\n", + "\n", + "# For this demo, let's use the current compute instance as the compute target, if available\n", + "if current_compute_instance:\n", + " instance_name = current_compute_instance['instance']\n", + "else:\n", + " instance_name = next(iter(ws.compute_targets))\n", + "\n", + "compute_target = ws.compute_targets[instance_name]\n", + "\n", + "print(\"Compute target status:\")\n", + "print(compute_target.get_status().serialize())\n", + "\n", + "print(\"Compute target size:\")\n", + "print(compute_target.size(ws))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Azure ML experiment\n", + "Create an experiment to track the runs in your workspace. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.experiment import Experiment\n", + "\n", + "experiment_name = 'CartPole-v0-CI'\n", + "exp = Experiment(workspace=ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Cartpole Agent Using Azure ML RL\n", + "To facilitate reinforcement learning, Azure Machine Learning Python SDK provides a high level abstraction, the _ReinforcementLearningEstimator_ class, which allows users to easily construct RL run configurations for the underlying RL framework. Azure ML RL initially supports the [Ray framework](https://ray.io/) and its highly customizable [RLlib](https://ray.readthedocs.io/en/latest/rllib.html#rllib-scalable-reinforcement-learning). In this section we show how to use _ReinforcementLearningEstimator_ and Ray/RLlib framework to train a cartpole playing agent. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create reinforcement learning estimator\n", + "\n", + "The code below creates an instance of *ReinforcementLearningEstimator*, `training_estimator`, which then will be used to submit a job to Azure Machine Learning to start the Ray experiment run.\n", + "\n", + "Note that this example is purposely simplified to the minimum. Here is a short description of the parameters we are passing into the constructor:\n", + "\n", + "- `source_directory`, local directory containing your training script(s) and helper modules,\n", + "- `entry_script`, path to your entry script relative to the source directory,\n", + "- `script_params`, constant parameters to be passed to each run of training script,\n", + "- `compute_target`, reference to the compute target in which the trainer and worker(s) jobs will be executed,\n", + "- `rl_framework`, the RL framework to be used (currently must be Ray).\n", + "\n", + "We use the `script_params` parameter to pass in general and algorithm-specific parameters to the training script.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.contrib.train.rl import ReinforcementLearningEstimator, Ray\n", + "\n", + "training_algorithm = \"PPO\"\n", + "rl_environment = \"CartPole-v0\"\n", + "\n", + "script_params = {\n", + "\n", + " # Training algorithm\n", + " \"--run\": training_algorithm,\n", + " \n", + " # Training environment\n", + " \"--env\": rl_environment,\n", + " \n", + " # Algorithm-specific parameters\n", + " \"--config\": '\\'{\"num_gpus\": 0, \"num_workers\": 1}\\'',\n", + " \n", + " # Stop conditions\n", + " \"--stop\": '\\'{\"episode_reward_mean\": 200, \"time_total_s\": 300}\\'',\n", + " \n", + " # Frequency of taking checkpoints\n", + " \"--checkpoint-freq\": 2,\n", + " \n", + " # If a checkpoint should be taken at the end - optional argument with no value\n", + " \"--checkpoint-at-end\": \"\",\n", + " \n", + " # Log directory\n", + " \"--local-dir\": './logs'\n", + "}\n", + "\n", + "training_estimator = ReinforcementLearningEstimator(\n", + "\n", + " # Location of source files\n", + " source_directory='files',\n", + " \n", + " # Python script file\n", + " entry_script='cartpole_training.py',\n", + " \n", + " # A dictionary of arguments to pass to the training script specified in ``entry_script``\n", + " script_params=script_params,\n", + " \n", + " # The Azure ML compute target set up for Ray head nodes\n", + " compute_target=compute_target,\n", + " \n", + " # RL framework. Currently must be Ray.\n", + " rl_framework=Ray()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training script\n", + "\n", + "As recommended in RLlib documentations, we use Ray Tune API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use `tune.run()` to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n", + "\n", + "This is the list of parameters we are passing into `tune.run()` via the `script_params` parameter:\n", + "\n", + "- `run_or_experiment`: name of the [built-in algorithm](https://ray.readthedocs.io/en/latest/rllib-algorithms.html#rllib-algorithms), 'PPO' in our example,\n", + "- `config`: Algorithm-specific configuration. This includes specifying the environment, `env`, which in our example is the gym **[CartPole-v0](https://gym.openai.com/envs/CartPole-v0/)** environment,\n", + "- `stop`: stopping conditions, which could be any of the metrics returned by the trainer. Here we use \"mean of episode reward\", and \"total training time in seconds\" as stop conditions, and\n", + "- `checkpoint_freq` and `checkpoint_at_end`: Frequency of taking checkpoints (number of training iterations between checkpoints), and if a checkpoint should be taken at the end.\n", + "\n", + "We also specify the `local_dir`, the directory in which the training logs, checkpoints and other training artificats will be recorded. \n", + "\n", + "See [RLlib Training APIs](https://ray.readthedocs.io/en/latest/rllib-training.html#rllib-training-apis) for more details, and also [Training (tune.run, tune.Experiment)](https://ray.readthedocs.io/en/latest/tune/api_docs/execution.html#training-tune-run-tune-experiment) for the complete list of parameters.\n", + "\n", + "```python\n", + "import ray\n", + "import ray.tune as tune\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " # parse arguments ...\n", + " \n", + " # Intitialize ray\n", + " ay.init(address=args.ray_address)\n", + "\n", + " # Run training task using tune.run\n", + " tune.run(\n", + " run_or_experiment=args.run,\n", + " config=dict(args.config, env=args.env),\n", + " stop=args.stop,\n", + " checkpoint_freq=args.checkpoint_freq,\n", + " checkpoint_at_end=args.checkpoint_at_end,\n", + " local_dir=args.local_dir\n", + " )\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit the estimator to start experiment\n", + "Now we use the *training_estimator* to submit a run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_run = exp.submit(training_estimator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor experiment\n", + "Azure ML provides a Jupyter widget to show the real-time status of an experiment run. You could use this widget to monitor status of the runs.\n", + "\n", + "Note that _ReinforcementLearningEstimator_ creates at least two runs: (a) A parent run, i.e. the run returned above, and (b) a collection of child runs. The number of the child runs depends on the configuration of the reinforcement learning estimator. In our simple scenario, configured above, only one child run will be created.\n", + "\n", + "The widget will show a list of the child runs as well. You can click on the link under **Status** to see the details of a child run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.widgets import RunDetails\n", + "\n", + "RunDetails(training_run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop the run\n", + "\n", + "To cancel the run, call `training_run.cancel()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment line below to cancel the run\n", + "# training_run.cancel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wait for completion\n", + "Wait for the run to complete before proceeding.\n", + "\n", + "**Note: The run may take a few minutes to complete.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_run.wait_for_completion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get a handle to the child run\n", + "You can obtain a handle to the child run as follows. In our scenario, there is only one child run, we have it called `child_run_0`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "child_run_0 = None\n", + "timeout = 30\n", + "while timeout > 0 and not child_run_0:\n", + " child_runs = list(training_run.get_children())\n", + " print('Number of child runs:', len(child_runs))\n", + " if len(child_runs) > 0:\n", + " child_run_0 = child_runs[0]\n", + " break\n", + " time.sleep(2) # Wait for 2 seconds\n", + " timeout -= 2\n", + "\n", + "print('Child run info:')\n", + "print(child_run_0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate Trained Agent and See Results\n", + "\n", + "We can evaluate a previously trained policy using the `rollout.py` helper script provided by RLlib (see [Evaluating Trained Policies](https://ray.readthedocs.io/en/latest/rllib-training.html#evaluating-trained-policies) for more details). Here we use an adaptation of this script to reconstruct a policy from a checkpoint taken and saved during training. We took these checkpoints by setting `checkpoint-freq` and `checkpoint-at-end` parameters above.\n", + "\n", + "In this section we show how to get access to these checkpoints data, and then how to use them to evaluate the trained policy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a dataset of training artifacts\n", + "To evaluate a trained policy (a checkpoint) we need to make the checkpoint accessible to the rollout script. All the training artifacts are stored in workspace default datastore under **azureml/<run_id>** directory.\n", + "\n", + "Here we create a file dataset from the stored artifacts, and then use this dataset to feed these data to rollout estimator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "\n", + "run_id = child_run_0.id # Or set to run id of a completed run (e.g. 'rl-cartpole-v0_1587572312_06e04ace_head')\n", + "run_artifacts_path = os.path.join('azureml', run_id)\n", + "print(\"Run artifacts path:\", run_artifacts_path)\n", + "\n", + "# Create a file dataset object from the files stored on default datastore\n", + "datastore = ws.get_default_datastore()\n", + "training_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To verify, we can print out the number (and paths) of all the files in the dataset, as follows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "artifacts_paths = training_artifacts_ds.to_path()\n", + "print(\"Number of files in dataset:\", len(artifacts_paths))\n", + "\n", + "# Uncomment line below to print all file paths\n", + "#print(\"Artifacts dataset file paths: \", artifacts_paths)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate a trained policy\n", + "We need to configure another reinforcement learning estimator, `rollout_estimator`, and then use it to submit another run. Note that the entry script for this estimator now points to `cartpole-rollout.py` script.\n", + "Also note how we pass the checkpoints dataset to this script using `inputs` parameter of the _ReinforcementLearningEstimator_.\n", + "\n", + "We are using script parameters to pass in the same algorithm and the same environment used during training. We also specify the checkpoint number of the checkpoint we wish to evaluate, `checkpoint-number`, and number of the steps we shall run the rollout, `steps`.\n", + "\n", + "The checkpoints dataset will be accessible to the rollout script as a mounted folder. The mounted folder and the checkpoint number, passed in via `checkpoint-number`, will be used to create a path to the checkpoint we are going to evaluate. The created checkpoint path then will be passed into RLlib rollout script for evaluation.\n", + "\n", + "Let's find the checkpoints and the last checkpoint number first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find checkpoints and last checkpoint number\n", + "from os import path\n", + "checkpoint_files = [\n", + " os.path.basename(file) for file in training_artifacts_ds.to_path() \\\n", + " if os.path.basename(file).startswith('checkpoint-') and \\\n", + " not os.path.basename(file).endswith('tune_metadata')\n", + "]\n", + "\n", + "checkpoint_numbers = []\n", + "for file in checkpoint_files:\n", + " checkpoint_numbers.append(int(file.split('-')[1]))\n", + "\n", + "print(\"Checkpoints:\", checkpoint_numbers)\n", + "\n", + "last_checkpoint_number = max(checkpoint_numbers)\n", + "print(\"Last checkpoint number:\", last_checkpoint_number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's configure rollout estimator. Note that we use the last checkpoint for evaluation. The assumption is that the last checkpoint points to our best trained agent. You may change this to any of the checkpoint numbers printed above and observe the effect." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "script_params = { \n", + " # Checkpoint number of the checkpoint from which to roll out\n", + " \"--checkpoint-number\": last_checkpoint_number,\n", + "\n", + " # Training algorithm\n", + " \"--run\": training_algorithm,\n", + " \n", + " # Training environment\n", + " \"--env\": rl_environment,\n", + " \n", + " # Algorithm-specific parameters\n", + " \"--config\": '{}',\n", + " \n", + " # Number of rollout steps \n", + " \"--steps\": 2000,\n", + " \n", + " # If should repress rendering of the environment\n", + " \"--no-render\": \"\"\n", + "}\n", + "\n", + "rollout_estimator = ReinforcementLearningEstimator(\n", + " # Location of source files\n", + " source_directory='files',\n", + " \n", + " # Python script file\n", + " entry_script='cartpole_rollout.py',\n", + " \n", + " # A dictionary of arguments to pass to the rollout script specified in ``entry_script``\n", + " script_params = script_params,\n", + " \n", + " # Data inputs\n", + " inputs=[\n", + " training_artifacts_ds.as_named_input('artifacts_dataset'),\n", + " training_artifacts_ds.as_named_input('artifacts_path').as_mount()],\n", + " \n", + " # The Azure ML compute target\n", + " compute_target=compute_target,\n", + " \n", + " # RL framework. Currently must be Ray.\n", + " rl_framework=Ray(),\n", + " \n", + " # Additional pip packages to install\n", + " pip_packages = ['azureml-dataprep[fuse,pandas]'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Same as before, we use the *rollout_estimator* to submit a run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rollout_run = exp.submit(rollout_estimator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in driver_log.txt file. Note that you may need to wait several minutes before these results become available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.widgets import RunDetails\n", + "\n", + "RunDetails(rollout_run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait for completion of the rollout run, or you may cancel the run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment line below to cancel the run\n", + "#rollout_run.cancel()\n", + "rollout_run.wait_for_completion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up\n", + "For your convenience, below you can find code snippets to clean up any resources created as part of this tutorial that you don't wish to retain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To archive the created experiment:\n", + "#exp.archive()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next\n", + "This example was about running Azure ML RL (Ray/RLlib Framework) on compute instance. Please see [Cartpole problem](../cartpole-on-single-compute/cartpole_cc.ipynb)\n", + "example which uses Ray RLlib to train a Cartpole playing agent on a single node remote compute.\n" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "adrosa" + }, + { + "name": "hoazari" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + }, + "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.yml b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.yml new file mode 100644 index 00000000..c5a2ed39 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.yml @@ -0,0 +1,6 @@ +name: cartpole_ci +dependencies: +- pip: + - azureml-sdk + - azureml-contrib-reinforcementlearning + - azureml-widgets diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/cartpole_rollout.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/cartpole_rollout.py new file mode 100644 index 00000000..4a951c72 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/cartpole_rollout.py @@ -0,0 +1,119 @@ +import argparse +import os +import sys + +import ray +from ray.rllib import rollout +from ray.tune.registry import get_trainable_cls + +from azureml.core import Run + +from utils import callbacks + + +DEFAULT_RAY_ADDRESS = 'localhost:6379' + + +def run_rollout(args, parser, ray_address): + + config = args.config + if not args.env: + if not config.get("env"): + parser.error("the following arguments are required: --env") + args.env = config.get("env") + + ray.init(address=ray_address) + + # Create the Trainer from config. + cls = get_trainable_cls(args.run) + agent = cls(env=args.env, config=config) + + # Load state from checkpoint. + agent.restore(args.checkpoint) + num_steps = int(args.steps) + num_episodes = int(args.episodes) + + # Determine the video output directory. + use_arg_monitor = False + try: + args.video_dir + except AttributeError: + print("There is no such attribute: args.video_dir") + use_arg_monitor = True + + video_dir = None + if not use_arg_monitor: + if args.monitor: + video_dir = os.path.join("./logs", "video") + elif args.video_dir: + video_dir = os.path.expanduser(args.video_dir) + + # Do the actual rollout. + with rollout.RolloutSaver( + args.out, + args.use_shelve, + write_update_file=args.track_progress, + target_steps=num_steps, + target_episodes=num_episodes, + save_info=args.save_info) as saver: + if use_arg_monitor: + rollout.rollout( + agent, + args.env, + num_steps, + num_episodes, + saver, + args.no_render, + args.monitor) + else: + rollout.rollout( + agent, args.env, + num_steps, + num_episodes, + saver, + args.no_render, video_dir) + + +if __name__ == "__main__": + + # Add positional argument - serves as placeholder for checkpoint + argvc = sys.argv[1:] + argvc.insert(0, 'checkpoint-placeholder') + + # Parse arguments + rollout_parser = rollout.create_parser() + + rollout_parser.add_argument( + '--checkpoint-number', required=False, type=int, default=1, + help='Checkpoint number of the checkpoint from which to roll out') + + rollout_parser.add_argument( + '--ray-address', required=False, default=DEFAULT_RAY_ADDRESS, + help='The address of the Ray cluster to connect to') + + args = rollout_parser.parse_args(argvc) + + # Get a handle to run + run = Run.get_context() + + # Get handles to the tarining artifacts dataset and mount path + artifacts_dataset = run.input_datasets['artifacts_dataset'] + artifacts_path = run.input_datasets['artifacts_path'] + + # Find checkpoint file to be evaluated + checkpoint_id = '-' + str(args.checkpoint_number) + checkpoint_files = list(filter( + lambda filename: filename.endswith(checkpoint_id), + artifacts_dataset.to_path())) + + checkpoint_file = checkpoint_files[0] + if checkpoint_file[0] == '/': + checkpoint_file = checkpoint_file[1:] + checkpoint = os.path.join(artifacts_path, checkpoint_file) + print('Checkpoint:', checkpoint) + + # Set rollout checkpoint + args.checkpoint = checkpoint + + # Start rollout + run_rollout(args, rollout_parser, args.ray_address) diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/cartpole_training.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/cartpole_training.py new file mode 100644 index 00000000..ae926273 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/cartpole_training.py @@ -0,0 +1,41 @@ +import argparse +import os +import sys + +import ray +from ray.rllib import train +from ray import tune + +from utils import callbacks + + +DEFAULT_RAY_ADDRESS = 'localhost:6379' + + +if __name__ == "__main__": + + # Parse arguments and add callbacks to config + train_parser = train.create_parser() + + args = train_parser.parse_args() + args.config["callbacks"] = {"on_train_result": callbacks.on_train_result} + + # Trace if video capturing is on + if 'monitor' in args.config and args.config['monitor']: + print("Video capturing is ON!") + + # Start (connect to) Ray cluster + if args.ray_address is None: + args.ray_address = DEFAULT_RAY_ADDRESS + + ray.init(address=args.ray_address) + + # Run training task using tune.run + tune.run( + run_or_experiment=args.run, + config=dict(args.config, env=args.env), + stop=args.stop, + checkpoint_freq=args.checkpoint_freq, + checkpoint_at_end=args.checkpoint_at_end, + local_dir=args.local_dir + ) diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py new file mode 100644 index 00000000..f34a4e8c --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/callbacks.py @@ -0,0 +1,17 @@ +'''RLlib callbacks module: + Common callback methods to be passed to RLlib trainer. +''' + +from azureml.core import Run + + +def on_train_result(info): + '''Callback on train result to record metrics returned by trainer. + ''' + run = Run.get_context() + run.log( + name='episode_reward_mean', + value=info["result"]["episode_reward_mean"]) + run.log( + name='episodes_total', + value=info["result"]["episodes_total"]) diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/misc.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/misc.py new file mode 100644 index 00000000..f123324e --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/files/utils/misc.py @@ -0,0 +1,13 @@ +'''Misc module: + Miscellaneous helper functions and utilities. +''' + +import os +import glob + + +# Helper function to find a file or folder path +def find_path(name, path_prefix): + for root, _, _ in os.walk(path_prefix): + if glob.glob(os.path.join(root, name)): + return root diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/images/cartpole.png b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/images/cartpole.png new file mode 100644 index 00000000..f37c084e Binary files /dev/null and b/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/images/cartpole.png differ diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.ipynb b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.ipynb new file mode 100644 index 00000000..15fb9aa2 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.ipynb @@ -0,0 +1,1031 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/tutorials/how-to-use-azureml/reinforcement-learning/cartpole_on_single_compute/cartpole_cc.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Azure ML Reinforcement Learning Sample - Cartpole Problem\n", + "\n", + "Azure ML Reinforcement Learning (Azure ML RL) is a managed service for running reinforcement learning training and simulation. With Azure MLRL, data scientists can start developing RL systems on one machine, and scale to compute clusters with 100\u00e2\u20ac\u2122s of nodes if needed.\n", + "\n", + "This example shows how to use Azure ML RL to train a Cartpole playing agent on a single machine. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cartpole problem\n", + "\n", + "Cartpole, also known as [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum), is a pendulum with a center of mass above its pivot point. This formation is essentially unstable and will easily fall over but can be kept balanced by applying appropriate horizontal forces to the pivot point.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Cartpole \n", + "

Fig 1. Cartpole problem schematic description (from towardsdatascience.com).

\n", + "\n", + "The goal here is to train an agent to keep the cartpole balanced by applying appropriate forces to the pivot point.\n", + "\n", + "See [this video](https://www.youtube.com/watch?v=XiigTGKZfks) for a real-world demonstration of cartpole problem." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisite\n", + "The user should have completed the Azure Machine Learning Tutorial: [Get started creating your first ML experiment with the Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup). You will need to make sure that you have a valid subscription id, a resource group and a workspace. All datastores and datasets you use should be associated with your workspace." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up Development Environment\n", + "The following subsections show typical steps to setup your development environment. Setup includes:\n", + "\n", + "* Connecting to a workspace to enable communication between your local machine and remote resources\n", + "* Creating an experiment to track all your runs\n", + "* Creating a remote compute target to use for training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure ML SDK \n", + "Display the Azure ML SDK version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "\n", + "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Azure ML workspace\n", + "Get a reference to an existing Azure ML workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.location, ws.resource_group, sep = ' | ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a new compute resource or attach an existing one\n", + "\n", + "A compute target is a designated compute resource where you run your training and simulation scripts. This location may be your local machine or a cloud-based compute resource. The code below shows how to create a cloud-based compute target. For more information see [What are compute targets in Azure Machine Learning?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target)\n", + "\n", + "**Note: Creation of a compute resource can take several minutes**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import AmlCompute, ComputeTarget\n", + "import os\n", + "\n", + "# Choose a name and maximum size for your cluster\n", + "compute_name = \"cpu-cluster-d2\"\n", + "compute_min_nodes = 0\n", + "compute_max_nodes = 4\n", + "vm_size = \"STANDARD_D2_V2\"\n", + "\n", + "if compute_name in ws.compute_targets:\n", + " print(\"Found an existing compute target of name: \" + compute_name)\n", + " compute_target = ws.compute_targets[compute_name]\n", + " # Note: you may want to make sure compute_target is of type AmlCompute \n", + "else:\n", + " print(\"Creating new compute target...\")\n", + " provisioning_config = AmlCompute.provisioning_configuration(\n", + " vm_size=vm_size,\n", + " min_nodes=compute_min_nodes, \n", + " max_nodes=compute_max_nodes)\n", + " \n", + " # Create the cluster\n", + " compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n", + " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", + "\n", + "print(compute_target.get_status().serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Azure ML experiment\n", + "Create an experiment to track the runs in your workspace. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.experiment import Experiment\n", + "\n", + "experiment_name = 'CartPole-v0-CC'\n", + "exp = Experiment(workspace=ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Cartpole Agent Using Azure ML RL\n", + "To facilitate reinforcement learning, Azure Machine Learning Python SDK provides a high level abstraction, the _ReinforcementLearningEstimator_ class, which allows users to easily construct RL run configurations for the underlying RL framework. Azure ML RL initially supports the [Ray framework](https://ray.io/) and its highly customizable [RLlib](https://ray.readthedocs.io/en/latest/rllib.html#rllib-scalable-reinforcement-learning). In this section we show how to use _ReinforcementLearningEstimator_ and Ray/RLlib framework to train a cartpole playing agent. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create reinforcement learning estimator\n", + "\n", + "The code below creates an instance of *ReinforcementLearningEstimator*, `training_estimator`, which then will be used to submit a job to Azure Machine Learning to start the Ray experiment run.\n", + "\n", + "Note that this example is purposely simplified to the minimum. Here is a short description of the parameters we are passing into the constructor:\n", + "\n", + "- `source_directory`, local directory containing your training script(s) and helper modules,\n", + "- `entry_script`, path to your entry script relative to the source directory,\n", + "- `script_params`, constant parameters to be passed to each run of training script,\n", + "- `compute_target`, reference to the compute target in which the trainer and worker(s) jobs will be executed,\n", + "- `rl_framework`, the RL framework to be used (currently must be Ray).\n", + "\n", + "We use the `script_params` parameter to pass in general and algorithm-specific parameters to the training script.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.contrib.train.rl import ReinforcementLearningEstimator, Ray\n", + "from azureml.core.environment import Environment\n", + "\n", + "training_algorithm = \"PPO\"\n", + "rl_environment = \"CartPole-v0\"\n", + "video_capture = True\n", + "\n", + "if video_capture:\n", + " algorithm_config = '\\'{\"num_gpus\": 0, \"num_workers\": 1, \"monitor\": true}\\''\n", + "else:\n", + " algorithm_config = '\\'{\"num_gpus\": 0, \"num_workers\": 1, \"monitor\": false}\\''\n", + "\n", + "script_params = {\n", + "\n", + " # Training algorithm\n", + " \"--run\": training_algorithm,\n", + " \n", + " # Training environment\n", + " \"--env\": rl_environment,\n", + " \n", + " # Algorithm-specific parameters\n", + " \"--config\": algorithm_config,\n", + " \n", + " # Stop conditions\n", + " \"--stop\": '\\'{\"episode_reward_mean\": 200, \"time_total_s\": 300}\\'',\n", + " \n", + " # Frequency of taking checkpoints\n", + " \"--checkpoint-freq\": 2,\n", + " \n", + " # If a checkpoint should be taken at the end - optional argument with no value\n", + " \"--checkpoint-at-end\": \"\",\n", + " \n", + " # Log directory\n", + " \"--local-dir\": './logs'\n", + "}\n", + "\n", + "xvfb_env = None\n", + "if video_capture:\n", + " # Ray's video capture support requires to run everything under a headless display driver called (xvfb).\n", + " # There are two parts to this:\n", + " # 1. Use a custom docker file with proper instructions to install xvfb, ffmpeg, python-opengl\n", + " # and other dependencies. \n", + " # TODO: Add these instructions to default rl base image and drop this docker file.\n", + " \n", + " with open(\"files/docker/Dockerfile\", \"r\") as f:\n", + " dockerfile=f.read()\n", + "\n", + " xvfb_env = Environment(name='xvfb-vdisplay')\n", + " xvfb_env.docker.base_image = None\n", + " xvfb_env.docker.base_dockerfile = dockerfile\n", + " \n", + " # 2. Execute the Python process via the xvfb-run command to set up the headless display driver.\n", + " xvfb_env.python.user_managed_dependencies = True\n", + " xvfb_env.python.interpreter_path = \"xvfb-run -s '-screen 0 640x480x16 -ac +extension GLX +render' python\"\n", + "\n", + "\n", + "training_estimator = ReinforcementLearningEstimator(\n", + "\n", + " # Location of source files\n", + " source_directory='files',\n", + " \n", + " # Python script file\n", + " entry_script='cartpole_training.py',\n", + " \n", + " # A dictionary of arguments to pass to the training script specified in ``entry_script``\n", + " script_params=script_params,\n", + " \n", + " # The Azure ML compute target set up for Ray head nodes\n", + " compute_target=compute_target,\n", + " \n", + " # RL framework. Currently must be Ray.\n", + " rl_framework=Ray(),\n", + " \n", + " # Custom environmnet for Xvfb\n", + " environment=xvfb_env\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training script\n", + "\n", + "As recommended in RLlib documentations, we use Ray Tune API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use `tune.run()` to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n", + "\n", + "This is the list of parameters we are passing into `tune.run()` via the `script_params` parameter:\n", + "\n", + "- `run_or_experiment`: name of the [built-in algorithm](https://ray.readthedocs.io/en/latest/rllib-algorithms.html#rllib-algorithms), 'PPO' in our example,\n", + "- `config`: Algorithm-specific configuration. This includes specifying the environment, `env`, which in our example is the gym **[CartPole-v0](https://gym.openai.com/envs/CartPole-v0/)** environment,\n", + "- `stop`: stopping conditions, which could be any of the metrics returned by the trainer. Here we use \"mean of episode reward\", and \"total training time in seconds\" as stop conditions, and\n", + "- `checkpoint_freq` and `checkpoint_at_end`: Frequency of taking checkpoints (number of training iterations between checkpoints), and if a checkpoint should be taken at the end.\n", + "\n", + "We also specify the `local_dir`, the directory in which the training logs, checkpoints and other training artificats will be recorded. \n", + "\n", + "See [RLlib Training APIs](https://ray.readthedocs.io/en/latest/rllib-training.html#rllib-training-apis) for more details, and also [Training (tune.run, tune.Experiment)](https://ray.readthedocs.io/en/latest/tune/api_docs/execution.html#training-tune-run-tune-experiment) for the complete list of parameters.\n", + "\n", + "```python\n", + "import ray\n", + "import ray.tune as tune\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " # parse arguments ...\n", + " \n", + " # Intitialize ray\n", + " ray.init(address=args.ray_address)\n", + "\n", + " # Run training task using tune.run\n", + " tune.run(\n", + " run_or_experiment=args.run,\n", + " config=dict(args.config, env=args.env),\n", + " stop=args.stop,\n", + " checkpoint_freq=args.checkpoint_freq,\n", + " checkpoint_at_end=args.checkpoint_at_end,\n", + " local_dir=args.local_dir\n", + " )\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit the estimator to start experiment\n", + "Now we use the *training_estimator* to submit a run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_run = exp.submit(training_estimator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor experiment\n", + "\n", + "Azure ML provides a Jupyter widget to show the real-time status of an experiment run. You could use this widget to monitor status of the runs.\n", + "\n", + "Note that _ReinforcementLearningEstimator_ creates at least two runs: (a) A parent run, i.e. the run returned above, and (b) a collection of child runs. The number of the child runs depends on the configuration of the reinforcement learning estimator. In our simple scenario, configured above, only one child run will be created.\n", + "\n", + "The widget will show a list of the child runs as well. You can click on the link under **Status** to see the details of a child run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.widgets import RunDetails\n", + "\n", + "RunDetails(training_run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop the run\n", + "To cancel the run, call `training_run.cancel()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment line below to cancel the run\n", + "#training_run.cancel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wait for completion\n", + "Wait for the run to complete before proceeding.\n", + "\n", + "**Note: The length of the run depends on the provisioning time of the compute target and may take several minutes to complete.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_run.wait_for_completion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get a handle to the child run\n", + "You can obtain a handle to the child run as follows. In our scenario, there is only one child run, we have it called `child_run_0`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "child_run_0 = None\n", + "timeout = 30\n", + "while timeout > 0 and not child_run_0:\n", + " child_runs = list(training_run.get_children())\n", + " print('Number of child runs:', len(child_runs))\n", + " if len(child_runs) > 0:\n", + " child_run_0 = child_runs[0]\n", + " break\n", + " time.sleep(2) # Wait for 2 seconds\n", + " timeout -= 2\n", + "\n", + "print('Child run info:')\n", + "print(child_run_0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get access to training artifacts\n", + "We can simply use run id to get a handle to an in-progress or a previously concluded run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Run\n", + "\n", + "run_id = child_run_0.id # Or set to run id of a completed run (e.g. 'rl-cartpole-v0_1587572312_06e04ace_head')\n", + "child_run_0 = Run(exp, run_id=run_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use the Run API to download policy training artifacts (saved model and checkpoints) to local compute." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from os import path\n", + "from distutils import dir_util\n", + "\n", + "path_prefix = path.join(\"logs\", training_algorithm)\n", + "print(\"Path prefix:\", path_prefix)\n", + "\n", + "if path.exists(path_prefix):\n", + " dir_util.remove_tree(path_prefix)\n", + "\n", + "# Uncomment line below to download run artifacts to local compute\n", + "#child_run_0.download_files(path_prefix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a dataset of training artifacts\n", + "To evaluate a trained policy (a checkpoint) we need to make the checkpoint accessible to the rollout script. All the training artifacts are stored in workspace default datastore under **azureml/<run_id>** directory.\n", + "\n", + "Here we create a file dataset from the stored artifacts, and then use this dataset to feed these data to rollout estimator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "\n", + "run_id = child_run_0.id # Or set to run id of a completed run (e.g. 'rl-cartpole-v0_1587572312_06e04ace_head')\n", + "run_artifacts_path = os.path.join('azureml', run_id)\n", + "print(\"Run artifacts path:\", run_artifacts_path)\n", + "\n", + "# Create a file dataset object from the files stored on default datastore\n", + "datastore = ws.get_default_datastore()\n", + "training_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To verify, we can print out the number (and paths) of all the files in the dataset, as follows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "artifacts_paths = training_artifacts_ds.to_path()\n", + "print(\"Number of files in dataset:\", len(artifacts_paths))\n", + "\n", + "# Uncomment line below to print all file paths\n", + "#print(\"Artifacts dataset file paths: \", artifacts_paths)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display movies of selected training episodes\n", + "\n", + "Ray creates video output of selected training episodes in mp4 format. Here we will display two of these, i.e. the first and the last recorded videos, so you could see the improvement of the agent after training.\n", + "\n", + "First we introduce a few helper functions: a function to download the movies from our dataset, another one to find mp4 movies in a local directory, and one more to display a downloaded movie." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from os import path\n", + "from distutils import dir_util\n", + "import shutil\n", + "from files.utils import misc\n", + "\n", + "# A helper function to download (copy) movies from a dataset to local directory\n", + "def download_movies(artifacts_ds, movies, destination):\n", + " # Create the local destination directory \n", + " if path.exists(destination):\n", + " dir_util.remove_tree(destination)\n", + " dir_util.mkpath(destination)\n", + " \n", + " try:\n", + " # Mount dataset and copy movies\n", + " # Note: We assume movie paths start with '\\'\n", + " mount_context = artifacts_ds.mount()\n", + " mount_context.start()\n", + " print('Download started.')\n", + " for movie in movies:\n", + " print('Copying {} ...'.format(movie))\n", + " shutil.copy2(path.join(mount_context.mount_point, movie[1:]), destination)\n", + " mount_context.stop()\n", + " except:\n", + " print(\"Mounting error! Downloading all artifacts ...\")\n", + " artifacts_ds.download(target_path=destination, overwrite=True)\n", + " \n", + " print('Downloading movies completed!')\n", + "\n", + "\n", + "# A helper function to find movies in a directory\n", + "def find_movies(movie_path):\n", + " print(\"Looking in path:\", movie_path)\n", + " mp4_files = []\n", + " for root, _, files in os.walk(movie_path):\n", + " for file in files:\n", + " if file.endswith('.mp4'):\n", + " mp4_files.append(path.join(root, file))\n", + " print('Found {} movies'.format(len(mp4_files)))\n", + "\n", + " return mp4_files\n", + "\n", + "\n", + "# A helper function to display a movie\n", + "from IPython.core.display import display, HTML\n", + "def display_movie(movie_file):\n", + " display(\n", + " HTML('\\\n", + " '.format(movie_file)\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's find the first and the last recorded videos in training artifacts dataset and download them to a local directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find first and last movie\n", + "mp4_files = [file for file in training_artifacts_ds.to_path() if file.endswith('.mp4')]\n", + "mp4_files.sort()\n", + "\n", + "first_movie = mp4_files[0] if len(mp4_files) > 0 else None\n", + "last_movie = mp4_files[-1] if len(mp4_files) > 1 else None\n", + "\n", + "print(\"First movie:\", first_movie)\n", + "print(\"Last movie:\", last_movie)\n", + "\n", + "# Download movies\n", + "training_movies_path = \"training\"\n", + "download_movies(training_artifacts_ds, [first_movie, last_movie], training_movies_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look for the downloaded movies in the local directory and sort them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mp4_files = find_movies(training_movies_path)\n", + "mp4_files.sort()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display a movie of the first training episode. This is how the agent performs with no training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "first_movie = mp4_files[0] if len(mp4_files) > 0 else None\n", + "print(\"First movie:\", first_movie)\n", + "\n", + "display_movie(first_movie)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display a movie of the last training episode. This is how a fully-trained agent performs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_movie = mp4_files[-1] if len(mp4_files) > 0 else None\n", + "print(\"Last movie:\", last_movie)\n", + "\n", + "display_movie(last_movie)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate Trained Agent and See Results\n", + "\n", + "We can evaluate a previously trained policy using the `rollout.py` helper script provided by RLlib (see [Evaluating Trained Policies](https://ray.readthedocs.io/en/latest/rllib-training.html#evaluating-trained-policies) for more details). Here we use an adaptation of this script to reconstruct a policy from a checkpoint taken and saved during training. We took these checkpoints by setting `checkpoint-freq` and `checkpoint-at-end` parameters above.\n", + "In this section we show how to use these checkpoints to evaluate the trained policy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate a trained policy\n", + "We need to configure another reinforcement learning estimator, `rollout_estimator`, and then use it to submit another run. Note that the entry script for this estimator now points to `cartpole-rollout.py` script.\n", + "Also note how we pass the checkpoints dataset to this script using `inputs` parameter of the _ReinforcementLearningEstimator_.\n", + "\n", + "We are using script parameters to pass in the same algorithm and the same environment used during training. We also specify the checkpoint number of the checkpoint we wish to evaluate, `checkpoint-number`, and number of the steps we shall run the rollout, `steps`.\n", + "\n", + "The training artifacts dataset will be accessible to the rollout script as a mounted folder. The mounted folder and the checkpoint number, passed in via `checkpoint-number`, will be used to create a path to the checkpoint we are going to evaluate. The created checkpoint path then will be passed into RLlib rollout script for evaluation.\n", + "\n", + "Let's find the checkpoints and the last checkpoint number first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find checkpoints and last checkpoint number\n", + "from os import path\n", + "checkpoint_files = [\n", + " os.path.basename(file) for file in training_artifacts_ds.to_path() \\\n", + " if os.path.basename(file).startswith('checkpoint-') and \\\n", + " not os.path.basename(file).endswith('tune_metadata')\n", + "]\n", + "\n", + "checkpoint_numbers = []\n", + "for file in checkpoint_files:\n", + " checkpoint_numbers.append(int(file.split('-')[1]))\n", + "\n", + "print(\"Checkpoints:\", checkpoint_numbers)\n", + "\n", + "last_checkpoint_number = max(checkpoint_numbers)\n", + "print(\"Last checkpoint number:\", last_checkpoint_number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's configure rollout estimator. Note that we use the last checkpoint for evaluation. The assumption is that the last checkpoint points to our best trained agent. You may change this to any of the checkpoint numbers printed above and observe the effect." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "script_params = { \n", + " # Checkpoint number of the checkpoint from which to roll out\n", + " \"--checkpoint-number\": last_checkpoint_number,\n", + "\n", + " # Training algorithm\n", + " \"--run\": training_algorithm,\n", + " \n", + " # Training environment\n", + " \"--env\": rl_environment,\n", + " \n", + " # Algorithm-specific parameters\n", + " \"--config\": '{}',\n", + " \n", + " # Number of rollout steps \n", + " \"--steps\": 2000,\n", + " \n", + " # If should repress rendering of the environment\n", + " \"--no-render\": \"\",\n", + " \n", + " # The place where recorded videos will be stored\n", + " \"--video-dir\": \"./logs/video\"\n", + "}\n", + "\n", + "if video_capture:\n", + " script_params.pop(\"--no-render\")\n", + "else:\n", + " script_params.pop(\"--video-dir\")\n", + "\n", + "\n", + "# Ray's video capture support requires to run everything under a headless display driver called (xvfb).\n", + "# There are two parts to this:\n", + "\n", + "# 1. Use a custom docker file with proper instructions to install xvfb, ffmpeg, python-opengl\n", + "# and other dependencies.\n", + "# Note: Even when the rendering is off pyhton-opengl is needed.\n", + "# TODO: Add these instructions to default rl base image and drop this docker file.\n", + "\n", + "with open(\"files/docker/Dockerfile\", \"r\") as f:\n", + " dockerfile=f.read()\n", + "\n", + "xvfb_env = Environment(name='xvfb-vdisplay')\n", + "xvfb_env.docker.base_image = None\n", + "xvfb_env.docker.base_dockerfile = dockerfile\n", + " \n", + "# 2. Execute the Python process via the xvfb-run command to set up the headless display driver.\n", + "xvfb_env.python.user_managed_dependencies = True\n", + "if video_capture:\n", + " xvfb_env.python.interpreter_path = \"xvfb-run -s '-screen 0 640x480x16 -ac +extension GLX +render' python\"\n", + "\n", + "\n", + "rollout_estimator = ReinforcementLearningEstimator(\n", + " # Location of source files\n", + " source_directory='files',\n", + " \n", + " # Python script file\n", + " entry_script='cartpole_rollout.py',\n", + " \n", + " # A dictionary of arguments to pass to the rollout script specified in ``entry_script``\n", + " script_params = script_params,\n", + " \n", + " # Data inputs\n", + " inputs=[\n", + " training_artifacts_ds.as_named_input('artifacts_dataset'),\n", + " training_artifacts_ds.as_named_input('artifacts_path').as_mount()],\n", + " \n", + " # The Azure ML compute target set up for Ray head nodes\n", + " compute_target=compute_target,\n", + " \n", + " # RL framework. Currently must be Ray.\n", + " rl_framework=Ray(),\n", + " \n", + " # Custom environmnet for Xvfb\n", + " environment=xvfb_env)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Same as before, we use the *rollout_estimator* to submit a run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rollout_run = exp.submit(rollout_estimator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in driver_log.txt file. Note that you may need to wait several minutes before these results become available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.widgets import RunDetails\n", + "\n", + "RunDetails(rollout_run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait for completion of the rollout run before moving to the next section, or you may cancel the run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment line below to cancel the run\n", + "#rollout_run.cancel()\n", + "rollout_run.wait_for_completion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display movies of selected rollout episodes\n", + "\n", + "To display recorded movies first we download recorded videos to local machine. Here again we create a dataset of rollout artifacts and use the helper functions introduced above to download and displays rollout videos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "\n", + "# Get a handle to child run\n", + "child_runs = list(rollout_run.get_children())\n", + "print('Number of child runs:', len(child_runs))\n", + "child_run_0 = child_runs[0]\n", + "\n", + "run_id = child_run_0.id # Or set to run id of a completed run (e.g. 'rl-cartpole-v0_1587572312_06e04ace_head')\n", + "run_artifacts_path = os.path.join('azureml', run_id)\n", + "print(\"Run artifacts path:\", run_artifacts_path)\n", + "\n", + "# Create a file dataset object from the files stored on default datastore\n", + "datastore = ws.get_default_datastore()\n", + "rollout_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))\n", + "\n", + "artifacts_paths = rollout_artifacts_ds.to_path()\n", + "print(\"Number of files in dataset:\", len(artifacts_paths))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, similar to the training section, we look for the last video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find last movie\n", + "mp4_files = [file for file in rollout_artifacts_ds.to_path() if file.endswith('.mp4')]\n", + "mp4_files.sort()\n", + "\n", + "last_movie = mp4_files[-1] if len(mp4_files) > 1 else None\n", + "print(\"Last movie:\", last_movie)\n", + "\n", + "# Download last movie\n", + "rollout_movies_path = \"rollout\"\n", + "download_movies(rollout_artifacts_ds, [last_movie], rollout_movies_path)\n", + "\n", + "# Look for the downloaded movie in local directory\n", + "mp4_files = find_movies(rollout_movies_path)\n", + "mp4_files.sort()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display last video recorded during the rollout." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_movie = mp4_files[-1] if len(mp4_files) > 0 else None\n", + "print(\"Last movie:\", last_movie)\n", + "\n", + "display_movie(last_movie)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up\n", + "For your convenience, below you can find code snippets to clean up any resources created as part of this tutorial that you don't wish to retain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from os import path\n", + "from distutils import dir_util\n", + "\n", + "# To archive the created experiment:\n", + "#exp.archive()\n", + "\n", + "# To delete the compute target:\n", + "#compute_target.delete()\n", + "\n", + "# To delete downloaded training artifacts\n", + "#if os.path.exists(path_prefix):\n", + "# dir_util.remove_tree(path_prefix)\n", + "\n", + "# To delete downloaded training videos\n", + "#if path.exists(training_movies_path):\n", + "# dir_util.remove_tree(training_movies_path)\n", + "\n", + "# To delete downloaded rollout videos\n", + "#if path.exists(rollout_movies_path):\n", + "# dir_util.remove_tree(rollout_movies_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next\n", + "This example was about running Azure ML RL (Ray/RLlib Framework) on a single node. Please see [Pong problem](../atari-on-distributed-compute/pong_rllib.ipynb)\n", + "example which uses Ray RLlib to train a Pong playing agent on a multi-node cluster." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "hoazari" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + }, + "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.yml b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.yml new file mode 100644 index 00000000..0ac02b81 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.yml @@ -0,0 +1,6 @@ +name: cartpole_cc +dependencies: +- pip: + - azureml-sdk + - azureml-contrib-reinforcementlearning + - azureml-widgets diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/cartpole_rollout.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/cartpole_rollout.py new file mode 100644 index 00000000..4a951c72 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/cartpole_rollout.py @@ -0,0 +1,119 @@ +import argparse +import os +import sys + +import ray +from ray.rllib import rollout +from ray.tune.registry import get_trainable_cls + +from azureml.core import Run + +from utils import callbacks + + +DEFAULT_RAY_ADDRESS = 'localhost:6379' + + +def run_rollout(args, parser, ray_address): + + config = args.config + if not args.env: + if not config.get("env"): + parser.error("the following arguments are required: --env") + args.env = config.get("env") + + ray.init(address=ray_address) + + # Create the Trainer from config. + cls = get_trainable_cls(args.run) + agent = cls(env=args.env, config=config) + + # Load state from checkpoint. + agent.restore(args.checkpoint) + num_steps = int(args.steps) + num_episodes = int(args.episodes) + + # Determine the video output directory. + use_arg_monitor = False + try: + args.video_dir + except AttributeError: + print("There is no such attribute: args.video_dir") + use_arg_monitor = True + + video_dir = None + if not use_arg_monitor: + if args.monitor: + video_dir = os.path.join("./logs", "video") + elif args.video_dir: + video_dir = os.path.expanduser(args.video_dir) + + # Do the actual rollout. + with rollout.RolloutSaver( + args.out, + args.use_shelve, + write_update_file=args.track_progress, + target_steps=num_steps, + target_episodes=num_episodes, + save_info=args.save_info) as saver: + if use_arg_monitor: + rollout.rollout( + agent, + args.env, + num_steps, + num_episodes, + saver, + args.no_render, + args.monitor) + else: + rollout.rollout( + agent, args.env, + num_steps, + num_episodes, + saver, + args.no_render, video_dir) + + +if __name__ == "__main__": + + # Add positional argument - serves as placeholder for checkpoint + argvc = sys.argv[1:] + argvc.insert(0, 'checkpoint-placeholder') + + # Parse arguments + rollout_parser = rollout.create_parser() + + rollout_parser.add_argument( + '--checkpoint-number', required=False, type=int, default=1, + help='Checkpoint number of the checkpoint from which to roll out') + + rollout_parser.add_argument( + '--ray-address', required=False, default=DEFAULT_RAY_ADDRESS, + help='The address of the Ray cluster to connect to') + + args = rollout_parser.parse_args(argvc) + + # Get a handle to run + run = Run.get_context() + + # Get handles to the tarining artifacts dataset and mount path + artifacts_dataset = run.input_datasets['artifacts_dataset'] + artifacts_path = run.input_datasets['artifacts_path'] + + # Find checkpoint file to be evaluated + checkpoint_id = '-' + str(args.checkpoint_number) + checkpoint_files = list(filter( + lambda filename: filename.endswith(checkpoint_id), + artifacts_dataset.to_path())) + + checkpoint_file = checkpoint_files[0] + if checkpoint_file[0] == '/': + checkpoint_file = checkpoint_file[1:] + checkpoint = os.path.join(artifacts_path, checkpoint_file) + print('Checkpoint:', checkpoint) + + # Set rollout checkpoint + args.checkpoint = checkpoint + + # Start rollout + run_rollout(args, rollout_parser, args.ray_address) diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/cartpole_training.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/cartpole_training.py new file mode 100644 index 00000000..ae926273 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/cartpole_training.py @@ -0,0 +1,41 @@ +import argparse +import os +import sys + +import ray +from ray.rllib import train +from ray import tune + +from utils import callbacks + + +DEFAULT_RAY_ADDRESS = 'localhost:6379' + + +if __name__ == "__main__": + + # Parse arguments and add callbacks to config + train_parser = train.create_parser() + + args = train_parser.parse_args() + args.config["callbacks"] = {"on_train_result": callbacks.on_train_result} + + # Trace if video capturing is on + if 'monitor' in args.config and args.config['monitor']: + print("Video capturing is ON!") + + # Start (connect to) Ray cluster + if args.ray_address is None: + args.ray_address = DEFAULT_RAY_ADDRESS + + ray.init(address=args.ray_address) + + # Run training task using tune.run + tune.run( + run_or_experiment=args.run, + config=dict(args.config, env=args.env), + stop=args.stop, + checkpoint_freq=args.checkpoint_freq, + checkpoint_at_end=args.checkpoint_at_end, + local_dir=args.local_dir + ) diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/docker/Dockerfile b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/docker/Dockerfile new file mode 100644 index 00000000..fca3dc4a --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/docker/Dockerfile @@ -0,0 +1,29 @@ +FROM mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python-opengl \ + rsync \ + xvfb && \ + apt-get clean -y && \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf /usr/share/man/* + +RUN conda install -y conda=4.7.12 python=3.6.2 && conda clean -ay && \ + pip install --no-cache-dir \ + azureml-defaults \ + azureml-dataprep[fuse,pandas] \ + azureml-contrib-reinforcementlearning \ + gputil \ + cloudpickle==1.3.0 \ + tensorboardX \ + tensorflow==1.14.0 \ + tabulate \ + dm_tree \ + lz4 \ + ray==0.8.3 \ + ray[rllib,dashboard,tune]==0.8.3 \ + psutil \ + setproctitle \ + gym[atari] && \ + conda install -y -c conda-forge x264='1!152.20180717' ffmpeg=4.0.2 && \ + conda install opencv diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py new file mode 100644 index 00000000..f34a4e8c --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/callbacks.py @@ -0,0 +1,17 @@ +'''RLlib callbacks module: + Common callback methods to be passed to RLlib trainer. +''' + +from azureml.core import Run + + +def on_train_result(info): + '''Callback on train result to record metrics returned by trainer. + ''' + run = Run.get_context() + run.log( + name='episode_reward_mean', + value=info["result"]["episode_reward_mean"]) + run.log( + name='episodes_total', + value=info["result"]["episodes_total"]) diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/misc.py b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/misc.py new file mode 100644 index 00000000..f123324e --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/files/utils/misc.py @@ -0,0 +1,13 @@ +'''Misc module: + Miscellaneous helper functions and utilities. +''' + +import os +import glob + + +# Helper function to find a file or folder path +def find_path(name, path_prefix): + for root, _, _ in os.walk(path_prefix): + if glob.glob(os.path.join(root, name)): + return root diff --git a/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/images/cartpole.png b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/images/cartpole.png new file mode 100644 index 00000000..f37c084e Binary files /dev/null and b/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/images/cartpole.png differ diff --git a/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb b/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb new file mode 100644 index 00000000..00467b6b --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb @@ -0,0 +1,245 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/tutorials/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Azure ML Reinforcement Learning Sample - Setting Up Development Environment\n", + "\n", + "Ray multi-node cluster setup requires all worker nodes to be able to communicate with the head node. This notebook explains you how to setup a virtual network, to be used by the Ray head and worker compute targets, created and used in other notebook examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisite\n", + "\n", + "The user should have completed the Azure Machine Learning Tutorial: [Get started creating your first ML experiment with the Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup). You will need to make sure that you have a valid subscription id, a resource group and a workspace." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure Machine Learning SDK \n", + "Display the Azure Machine Learning SDK version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Azure ML Core imports\n", + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Azure ML workspace\n", + "Get a reference to an existing Azure ML workspace. Please make sure that the VM sizes `STANDARD_NC6` and `STANDARD_D2_V2` are supported in the workspace's region.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.location, ws.resource_group, sep = ' | ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Virtual Network\n", + "\n", + "If you are using separate compute targets for the Ray head and worker, a virtual network must be created in the resource group. If you have alraeady created a virtual network in the resource group, you can skip this step.\n", + "\n", + "To do this, you first must install the Azure Networking API.\n", + "\n", + "`pip install --upgrade azure-mgmt-network`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you need to install the Azure Networking SDK, uncomment the following line.\n", + "#!pip install --upgrade azure-mgmt-network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.mgmt.network import NetworkManagementClient\n", + "\n", + "# Virtual network name\n", + "vnet_name =\"your_vnet\"\n", + "\n", + "# Default subnet\n", + "subnet_name =\"default\"\n", + "\n", + "# The Azure subscription you are using\n", + "subscription_id=ws.subscription_id\n", + "\n", + "# The resource group for the RL cluster\n", + "resource_group=ws.resource_group\n", + "\n", + "# Azure region of the resource group\n", + "location=ws.location\n", + "\n", + "network_client = NetworkManagementClient(ws._auth_object, subscription_id)\n", + "\n", + "async_vnet_creation = network_client.virtual_networks.create_or_update(\n", + " resource_group,\n", + " vnet_name,\n", + " {\n", + " 'location': location,\n", + " 'address_space': {\n", + " 'address_prefixes': ['10.0.0.0/16']\n", + " }\n", + " }\n", + ")\n", + "\n", + "async_vnet_creation.wait()\n", + "print(\"VNet created successfully: \", async_vnet_creation.result())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up Network Security Group on Virtual Network\n", + "\n", + "Depending on your Azure setup, you may need to open certain ports to make it possible for Azure to manage the compute targets that you create. The ports that need to be opened are described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-enable-virtual-network).\n", + "\n", + "A common situation is that ports `29876-29877` are closed. The following code will add a security rule to open these ports. Or you can do this manually in the [Azure portal](https://portal.azure.com).\n", + "\n", + "You may need to modify the code below to match your scenario." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azure.mgmt.network.models\n", + "\n", + "security_group_name = vnet_name + '-' + \"nsg\"\n", + "security_rule_name = \"AllowAML\"\n", + "\n", + "# Create a network security group\n", + "nsg_params = azure.mgmt.network.models.NetworkSecurityGroup(\n", + " location=location,\n", + " security_rules=[\n", + " azure.mgmt.network.models.SecurityRule(\n", + " name=security_rule_name,\n", + " access=azure.mgmt.network.models.SecurityRuleAccess.allow,\n", + " description='Azure ML RL rule',\n", + " destination_address_prefix='*',\n", + " destination_port_range='29876-29877',\n", + " direction=azure.mgmt.network.models.SecurityRuleDirection.inbound,\n", + " priority=400,\n", + " protocol=azure.mgmt.network.models.SecurityRuleProtocol.tcp,\n", + " source_address_prefix='BatchNodeManagement',\n", + " source_port_range='*'\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "async_nsg_creation = network_client.network_security_groups.create_or_update(\n", + " resource_group,\n", + " security_group_name,\n", + " nsg_params,\n", + ")\n", + "\n", + "async_nsg_creation.wait() \n", + "print(\"Network security group created successfully: \", async_nsg_creation.result())\n", + "\n", + "network_security_group = network_client.network_security_groups.get(\n", + " resource_group,\n", + " security_group_name,\n", + ")\n", + "\n", + "# Define a subnet to be created with network security group\n", + "subnet = azure.mgmt.network.models.Subnet(\n", + " id='default',\n", + " address_prefix='10.0.0.0/24',\n", + " network_security_group=network_security_group\n", + " )\n", + " \n", + "# Create subnet on vnet\n", + "async_subnet_creation = network_client.subnets.create_or_update(\n", + " resource_group_name=resource_group,\n", + " virtual_network_name=vnet_name,\n", + " subnet_name=subnet_name,\n", + " subnet_parameters=subnet\n", + ")\n", + "\n", + "async_subnet_creation.wait()\n", + "print(\"Subnet created successfully:\", async_subnet_creation.result())" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "vineetg" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + }, + "notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00e2\u20ac\u00afLicensed under the MIT License.\u00e2\u20ac\u00af " + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.yml b/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.yml new file mode 100644 index 00000000..e1752561 --- /dev/null +++ b/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.yml @@ -0,0 +1,5 @@ +name: devenv_setup +dependencies: +- pip: + - azureml-sdk + - azure-mgmt-network diff --git a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb index 4dfbc32d..b2f8ef69 100644 --- a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb +++ b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb @@ -100,7 +100,7 @@ "\n", "# Check core SDK version number\n", "\n", - "print(\"This notebook was created using SDK version 1.4.0, you are currently running version\", azureml.core.VERSION)" + "print(\"This notebook was created using SDK version 1.5.0, you are currently running version\", azureml.core.VERSION)" ] }, { diff --git a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/keras_mnist.py b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/keras_mnist.py index e4d17706..6b67b9bf 100644 --- a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/keras_mnist.py +++ b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/keras_mnist.py @@ -23,7 +23,7 @@ print("Keras version:", keras.__version__) print("Tensorflow version:", tf.__version__) parser = argparse.ArgumentParser() -parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') +parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point') parser.add_argument('--batch-size', type=int, dest='batch_size', default=50, help='mini batch size for training') parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=100, help='# of neurons in the first layer') @@ -84,8 +84,8 @@ class LogRunMetrics(Callback): # callback at the end of every epoch def on_epoch_end(self, epoch, log): # log a value repeated which creates a list - run.log('Loss', log['loss']) - run.log('Accuracy', log['acc']) + run.log('Loss', log['val_loss']) + run.log('Accuracy', log['val_accuracy']) history = model.fit(X_train, y_train, @@ -106,8 +106,8 @@ print('Test accuracy:', score[1]) plt.figure(figsize=(6, 3)) plt.title('MNIST with Keras MLP ({} epochs)'.format(n_epochs), fontsize=14) -plt.plot(history.history['acc'], 'b-', label='Accuracy', lw=4, alpha=0.5) -plt.plot(history.history['loss'], 'r--', label='Loss', lw=4, alpha=0.5) +plt.plot(history.history['val_accuracy'], 'b-', label='Accuracy', lw=4, alpha=0.5) +plt.plot(history.history['val_loss'], 'r--', label='Loss', lw=4, alpha=0.5) plt.legend(fontsize=12) plt.grid(True) diff --git a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb index ed1519f2..5aa8dda0 100644 --- a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb +++ b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb @@ -158,13 +158,13 @@ "os.makedirs(data_folder, exist_ok=True)\n", "\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'train-images.gz'))\n", + " filename=os.path.join(data_folder, 'train-images-idx3-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'train-labels.gz'))\n", + " filename=os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'test-images.gz'))\n", + " filename=os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'))\n", "urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',\n", - " filename=os.path.join(data_folder, 'test-labels.gz'))" + " filename=os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'))" ] }, { @@ -186,10 +186,10 @@ "from utils import load_data, one_hot_encode\n", "\n", "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the model converge faster.\n", - "X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0\n", - "X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0\n", - "y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)\n", - "y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)\n", + "X_train = load_data(os.path.join(data_folder, 'train-images-idx3-ubyte.gz'), False) / 255.0\n", + "X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / 255.0\n", + "y_train = load_data(os.path.join(data_folder, 'train-labels-idx1-ubyte.gz'), True).reshape(-1)\n", + "y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)\n", "\n", "# now let's show some randomly chosen images from the training set.\n", "count = 0\n", @@ -384,8 +384,8 @@ "```\n", "2. The script is accessing the Azure ML `Run` object by executing `run = Run.get_context()`. Further down the script is using the `run` to report the loss and accuracy at the end of each epoch via callback.\n", "```\n", - " run.log('Loss', log['loss'])\n", - " run.log('Accuracy', log['acc'])\n", + " run.log('Loss', log['val_loss'])\n", + " run.log('Accuracy', log['val_accuracy'])\n", "```\n", "3. When running the script on Azure ML, you can write files out to a folder `./outputs` that is relative to the root directory. This folder is specially tracked by Azure ML in the sense that any files written to that folder during script execution on the remote target will be picked up by Run History; these files (known as artifacts) will be available as part of the run history record." ] @@ -447,8 +447,9 @@ "est = TensorFlow(source_directory=script_folder,\n", " script_params=script_params,\n", " compute_target=compute_target, \n", - " entry_script='keras_mnist.py', \n", - " pip_packages=['keras==2.2.5','azureml-dataprep[pandas,fuse]','matplotlib'])" + " entry_script='keras_mnist.py',\n", + " framework_version='2.0', \n", + " pip_packages=['keras<=2.3.1','azureml-dataprep[pandas,fuse]','matplotlib'])" ] }, { @@ -727,8 +728,9 @@ "est = TensorFlow(source_directory=script_folder,\n", " script_params={'--data-folder': dataset.as_named_input('mnist').as_mount()},\n", " compute_target=compute_target,\n", - " entry_script='keras_mnist.py', \n", - " pip_packages=['keras==2.2.5','azureml-dataprep[pandas,fuse]','matplotlib'])" + " entry_script='keras_mnist.py',\n", + " framework_version='2.0',\n", + " pip_packages=['keras<=2.3.1','azureml-dataprep[pandas,fuse]','matplotlib'])" ] }, { @@ -944,7 +946,7 @@ "\n", "cd = CondaDependencies.create()\n", "cd.add_tensorflow_conda_package()\n", - "cd.add_conda_package('keras==2.2.5')\n", + "cd.add_conda_package('keras<=2.3.1')\n", "cd.add_pip_package(\"azureml-defaults\")\n", "cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n", "\n", diff --git a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.yml b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.yml index 49b37af7..8fa4d352 100644 --- a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.yml +++ b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.yml @@ -3,8 +3,6 @@ dependencies: - pip: - azureml-sdk - azureml-widgets - - tensorflow==1.13.1 - - keras==2.2.5 - - matplotlib==3.0.3 - - numpy==1.16.2 - - pandas + - tensorflow + - keras<=2.3.1 + - matplotlib diff --git a/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb b/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb index 38d1cbe6..abe2830f 100644 --- a/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb +++ b/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb @@ -26,7 +26,6 @@ "* Create an Experiment\n", "* Introduction to AmlCompute\n", "* Submit an AmlCompute run in a few different ways\n", - " - Provision as a run based compute target \n", " - Provision as a persistent compute target (Basic)\n", " - Provision as a persistent compute target (Advanced)\n", "* Additional operations to perform on AmlCompute\n", @@ -301,7 +300,9 @@ "* `admin_username`: Name of Admin user account which will be created on all the nodes of the cluster\n", "* `admin_user_password`: Password that you want to set for the user account above\n", "* `admin_user_ssh_key`: SSH Key for the user account above. You can specify either a password or an SSH key or both\n", - "* `remote_login_port_public_access`: Flag to enable or disable the public SSH port. If you dont specify, AmlCompute will smartly close the port when deploying inside a VNet" + "* `remote_login_port_public_access`: Flag to enable or disable the public SSH port. If you dont specify, AmlCompute will smartly close the port when deploying inside a VNet\n", + "* `identity_type`: Compute Identity type that you want to set on the cluster, which can either be SystemAssigned or UserAssigned\n", + "* `identity_id`: Resource ID of identity in case it is a UserAssigned identity, optional otherwise\n" ] }, { @@ -332,7 +333,9 @@ " admin_username='',\n", " admin_user_password='',\n", " admin_user_ssh_key='',\n", - " remote_login_port_public_access='enabled')\n", + " remote_login_port_public_access='enabled',\n", + " identity_type='UserAssigned',\n", + " identity_id='')\n", " cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n", "\n", "cpu_cluster.wait_for_completion(show_output=True)" diff --git a/how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb b/how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb index 76a08906..6216378d 100644 --- a/how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb +++ b/how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb @@ -282,7 +282,7 @@ "# username='username',\n", "# private_key_file='./.ssh/id_rsa')\n", "\n", - " attached_dsvm_compute = ComputeTarget.attach(workspace=ws,\n", + " attached_dsvm_compute = ComputeTarget.attach(workspace=ws,\n", " name=compute_target_name,\n", " attach_configuration=attach_config)\n", " attached_dsvm_compute.wait_for_completion(show_output=True)" diff --git a/index.md b/index.md index 490d6899..0106aa11 100644 --- a/index.md +++ b/index.md @@ -128,6 +128,10 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an | [training_notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/notebook_runner/training_notebook.ipynb) | | | | | | | | [nyc-taxi-data-regression-model-building](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb) | | | | | | | | [authentication-in-azureml](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb) | | | | | | | +| [pong_rllib](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb) | | | | | | | +| [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) | | | | | | | +| [cartpole_cc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.ipynb) | | | | | | | +| [devenv_setup](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb) | | | | | | | | [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None | | [distributed-cntk-with-custom-docker](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/distributed-cntk-with-custom-docker/distributed-cntk-with-custom-docker.ipynb) | | | | | | | | [notebook_example](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/how-to-use-estimator/notebook_example.ipynb) | | | | | | | diff --git a/setup-environment/configuration.ipynb b/setup-environment/configuration.ipynb index cfe1c4ba..3f2eb123 100644 --- a/setup-environment/configuration.ipynb +++ b/setup-environment/configuration.ipynb @@ -102,7 +102,7 @@ "source": [ "import azureml.core\n", "\n", - "print(\"This notebook was created using version 1.4.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.5.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb b/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb index 7d067225..f5b06fac 100644 --- a/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb +++ b/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb @@ -296,7 +296,7 @@ "from azureml.core.conda_dependencies import CondaDependencies\n", "from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n", "\n", - "cd = CondaDependencies.create(pip_packages=[\"tensorflow-gpu==1.13.1\", \"azureml-defaults\"])\n", + "cd = CondaDependencies.create(pip_packages=[\"tensorflow-gpu==1.15.2\", \"azureml-defaults\"])\n", "\n", "env = Environment(name=\"parallelenv\")\n", "env.python.conda_dependencies=cd\n",