Delete pytorch_mnist.py

Delete distributed-pytorch-with-nccl-gloo.ipynb
Add files via upload
2025-12-20 09:37:04 -05:00 · 2021-02-23 11:19:39 -08:00 · 2021-02-23 11:19:33 -08:00 · 2021-02-23 11:19:02 -08:00 · 2021-02-23 11:18:43 -08:00 · 2021-02-23 11:18:10 -08:00
40 changed files with 809 additions and 811 deletions
--- a/configuration.ipynb
+++ b/configuration.ipynb
@@ -103,7 +103,7 @@
      "source": [
        "import azureml.core\n",
        "\n",
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/automl_env.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env.yml
@@ -21,9 +21,9 @@ dependencies:

 - pip:
  # Required packages for AzureML execution, history, and data preparation.
-  - azureml-widgets~=1.22.0
+  - azureml-widgets~=1.23.0
  - pytorch-transformers==1.0.0
  - spacy==2.1.8
  - https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
-  - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.22.0/validated_win32_requirements.txt [--no-deps]
+  - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.23.0/validated_win32_requirements.txt [--no-deps]
  - PyJWT < 2.0.0
--- a/how-to-use-azureml/automated-machine-learning/automl_env_linux.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env_linux.yml
@@ -21,10 +21,10 @@ dependencies:

 - pip:
  # Required packages for AzureML execution, history, and data preparation.
-  - azureml-widgets~=1.22.0
+  - azureml-widgets~=1.23.0
  - pytorch-transformers==1.0.0
  - spacy==2.1.8
  - https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
-  - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.22.0/validated_linux_requirements.txt [--no-deps]
+  - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.23.0/validated_linux_requirements.txt [--no-deps]
  - PyJWT < 2.0.0

--- a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
@@ -22,9 +22,9 @@ dependencies:

 - pip:
  # Required packages for AzureML execution, history, and data preparation.
-  - azureml-widgets~=1.22.0
+  - azureml-widgets~=1.23.0
  - pytorch-transformers==1.0.0
  - spacy==2.1.8
  - https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz  
-  - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.22.0/validated_darwin_requirements.txt [--no-deps]
+  - -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.23.0/validated_darwin_requirements.txt [--no-deps]
  - PyJWT < 2.0.0
--- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb
@@ -105,7 +105,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
@@ -93,7 +93,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb
@@ -96,7 +96,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb
@@ -81,7 +81,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/experimental/automl_thin_client_env.yml
+++ b/how-to-use-azureml/automated-machine-learning/experimental/automl_thin_client_env.yml
@@ -5,17 +5,13 @@ dependencies:
 - pip<=19.3.1
 - python>=3.5.2,<3.8
 - nb_conda
- matplotlib==2.1.0
- numpy~=1.18.0
 - cython
 - urllib3<1.24
- scikit-learn==0.22.1
- pandas==0.25.1

 - pip:
  # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults
  - azureml-sdk
  - azureml-widgets
-  - azureml-explain-model
+  - pandas
  - PyJWT < 2.0.0
--- a/how-to-use-azureml/automated-machine-learning/experimental/automl_thin_client_env_mac.yml
+++ b/how-to-use-azureml/automated-machine-learning/experimental/automl_thin_client_env_mac.yml
@@ -6,17 +6,13 @@ dependencies:
 - nomkl
 - python>=3.5.2,<3.8
 - nb_conda
- matplotlib==2.1.0
- numpy~=1.18.0
 - cython
 - urllib3<1.24
- scikit-learn==0.22.1
- pandas==0.25.1

 - pip:
  # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults
  - azureml-sdk
  - azureml-widgets
-  - azureml-explain-model
+  - pandas
  - PyJWT < 2.0.0
--- a/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb
@@ -67,11 +67,8 @@
      "source": [
        "import logging\n",
        "\n",
-        "from matplotlib import pyplot as plt\n",
        "import json\n",
-        "import numpy as np\n",
-        "import pandas as pd\n",
-        " \n",
+        "\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
@@ -93,7 +90,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
@@ -116,9 +113,7 @@
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Run History Name'] = experiment_name\n",
-        "pd.set_option('display.max_colwidth', -1)\n",
-        "outputDf = pd.DataFrame(data = output, index = [''])\n",
-        "outputDf.T"
+        "output"
      ]
    },
    {
@@ -276,34 +271,13 @@
        "## Results"
      ]
    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "#### Widget for Monitoring Runs\n",
-        "\n",
-        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
-        "\n",
-        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
-      ]
-    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
-        "from azureml.widgets import RunDetails\n",
-        "RunDetails(remote_run).show() "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "remote_run.wait_for_completion()"
+        "remote_run.wait_for_completion(show_output=True)"
      ]
    },
    {
@@ -368,18 +342,12 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "# preview the first 3 rows of the dataset\n",
-        "\n",
-        "test_data = test_data.to_pandas_dataframe()\n",
-        "y_test = test_data['ERP'].fillna(0)\n",
-        "test_data = test_data.drop('ERP', 1)\n",
-        "test_data = test_data.fillna(0)\n",
+        "y_test = test_data.keep_columns('ERP')\n",
+        "test_data = test_data.drop_columns('ERP')\n",
        "\n",
        "\n",
-        "train_data = train_data.to_pandas_dataframe()\n",
-        "y_train = train_data['ERP'].fillna(0)\n",
-        "train_data = train_data.drop('ERP', 1)\n",
-        "train_data = train_data.fillna(0)\n"
+        "y_train = train_data.keep_columns('ERP')\n",
+        "train_data = train_data.drop_columns('ERP')\n"
      ]
    },
    {
@@ -397,7 +365,16 @@
      "outputs": [],
      "source": [
        "from azureml.train.automl.model_proxy import ModelProxy\n",
-        "best_model_proxy = ModelProxy(best_run)"
+        "best_model_proxy = ModelProxy(best_run)\n",
+        "y_pred_train = best_model_proxy.predict(train_data)\n",
+        "y_pred_test = best_model_proxy.predict(test_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Exploring results"
      ]
    },
    {
@@ -406,60 +383,15 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "y_pred_train = best_model_proxy.predict(train_data).to_pandas_dataframe().values.flatten()\n",
+        "y_pred_train = y_pred_train.to_pandas_dataframe().values.flatten()\n",
+        "y_train = y_train.to_pandas_dataframe().values.flatten()\n",
        "y_residual_train = y_train - y_pred_train\n",
        "\n",
-        "y_pred_test = best_model_proxy.predict(test_data).to_pandas_dataframe().values.flatten()\n",
-        "y_residual_test = y_test - y_pred_test"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%matplotlib inline\n",
-        "from sklearn.metrics import mean_squared_error, r2_score\n",
-        "\n",
-        "# Set up a multi-plot chart.\n",
-        "f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
-        "f.suptitle('Regression Residual Values', fontsize = 18)\n",
-        "f.set_figheight(6)\n",
-        "f.set_figwidth(16)\n",
-        "\n",
-        "# Plot residual values of training set.\n",
-        "a0.axis([0, 360, -100, 100])\n",
-        "a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
-        "a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
-        "a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
-        "a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)),fontsize = 12)\n",
-        "a0.set_xlabel('Training samples', fontsize = 12)\n",
-        "a0.set_ylabel('Residual Values', fontsize = 12)\n",
-        "\n",
-        "# Plot residual values of test set.\n",
-        "a1.axis([0, 90, -100, 100])\n",
-        "a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
-        "a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
-        "a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
-        "a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)),fontsize = 12)\n",
-        "a1.set_xlabel('Test samples', fontsize = 12)\n",
-        "a1.set_yticklabels([])\n",
-        "\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%matplotlib inline\n",
-        "test_pred = plt.scatter(y_test, y_pred_test, color='')\n",
-        "test_test = plt.scatter(y_test, y_test, color='g')\n",
-        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
-        "plt.show()"
+        "y_pred_test = y_pred_test.to_pandas_dataframe().values.flatten()\n",
+        "y_test = y_test.to_pandas_dataframe().values.flatten()\n",
+        "y_residual_test = y_test - y_pred_test\n",
+        "print(y_residual_train)\n",
+        "print(y_residual_test)"
      ]
    },
    {
--- a/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-beer-remote/auto-ml-forecasting-beer-remote.ipynb
@@ -113,7 +113,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
@@ -87,7 +87,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
@@ -97,7 +97,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb
@@ -94,7 +94,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
@@ -82,7 +82,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb
@@ -96,7 +96,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb
@@ -96,7 +96,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
--- a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
@@ -92,7 +92,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
@@ -375,18 +375,12 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "# preview the first 3 rows of the dataset\n",
-        "\n",
-        "test_data = test_data.to_pandas_dataframe()\n",
-        "y_test = test_data['ERP'].fillna(0)\n",
-        "test_data = test_data.drop('ERP', 1)\n",
-        "test_data = test_data.fillna(0)\n",
+        "y_test = test_data.keep_columns('ERP').to_pandas_dataframe()\n",
+        "test_data = test_data.drop_columns('ERP').to_pandas_dataframe()\n",
        "\n",
        "\n",
-        "train_data = train_data.to_pandas_dataframe()\n",
-        "y_train = train_data['ERP'].fillna(0)\n",
-        "train_data = train_data.drop('ERP', 1)\n",
-        "train_data = train_data.fillna(0)\n"
+        "y_train = train_data.keep_columns('ERP').to_pandas_dataframe()\n",
+        "train_data = train_data.drop_columns('ERP').to_pandas_dataframe()\n"
      ]
    },
    {
@@ -396,10 +390,10 @@
      "outputs": [],
      "source": [
        "y_pred_train = fitted_model.predict(train_data)\n",
-        "y_residual_train = y_train - y_pred_train\n",
+        "y_residual_train = y_train.values - y_pred_train\n",
        "\n",
        "y_pred_test = fitted_model.predict(test_data)\n",
-        "y_residual_test = y_test - y_pred_test"
+        "y_residual_test = y_test.values - y_pred_test"
      ]
    },
    {
--- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb
+++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb
@@ -259,7 +259,7 @@
        "run_config.environment.docker.enabled = True\n",
        "\n",
        "azureml_pip_packages = [\n",
-        "    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-telemetry', 'azureml-interpret'\n",
+        "    'azureml-defaults', 'azureml-telemetry', 'azureml-interpret'\n",
        "]\n",
        "\n",
        "# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n",
--- a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb
+++ b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb
@@ -57,7 +57,7 @@
        "Problem: IBM employee attrition classification with scikit-learn (run model explainer locally and upload explanation to the Azure Machine Learning Run History)\n",
        "\n",
        "1. Train a SVM classification model using Scikit-learn\n",
-        "2. Run 'explain_model' with AML Run History, which leverages run history service to store and manage the explanation data\n",
+        "2. Run 'explain-model-sample' with AML Run History, which leverages run history service to store and manage the explanation data\n",
        "---\n",
        "\n",
        "Setup: If you are using Jupyter notebooks, the extensions should be installed automatically with the package.\n",
@@ -475,7 +475,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "experiment_name = 'explain_model'\n",
+        "experiment_name = 'explain-model-sample'\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "run = experiment.start_logging()\n",
        "client = ExplanationClient.from_run(run)"
--- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb
+++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb
@@ -323,7 +323,7 @@
        "\n",
        "# azureml-defaults is required to host the model as a web service.\n",
        "azureml_pip_packages = [\n",
-        "    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n",
+        "    'azureml-defaults', 'azureml-core', 'azureml-telemetry',\n",
        "    'azureml-interpret'\n",
        "]\n",
        " \n",
--- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb
+++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb
@@ -267,7 +267,7 @@
        "run_config.environment.python.user_managed_dependencies = False\n",
        "\n",
        "azureml_pip_packages = [\n",
-        "    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-telemetry', 'azureml-interpret'\n",
+        "    'azureml-defaults', 'azureml-telemetry', 'azureml-interpret'\n",
        "]\n",
        " \n",
        "\n",
@@ -431,7 +431,7 @@
        "\n",
        "# WARNING: to install this, g++ needs to be available on the Docker image and is not by default (look at the next cell)\n",
        "azureml_pip_packages = [\n",
-        "    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n",
+        "    'azureml-defaults', 'azureml-core', 'azureml-telemetry',\n",
        "    'azureml-interpret'\n",
        "]\n",
        " \n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb
@@ -341,7 +341,7 @@
      "outputs": [],
      "source": [
        "pipeline = Pipeline(workspace=ws, steps=[step])\n",
-        "pipeline_run = Experiment(ws, 'azurebatch_experiment').submit(pipeline)"
+        "pipeline_run = Experiment(ws, 'azurebatch_sample').submit(pipeline)"
      ]
    },
    {
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-pipeline-drafts.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-pipeline-drafts.ipynb
@@ -130,7 +130,7 @@
        "\n",
        "pipeline_draft = PipelineDraft.create(ws, name=\"TestPipelineDraft\",\n",
        "                                      description=\"draft description\",\n",
-        "                                      experiment_name=\"helloworld\",\n",
+        "                                      experiment_name=\"pipeline_draft_sample\",\n",
        "                                      pipeline=pipeline,\n",
        "                                      continue_on_step_failure=True,\n",
        "                                      tags={'dev': 'true'},\n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-publish-and-run-using-rest-endpoint.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-publish-and-run-using-rest-endpoint.ipynb
@@ -325,7 +325,7 @@
      "outputs": [],
      "source": [
        "# submit a pipeline run\n",
-        "pipeline_run1 = Experiment(ws, 'Pipeline_experiment').submit(pipeline1)\n",
+        "pipeline_run1 = Experiment(ws, 'Pipeline_experiment_sample').submit(pipeline1)\n",
        "# publish a pipeline from the submitted pipeline run\n",
        "published_pipeline2 = pipeline_run1.publish_pipeline(name=\"My_New_Pipeline2\", description=\"My Published Pipeline Description\", version=\"0.1\", continue_on_step_failure=True)\n",
        "published_pipeline2"
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipeline.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipeline.ipynb
@@ -259,7 +259,7 @@
        "\n",
        "schedule = Schedule.create(workspace=ws, name=\"My_Schedule\",\n",
        "                           pipeline_id=pub_pipeline_id, \n",
-        "                           experiment_name='Schedule_Run',\n",
+        "                           experiment_name='Schedule-run-sample',\n",
        "                           recurrence=recurrence,\n",
        "                           wait_for_provisioning=True,\n",
        "                           description=\"Schedule Run\")\n",
@@ -445,7 +445,7 @@
        "\n",
        "schedule = Schedule.create(workspace=ws, name=\"My_Schedule\",\n",
        "                           pipeline_id=pub_pipeline_id, \n",
-        "                           experiment_name='Schedule_Run',\n",
+        "                           experiment_name='Schedule-run-sample',\n",
        "                           datastore=datastore,\n",
        "                           wait_for_provisioning=True,\n",
        "                           description=\"Schedule Run\")\n",
@@ -516,7 +516,7 @@
        "\n",
        "schedule = Schedule.create_for_pipeline_endpoint(workspace=ws, name=\"My_Endpoint_Schedule\",\n",
        "                                                 pipeline_endpoint_id=published_pipeline_endpoint_id,\n",
-        "                                                 experiment_name='Schedule_Run',\n",
+        "                                                 experiment_name='Schedule-run-sample',\n",
        "                                                 recurrence=recurrence, description=\"Schedule_Run\",\n",
        "                                                 wait_for_provisioning=True)\n",
        "\n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-versioned-pipeline-endpoints.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-versioned-pipeline-endpoints.ipynb
@@ -553,7 +553,7 @@
      "outputs": [],
      "source": [
        "from azureml.core import Experiment\n",
-        "pipeline_run = Experiment(ws, name=\"submit_from_endpoint\").submit(pipeline_endpoint_by_name, tags={'endpoint_tag': \"1\"}, pipeline_version=\"0\")"
+        "pipeline_run = Experiment(ws, name=\"submit_endpoint_sample\").submit(pipeline_endpoint_by_name, tags={'endpoint_tag': \"1\"}, pipeline_version=\"0\")"
      ]
    }
  ],
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb
@@ -101,7 +101,7 @@
      "metadata": {},
      "source": [
        "## Create an Azure ML experiment\n",
-        "Let's create an experiment named \"automlstep-classification\" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.\n",
+        "Let's create an experiment named \"automlstep-sample\" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.\n",
        "\n",
        "The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step."
      ]
@@ -113,7 +113,7 @@
      "outputs": [],
      "source": [
        "# Choose a name for the run history container in the workspace.\n",
-        "experiment_name = 'automlstep-classification'\n",
+        "experiment_name = 'automlstep-sample'\n",
        "project_folder = './project'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
--- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb
+++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb
@@ -428,7 +428,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "pipeline_run1 = Experiment(ws, 'Data_dependency').submit(pipeline1)\n",
+        "pipeline_run1 = Experiment(ws, 'Data_dependency_sample').submit(pipeline1)\n",
        "print(\"Pipeline is submitted for execution\")"
      ]
    },
--- a/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/distributed-pytorch-with-distributeddataparallel.ipynb
+++ b/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/distributed-pytorch-with-distributeddataparallel.ipynb
@@ -0,0 +1,495 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved.\n",
+    "\n",
+    "Licensed under the MIT License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distributed PyTorch with DistributedDataParallel\n",
+    "\n",
+    "In this tutorial, you will train a PyTorch model on the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset using distributed training with PyTorch's `DistributedDataParallel` module across a GPU cluster."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check core SDK version number\n",
+    "import azureml.core\n",
+    "\n",
+    "print(\"SDK version:\", azureml.core.VERSION)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Diagnostics\n",
+    "Opt-in diagnostics for better experience, quality, and security of future releases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "Diagnostics"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from azureml.telemetry import set_diagnostics_collection\n",
+    "\n",
+    "set_diagnostics_collection(send_diagnostics=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize workspace\n",
+    "\n",
+    "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.core.workspace import Workspace\n",
+    "\n",
+    "ws = Workspace.from_config()\n",
+    "print('Workspace name: ' + ws.name, \n",
+    "      'Azure region: ' + ws.location, \n",
+    "      'Subscription id: ' + ws.subscription_id, \n",
+    "      'Resource group: ' + ws.resource_group, sep='\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create or attach existing AmlCompute\n",
+    "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `STANDARD_NC6` GPU cluster that autoscales from `0` to `4` nodes.\n",
+    "\n",
+    "**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
+    "\n",
+    "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
+    "from azureml.core.compute_target import ComputeTargetException\n",
+    "\n",
+    "# choose a name for your cluster\n",
+    "cluster_name = 'gpu-cluster'\n",
+    "\n",
+    "try:\n",
+    "    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
+    "    print('Found existing compute target.')\n",
+    "except ComputeTargetException:\n",
+    "    print('Creating a new compute target...')\n",
+    "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
+    "                                                           max_nodes=4)\n",
+    "\n",
+    "    # create the cluster\n",
+    "    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
+    "\n",
+    "    compute_target.wait_for_completion(show_output=True)\n",
+    "\n",
+    "# use get_status() to get a detailed status for the current AmlCompute. \n",
+    "print(compute_target.get_status().serialize())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare dataset\n",
+    "\n",
+    "Prepare the dataset used for training. We will first download and extract the publicly available CIFAR-10 dataset from the cs.toronto.edu website and then create an Azure ML FileDataset to use the data for training."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download and extract CIFAR-10 data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import urllib\n",
+    "import tarfile\n",
+    "import os\n",
+    "\n",
+    "url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'\n",
+    "filename = 'cifar-10-python.tar.gz'\n",
+    "data_root = 'cifar-10'\n",
+    "filepath = os.path.join(data_root, filename)\n",
+    "\n",
+    "if not os.path.isdir(data_root):\n",
+    "    os.makedirs(data_root, exist_ok=True)\n",
+    "    urllib.request.urlretrieve(url, filepath)\n",
+    "    with tarfile.open(filepath, \"r:gz\") as tar:\n",
+    "        tar.extractall(path=data_root)\n",
+    "    os.remove(filepath)  # delete tar.gz file after extraction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create Azure ML dataset\n",
+    "\n",
+    "The `upload_directory` method will upload the data to a datastore and create a FileDataset from it. In this tutorial we will use the workspace's default datastore."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.core import Dataset\n",
+    "\n",
+    "datastore = ws.get_default_datastore()\n",
+    "dataset = Dataset.File.upload_directory(\n",
+    "    src_dir=data_root, target=(datastore, data_root)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train model on the remote compute\n",
+    "Now that we have the AmlCompute ready to go, let's run our distributed training job."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a project directory\n",
+    "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_folder = './pytorch-distr'\n",
+    "os.makedirs(project_folder, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare training script\n",
+    "Now you will need to create your training script. In this tutorial, the script for distributed training on CIFAR-10 is already provided for you at `train.py`. In practice, you should be able to take any custom PyTorch training script as is and run it with Azure ML without having to modify your code."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once your script is ready, copy the training script `train.py` into the project directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "\n",
+    "shutil.copy('train.py', project_folder)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create an experiment\n",
+    "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.core import Experiment\n",
+    "\n",
+    "experiment_name = 'pytorch-distr'\n",
+    "experiment = Experiment(ws, name=experiment_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create an environment\n",
+    "\n",
+    "In this tutorial, we will use one of Azure ML's curated PyTorch environments for training. [Curated environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments#use-a-curated-environment) are available in your workspace by default. Specifically, we will use the PyTorch 1.6 GPU curated environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.core import Environment\n",
+    "\n",
+    "pytorch_env = Environment.get(ws, name='AzureML-PyTorch-1.6-GPU')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure the training job\n",
+    "\n",
+    "To launch a distributed PyTorch job on Azure ML, you have two options:\n",
+    "\n",
+    "1. Per-process launch - specify the total # of worker processes (typically one per GPU) you want to run, and\n",
+    "Azure ML will handle launching each process.\n",
+    "2. Per-node launch with [torch.distributed.launch](https://pytorch.org/docs/stable/distributed.html#launch-utility) - provide the `torch.distributed.launch` command you want to\n",
+    "run on each node.\n",
+    "\n",
+    "For more information, see the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-pytorch#distributeddataparallel).\n",
+    "\n",
+    "Both options are shown below."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Per-process launch\n",
+    "\n",
+    "To use the per-process launch option in which Azure ML will handle launching each of the processes to run your training script,\n",
+    "\n",
+    "1. Specify the training script and arguments\n",
+    "2. Create a `PyTorchConfiguration` and specify `node_count` and `process_count`. The `process_count` is the total number of processes you want to run for the job; this should typically equal the # of GPUs available on each node multiplied by the # of nodes. Since this tutorial uses the `STANDARD_NC6` SKU, which has one GPU, the total process count for a 2-node job is `2`. If you are using a SKU with >1 GPUs, adjust the `process_count` accordingly.\n",
+    "\n",
+    "Azure ML will set the `MASTER_ADDR`, `MASTER_PORT`, `NODE_RANK`, `WORLD_SIZE` environment variables on each node, in addition to the process-level `RANK` and `LOCAL_RANK` environment variables, that are needed for distributed PyTorch training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.core import ScriptRunConfig\n",
+    "from azureml.core.runconfig import PyTorchConfiguration\n",
+    "\n",
+    "# create distributed config\n",
+    "distr_config = PyTorchConfiguration(process_count=2, node_count=2)\n",
+    "\n",
+    "# create args\n",
+    "args = [\"--data-dir\", dataset.as_download(), \"--epochs\", 25]\n",
+    "\n",
+    "# create job config\n",
+    "src = ScriptRunConfig(source_directory=project_folder,\n",
+    "                      script='train.py',\n",
+    "                      arguments=args,\n",
+    "                      compute_target=compute_target,\n",
+    "                      environment=pytorch_env,\n",
+    "                      distributed_job_config=distr_config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Per-node launch with `torch.distributed.launch`\n",
+    "\n",
+    "If you would instead like to use the PyTorch-provided launch utility `torch.distributed.launch` to handle launching the worker processes on each node, you can do so as well. \n",
+    "\n",
+    "1. Provide the launch command to the `command` parameter of ScriptRunConfig. For PyTorch jobs Azure ML will set the `MASTER_ADDR`, `MASTER_PORT`, and `NODE_RANK` environment variables on each node, so you can simply just reference those environment variables in your command. If you are using a SKU with >1 GPUs, adjust the `--nproc_per_node` argument accordingly.\n",
+    "\n",
+    "2. Create a `PyTorchConfiguration` and specify the `node_count`. You do not need to specify the `process_count`; by default Azure ML will launch one process per node to run the `command` you provided.\n",
+    "\n",
+    "Uncomment the code below to configure a job with this method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "from azureml.core import ScriptRunConfig\n",
+    "from azureml.core.runconfig import PyTorchConfiguration\n",
+    "\n",
+    "# create distributed config\n",
+    "distr_config = PyTorchConfiguration(node_count=2)\n",
+    "\n",
+    "# define command\n",
+    "launch_cmd = [\"python -m torch.distributed.launch --nproc_per_node 1 --nnodes 2 \" \\\n",
+    "    \"--node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT --use_env \" \\\n",
+    "    \"train.py --data-dir\", dataset.as_download(), \"--epochs 25\"]\n",
+    "\n",
+    "# create job config\n",
+    "src = ScriptRunConfig(source_directory=project_folder,\n",
+    "                      command=launch_cmd,\n",
+    "                      compute_target=compute_target,\n",
+    "                      environment=pytorch_env,\n",
+    "                      distributed_job_config=distr_config)\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Submit job\n",
+    "Run your experiment by submitting your `ScriptRunConfig` object. Note that this call is asynchronous."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run = experiment.submit(src)\n",
+    "print(run)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Monitor your run\n",
+    "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes. You can see that the widget automatically plots and visualizes the loss metric that we logged to the Azure ML run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azureml.widgets import RunDetails\n",
+    "\n",
+    "RunDetails(run).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, you can block until the script has completed training before running more code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run.wait_for_completion(show_output=True) # this provides a verbose log"
+   ]
+  }
+ ],
+ "metadata": {
+  "authors": [
+   {
+    "name": "minxia"
+   }
+  ],
+  "category": "training",
+  "compute": [
+   "AML Compute"
+  ],
+  "datasets": [
+   "CIFAR-10"
+  ],
+  "deployment": [
+   "None"
+  ],
+  "exclude_from_index": false,
+  "framework": [
+   "PyTorch"
+  ],
+  "friendly_name": "Distributed training with PyTorch",
+  "index_order": 1,
+  "kernelspec": {
+   "display_name": "Python 3.6",
+   "language": "python",
+   "name": "python36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "tags": [
+   "None"
+  ],
+  "task": "Train a model using distributed training via PyTorch DistributedDataParallel"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/train.py
+++ b/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/train.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2017 Facebook, Inc. All rights reserved.
+# BSD 3-Clause License
+#
+# Script adapted from:
+# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
+# ==============================================================================
+
+# imports
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import os
+import argparse
+
+
+# define network architecture
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 32, 3)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(32, 64, 3)
+        self.conv3 = nn.Conv2d(64, 128, 3)
+        self.fc1 = nn.Linear(128 * 6 * 6, 120)
+        self.dropout = nn.Dropout(p=0.2)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = self.pool(F.relu(self.conv3(x)))
+        x = x.view(-1, 128 * 6 * 6)
+        x = self.dropout(F.relu(self.fc1(x)))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def train(train_loader, model, criterion, optimizer, epoch, device, print_freq, rank):
+    running_loss = 0.0
+    for i, data in enumerate(train_loader, 0):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data[0].to(device), data[1].to(device)
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = model(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % print_freq == 0:  # print every print_freq mini-batches
+            print(
+                "Rank %d: [%d, %5d] loss: %.3f"
+                % (rank, epoch + 1, i + 1, running_loss / print_freq)
+            )
+            running_loss = 0.0
+
+
+def evaluate(test_loader, model, device):
+    classes = (
+        "plane",
+        "car",
+        "bird",
+        "cat",
+        "deer",
+        "dog",
+        "frog",
+        "horse",
+        "ship",
+        "truck",
+    )
+
+    model.eval()
+
+    correct = 0
+    total = 0
+    class_correct = list(0.0 for i in range(10))
+    class_total = list(0.0 for i in range(10))
+    with torch.no_grad():
+        for data in test_loader:
+            images, labels = data[0].to(device), data[1].to(device)
+            outputs = model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+            c = (predicted == labels).squeeze()
+            for i in range(10):
+                label = labels[i]
+                class_correct[label] += c[i].item()
+                class_total[label] += 1
+
+    # print total test set accuracy
+    print(
+        "Accuracy of the network on the 10000 test images: %d %%"
+        % (100 * correct / total)
+    )
+
+    # print test accuracy for each of the classes
+    for i in range(10):
+        print(
+            "Accuracy of %5s : %2d %%"
+            % (classes[i], 100 * class_correct[i] / class_total[i])
+        )
+
+
+def main(args):
+    # get PyTorch environment variables
+    world_size = int(os.environ["WORLD_SIZE"])
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+
+    distributed = world_size > 1
+
+    # set device
+    if distributed:
+        device = torch.device("cuda", local_rank)
+    else:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    # initialize distributed process group using default env:// method
+    if distributed:
+        torch.distributed.init_process_group(backend="nccl")
+
+    # define train and test dataset DataLoaders
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    )
+
+    train_set = torchvision.datasets.CIFAR10(
+        root=args.data_dir, train=True, download=False, transform=transform
+    )
+
+    if distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=args.workers,
+        sampler=train_sampler,
+    )
+
+    test_set = torchvision.datasets.CIFAR10(
+        root=args.data_dir, train=False, download=False, transform=transform
+    )
+    test_loader = torch.utils.data.DataLoader(
+        test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers
+    )
+
+    model = Net().to(device)
+
+    # wrap model with DDP
+    if distributed:
+        model = nn.parallel.DistributedDataParallel(
+            model, device_ids=[local_rank], output_device=local_rank
+        )
+
+    # define loss function and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(
+        model.parameters(), lr=args.learning_rate, momentum=args.momentum
+    )
+
+    # train the model
+    for epoch in range(args.epochs):
+        print("Rank %d: Starting epoch %d" % (rank, epoch))
+        if distributed:
+            train_sampler.set_epoch(epoch)
+        model.train()
+        train(
+            train_loader,
+            model,
+            criterion,
+            optimizer,
+            epoch,
+            device,
+            args.print_freq,
+            rank,
+        )
+
+    print("Rank %d: Finished Training" % (rank))
+
+    if not distributed or rank == 0:
+        os.makedirs(args.output_dir, exist_ok=True)
+        model_path = os.path.join(args.output_dir, "cifar_net.pt")
+        torch.save(model.state_dict(), model_path)
+
+        # evaluate on full test dataset
+        evaluate(test_loader, model, device)
+
+
+if __name__ == "__main__":
+    # setup argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data-dir", type=str, help="directory containing CIFAR-10 dataset"
+    )
+    parser.add_argument("--epochs", default=10, type=int, help="number of epochs")
+    parser.add_argument(
+        "--batch-size",
+        default=16,
+        type=int,
+        help="mini batch size for each gpu/process",
+    )
+    parser.add_argument(
+        "--workers",
+        default=2,
+        type=int,
+        help="number of data loading workers for each gpu/process",
+    )
+    parser.add_argument(
+        "--learning-rate", default=0.001, type=float, help="learning rate"
+    )
+    parser.add_argument("--momentum", default=0.9, type=float, help="momentum")
+    parser.add_argument(
+        "--output-dir", default="outputs", type=str, help="directory to save model to"
+    )
+    parser.add_argument(
+        "--print-freq",
+        default=200,
+        type=int,
+        help="frequency of printing training statistics",
+    )
+    args = parser.parse_args()
+
+    main(args)
--- a/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/distributed-pytorch-with-nccl-gloo.ipynb
+++ b/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/distributed-pytorch-with-nccl-gloo.ipynb
@@ -1,444 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
-        "\n",
-        "Licensed under the MIT License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.png)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# Distributed PyTorch with DistributedDataParallel\n",
-        "In this tutorial, you will train a PyTorch model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using distributed training with PyTorch's `DistributedDataParallel` module across a GPU cluster. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Prerequisites\n",
-        "* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Check core SDK version number\n",
-        "import azureml.core\n",
-        "\n",
-        "print(\"SDK version:\", azureml.core.VERSION)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Diagnostics\n",
-        "Opt-in diagnostics for better experience, quality, and security of future releases."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "tags": [
-          "Diagnostics"
-        ]
-      },
-      "outputs": [],
-      "source": [
-        "from azureml.telemetry import set_diagnostics_collection\n",
-        "\n",
-        "set_diagnostics_collection(send_diagnostics=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Initialize workspace\n",
-        "\n",
-        "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.core.workspace import Workspace\n",
-        "\n",
-        "ws = Workspace.from_config()\n",
-        "print('Workspace name: ' + ws.name, \n",
-        "      'Azure region: ' + ws.location, \n",
-        "      'Subscription id: ' + ws.subscription_id, \n",
-        "      'Resource group: ' + ws.resource_group, sep='\\n')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Create or attach existing AmlCompute\n",
-        "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `STANDARD_NC6` GPU cluster that autoscales from `0` to `4` nodes.\n",
-        "\n",
-        "**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
-        "\n",
-        "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.core.compute import ComputeTarget, AmlCompute\n",
-        "from azureml.core.compute_target import ComputeTargetException\n",
-        "\n",
-        "# choose a name for your cluster\n",
-        "cluster_name = \"gpu-cluster\"\n",
-        "\n",
-        "try:\n",
-        "    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
-        "    print('Found existing compute target.')\n",
-        "except ComputeTargetException:\n",
-        "    print('Creating a new compute target...')\n",
-        "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
-        "                                                           max_nodes=4)\n",
-        "\n",
-        "    # create the cluster\n",
-        "    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
-        "\n",
-        "    compute_target.wait_for_completion(show_output=True)\n",
-        "\n",
-        "# use get_status() to get a detailed status for the current AmlCompute. \n",
-        "print(compute_target.get_status().serialize())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "The above code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Train model on the remote compute\n",
-        "Now that we have the AmlCompute ready to go, let's run our distributed training job."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Create a project directory\n",
-        "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "\n",
-        "project_folder = './pytorch-distr'\n",
-        "os.makedirs(project_folder, exist_ok=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Prepare training script\n",
-        "Now you will need to create your training script. In this tutorial, the script for distributed training of MNIST is already provided for you at `pytorch_mnist.py`. In practice, you should be able to take any custom PyTorch training script as is and run it with Azure ML without having to modify your code.\n",
-        "\n",
-        "However, if you would like to use Azure ML's [metric logging](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#logging) capabilities, you will have to add a small amount of Azure ML logic inside your training script. In this example, at each logging interval, we will log the loss for that minibatch to our Azure ML run.\n",
-        "\n",
-        "To do so, in `pytorch_mnist.py`, we will first access the Azure ML `Run` object within the script:\n",
-        "```Python\n",
-        "from azureml.core.run import Run\n",
-        "run = Run.get_context()\n",
-        "```\n",
-        "Later within the script, we log the loss metric to our run:\n",
-        "```Python\n",
-        "run.log('loss', losses.avg)\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Once your script is ready, copy the training script `pytorch_mnist.py` into the project directory."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import shutil\n",
-        "\n",
-        "shutil.copy('pytorch_mnist.py', project_folder)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Create an experiment\n",
-        "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.core import Experiment\n",
-        "\n",
-        "experiment_name = 'pytorch-distr'\n",
-        "experiment = Experiment(ws, name=experiment_name)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Create an environment\n",
-        "\n",
-        "Define a conda environment YAML file with your training script dependencies and create an Azure ML environment."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%%writefile conda_dependencies.yml\n",
-        "\n",
-        "channels:\n",
-        "- conda-forge\n",
-        "dependencies:\n",
-        "- python=3.6.2\n",
-        "- pip:\n",
-        "  - azureml-defaults\n",
-        "  - torch==1.6.0\n",
-        "  - torchvision==0.7.0\n",
-        "  - future==0.17.1"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.core import Environment\n",
-        "\n",
-        "pytorch_env = Environment.from_conda_specification(name = 'pytorch-1.6-gpu', file_path = './conda_dependencies.yml')\n",
-        "\n",
-        "# Specify a GPU base image\n",
-        "pytorch_env.docker.enabled = True\n",
-        "pytorch_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Configure the training job: torch.distributed with NCCL backend\n",
-        "\n",
-        "Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
-        "\n",
-        "In order to run a distributed PyTorch job with **torch.distributed** using the NCCL backend, create a `PyTorchConfiguration` and pass it to the `distributed_job_config` parameter of the ScriptRunConfig constructor. Specify `communication_backend='Nccl'` in the PyTorchConfiguration. The below code will configure a 2-node distributed job. The NCCL backend is the recommended backend for PyTorch distributed GPU training.\n",
-        "\n",
-        "The script arguments refers to the Azure ML-set environment variables `AZ_BATCHAI_PYTORCH_INIT_METHOD` for shared file-system initialization and `AZ_BATCHAI_TASK_INDEX` for the global rank of each worker process."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.core import ScriptRunConfig\n",
-        "from azureml.core.runconfig import PyTorchConfiguration\n",
-        "\n",
-        "args = ['--dist-backend', 'nccl',\n",
-        "        '--dist-url', '$AZ_BATCHAI_PYTORCH_INIT_METHOD',\n",
-        "        '--rank', '$AZ_BATCHAI_TASK_INDEX',\n",
-        "        '--world-size', 2]\n",
-        "\n",
-        "src = ScriptRunConfig(source_directory=project_folder,\n",
-        "                      script='pytorch_mnist.py',\n",
-        "                      arguments=args,\n",
-        "                      compute_target=compute_target,\n",
-        "                      environment=pytorch_env,\n",
-        "                      distributed_job_config=PyTorchConfiguration(communication_backend='Nccl', node_count=2))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Submit job\n",
-        "Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "run = experiment.submit(src)\n",
-        "print(run)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Monitor your run\n",
-        "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes. You can see that the widget automatically plots and visualizes the loss metric that we logged to the Azure ML run."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.widgets import RunDetails\n",
-        "\n",
-        "RunDetails(run).show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Alternatively, you can block until the script has completed training before running more code."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "run.wait_for_completion(show_output=True) # this provides a verbose log"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Configure training job: torch.distributed with Gloo backend\n",
-        "\n",
-        "If you would instead like to use the Gloo backend for distributed training, you can do so via the following code. The Gloo backend is recommended for distributed CPU training."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from azureml.core import ScriptRunConfig\n",
-        "from azureml.core.runconfig import PyTorchConfiguration\n",
-        "\n",
-        "args = ['--dist-backend', 'gloo',\n",
-        "        '--dist-url', '$AZ_BATCHAI_PYTORCH_INIT_METHOD',\n",
-        "        '--rank', '$AZ_BATCHAI_TASK_INDEX',\n",
-        "        '--world-size', 2]\n",
-        "\n",
-        "src = ScriptRunConfig(source_directory=project_folder,\n",
-        "                      script='pytorch_mnist.py',\n",
-        "                      arguments=args,\n",
-        "                      compute_target=compute_target,\n",
-        "                      environment=pytorch_env,\n",
-        "                      distributed_job_config=PyTorchConfiguration(communication_backend='Gloo', node_count=2))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Once you create the ScriptRunConfig, you can follow the submit steps as shown in the previous steps to submit a PyTorch distributed run using the Gloo backend."
-      ]
-    }
-  ],
-  "metadata": {
-    "authors": [
-      {
-        "name": "ninhu"
-      }
-    ],
-    "category": "training",
-    "compute": [
-      "AML Compute"
-    ],
-    "datasets": [
-      "MNIST"
-    ],
-    "deployment": [
-      "None"
-    ],
-    "exclude_from_index": false,
-    "framework": [
-      "PyTorch"
-    ],
-    "friendly_name": "Distributed training with PyTorch",
-    "index_order": 1,
-    "kernelspec": {
-      "display_name": "Python 3.6",
-      "language": "python",
-      "name": "python36"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.6.9"
-    },
-    "tags": [
-      "None"
-    ],
-    "task": "Train a model using distributed training via Nccl/Gloo"
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2
-}
--- a/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py
+++ b/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py
@@ -1,209 +0,0 @@
-# Copyright (c) 2017, PyTorch contributors
-# Modifications copyright (C) Microsoft Corporation
-# Licensed under the BSD license
-# Adapted from https://github.com/Azure/BatchAI/tree/master/recipes/PyTorch/PyTorch-GPU-Distributed-Gloo
-
-from __future__ import print_function
-import argparse
-import os
-import shutil
-import time
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-import torch.nn.parallel
-import torch.backends.cudnn as cudnn
-import torch.distributed as dist
-import torch.utils.data
-import torch.utils.data.distributed
-import torchvision.models as models
-
-from azureml.core.run import Run
-# get the Azure ML run object
-run = Run.get_context()
-
-# Training settings
-parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                    help='input batch size for training (default: 64)')
-parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                    help='input batch size for testing (default: 1000)')
-parser.add_argument('--epochs', type=int, default=10, metavar='N',
-                    help='number of epochs to train (default: 10)')
-parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
-                    help='learning rate (default: 0.01)')
-parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
-                    help='SGD momentum (default: 0.5)')
-parser.add_argument('--seed', type=int, default=1, metavar='S',
-                    help='random seed (default: 1)')
-parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
-                    help='number of data loading workers (default: 4)')
-parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                    help='how many batches to wait before logging training status')
-parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
-                    metavar='W', help='weight decay (default: 1e-4)')
-parser.add_argument('--world-size', default=1, type=int,
-                    help='number of distributed processes')
-parser.add_argument('--dist-url', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--dist-backend', default='nccl', type=str,
-                    help='distributed backend')
-parser.add_argument('--rank', default=-1, type=int,
-                    help='rank of the worker')
-
-best_prec1 = 0
-args = parser.parse_args()
-
-args.distributed = args.world_size >= 2
-
-if args.distributed:
-    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
-                            world_size=args.world_size, rank=args.rank)
-
-train_dataset = datasets.MNIST('data-%d' % args.rank, train=True, download=True,
-                               transform=transforms.Compose([
-                                   transforms.ToTensor(),
-                                   transforms.Normalize((0.1307,), (0.3081,))
-                               ]))
-
-if args.distributed:
-    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
-else:
-    train_sampler = None
-
-train_loader = torch.utils.data.DataLoader(
-    train_dataset,
-    batch_size=args.batch_size, shuffle=(train_sampler is None),
-    num_workers=args.workers, pin_memory=True, sampler=train_sampler)
-
-
-test_loader = torch.utils.data.DataLoader(
-    train_dataset,
-    batch_size=args.batch_size, shuffle=False,
-    num_workers=args.workers, pin_memory=True)
-
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-        self.conv2_drop = nn.Dropout2d()
-        self.fc1 = nn.Linear(320, 50)
-        self.fc2 = nn.Linear(50, 10)
-
-    def forward(self, x):
-        x = F.relu(F.max_pool2d(self.conv1(x), 2))
-        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-        x = x.view(-1, 320)
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, training=self.training)
-        x = self.fc2(x)
-        return F.log_softmax(x)
-
-
-model = Net()
-
-if not args.distributed:
-    model = torch.nn.DataParallel(model).cuda()
-else:
-    model.cuda()
-    model = torch.nn.parallel.DistributedDataParallel(model)
-
-# define loss function (criterion) and optimizer
-criterion = nn.CrossEntropyLoss().cuda()
-
-optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
-
-
-def train(epoch):
-    batch_time = AverageMeter()
-    data_time = AverageMeter()
-    losses = AverageMeter()
-    top1 = AverageMeter()
-    top5 = AverageMeter()
-
-    # switch to train mode
-    model.train()
-    end = time.time()
-    for i, (input, target) in enumerate(train_loader):
-        # measure data loading time
-        data_time.update(time.time() - end)
-
-        input, target = input.cuda(), target.cuda()
-
-        # compute output
-        try:
-            output = model(input)
-            loss = criterion(output, target)
-
-            # measure accuracy and record loss
-            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
-            losses.update(loss.item(), input.size(0))
-            top1.update(prec1[0], input.size(0))
-            top5.update(prec5[0], input.size(0))
-
-            # compute gradient and do SGD step
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-            # measure elapsed time
-            batch_time.update(time.time() - end)
-            end = time.time()
-
-            if i % 10 == 0:
-                run.log("loss", losses.avg)
-                run.log("prec@1", "{0:.3f}".format(top1.avg))
-                run.log("prec@5", "{0:.3f}".format(top5.avg))
-                print('Epoch: [{0}][{1}/{2}]\t'
-                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
-                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
-                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
-                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
-                                                                      batch_time=batch_time, data_time=data_time,
-                                                                      loss=losses, top1=top1, top5=top5))
-        except:
-            import sys
-            print("Unexpected error:", sys.exc_info()[0])
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the precision@k for the specified values of k"""
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
-
-
-for epoch in range(1, args.epochs + 1):
-    train(epoch)
--- a/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb
@@ -147,7 +147,7 @@
        "\n",
        "To do this, you first must install the Azure Networking API.\n",
        "\n",
-        "`pip install --upgrade azure-mgmt-network`"
+        "`pip install --upgrade azure-mgmt-network==12.0.0`"
      ]
    },
    {
@@ -157,7 +157,7 @@
      "outputs": [],
      "source": [
        "# If you need to install the Azure Networking SDK, uncomment the following line.\n",
-        "#!pip install --upgrade azure-mgmt-network"
+        "#!pip install --upgrade azure-mgmt-network==12.0.0"
      ]
    },
    {
--- a/how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb
@@ -167,7 +167,7 @@
        "\n",
        "To do this, you first must install the Azure Networking API.\n",
        "\n",
-        "`pip install --upgrade azure-mgmt-network`"
+        "`pip install --upgrade azure-mgmt-network==12.0.0`"
      ]
    },
    {
@@ -177,7 +177,7 @@
      "outputs": [],
      "source": [
        "# If you need to install the Azure Networking SDK, uncomment the following line.\n",
-        "#!pip install --upgrade azure-mgmt-network"
+        "#!pip install --upgrade azure-mgmt-network==12.0.0"
      ]
    },
    {
--- a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb
+++ b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb
@@ -100,7 +100,7 @@
        "\n",
        "# Check core SDK version number\n",
        "\n",
-        "print(\"This notebook was created using SDK version 1.22.0, you are currently running version\", azureml.core.VERSION)"
+        "print(\"This notebook was created using SDK version 1.23.0, you are currently running version\", azureml.core.VERSION)"
      ]
    },
    {
--- a/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-local/train-local.ipynb
+++ b/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-local/train-local.ipynb
@@ -98,7 +98,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "experiment_name = \"experiment-with-mlflow\"\n",
+        "experiment_name = \"LocalTrain-with-mlflow-sample\"\n",
        "mlflow.set_experiment(experiment_name)"
      ]
    },
--- a/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-remote/train-remote.ipynb
+++ b/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-remote/train-remote.ipynb
@@ -123,7 +123,7 @@
      "source": [
        "from azureml.core import Experiment\n",
        "\n",
-        "experiment_name = \"experiment-with-mlflow\"\n",
+        "experiment_name = \"RemoteTrain-with-mlflow-sample\"\n",
        "exp = Experiment(workspace=ws, name=experiment_name)"
      ]
    },
--- a/setup-environment/configuration.ipynb
+++ b/setup-environment/configuration.ipynb
@@ -102,7 +102,7 @@
      "source": [
        "import azureml.core\n",
        "\n",
-        "print(\"This notebook was created using version 1.22.0 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.23.0 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
Author	SHA1	Message	Date
mx-iao	37f37a46c1	Delete pytorch_mnist.py	2021-02-23 11:19:39 -08:00
mx-iao	0cd1412421	Delete distributed-pytorch-with-nccl-gloo.ipynb	2021-02-23 11:19:33 -08:00
mx-iao	c3ae9f00f6	Add files via upload	2021-02-23 11:19:02 -08:00
mx-iao	11b02c650c	Rename how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel.ipynb to how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/distributed-pytorch-with-distributeddataparallel.ipynb	2021-02-23 11:18:43 -08:00
mx-iao	606048c71f	Add files via upload	2021-02-23 11:18:10 -08:00
Harneet Virk	cb1c354d44	Merge pull request #1353 from Azure/release_update/Release-88 update samples from Release-88 as a part of SDK release 1.23.0	2021-02-22 11:49:02 -08:00
amlrelsa-ms	c868fff5a2	update samples from Release-88 as a part of SDK release	2021-02-22 19:23:04 +00:00
Harneet Virk	bc4e6611c4	Merge pull request #1342 from Azure/release_update/Release-87 update samples from Release-87 as a part of SDK release	2021-02-16 18:43:49 -08:00