Update README.md

Merge pull request #545 from Azure/imatiach-msft-patch-1
add dataprep dependency to notebook
2025-12-20 01:27:06 -05:00 · 2019-08-27 09:23:40 -07:00 · 2019-08-23 13:14:30 -04:00 · 2019-08-23 13:11:36 -04:00 · 2019-08-23 08:38:01 -07:00 · 2019-08-23 05:28:45 +00:00
412 changed files with 60873 additions and 7586 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,30 @@
 ---
 name: Bug report
 about: Create a report to help us improve
 title: "[Notebook issue]"
 labels: ''
 assignees: ''
 ---
 **Describe the bug**
 A clear and concise description of what the bug is.
 Provide the following if applicable:
 + Your Python & SDK version 
 + Python Scripts or the full notebook name
 + Pipeline definition 
 + Environment definition 
 + Example data 
 + Any log files. 
 + Run and Workspace Id 
 **To Reproduce**
 Steps to reproduce the behavior:
 1. 
 **Expected behavior**
 A clear and concise description of what you expected to happen.
 **Additional context**
 Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/notebook-issue.md
+++ b/.github/ISSUE_TEMPLATE/notebook-issue.md
@@ -0,0 +1,43 @@
 ---
 name: Notebook issue
 about: Describe your notebook issue
 title: "[Notebook] DESCRIPTIVE TITLE"
 labels: notebook
 assignees: ''
 ---
 ### DESCRIPTION: Describe clearly + concisely
 .
 ### REPRODUCIBLE: Steps
 .
 ### EXPECTATION: Clear description 
 .
 ### CONFIG/ENVIRONMENT: 
 ```Provide where applicable
 ## Your Python & SDK version:
 ## Environment definition:
 ## Notebook name or Python scripts:
 ## Run and Workspace Id:
 ## Pipeline definition:
 ## Example data:
 ## Any log files:
 ```
--- a/Dockerfiles/1.0.23/Dockerfile
+++ b/Dockerfiles/1.0.23/Dockerfile
@@ -0,0 +1,29 @@
 FROM continuumio/miniconda:4.5.11
 # install git
 RUN apt-get update && apt-get upgrade -y && apt-get install -y git
 # create a new conda environment named azureml
 RUN conda create -n azureml -y -q Python=3.6
 # install additional packages used by sample notebooks. this is optional
 RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
 # install azurmel-sdk components
 RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.23"]
 # clone Azure ML GitHub sample notebooks
 RUN cd /home && git clone -b "azureml-sdk-1.0.23" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
 # generate jupyter configuration file
 RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
 # set an emtpy token for Jupyter to remove authentication. 
 # this is NOT recommended for production environment
 RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
 # open up port 8887 on the container
 EXPOSE 8887
 # start Jupyter notebook server on port 8887 when the container starts
 CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"
--- a/Dockerfiles/1.0.30/Dockerfile
+++ b/Dockerfiles/1.0.30/Dockerfile
@@ -0,0 +1,29 @@
 FROM continuumio/miniconda:4.5.11
 # install git
 RUN apt-get update && apt-get upgrade -y && apt-get install -y git
 # create a new conda environment named azureml
 RUN conda create -n azureml -y -q Python=3.6
 # install additional packages used by sample notebooks. this is optional
 RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
 # install azurmel-sdk components
 RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.30"]
 # clone Azure ML GitHub sample notebooks
 RUN cd /home && git clone -b "azureml-sdk-1.0.30" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
 # generate jupyter configuration file
 RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
 # set an emtpy token for Jupyter to remove authentication. 
 # this is NOT recommended for production environment
 RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
 # open up port 8887 on the container
 EXPOSE 8887
 # start Jupyter notebook server on port 8887 when the container starts
 CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"
--- a/Dockerfiles/1.0.33/Dockerfile
+++ b/Dockerfiles/1.0.33/Dockerfile
@@ -0,0 +1,29 @@
 FROM continuumio/miniconda:4.5.11
 # install git
 RUN apt-get update && apt-get upgrade -y && apt-get install -y git
 # create a new conda environment named azureml
 RUN conda create -n azureml -y -q Python=3.6
 # install additional packages used by sample notebooks. this is optional
 RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
 # install azurmel-sdk components
 RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.33"]
 # clone Azure ML GitHub sample notebooks
 RUN cd /home && git clone -b "azureml-sdk-1.0.33" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
 # generate jupyter configuration file
 RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
 # set an emtpy token for Jupyter to remove authentication. 
 # this is NOT recommended for production environment
 RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
 # open up port 8887 on the container
 EXPOSE 8887
 # start Jupyter notebook server on port 8887 when the container starts
 CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"
--- a/Dockerfiles/1.0.41/Dockerfile
+++ b/Dockerfiles/1.0.41/Dockerfile
@@ -0,0 +1,29 @@
 FROM continuumio/miniconda:4.5.11
 # install git
 RUN apt-get update && apt-get upgrade -y && apt-get install -y git
 # create a new conda environment named azureml
 RUN conda create -n azureml -y -q Python=3.6
 # install additional packages used by sample notebooks. this is optional
 RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
 # install azurmel-sdk components
 RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.41"]
 # clone Azure ML GitHub sample notebooks
 RUN cd /home && git clone -b "azureml-sdk-1.0.41" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
 # generate jupyter configuration file
 RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
 # set an emtpy token for Jupyter to remove authentication. 
 # this is NOT recommended for production environment
 RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
 # open up port 8887 on the container
 EXPOSE 8887
 # start Jupyter notebook server on port 8887 when the container starts
 CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"
--- a/Dockerfiles/1.0.43/Dockerfile
+++ b/Dockerfiles/1.0.43/Dockerfile
@@ -0,0 +1,29 @@
 FROM continuumio/miniconda:4.5.11
 # install git
 RUN apt-get update && apt-get upgrade -y && apt-get install -y git
 # create a new conda environment named azureml
 RUN conda create -n azureml -y -q Python=3.6
 # install additional packages used by sample notebooks. this is optional
 RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
 # install azurmel-sdk components
 RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.43"]
 # clone Azure ML GitHub sample notebooks
 RUN cd /home && git clone -b "azureml-sdk-1.0.43" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
 # generate jupyter configuration file
 RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
 # set an emtpy token for Jupyter to remove authentication. 
 # this is NOT recommended for production environment
 RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
 # open up port 8887 on the container
 EXPOSE 8887
 # start Jupyter notebook server on port 8887 when the container starts
 CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"
--- a/Licenses/sdk-license/LICENSE
+++ b/Licenses/sdk-license/LICENSE
@@ -1,3 +1,4 @@
 This software is made available to you on the condition that you agree to
 [your agreement][1] governing your use of Azure.
 If you do not have an existing agreement governing your use of Azure, you agree that 
--- a/Licenses/sdk-preview-license/LICENSE
+++ b/Licenses/sdk-preview-license/LICENSE
--- a/NBSETUP.md
+++ b/NBSETUP.md
@@ -1,6 +1,4 @@
-# Setting up environment
+# Set up your notebook environment for Azure Machine Learning
 ---
 To run the notebooks in this repository use one of following options.
@@ -12,9 +10,7 @@ Azure Notebooks is a hosted Jupyter-based notebook service in the Azure cloud. A
 1. Follow the instructions in the [Configuration](configuration.ipynb) notebook to create and connect to a workspace
 1. Open one of the sample notebooks
-    **Make sure the Azure Notebook kernel is set to `Python 3.6`** when you open a notebook
+    **Make sure the Azure Notebook kernel is set to `Python 3.6`** when you open a notebook by choosing Kernel > Change Kernel > Python 3.6 from the menus.
    ![set kernel to Python 3.6](images/python36.png)
 ## **Option 2: Use your own notebook server**
@@ -28,11 +24,8 @@ pip install azureml-sdk
 git clone https://github.com/Azure/MachineLearningNotebooks.git
 # below steps are optional
-# install the base SDK and a Jupyter notebook server
+# install the base SDK, Jupyter notebook server and tensorboard
-pip install azureml-sdk[notebooks]
+pip install azureml-sdk[notebooks,tensorboard]
 # install the data prep component
 pip install azureml-dataprep
 # install model explainability component
 pip install azureml-sdk[explain]
@@ -58,8 +51,7 @@ Please make sure you start with the [Configuration](configuration.ipynb) noteboo
 ### Video walkthrough:
-[![Get Started video](images/yt_cover.png)](https://youtu.be/VIsXeTuW3FU)
+[!VIDEO https://youtu.be/VIsXeTuW3FU]
 ## **Option 3: Use Docker**
@@ -90,9 +82,6 @@ Now you can point your browser to http://localhost:8887. We recommend that you s
 If you need additional Azure ML SDK components, you can either modify the Docker files before you build the Docker images to add additional steps, or install them through command line in the live container after you build the Docker image. For example:
 ```sh
 # install dataprep components
 pip install azureml-dataprep
 # install the core SDK and automated ml components
 pip install azureml-sdk[automl]
--- a/README.md
+++ b/README.md
@@ -1,8 +1,17 @@
 ---
 page_type: sample
 languages:
 - python
 products:
 - azure
 - azure-machine-learning-service
 description: "With Azure Machine Learning service, learn to prep data, train, test, deploy, manage, and track machine learning models in a cloud-based environment."
 ---
 # Azure Machine Learning service example notebooks
 This repository contains example notebooks demonstrating the [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/) Python SDK which allows you to build, train, deploy and manage machine learning solutions using Azure.  The AML SDK allows you the choice of using local or cloud compute resources, while managing and maintaining the complete data science workflow from the cloud.
 ![Azure ML workflow](https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/machine-learning/service/media/overview-what-is-azure-ml/aml.png)
 ## Quick installation
 ```sh
@@ -11,7 +20,7 @@ pip install azureml-sdk
 Read more detailed instructions on [how to set up your environment](./NBSETUP.md) using Azure Notebook service, your own Jupyter notebook server, or Docker.
 ## How to navigate and use the example notebooks?
-You should always run the [Configuration](./configuration.ipynb) notebook first when setting up a notebook library on a new machine or in a new environment. It configures your notebook library to connect to an Azure Machine Learning workspace, and sets up your workspace and compute to be used by many of the other examples. 
+If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, you should always run the [Configuration](./configuration.ipynb) notebook first when setting up a notebook library on a new machine or in a new environment. It configures your notebook library to connect to an Azure Machine Learning workspace, and sets up your workspace and compute to be used by many of the other examples. 
 If you want to...
@@ -20,7 +29,7 @@ If you want to...
 * ...learn about experimentation and tracking run history, first [train within Notebook](./how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb), then try [training on remote VM](./how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb) and [using logging APIs](./how-to-use-azureml/training/logging-api/logging-api.ipynb).
 * ...train deep learning models at scale, first learn about [Machine Learning Compute](./how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb), and then try [distributed hyperparameter tuning](./how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) and [distributed training](./how-to-use-azureml/training-with-deep-learning/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.ipynb).
 * ...deploy models as a realtime scoring service, first learn the basics by [training within Notebook and deploying to Azure Container Instance](./how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb), then learn how to [register and manage models, and create Docker images](./how-to-use-azureml/deployment/register-model-create-image-deploy-service/register-model-create-image-deploy-service.ipynb), and [production deploy models on Azure Kubernetes Cluster](./how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb).
- * ...deploy models as a batch scoring service, first [train a model within Notebook](./how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb), learn how to [register and manage models](./how-to-use-azureml/deployment/register-model-create-image-deploy-service/register-model-create-image-deploy-service.ipynb), then [create Machine Learning Compute for scoring compute](./how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb), and [use Machine Learning Pipelines to deploy your model](./how-to-use-azureml/machine-learning-pipelines/pipeline-mpi-batch-prediction.ipynb).
+ * ...deploy models as a batch scoring service, first [train a model within Notebook](./how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb), learn how to [register and manage models](./how-to-use-azureml/deployment/register-model-create-image-deploy-service/register-model-create-image-deploy-service.ipynb), then [create Machine Learning Compute for scoring compute](./how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb), and [use Machine Learning Pipelines to deploy your model](https://aka.ms/pl-batch-scoring).
 * ...monitor your deployed models, learn about using [App Insights](./how-to-use-azureml/deployment/enable-app-insights-in-production-service/enable-app-insights-in-production-service.ipynb) and [model data collection](./how-to-use-azureml/deployment/enable-data-collection-for-models-in-aks/enable-data-collection-for-models-in-aks.ipynb).
 ## Tutorials
@@ -38,6 +47,7 @@ The [How to use Azure ML](./how-to-use-azureml) folder contains specific example
 - [Machine Learning Pipelines](./how-to-use-azureml/machine-learning-pipelines) - Examples showing how to create and use reusable pipelines for training and batch scoring
 - [Deployment](./how-to-use-azureml/deployment) - Examples showing how to deploy and manage machine learning models and solutions
 - [Azure Databricks](./how-to-use-azureml/azure-databricks) - Examples showing how to use Azure ML with Azure Databricks
 - [Monitor Models](./how-to-use-azureml/monitor-models) - Examples showing how to enable model monitoring services such as DataDrift
 ---
 ## Documentation
@@ -52,5 +62,18 @@ The [How to use Azure ML](./how-to-use-azureml) folder contains specific example
 Visit following repos to see projects contributed by Azure ML users:
 - [AMLSamples](https://github.com/Azure/AMLSamples) Number of end-to-end examples, including face recognition, predictive maintenance, customer churn and sentiment analysis.
 - [Fine tune natural language processing models using Azure Machine Learning service](https://github.com/Microsoft/AzureML-BERT)
 - [Fashion MNIST with Azure ML SDK](https://github.com/amynic/azureml-sdk-fashion)
 ## Data/Telemetry 
 This repository collects usage data and sends it to Mircosoft to help improve our products and services. Read Microsoft's [privacy statement to learn more](https://privacy.microsoft.com/en-US/privacystatement)
 To opt out of tracking, please go to the raw markdown or .ipynb files and remove the following line of code:
 ```sh
    "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/README.png)"
 ```
 This URL will be slightly different depending on the file. 
 ![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/README.png)
--- a/configuration.ipynb
+++ b/configuration.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/configuration.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -51,7 +58,7 @@
        "\n",
        "### What is an Azure Machine Learning workspace\n",
        "\n",
-        "An Azure ML Workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows.  In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inferencing, and the monitoring of deployed models."
+        "An Azure ML Workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows.  In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models."
      ]
    },
    {
@@ -96,7 +103,7 @@
      "source": [
        "import azureml.core\n",
        "\n",
-        "print(\"This notebook was created using version 1.0.21 of the Azure ML SDK\")\n",
+        "print(\"This notebook was created using version 1.0.57 of the Azure ML SDK\")\n",
        "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
      ]
    },
@@ -251,7 +258,7 @@
        "```shell\n",
        "az vm list-skus -o tsv\n",
        "```\n",
-        "* min_nodes - this sets the minimum size of the cluster.  If you set the minimum to 0 the cluster will shut down all nodes while note in use.  Setting this number to a value higher than 0 will allow for faster start-up times, but you will also be billed when the cluster is not in use.\n",
+        "* min_nodes - this sets the minimum size of the cluster.  If you set the minimum to 0 the cluster will shut down all nodes while not in use.  Setting this number to a value higher than 0 will allow for faster start-up times, but you will also be billed when the cluster is not in use.\n",
        "* max_nodes - this sets the maximum size of the cluster.  Setting this to a larger number allows for more concurrency and a greater distributed processing of scale-out jobs.\n",
        "\n",
        "\n",
@@ -268,14 +275,14 @@
        "from azureml.core.compute_target import ComputeTargetException\n",
        "\n",
        "# Choose a name for your CPU cluster\n",
-        "cpu_cluster_name = \"cpucluster\"\n",
+        "cpu_cluster_name = \"cpu-cluster\"\n",
        "\n",
        "# Verify that cluster does not exist already\n",
        "try:\n",
        "    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
-        "    print(\"Found existing cpucluster\")\n",
+        "    print(\"Found existing cpu-cluster\")\n",
        "except ComputeTargetException:\n",
-        "    print(\"Creating new cpucluster\")\n",
+        "    print(\"Creating new cpu-cluster\")\n",
        "    \n",
        "    # Specify the configuration for the new cluster\n",
        "    compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_D2_V2\",\n",
@@ -306,14 +313,14 @@
        "from azureml.core.compute_target import ComputeTargetException\n",
        "\n",
        "# Choose a name for your GPU cluster\n",
-        "gpu_cluster_name = \"gpucluster\"\n",
+        "gpu_cluster_name = \"gpu-cluster\"\n",
        "\n",
        "# Verify that cluster does not exist already\n",
        "try:\n",
        "    gpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
        "    print(\"Found existing gpu cluster\")\n",
        "except ComputeTargetException:\n",
-        "    print(\"Creating new gpucluster\")\n",
+        "    print(\"Creating new gpu-cluster\")\n",
        "    \n",
        "    # Specify the configuration for the new cluster\n",
        "    compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\",\n",
--- a/configuration.yml
+++ b/configuration.yml
@@ -0,0 +1,4 @@
 name: configuration
 dependencies:
 - pip:
  - azureml-sdk
--- a/contrib/RAPIDS/README.md
+++ b/contrib/RAPIDS/README.md
@@ -287,6 +287,8 @@ Notice how the parameters are modified when using the CPU-only mode.
 The outputs of the script can be observed in the master notebook as the script is executed
 ![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/contrib/RAPIDS/README.png)
--- a/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb
+++ b/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb
@@ -20,7 +20,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-    "The [RAPIDS](https://www.developer.nvidia.com/rapids) suite of software libraries from NVIDIA enables the execution of end-to-end data science and analytics pipelines entirely on GPUs. In many machine learning projects, a significant portion of the model training time is spent in setting up the data; this stage of the process is known as Extraction, Transformation and Loading, or ETL. By using the DataFrame API for ETLÂ and GPU-capable ML algorithms in RAPIDS, data preparation and training models can be done in GPU-accelerated end-to-end pipelines without incurring serialization costs between the pipeline stages. This notebook demonstrates how to use NVIDIA RAPIDS to prepare data and train model in Azure.\n",
+        "The [RAPIDS](https://www.developer.nvidia.com/rapids) suite of software libraries from NVIDIA enables the execution of end-to-end data science and analytics pipelines entirely on GPUs. In many machine learning projects, a significant portion of the model training time is spent in setting up the data; this stage of the process is known as Extraction, Transformation and Loading, or ETL. By using the DataFrame API for ETL\u00c3\u201a\u00c2\u00a0and GPU-capable ML algorithms in RAPIDS, data preparation and training models can be done in GPU-accelerated end-to-end pipelines without incurring serialization costs between the pipeline stages. This notebook demonstrates how to use NVIDIA RAPIDS to prepare data and train model\u00c2\u00a0in Azure.\n",
        " \n",
        "In this notebook, we will do the following:\n",
        " \n",
--- a/end-to-end-samples/README.md
+++ b/end-to-end-samples/README.md
--- a/how-to-use-azureml/README.md
+++ b/how-to-use-azureml/README.md
@@ -8,7 +8,7 @@ As a pre-requisite, run the [configuration Notebook](../configuration.ipynb) not
 * [train-on-local](./training/train-on-local): Learn how to submit a run to local computer and use Azure ML managed run configuration.
 * [train-on-amlcompute](./training/train-on-amlcompute): Use a 1-n node Azure ML managed compute cluster for remote runs on Azure CPU or GPU infrastructure.
 * [train-on-remote-vm](./training/train-on-remote-vm): Use Data Science Virtual Machine as a target for remote runs.
-* [logging-api](./training/logging-api): Learn about the details of logging metrics to run history.
+* [logging-api](./track-and-monitor-experiments/logging-api): Learn about the details of logging metrics to run history.
 * [register-model-create-image-deploy-service](./deployment/register-model-create-image-deploy-service): Learn about the details of model management.
 * [production-deploy-to-aks](./deployment/production-deploy-to-aks) Deploy a model to production at scale on Azure Kubernetes Service.
 * [enable-data-collection-for-models-in-aks](./deployment/enable-data-collection-for-models-in-aks) Learn about data collection APIs for deployed model.
--- a/how-to-use-azureml/automated-machine-learning/README.md
+++ b/how-to-use-azureml/automated-machine-learning/README.md
@@ -1,8 +1,8 @@
 # Table of Contents
 1. [Automated ML Introduction](#introduction)
-1. [Running samples in Azure Notebooks](#jupyter)
+1. [Setup using Azure Notebooks](#jupyter)
-1. [Running samples in Azure Databricks](#databricks)
+1. [Setup using Azure Databricks](#databricks)
-1. [Running samples in a Local Conda environment](#localconda)
+1. [Setup using a Local Conda environment](#localconda)
 1. [Automated ML SDK Sample Notebooks](#samples)
 1. [Documentation](#documentation)
 1. [Running using python command](#pythoncommand)
@@ -13,15 +13,15 @@
 Automated machine learning (automated ML) builds high quality machine learning models for you by automating model and hyperparameter selection. Bring a labelled dataset that you want to build a model for, automated ML will give you a high quality machine learning model that you can use for predictions.
-If you are new to Data Science, AutoML will help you get jumpstarted by simplifying machine learning model building. It abstracts you from needing to perform model selection, hyperparameter selection and in one step creates a high quality trained model for you to use.
+If you are new to Data Science, automated ML will help you get jumpstarted by simplifying machine learning model building. It abstracts you from needing to perform model selection, hyperparameter selection and in one step creates a high quality trained model for you to use.
-If you are an experienced data scientist, AutoML will help increase your productivity by intelligently performing the model and hyperparameter selection for your training and generates high quality models much quicker than manually specifying several combinations of the parameters and running training jobs. AutoML provides visibility and access to all the training jobs and the performance characteristics of the models to help you further tune the pipeline if you desire.
+If you are an experienced data scientist, automated ML will help increase your productivity by intelligently performing the model and hyperparameter selection for your training and generates high quality models much quicker than manually specifying several combinations of the parameters and running training jobs. Automated ML provides visibility and access to all the training jobs and the performance characteristics of the models to help you further tune the pipeline if you desire.
-Below are the three execution environments supported by AutoML.
+Below are the three execution environments supported by automated ML.
 <a name="jupyter"></a>
-## Running samples in Azure Notebooks - Jupyter based notebooks in the Azure cloud
+## Setup using Azure Notebooks - Jupyter based notebooks in the Azure cloud
 1. [![Azure Notebooks](https://notebooks.azure.com/launch.png)](https://aka.ms/aml-clone-azure-notebooks)
 [Import sample notebooks ](https://aka.ms/aml-clone-azure-notebooks) into Azure Notebooks.
@@ -29,7 +29,7 @@ Below are the three execution environments supported by AutoML.
 1. Open one of the sample notebooks.
 <a name="databricks"></a>
-## Running samples in Azure Databricks
+## Setup using Azure Databricks
 **NOTE**: Please create your Azure Databricks cluster as v4.x (high concurrency preferred) with **Python 3** (dropdown).
 **NOTE**: You should at least have contributor access to your Azure subcription to run the notebook.
@@ -39,7 +39,7 @@ Below are the three execution environments supported by AutoML.
 - Attach the notebook to the cluster.
 <a name="localconda"></a>
-## Running samples in a Local Conda environment
+## Setup using a Local Conda environment
 To run these notebook on your own notebook server, use these installation instructions.
 The instructions below will install everything you need and then start a Jupyter notebook.
@@ -49,11 +49,15 @@ The instructions below will install everything you need and then start a Jupyter
 There's no need to install mini-conda specifically.
 ### 2. Downloading the sample notebooks
- Download the sample notebooks from [GitHub](https://github.com/Azure/MachineLearningNotebooks) as zip and extract the contents to a local directory.  The AutoML sample notebooks are in the "automl" folder.
+- Download the sample notebooks from [GitHub](https://github.com/Azure/MachineLearningNotebooks) as zip and extract the contents to a local directory.  The automated ML sample notebooks are in the "automated-machine-learning" folder.
 ### 3. Setup a new conda environment
-The **automl/automl_setup** script creates a new conda environment, installs the necessary packages, configures the widget and starts a jupyter notebook.
+The **automl_setup** script creates a new conda environment, installs the necessary packages, configures the widget and starts a jupyter notebook. It takes the conda environment name as an optional parameter.  The default conda environment name is azure_automl.  The exact command depends on the operating system.  See the specific sections below for Windows, Mac and Linux.  It can take about 10 minutes to execute.
-It takes the conda environment name as an optional parameter.  The default conda environment name is azure_automl.  The exact command depends on the operating system.  See the specific sections below for Windows, Mac and Linux.  It can take about 10 minutes to execute.
+
 Packages installed by the **automl_setup** script:
    <ul><li>python</li><li>nb_conda</li><li>matplotlib</li><li>numpy</li><li>cython</li><li>urllib3</li><li>scipy</li><li>scikit-learn</li><li>pandas</li><li>tensorflow</li><li>py-xgboost</li><li>azureml-sdk</li><li>azureml-widgets</li><li>pandas-ml</li></ul>
 For more details refer to the [automl_env.yml](./automl_env.yml)
 ## Windows
 Start an **Anaconda Prompt** window, cd to the **how-to-use-azureml/automated-machine-learning** folder where the sample notebooks were extracted and then run:
 ```
@@ -81,7 +85,7 @@ bash automl_setup_linux.sh
 ### 5. Running Samples
 - Please make sure you use the Python [conda env:azure_automl] kernel when trying the sample Notebooks.
- Follow the instructions in the individual notebooks to explore various features in AutoML
+- Follow the instructions in the individual notebooks to explore various features in automated ML.
 ### 6. Starting jupyter notebook manually
 To start your Jupyter notebook manually, use:
@@ -103,37 +107,22 @@ jupyter notebook
 - [auto-ml-classification.ipynb](classification/auto-ml-classification.ipynb)
    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
-    - Simple example of using Auto ML for classification
+    - Simple example of using automated ML for classification
    - Uses local compute for training
 - [auto-ml-regression.ipynb](regression/auto-ml-regression.ipynb)
    - Dataset: scikit learn's [diabetes dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html)
-    - Simple example of using Auto ML for regression
+    - Simple example of using automated ML for regression
    - Uses local compute for training
- [auto-ml-remote-execution.ipynb](remote-execution/auto-ml-remote-execution.ipynb)
+- [auto-ml-remote-amlcompute.ipynb](remote-amlcompute/auto-ml-remote-amlcompute.ipynb)
    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
    - Example of using Auto ML for classification using a remote linux DSVM for training
    - Parallel execution of iterations
    - Async tracking of progress
    - Cancelling individual iterations or entire run
    - Retrieving models for any iteration or logged metric
    - Specify automl settings as kwargs
 - [auto-ml-remote-amlcompute.ipynb](remote-batchai/auto-ml-remote-amlcompute.ipynb)
    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
    - Example of using automated ML for classification using remote AmlCompute for training
    - Parallel execution of iterations
    - Async tracking of progress
    - Cancelling individual iterations or entire run
    - Retrieving models for any iteration or logged metric
-    - Specify automl settings as kwargs
+    - Specify automated ML settings as kwargs
 - [auto-ml-remote-attach.ipynb](remote-attach/auto-ml-remote-attach.ipynb)
    - Dataset: Scikit learn's [20newsgroup](http://scikit-learn.org/stable/datasets/twenty_newsgroups.html)
    - handling text data with preprocess flag
    - Reading data from a blob store for remote executions
    - using pandas dataframes for reading data
 - [auto-ml-missing-data-blacklist-early-termination.ipynb](missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.ipynb)
    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
@@ -148,17 +137,13 @@ jupyter notebook
 - [auto-ml-exploring-previous-runs.ipynb](exploring-previous-runs/auto-ml-exploring-previous-runs.ipynb)
    - List all projects for the workspace
-    - List all AutoML Runs for a given project
+    - List all automated ML Runs for a given project
-    - Get details for a AutoML Run. (Automl settings, run widget & all metrics)
+    - Get details for a automated ML Run. (automated ML settings, run widget & all metrics)
    - Download fitted pipeline for any iteration
 - [auto-ml-remote-execution-with-datastore.ipynb](remote-execution-with-datastore/auto-ml-remote-execution-with-datastore.ipynb)
    - Dataset: Scikit learn's [20newsgroup](http://scikit-learn.org/stable/datasets/twenty_newsgroups.html)
    - Download the data and store it in DataStore.
 - [auto-ml-classification-with-deployment.ipynb](classification-with-deployment/auto-ml-classification-with-deployment.ipynb)
    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
-    - Simple example of using Auto ML for classification
+    - Simple example of using automated ML for classification
    - Registering the model
    - Creating Image and creating aci service
    - Testing the aci service
@@ -170,30 +155,59 @@ jupyter notebook
 - [auto-ml-subsampling-local.ipynb](subsampling/auto-ml-subsampling-local.ipynb)
    - How to enable subsampling
- [auto-ml-dataprep.ipynb](dataprep/auto-ml-dataprep.ipynb)
+- [auto-ml-dataset.ipynb](dataprep/auto-ml-dataset.ipynb)
-    - Using DataPrep for reading data
+    - Using Dataset for reading data
- [auto-ml-dataprep-remote-execution.ipynb](dataprep-remote-execution/auto-ml-dataprep-remote-execution.ipynb)
+- [auto-ml-dataset-remote-execution.ipynb](dataprep-remote-execution/auto-ml-dataset-remote-execution.ipynb)
-    - Using DataPrep for reading data with remote execution
+    - Using Dataset for reading data with remote execution
 - [auto-ml-classification-with-whitelisting.ipynb](classification-with-whitelisting/auto-ml-classification-with-whitelisting.ipynb)
    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
-    - Simple example of using Auto ML for classification with whitelisting tensorflow models.
+    - Simple example of using automated ML for classification with whitelisting tensorflow models.
    - Uses local compute for training
 - [auto-ml-forecasting-energy-demand.ipynb](forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb)
    - Dataset: [NYC energy demand data](forecasting-a/nyc_energy.csv)
-    - Example of using AutoML for training a forecasting model
+    - Example of using automated ML for training a forecasting model
 - [auto-ml-forecasting-orange-juice-sales.ipynb](forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb)
    - Dataset: [Dominick's grocery sales of orange juice](forecasting-b/dominicks_OJ.csv)
-    - Example of training an AutoML forecasting model on multiple time-series
+    - Example of training an automated ML forecasting model on multiple time-series
 - [auto-ml-classification-with-onnx.ipynb](classification-with-onnx/auto-ml-classification-with-onnx.ipynb)
-    - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
+    - Dataset: scikit learn's [iris dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html)
-    - Simple example of using Auto ML for classification with ONNX models
+    - Simple example of using automated ML for classification with ONNX models
    - Uses local compute for training
 - [auto-ml-remote-amlcompute-with-onnx.ipynb](remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.ipynb)
    - Dataset: scikit learn's [iris dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html)
    - Example of using automated ML for classification using remote AmlCompute for training
    - Train the models with ONNX compatible config on
    - Parallel execution of iterations
    - Async tracking of progress
    - Cancelling individual iterations or entire run
    - Retrieving the ONNX models and do the inference with them
 - [auto-ml-bank-marketing-subscribers-with-deployment.ipynb](bank-marketing-subscribers-with-deployment/auto-ml-bank-marketing-with-deployment.ipynb)
    - Dataset: UCI's [bank marketing dataset](https://www.kaggle.com/janiobachmann/bank-marketing-dataset)
    - Simple example of using automated ML for classification to predict term deposit subscriptions for a bank
    - Uses azure compute for training
 - [auto-ml-creditcard-with-deployment.ipynb](credit-card-fraud-detection-with-deployment/auto-ml-creditcard-with-deployment.ipynb)
    - Dataset: Kaggle's [credit card fraud detection dataset](https://www.kaggle.com/mlg-ulb/creditcardfraud)
    - Simple example of using automated ML for classification to fraudulent credit card transactions
    - Uses azure compute for training
 - [auto-ml-hardware-performance-with-deployment.ipynb](hardware-performance-prediction-with-deployment/auto-ml-hardware-performance-with-deployment.ipynb)
    - Dataset: UCI's [computer hardware dataset](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware)
    - Simple example of using automated ML for regression to predict the performance of certain combinations of hardware components
    - Uses azure compute for training
 - [auto-ml-concrete-strength-with-deployment.ipynb](predicting-concrete-strength-with-deployment/auto-ml-concrete-strength-with-deployment.ipynb)
    - Dataset: UCI's [concrete compressive strength dataset](https://www.kaggle.com/pavanraj159/concrete-compressive-strength-data-set)
    - Simple example of using automated ML for regression to predict the strength  predict the compressive strength of concrete based off of different ingredient combinations and quantities of those ingredients
    - Uses azure compute for training
 <a name="documentation"></a>
 See [Configure automated machine learning experiments](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train) to learn how more about the the settings and features available for automated machine learning experiments.
@@ -211,10 +225,18 @@ The main code of the file must be indented so that it is under this condition.
 <a name="troubleshooting"></a>
 # Troubleshooting
 ## automl_setup fails
-1. On windows, make sure that you are running automl_setup from an Anconda Prompt window rather than a regular cmd window.  You can launch the "Anaconda Prompt" window by hitting the Start button and typing "Anaconda Prompt".  If you don't see the application "Anaconda Prompt", you might not have conda or mini conda installed.  In that case, you can install it [here](https://conda.io/miniconda.html)
+1. On Windows, make sure that you are running automl_setup from an Anconda Prompt window rather than a regular cmd window.  You can launch the "Anaconda Prompt" window by hitting the Start button and typing "Anaconda Prompt".  If you don't see the application "Anaconda Prompt", you might not have conda or mini conda installed.  In that case, you can install it [here](https://conda.io/miniconda.html)
 2. Check that you have conda 64-bit installed rather than 32-bit.  You can check this with the command `conda info`.  The `platform` should be `win-64` for Windows or `osx-64` for Mac.
 3. Check that you have conda 4.4.10 or later.  You can check the version with the command `conda -V`.  If you have a previous version installed, you can update it using the command: `conda update conda`.
-4. Pass a new name as the first parameter to automl_setup so that it creates a new conda environment. You can view existing conda environments using `conda env list` and remove them with `conda env remove -n <environmentname>`. 
+4. On Linux, if the error is `gcc: error trying to exec 'cc1plus': execvp: No such file or directory`, install build essentials using the command `sudo apt-get install build-essential`.
 5. Pass a new name as the first parameter to automl_setup so that it creates a new conda environment. You can view existing conda environments using `conda env list` and remove them with `conda env remove -n <environmentname>`.
 ## automl_setup_linux.sh fails
 If automl_setup_linux.sh fails on Ubuntu Linux with the error: `unable to execute 'gcc': No such file or directory`
 1. Make sure that outbound ports 53 and 80 are enabled.  On an Azure VM, you can do this from the Azure Portal by selecting the VM and clicking on Networking.
 2. Run the command: `sudo apt-get update`
 3. Run the command: `sudo apt-get install build-essential --fix-missing`
 4. Run `automl_setup_linux.sh` again.
 ## configuration.ipynb fails
 1) For local conda, make sure that you have susccessfully run automl_setup first.
@@ -251,7 +273,7 @@ There are several reasons why the DsvmCompute.create can fail.  The reason is us
 2) `The requested VM size xxxxx is not available in the current region.`  You can select a different region or vm_size.
 ## Remote run: Unable to establish SSH connection
-AutoML uses the SSH protocol to communicate with remote DSVMs.  This defaults to port 22.  Possible causes for this error are:
+Automated ML uses the SSH protocol to communicate with remote DSVMs.  This defaults to port 22.  Possible causes for this error are:
 1) The DSVM is not ready for SSH connections.  When DSVM creation completes, the DSVM might still not be ready to acceept SSH connections.  The sample notebooks have a one minute delay to allow for this.
 2) Your Azure Subscription may restrict the IP address ranges that can access the DSVM on port 22.  You can check this in the Azure Portal by selecting the Virtual Machine and then clicking Networking.  The Virtual Machine name is the name that you provided in the notebook plus 10 alpha numeric characters to make the name unique.  The Inbound Port Rules define what can access the VM on specific ports.  Note that there is a priority priority order.  So, a Deny entry with a low priority number will override a Allow entry with a higher priority number.
@@ -262,13 +284,13 @@ This is often an issue with the `get_data` method.
 3) You can get to the error log for the setup iteration by clicking the `Click here to see the run in Azure portal` link, click `Back to Experiment`, click on the highest run number and then click on Logs.
 ## Remote run: disk full
-AutoML creates files under /tmp/azureml_runs for each iteration that it runs.  It creates a folder with the iteration id.  For example: AutoML_9a038a18-77cc-48f1-80fb-65abdbc33abe_93.  Under this, there is a azureml-logs folder, which contains logs.  If you run too many iterations on the same DSVM, these files can fill the disk.
+Automated ML creates files under /tmp/azureml_runs for each iteration that it runs.  It creates a folder with the iteration id.  For example: AutoML_9a038a18-77cc-48f1-80fb-65abdbc33abe_93.  Under this, there is a azureml-logs folder, which contains logs.  If you run too many iterations on the same DSVM, these files can fill the disk.
 You can delete the files under /tmp/azureml_runs or just delete the VM and create a new one.
 If your get_data downloads files, make sure the delete them or they can use disk space as well.
 When using DataStore, it is good to specify an absolute path for the files so that they are downloaded just once.  If you specify a relative path, it will download a file for each iteration.
 ## Remote run: Iterations fail and the log contains "MemoryError"
-This can be caused by insufficient memory on the DSVM.  AutoML loads all training data into memory.  So, the available memory should be more than the training data size.
+This can be caused by insufficient memory on the DSVM.  Automated ML loads all training data into memory.  So, the available memory should be more than the training data size.
 If you are using a remote DSVM, memory is needed for each concurrent iteration.  The max_concurrent_iterations setting specifies the maximum concurrent iterations.  For example, if the training data size is 8Gb and max_concurrent_iterations is set to 10, the minimum memory required is at least 80Gb.
 To resolve this issue, allocate a DSVM with more memory or reduce the value specified for max_concurrent_iterations.
--- a/how-to-use-azureml/automated-machine-learning/automl_env.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env.yml
@@ -2,21 +2,24 @@ name: azure_automl
 dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
 - pip
 - python>=3.5.2,<3.6.8
 - nb_conda
 - matplotlib==2.1.0
- numpy>=1.11.0,<1.15.0
+- numpy>=1.11.0,<=1.16.2
 - cython
 - urllib3<1.24
 - scipy>=1.0.0,<=1.1.0
- scikit-learn>=0.18.0,<=0.19.1
+- scikit-learn>=0.19.0,<=0.20.3
- pandas>=0.22.0,<0.23.0
+- pandas>=0.22.0,<=0.23.4
 - tensorflow>=1.12.0
 - py-xgboost<=0.80
 - pyarrow>=0.11.0
 - pip:
  # Required packages for AzureML execution, history, and data preparation.
-  - azureml-sdk[automl,explain]
+  - azureml-defaults
  - azureml-train-automl
  - azureml-widgets
  - azureml-explain-model
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
+++ b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml
@@ -2,22 +2,25 @@ name: azure_automl
 dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
 - pip
 - nomkl
 - python>=3.5.2,<3.6.8
 - nb_conda
 - matplotlib==2.1.0
- numpy>=1.15.3
+- numpy>=1.11.0,<=1.16.2
 - cython
 - urllib3<1.24
 - scipy>=1.0.0,<=1.1.0
- scikit-learn>=0.18.0,<=0.19.1
+- scikit-learn>=0.19.0,<=0.20.3
 - pandas>=0.22.0,<0.23.0
 - tensorflow>=1.12.0
 - py-xgboost<=0.80
 - pyarrow>=0.11.0
 - pip:
  # Required packages for AzureML execution, history, and data preparation.
-  - azureml-sdk[automl,explain]
+  - azureml-defaults
  - azureml-train-automl
  - azureml-widgets
  - azureml-explain-model
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/automl_setup.cmd
+++ b/how-to-use-azureml/automated-machine-learning/automl_setup.cmd
@@ -9,6 +9,8 @@ IF "%automl_env_file%"=="" SET automl_env_file="automl_env.yml"
 IF NOT EXIST %automl_env_file% GOTO YmlMissing
 IF "%CONDA_EXE%"=="" GOTO CondaMissing
 call conda activate %conda_env_name% 2>nul:
 if not errorlevel 1 (
@@ -42,6 +44,15 @@ IF NOT "%options%"=="nolaunch" (
 goto End
 :CondaMissing
 echo Please run this script from an Anaconda Prompt window.
 echo You can start an Anaconda Prompt window by
 echo typing Anaconda Prompt on the Start menu.
 echo If you don't see the Anaconda Prompt app, install Miniconda.
 echo If you are running an older version of Miniconda or Anaconda,
 echo you can upgrade using the command: conda update conda
 goto End
 :YmlMissing
 echo File %automl_env_file% not found.
--- a/how-to-use-azureml/automated-machine-learning/automl_setup_mac.sh
+++ b/how-to-use-azureml/automated-machine-learning/automl_setup_mac.sh
@@ -31,7 +31,6 @@ else
   conda install lightgbm -c conda-forge -y &&
   python -m ipykernel install --user --name $CONDA_ENV_NAME --display-name "Python ($CONDA_ENV_NAME)" &&
   jupyter nbextension uninstall --user --py azureml.widgets &&
   pip install numpy==1.15.3 &&
   echo "" &&
   echo "" &&
   echo "***************************************" &&
--- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.ipynb
@@ -0,0 +1,718 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Classification with Deployment using a Bank Marketing Dataset**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Deploy](#Deploy)\n",
        "1. [Test](#Test)\n",
        "1. [Acknowledgements](#Acknowledgements)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "\n",
        "In this example we use the UCI Bank Marketing dataset to showcase how you can use AutoML for a  classification problem and deploy it to an Azure Container Instance (ACI). The classification goal is to predict if the client will subscribe to a term deposit with the bank.\n",
        "\n",
        "If you are using an Azure Machine Learning Notebook VM, you are all set.  Otherwise, go through the [configuration](../../../configuration.ipynb)  notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create an experiment using an existing workspace.\n",
        "2. Configure AutoML using `AutoMLConfig`.\n",
        "3. Train the model using local compute.\n",
        "4. Explore the results.\n",
        "5. Register the model.\n",
        "6. Create a container image.\n",
        "7. Create an Azure Container Instance (ACI) service.\n",
        "8. Test the ACI service."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "\n",
        "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import pandas as pd\n",
        "import os\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# choose a name for experiment\n",
        "experiment_name = 'automl-classification-bmarketing'\n",
        "# project folder\n",
        "project_folder = './sample_projects/automl-classification-bankmarketing'\n",
        "\n",
        "experiment=Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create or Attach existing AmlCompute\n",
        "You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
        "#### Creation of AmlCompute takes approximately 5 minutes. \n",
        "If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
        "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.compute import AmlCompute\n",
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
        "amlcompute_cluster_name = \"automlcl\"\n",
        "\n",
        "found = False\n",
        "# Check if this compute target already exists in the workspace.\n",
        "cts = ws.compute_targets\n",
        "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
        "    found = True\n",
        "    print('Found existing compute target.')\n",
        "    compute_target = cts[amlcompute_cluster_name]\n",
        "    \n",
        "if not found:\n",
        "    print('Creating a new compute target...')\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
        "    # Create the cluster.\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "    \n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
        "    \n",
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Data\n",
        "\n",
        "Here load the data in the get_data() script to be utilized in azure compute. To do this  first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_Run_config."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')\n",
        "    \n",
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "\n",
        "cd = CondaDependencies.create(conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Load Data\n",
        "\n",
        "Here we create the script to be run in azure comput for loading the data, we load the bank marketing dataset into X_train and y_train. Next X_train and y_train is returned for training the model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
        "dataset = Dataset.Tabular.from_delimited_files(data)\n",
        "X_train = dataset.drop_columns(columns=['y'])\n",
        "y_train = dataset.keep_columns(columns=['y'], validate=True)\n",
        "dataset.take(5).to_pandas_dataframe()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**task**|classification or regression|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], Multi-class targets.|\n",
        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
        "\n",
        "**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\": 5,\n",
        "    \"iterations\": 10,\n",
        "    \"n_cross_validations\": 2,\n",
        "    \"primary_metric\": 'AUC_weighted',\n",
        "    \"preprocess\": True,\n",
        "    \"max_concurrent_iterations\": 5,\n",
        "    \"verbosity\": logging.INFO,\n",
        "}\n",
        "\n",
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             X = X_train,\n",
        "                             y = y_train,\n",
        "                             **automl_settings\n",
        "                            )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
        "In this example, we specify `show_output = True` to print currently running iterations to the console."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config, show_output = True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Widget for Monitoring Runs\n",
        "\n",
        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(remote_run).show() "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Deploy\n",
        "\n",
        "### Retrieve the Best Model\n",
        "\n",
        "Below we select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last invocation. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Register the Fitted Model for Deployment\n",
        "If neither `metric` nor `iteration` are specified in the `register_model` call, the iteration with the best primary metric is registered."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "description = 'AutoML Model trained on bank marketing data to predict if a client will subscribe to a term deposit'\n",
        "tags = None\n",
        "model = remote_run.register_model(description = description, tags = tags)\n",
        "\n",
        "print(remote_run.model_id) # This will be written to the script file later in the notebook."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create Scoring Script\n",
        "The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile score.py\n",
        "import pickle\n",
        "import json\n",
        "import numpy\n",
        "import azureml.train.automl\n",
        "from sklearn.externals import joblib\n",
        "from azureml.core.model import Model\n",
        "\n",
        "\n",
        "def init():\n",
        "    global model\n",
        "    model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
        "    # deserialize the model file back into a sklearn model\n",
        "    model = joblib.load(model_path)\n",
        "\n",
        "def run(rawdata):\n",
        "    try:\n",
        "        data = json.loads(rawdata)['data']\n",
        "        data = np.array(data)\n",
        "        result = model.predict(data)\n",
        "    except Exception as e:\n",
        "        result = str(e)\n",
        "        return json.dumps({\"error\": result})\n",
        "    return json.dumps({\"result\":result.tolist()})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a YAML File for the Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "for p in ['azureml-train-automl', 'azureml-core']:\n",
        "    print('{}\\t{}'.format(p, dependencies[p]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost<=0.80'],\n",
        "                                 pip_packages=['azureml-train-automl'])\n",
        "\n",
        "conda_env_file_name = 'myenv.yml'\n",
        "myenv.save_to_file('.', conda_env_file_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Substitute the actual version number in the environment file.\n",
        "# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
        "# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
        "\n",
        "with open(conda_env_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(conda_env_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-train-automl']))\n",
        "\n",
        "# Substitute the actual model id in the script file.\n",
        "\n",
        "script_file_name = 'score.py'\n",
        "\n",
        "with open(script_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(script_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a Container Image\n",
        "\n",
        "Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
        "or when testing a model that is under development."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.image import Image, ContainerImage\n",
        "\n",
        "image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
        "                                 execution_script = script_file_name,\n",
        "                                 conda_file = conda_env_file_name,\n",
        "                                 tags = {'area': \"bmData\", 'type': \"automl_classification\"},\n",
        "                                 description = \"Image for automl classification sample\")\n",
        "\n",
        "image = Image.create(name = \"automlsampleimage\",\n",
        "                     # this is the model object \n",
        "                     models = [model],\n",
        "                     image_config = image_config, \n",
        "                     workspace = ws)\n",
        "\n",
        "image.wait_for_creation(show_output = True)\n",
        "\n",
        "if image.creation_state == 'Failed':\n",
        "    print(\"Image build log at: \" + image.image_build_log_uri)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Deploy the Image as a Web Service on Azure Container Instance\n",
        "\n",
        "Deploy an image that contains the model and other assets needed by the service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import AciWebservice\n",
        "\n",
        "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
        "                                               memory_gb = 1, \n",
        "                                               tags = {'area': \"bmData\", 'type': \"automl_classification\"}, \n",
        "                                               description = 'sample service for Automl Classification')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import Webservice\n",
        "\n",
        "aci_service_name = 'automl-sample-bankmarketing'\n",
        "print(aci_service_name)\n",
        "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
        "                                           image = image,\n",
        "                                           name = aci_service_name,\n",
        "                                           workspace = ws)\n",
        "aci_service.wait_for_deployment(True)\n",
        "print(aci_service.state)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Delete a Web Service\n",
        "\n",
        "Deletes the specified web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Get Logs from a Deployed Web Service\n",
        "\n",
        "Gets logs from a deployed web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.get_logs()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test\n",
        "\n",
        "Now that the model is trained split our data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load the bank marketing datasets.\n",
        "from numpy import array"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv\"\n",
        "dataset = Dataset.Tabular.from_delimited_files(data)\n",
        "X_test = dataset.drop_columns(columns=['y'])\n",
        "y_test = dataset.keep_columns(columns=['y'], validate=True)\n",
        "dataset.take(5).to_pandas_dataframe()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X_test = X_test.to_pandas_dataframe()\n",
        "y_test = y_test.to_pandas_dataframe()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_pred  = fitted_model.predict(X_test)\n",
        "actual = array(y_test)\n",
        "actual = actual[:,0]\n",
        "print(y_pred.shape, \" \", actual.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Calculate metrics for the prediction\n",
        "\n",
        "Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
        "from the trained model that was returned."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%matplotlib notebook\n",
        "test_pred = plt.scatter(actual, y_pred, color='b')\n",
        "test_test = plt.scatter(actual, actual, color='g')\n",
        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Acknowledgements"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "This Bank Marketing dataset is made available under the Creative Commons (CCO: Public Domain) License: https://creativecommons.org/publicdomain/zero/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: https://creativecommons.org/publicdomain/zero/1.0/ and is available at: https://www.kaggle.com/janiobachmann/bank-marketing-dataset .\n",
        "\n",
        "_**Acknowledgements**_\n",
        "This data set is originally available within the UCI Machine Learning Database: https://archive.ics.uci.edu/ml/datasets/bank+marketing\n",
        "\n",
        "[Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "v-rasav"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.7"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.yml
+++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.yml
@@ -0,0 +1,10 @@
 name: auto-ml-classification-bank-marketing
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
@@ -0,0 +1,709 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Classification with Deployment using Credit Card Dataset**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Deploy](#Deploy)\n",
        "1. [Test](#Test)\n",
        "1. [Acknowledgements](#Acknowledgements)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "\n",
        "In this example we use the associated credit card dataset to showcase how you can use AutoML for a simple classification problem and deploy it to an Azure Container Instance (ACI). The classification goal is to predict if a creditcard transaction  is or is not considered a fraudulent charge.\n",
        "\n",
        "If you are using an Azure Machine Learning Notebook VM, you are all set.  Otherwise, go through the [configuration](../../../configuration.ipynb)  notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create an experiment using an existing workspace.\n",
        "2. Configure AutoML using `AutoMLConfig`.\n",
        "3. Train the model using local compute.\n",
        "4. Explore the results.\n",
        "5. Register the model.\n",
        "6. Create a container image.\n",
        "7. Create an Azure Container Instance (ACI) service.\n",
        "8. Test the ACI service."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "\n",
        "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import pandas as pd\n",
        "import os\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# choose a name for experiment\n",
        "experiment_name = 'automl-classification-ccard'\n",
        "# project folder\n",
        "project_folder = './sample_projects/automl-classification-creditcard'\n",
        "\n",
        "experiment=Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create or Attach existing AmlCompute\n",
        "You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
        "#### Creation of AmlCompute takes approximately 5 minutes. \n",
        "If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
        "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.compute import AmlCompute\n",
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
        "amlcompute_cluster_name = \"automlcl\"\n",
        "\n",
        "found = False\n",
        "# Check if this compute target already exists in the workspace.\n",
        "cts = ws.compute_targets\n",
        "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
        "    found = True\n",
        "    print('Found existing compute target.')\n",
        "    compute_target = cts[amlcompute_cluster_name]\n",
        "    \n",
        "if not found:\n",
        "    print('Creating a new compute target...')\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
        "    # Create the cluster.\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "    \n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
        "\n",
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Data\n",
        "\n",
        "Here load the data in the get_data script to be utilized in azure compute. To do this, first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_run_config."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')\n",
        "    \n",
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "\n",
        "cd = CondaDependencies.create(conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Load Data\n",
        "\n",
        "Here create the script to be run in azure compute for loading the data, load the credit card dataset into cards and store the Class column (y) in the y variable and store the remaining data in the x variable. Next split the data using random_split and return X_train and y_train for training the model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
        "dataset = Dataset.Tabular.from_delimited_files(data)\n",
        "X = dataset.drop_columns(columns=['Class'])\n",
        "y = dataset.keep_columns(columns=['Class'], validate=True)\n",
        "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
        "y_train, y_test = y.random_split(percentage=0.8, seed=223)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**task**|classification or regression|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], Multi-class targets.|\n",
        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
        "\n",
        "**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "##### If you would like to see even better results increase \"iteration_time_out minutes\" to 10+ mins and increase \"iterations\" to a minimum of 30"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\": 5,\n",
        "    \"iterations\": 10,\n",
        "    \"n_cross_validations\": 2,\n",
        "    \"primary_metric\": 'average_precision_score_weighted',\n",
        "    \"preprocess\": True,\n",
        "    \"max_concurrent_iterations\": 5,\n",
        "    \"verbosity\": logging.INFO,\n",
        "}\n",
        "\n",
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors_20190417.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             X = X_train,\n",
        "                             y = y_train,\n",
        "                             **automl_settings\n",
        "                            )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
        "In this example, we specify `show_output = True` to print currently running iterations to the console."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config, show_output = True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Widget for Monitoring Runs\n",
        "\n",
        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(remote_run).show() "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Deploy\n",
        "\n",
        "### Retrieve the Best Model\n",
        "\n",
        "Below we select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last invocation. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Register the Fitted Model for Deployment\n",
        "If neither `metric` nor `iteration` are specified in the `register_model` call, the iteration with the best primary metric is registered."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "description = 'AutoML Model'\n",
        "tags = None\n",
        "model = remote_run.register_model(description = description, tags = tags)\n",
        "\n",
        "print(remote_run.model_id) # This will be written to the script file later in the notebook."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create Scoring Script\n",
        "The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile score.py\n",
        "import pickle\n",
        "import json\n",
        "import numpy\n",
        "import azureml.train.automl\n",
        "from sklearn.externals import joblib\n",
        "from azureml.core.model import Model\n",
        "\n",
        "def init():\n",
        "    global model\n",
        "    model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
        "    # deserialize the model file back into a sklearn model\n",
        "    model = joblib.load(model_path)\n",
        "\n",
        "def run(rawdata):\n",
        "    try:\n",
        "        data = json.loads(rawdata)['data']\n",
        "        data = numpy.array(data)\n",
        "        result = model.predict(data)\n",
        "    except Exception as e:\n",
        "        result = str(e)\n",
        "        return json.dumps({\"error\": result})\n",
        "    return json.dumps({\"result\":result.tolist()})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a YAML File for the Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "for p in ['azureml-train-automl', 'azureml-core']:\n",
        "    print('{}\\t{}'.format(p, dependencies[p]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost<=0.80'],\n",
        "                                 pip_packages=['azureml-train-automl'])\n",
        "\n",
        "conda_env_file_name = 'myenv.yml'\n",
        "myenv.save_to_file('.', conda_env_file_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Substitute the actual version number in the environment file.\n",
        "# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
        "# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
        "\n",
        "with open(conda_env_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(conda_env_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-train-automl']))\n",
        "\n",
        "# Substitute the actual model id in the script file.\n",
        "\n",
        "script_file_name = 'score.py'\n",
        "\n",
        "with open(script_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(script_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a Container Image\n",
        "\n",
        "Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
        "or when testing a model that is under development."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.image import Image, ContainerImage\n",
        "\n",
        "image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
        "                                 execution_script = script_file_name,\n",
        "                                 conda_file = conda_env_file_name,\n",
        "                                 tags = {'area': \"cards\", 'type': \"automl_classification\"},\n",
        "                                 description = \"Image for automl classification sample\")\n",
        "\n",
        "image = Image.create(name = \"automlsampleimage\",\n",
        "                     # this is the model object \n",
        "                     models = [model],\n",
        "                     image_config = image_config, \n",
        "                     workspace = ws)\n",
        "\n",
        "image.wait_for_creation(show_output = True)\n",
        "\n",
        "if image.creation_state == 'Failed':\n",
        "    print(\"Image build log at: \" + image.image_build_log_uri)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Deploy the Image as a Web Service on Azure Container Instance\n",
        "\n",
        "Deploy an image that contains the model and other assets needed by the service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import AciWebservice\n",
        "\n",
        "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
        "                                               memory_gb = 1, \n",
        "                                               tags = {'area': \"cards\", 'type': \"automl_classification\"}, \n",
        "                                               description = 'sample service for Automl Classification')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import Webservice\n",
        "\n",
        "aci_service_name = 'automl-sample-creditcard'\n",
        "print(aci_service_name)\n",
        "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
        "                                           image = image,\n",
        "                                           name = aci_service_name,\n",
        "                                           workspace = ws)\n",
        "aci_service.wait_for_deployment(True)\n",
        "print(aci_service.state)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Delete a Web Service\n",
        "\n",
        "Deletes the specified web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Get Logs from a Deployed Web Service\n",
        "\n",
        "Gets logs from a deployed web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.get_logs()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test\n",
        "\n",
        "Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#Randomly select and test\n",
        "X_test = X_test.to_pandas_dataframe()\n",
        "y_test = y_test.to_pandas_dataframe()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_pred = fitted_model.predict(X_test)\n",
        "y_pred"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Calculate metrics for the prediction\n",
        "\n",
        "Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
        "from the trained model that was returned."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#Randomly select and test\n",
        "# Plot outputs\n",
        "%matplotlib notebook\n",
        "test_pred = plt.scatter(y_test, y_pred, color='b')\n",
        "test_test = plt.scatter(y_test, y_test, color='g')\n",
        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Acknowledgements"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
        "\n",
        "\n",
        "The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
        "Please cite the following works: \n",
        "\u00e2\u20ac\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
        "\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
        "\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
        "o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
        "\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
        "\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "v-rasav"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.7"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.yml
+++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.yml
@@ -0,0 +1,10 @@
 name: auto-ml-classification-credit-card-fraud
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -290,7 +297,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "for p in ['azureml-train-automl', 'azureml-sdk', 'azureml-core']:\n",
+        "for p in ['azureml-train-automl', 'azureml-core']:\n",
        "    print('{}\\t{}'.format(p, dependencies[p]))"
      ]
    },
@@ -302,7 +309,8 @@
      "source": [
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "\n",
-        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-sdk[automl]'])\n",
+        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost<=0.80'],\n",
        "                                 pip_packages=['azureml-train-automl'])\n",
        "\n",
        "conda_env_file_name = 'myenv.yml'\n",
        "myenv.save_to_file('.', conda_env_file_name)"
@@ -322,7 +330,7 @@
        "    content = cefr.read()\n",
        "\n",
        "with open(conda_env_file_name, 'w') as cefw:\n",
-        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-sdk']))\n",
+        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-train-automl']))\n",
        "\n",
        "# Substitute the actual model id in the script file.\n",
        "\n",
--- a/how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.yml
+++ b/how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.yml
@@ -0,0 +1,8 @@
 name: auto-ml-classification-with-deployment
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/classification-with-onnx/auto-ml-classification-with-onnx.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-with-onnx/auto-ml-classification-with-onnx.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-with-onnx/auto-ml-classification-with-onnx.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -22,7 +29,6 @@
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Test](#Test)\n",
        "\n"
      ]
    },
@@ -32,7 +38,7 @@
      "source": [
        "## Introduction\n",
        "\n",
-        "In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
+        "In this example we use the scikit-learn's [iris dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html) to showcase how you can use AutoML for a simple classification problem.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
@@ -42,7 +48,8 @@
        "1. Create an `Experiment` in an existing `Workspace`.\n",
        "2. Configure AutoML using `AutoMLConfig`.\n",
        "3. Train the model using local compute with ONNX compatible config on.\n",
-        "4. Explore the results and save the ONNX model."
+        "4. Explore the results and save the ONNX model.\n",
        "5. Inference with the ONNX model."
      ]
    },
    {
@@ -66,11 +73,12 @@
        "import numpy as np\n",
        "import pandas as pd\n",
        "from sklearn import datasets\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
-        "from azureml.train.automl import AutoMLConfig"
+        "from azureml.train.automl import AutoMLConfig, constants"
      ]
    },
    {
@@ -106,7 +114,7 @@
      "source": [
        "## Data\n",
        "\n",
-        "This uses scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) method."
+        "This uses scikit-learn's [load_iris](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html) method."
      ]
    },
    {
@@ -115,22 +123,44 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "digits = datasets.load_digits()\n",
+        "iris = datasets.load_iris()\n",
        "X_train, X_test, y_train, y_test = train_test_split(iris.data, \n",
        "                                                    iris.target, \n",
        "                                                    test_size=0.2, \n",
        "                                                    random_state=0)\n",
        "\n",
-        "# Exclude the first 100 rows from training so that they can be used for test.\n",
+        "\n"
        "X_train = digits.data[100:,:]\n",
        "y_train = digits.target[100:]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Train with enable ONNX compatible models config on\n",
+        "### Ensure the x_train and x_test are pandas DataFrame."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Convert the X_train and X_test to pandas DataFrame and set column names,\n",
        "# This is needed for initializing the input variable names of ONNX model, \n",
        "# and the prediction with the ONNX model using the inference helper.\n",
        "X_train = pd.DataFrame(X_train, columns=['c1', 'c2', 'c3', 'c4'])\n",
        "X_test = pd.DataFrame(X_test, columns=['c1', 'c2', 'c3', 'c4'])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
        "\n",
-        "Set the parameter enable_onnx_compatible_models=True, if you also want to generate the ONNX compatible models. Please note, the forecasting task and TensorFlow models are not ONNX compatible yet.\n",
+        "**Note:** Set the parameter enable_onnx_compatible_models=True, if you also want to generate the ONNX compatible models. Please note, the forecasting task and TensorFlow models are not ONNX compatible yet.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
@@ -144,6 +174,13 @@
        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Set the preprocess=True,  currently the InferenceHelper only supports this mode."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
@@ -158,6 +195,7 @@
        "                             verbosity = logging.INFO,                             \n",
        "                             X = X_train, \n",
        "                             y = y_train,\n",
        "                             preprocess=True,\n",
        "                             enable_onnx_compatible_models=True,\n",
        "                             path = project_folder)"
      ]
@@ -249,10 +287,69 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "from azureml.train.automl._vendor.automl.client.core.common.onnx_convert import OnnxConverter\n",
+        "from azureml.automl.core.onnx_convert import OnnxConverter\n",
        "onnx_fl_path = \"./best_model.onnx\"\n",
        "OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Predict with the ONNX model, using onnxruntime package"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import sys\n",
        "import json\n",
        "from azureml.automl.core.onnx_convert import OnnxConvertConstants\n",
        "\n",
        "if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:\n",
        "    python_version_compatible = True\n",
        "else:\n",
        "    python_version_compatible = False\n",
        "\n",
        "try:\n",
        "    import onnxruntime\n",
        "    from azureml.automl.core.onnx_convert import OnnxInferenceHelper    \n",
        "    onnxrt_present = True\n",
        "except ImportError:\n",
        "    onnxrt_present = False\n",
        "\n",
        "def get_onnx_res(run):\n",
        "    res_path = 'onnx_resource.json'\n",
        "    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)\n",
        "    with open(res_path) as f:\n",
        "        onnx_res = json.load(f)\n",
        "    return onnx_res\n",
        "\n",
        "if onnxrt_present and python_version_compatible:    \n",
        "    mdl_bytes = onnx_mdl.SerializeToString()\n",
        "    onnx_res = get_onnx_res(best_run)\n",
        "\n",
        "    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)\n",
        "    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(X_test)\n",
        "\n",
        "    print(pred_onnx)\n",
        "    print(pred_prob_onnx)\n",
        "else:\n",
        "    if not python_version_compatible:\n",
        "        print('Please use Python version 3.6 or 3.7 to run the inference helper.')    \n",
        "    if not onnxrt_present:\n",
        "        print('Please install the onnxruntime package to do the prediction with ONNX model.')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/automated-machine-learning/classification-with-onnx/auto-ml-classification-with-onnx.yml
+++ b/how-to-use-azureml/automated-machine-learning/classification-with-onnx/auto-ml-classification-with-onnx.yml
@@ -0,0 +1,9 @@
 name: auto-ml-classification-with-onnx
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - onnxruntime
--- a/how-to-use-azureml/automated-machine-learning/classification-with-whitelisting/auto-ml-classification-with-whitelisting.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-with-whitelisting/auto-ml-classification-with-whitelisting.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-with-whitelisting/auto-ml-classification-with-whitelisting.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -34,7 +41,7 @@
        "In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
-        "This notebooks shows how can automl can be trained on a a selected list of models,see the readme.md for the models.\n",
+        "This notebooks shows how can automl can be trained on a selected list of models, see the readme.md for the models.\n",
        "This trains the model exclusively on tensorflow based models.\n",
        "\n",
        "In this notebook you will learn how to:\n",
--- a/how-to-use-azureml/automated-machine-learning/classification-with-whitelisting/auto-ml-classification-with-whitelisting.yml
+++ b/how-to-use-azureml/automated-machine-learning/classification-with-whitelisting/auto-ml-classification-with-whitelisting.yml
@@ -0,0 +1,8 @@
 name: auto-ml-classification-with-whitelisting
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -72,6 +79,32 @@
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Accessing the Azure ML workspace requires authentication with Azure.\n",
        "\n",
        "The default authentication is interactive authentication using the default tenant.  Executing the `ws = Workspace.from_config()` line in the cell below will prompt for authentication the first time that it is run.\n",
        "\n",
        "If you have multiple Azure tenants, you can specify the tenant by replacing the `ws = Workspace.from_config()` line in the cell below with the following:\n",
        "\n",
        "```\n",
        "from azureml.core.authentication import InteractiveLoginAuthentication\n",
        "auth = InteractiveLoginAuthentication(tenant_id = 'mytenantid')\n",
        "ws = Workspace.from_config(auth = auth)\n",
        "```\n",
        "\n",
        "If you need to run in an environment where interactive login is not possible, you can use Service Principal authentication by replacing the `ws = Workspace.from_config()` line in the cell below with the following:\n",
        "\n",
        "```\n",
        "from azureml.core.authentication import ServicePrincipalAuthentication\n",
        "auth = auth = ServicePrincipalAuthentication('mytenantid', 'myappid', 'mypassword')\n",
        "ws = Workspace.from_config(auth = auth)\n",
        "```\n",
        "For more details, see [aka.ms/aml-notebook-auth](http://aka.ms/aml-notebook-auth)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
@@ -133,11 +166,17 @@
        "|-|-|\n",
        "|**task**|classification or regression|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], Multi-class targets.|\n",
-        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|"
+        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|\n",
        "\n",
        "Automated machine learning trains multiple machine learning pipelines.  Each pipelines training is known as an iteration.\n",
        "* You can specify a maximum number of iterations using the `iterations` parameter.\n",
        "* You can specify a maximum time for the run using the `experiment_timeout_minutes` parameter.\n",
        "* If you specify neither the `iterations` nor the `experiment_timeout_minutes`, automated ML keeps running iterations while it continues to see improvements in the scores.\n",
        "\n",
        "The following example doesn't specify `iterations` or `experiment_timeout_minutes` and so runs until the scores stop improving.\n"
      ]
    },
    {
@@ -147,14 +186,10 @@
      "outputs": [],
      "source": [
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             primary_metric = 'AUC_weighted',\n",
        "                             iteration_timeout_minutes = 60,\n",
        "                             iterations = 25,\n",
        "                             verbosity = logging.INFO,\n",
        "                             X = X_train, \n",
        "                             y = y_train,\n",
-        "                             path = project_folder)"
+        "                             n_cross_validations = 3)"
      ]
    },
    {
@@ -223,7 +258,11 @@
    {
      "cell_type": "code",
      "execution_count": null,
-      "metadata": {},
+      "metadata": {
        "tags": [
          "widget-rundetails-sample"
        ]
      },
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
@@ -300,6 +339,12 @@
        "            print()\n",
        "            for estimator in step[1].estimators:\n",
        "                print_model(estimator[1], estimator[0]+ ' - ')\n",
        "        elif hasattr(step[1], '_base_learners') and hasattr(step[1], '_meta_learner'):\n",
        "            print(\"\\nMeta Learner\")\n",
        "            pprint(step[1]._meta_learner)\n",
        "            print()\n",
        "            for estimator in step[1]._base_learners:\n",
        "                print_model(estimator[1], estimator[0]+ ' - ')\n",
        "        else:\n",
        "            pprint(step[1].get_params())\n",
        "            print()\n",
--- a/how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.yml
+++ b/how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.yml
@@ -0,0 +1,8 @@
 name: auto-ml-classification
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/dataprep-remote-execution/auto-ml-dataprep-remote-execution.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/dataprep-remote-execution/auto-ml-dataprep-remote-execution.ipynb
@@ -9,12 +9,19 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/dataprep-remote-execution/auto-ml-dataprep-remote-execution.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
-        "_**Prepare Data using `azureml.dataprep` for Remote Execution (DSVM)**_\n",
+        "_**Prepare Data using `azureml.dataprep` for Remote Execution (AmlCompute)**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
@@ -117,21 +124,12 @@
      "outputs": [],
      "source": [
        "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
-        "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
+        "# The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.\n",
        "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
        "X = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1)  # Remove the header row.\n",
        "\n",
        "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
        "# and convert column types manually.\n",
-        "# Here we read a comma delimited file and convert all columns to integers.\n",
+        "example_data = 'https://dprepdata.blob.core.windows.net/demo/crime0-random.csv'\n",
-        "y = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
+        "dflow = dprep.read_csv(example_data, infer_column_types=True)\n",
-      ]
+        "dflow.get_profile()"
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "You can peek the result of a Dataflow at any range using `skip(i)` and `head(j)`. Doing so evaluates only `j` records for all the steps in the Dataflow, which makes it fast even against large datasets."
      ]
    },
    {
@@ -140,7 +138,30 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X.skip(1).head(5)"
+        "# As `Primary Type` is our y data, we need to drop the values those are null in this column.\n",
        "dflow = dflow.drop_nulls('Primary Type')\n",
        "dflow.head(5)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Review the Data Preparation Result\n",
        "\n",
        "You can peek the result of a Dataflow at any range using `skip(i)` and `head(j)`. Doing so evaluates only `j` records for all the steps in the Dataflow, which makes it fast even against large datasets.\n",
        "\n",
        "`Dataflow` objects are immutable and are composed of a list of data preparation steps. A `Dataflow` object can be branched at any point for further usage."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X = dflow.drop_columns(columns=['Primary Type', 'FBI Code'])\n",
        "y = dflow.keep_columns(columns=['Primary Type'], validate_column_exists=True)"
      ]
    },
    {
@@ -162,9 +183,8 @@
        "    \"iteration_timeout_minutes\" : 10,\n",
        "    \"iterations\" : 2,\n",
        "    \"primary_metric\" : 'AUC_weighted',\n",
-        "    \"preprocess\" : False,\n",
+        "    \"preprocess\" : True,\n",
-        "    \"verbosity\" : logging.INFO,\n",
+        "    \"verbosity\" : logging.INFO\n",
        "    \"n_cross_validations\": 3\n",
        "}"
      ]
    },
@@ -172,7 +192,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Create or Attach a Remote Linux DSVM"
+        "### Create or Attach an AmlCompute cluster"
      ]
    },
    {
@@ -181,21 +201,36 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "dsvm_name = 'mydsvmc'\n",
+        "from azureml.core.compute import AmlCompute\n",
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
-        "try:\n",
+        "# Choose a name for your cluster.\n",
-        "    while ws.compute_targets[dsvm_name].provisioning_state == 'Creating':\n",
+        "amlcompute_cluster_name = \"cpu-cluster\"\n",
        "        time.sleep(1)\n",
        "\n",
-        "    dsvm_compute = DsvmCompute(ws, dsvm_name)\n",
+        "found = False\n",
-        "    print('Found existing DVSM.')\n",
+        "\n",
-        "except:\n",
+        "# Check if this compute target already exists in the workspace.\n",
-        "    print('Creating a new DSVM.')\n",
+        "\n",
-        "    dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n",
+        "cts = ws.compute_targets\n",
-        "    dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)\n",
+        "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
-        "    dsvm_compute.wait_for_completion(show_output = True)\n",
+        "    found = True\n",
-        "    print(\"Waiting one minute for ssh to be accessible\")\n",
+        "    print('Found existing compute target.')\n",
-        "    time.sleep(90) # Wait for ssh to be accessible"
+        "    compute_target = cts[amlcompute_cluster_name]\n",
        "\n",
        "if not found:\n",
        "    print('Creating a new compute target...')\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
        "    # Create the cluster.\\n\",\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "\n",
        "    # Can poll for a minimum number of nodes and for a specific timeout.\n",
        "    # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "    compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
        "\n",
        "     # For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
@@ -206,12 +241,19 @@
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
-        "conda_run_config.target = dsvm_compute\n",
+        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
        "\n",
-        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
+        "dprep_dependency = 'azureml-dataprep==' + pkg_resources.get_distribution(\"azureml-dataprep\").version\n",
        "\n",
        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]', dprep_dependency], conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
@@ -257,6 +299,44 @@
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Pre-process cache cleanup\n",
        "The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run.clean_preprocessor_cache()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Cancelling Runs\n",
        "You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Cancel the ongoing experiment and stop scheduling new iterations.\n",
        "# remote_run.cancel()\n",
        "\n",
        "# Cancel iteration 1 and move onto iteration 2.\n",
        "# remote_run.cancel_iteration(1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -376,7 +456,8 @@
      "source": [
        "## Test\n",
        "\n",
-        "#### Load Test Data"
+        "#### Load Test Data\n",
        "For the test data, it should have the same preparation step as the train data. Otherwise it might get failed at the preprocessing step."
      ]
    },
    {
@@ -385,12 +466,8 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "from sklearn import datasets\n",
+        "dflow_test = dprep.auto_read_file(path='https://dprepdata.blob.core.windows.net/demo/crime0-test.csv').skip(1)\n",
-        "\n",
+        "dflow_test = dflow_test.drop_nulls('Primary Type')"
        "digits = datasets.load_digits()\n",
        "X_test = digits.data[:10, :]\n",
        "y_test = digits.target[:10]\n",
        "images = digits.images[:10]"
      ]
    },
    {
@@ -398,7 +475,7 @@
      "metadata": {},
      "source": [
        "#### Testing Our Best Fitted Model\n",
-        "We will try to predict 2 digits and see how our model works."
+        "We will use confusion matrix to see how our model works."
      ]
    },
    {
@@ -407,65 +484,19 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "#Randomly select digits and test\n",
+        "from pandas_ml import ConfusionMatrix\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
        "\n",
-        "for index in np.random.choice(len(y_test), 2, replace = False):\n",
+        "y_test = dflow_test.keep_columns(columns=['Primary Type']).to_pandas_dataframe()\n",
-        "    print(index)\n",
+        "X_test = dflow_test.drop_columns(columns=['Primary Type', 'FBI Code']).to_pandas_dataframe()\n",
        "    predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
        "    label = y_test[index]\n",
        "    title = \"Label value = %d  Predicted value = %d \" % (label, predicted)\n",
        "    fig = plt.figure(1, figsize=(3,3))\n",
        "    ax1 = fig.add_axes((0,0,.8,.8))\n",
        "    ax1.set_title(title)\n",
        "    plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
        "    plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Appendix"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Capture the `Dataflow` Objects for Later Use in AutoML\n",
        "\n",
-        "`Dataflow` objects are immutable and are composed of a list of data preparation steps. A `Dataflow` object can be branched at any point for further usage."
+        "\n",
-      ]
+        "ypred = fitted_model.predict(X_test)\n",
-    },
+        "\n",
-    {
+        "cm = ConfusionMatrix(y_test['Primary Type'], ypred)\n",
-      "cell_type": "code",
+        "\n",
-      "execution_count": null,
+        "print(cm)\n",
-      "metadata": {},
+        "\n",
-      "outputs": [],
+        "cm.plot()"
      "source": [
        "# sklearn.digits.data + target\n",
        "digits_complete = dprep.auto_read_file('https://dprepdata.blob.core.windows.net/automl-notebook-data/digits-complete.csv')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "`digits_complete` (sourced from `sklearn.datasets.load_digits()`) is forked into `dflow_X` to capture all the feature columns and `dflow_y` to capture the label column."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(digits_complete.to_pandas_dataframe().shape)\n",
        "labels_column = 'Column64'\n",
        "dflow_X = digits_complete.drop_columns(columns = [labels_column])\n",
        "dflow_y = digits_complete.keep_columns(columns = [labels_column])"
      ]
    }
  ],
--- a/how-to-use-azureml/automated-machine-learning/dataprep-remote-execution/auto-ml-dataprep-remote-execution.yml
+++ b/how-to-use-azureml/automated-machine-learning/dataprep-remote-execution/auto-ml-dataprep-remote-execution.yml
@@ -0,0 +1,8 @@
 name: auto-ml-dataprep-remote-execution
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -115,23 +122,12 @@
      "outputs": [],
      "source": [
        "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
-        "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
+        "# The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.\n",
        "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
        "X = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1)  # Remove the header row.\n",
        "\n",
        "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
        "# and convert column types manually.\n",
-        "# Here we read a comma delimited file and convert all columns to integers.\n",
+        "example_data = 'https://dprepdata.blob.core.windows.net/demo/crime0-random.csv'\n",
-        "y = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
+        "dflow = dprep.auto_read_file(example_data).skip(1)  # Remove the header row.\n",
-      ]
+        "dflow.get_profile()"
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Review the Data Preparation Result\n",
        "\n",
        "You can peek the result of a Dataflow at any range using `skip(i)` and `head(j)`. Doing so evaluates only `j` records for all the steps in the Dataflow, which makes it fast even against large datasets."
      ]
    },
    {
@@ -140,7 +136,30 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X.skip(1).head(5)"
+        "# As `Primary Type` is our y data, we need to drop the values those are null in this column.\n",
        "dflow = dflow.drop_nulls('Primary Type')\n",
        "dflow.head(5)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Review the Data Preparation Result\n",
        "\n",
        "You can peek the result of a Dataflow at any range using `skip(i)` and `head(j)`. Doing so evaluates only `j` records for all the steps in the Dataflow, which makes it fast even against large datasets.\n",
        "\n",
        "`Dataflow` objects are immutable and are composed of a list of data preparation steps. A `Dataflow` object can be branched at any point for further usage."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X = dflow.drop_columns(columns=['Primary Type', 'FBI Code'])\n",
        "y = dflow.keep_columns(columns=['Primary Type'], validate_column_exists=True)"
      ]
    },
    {
@@ -162,7 +181,7 @@
        "    \"iteration_timeout_minutes\" : 10,\n",
        "    \"iterations\" : 2,\n",
        "    \"primary_metric\" : 'AUC_weighted',\n",
-        "    \"preprocess\" : False,\n",
+        "    \"preprocess\" : True,\n",
        "    \"verbosity\" : logging.INFO\n",
        "}"
      ]
@@ -326,7 +345,8 @@
      "source": [
        "## Test\n",
        "\n",
-        "#### Load Test Data"
+        "#### Load Test Data\n",
        "For the test data, it should have the same preparation step as the train data. Otherwise it might get failed at the preprocessing step."
      ]
    },
    {
@@ -335,12 +355,8 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "from sklearn import datasets\n",
+        "dflow_test = dprep.auto_read_file(path='https://dprepdata.blob.core.windows.net/demo/crime0-test.csv').skip(1)\n",
-        "\n",
+        "dflow_test = dflow_test.drop_nulls('Primary Type')"
        "digits = datasets.load_digits()\n",
        "X_test = digits.data[:10, :]\n",
        "y_test = digits.target[:10]\n",
        "images = digits.images[:10]"
      ]
    },
    {
@@ -348,7 +364,7 @@
      "metadata": {},
      "source": [
        "#### Testing Our Best Fitted Model\n",
-        "We will try to predict 2 digits and see how our model works."
+        "We will use confusion matrix to see how our model works."
      ]
    },
    {
@@ -357,65 +373,18 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "#Randomly select digits and test\n",
+        "from pandas_ml import ConfusionMatrix\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
        "\n",
-        "for index in np.random.choice(len(y_test), 2, replace = False):\n",
+        "y_test = dflow_test.keep_columns(columns=['Primary Type']).to_pandas_dataframe()\n",
-        "    print(index)\n",
+        "X_test = dflow_test.drop_columns(columns=['Primary Type', 'FBI Code']).to_pandas_dataframe()\n",
        "    predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
        "    label = y_test[index]\n",
        "    title = \"Label value = %d  Predicted value = %d \" % (label, predicted)\n",
        "    fig = plt.figure(1, figsize=(3,3))\n",
        "    ax1 = fig.add_axes((0,0,.8,.8))\n",
        "    ax1.set_title(title)\n",
        "    plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
        "    plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Appendix"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Capture the `Dataflow` Objects for Later Use in AutoML\n",
        "\n",
-        "`Dataflow` objects are immutable and are composed of a list of data preparation steps. A `Dataflow` object can be branched at any point for further usage."
+        "ypred = fitted_model.predict(X_test)\n",
-      ]
+        "\n",
-    },
+        "cm = ConfusionMatrix(y_test['Primary Type'], ypred)\n",
-    {
+        "\n",
-      "cell_type": "code",
+        "print(cm)\n",
-      "execution_count": null,
+        "\n",
-      "metadata": {},
+        "cm.plot()"
      "outputs": [],
      "source": [
        "# sklearn.digits.data + target\n",
        "digits_complete = dprep.auto_read_file('https://dprepdata.blob.core.windows.net/automl-notebook-data/digits-complete.csv')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "`digits_complete` (sourced from `sklearn.datasets.load_digits()`) is forked into `dflow_X` to capture all the feature columns and `dflow_y` to capture the label column."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(digits_complete.to_pandas_dataframe().shape)\n",
        "labels_column = 'Column64'\n",
        "dflow_X = digits_complete.drop_columns(columns = [labels_column])\n",
        "dflow_y = digits_complete.keep_columns(columns = [labels_column])"
      ]
    }
  ],
--- a/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.yml
+++ b/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.yml
@@ -0,0 +1,8 @@
 name: auto-ml-dataprep
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.ipynb
@@ -9,12 +9,19 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/dataprep-remote-execution/auto-ml-dataprep-remote-execution.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
-        "_**Remote Execution using DSVM (Ubuntu)**_\n",
+        "_**Load Data using `TabularDataset` for Remote Execution (AmlCompute)**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
@@ -30,32 +37,26 @@
      "metadata": {},
      "source": [
        "## Introduction\n",
-        "In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
+        "In this example we showcase how you can use AzureML Dataset to load data for AutoML.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
-        "In this notebook you wiil learn how to:\n",
+        "In this notebook you will learn how to:\n",
-        "1. Create an `Experiment` in an existing `Workspace`.\n",
+        "1. Create a `TabularDataset` pointing to the training data.\n",
-        "2. Attach an existing DSVM to a workspace.\n",
+        "2. Pass the `TabularDataset` to AutoML for a remote run."
-        "3. Configure AutoML using `AutoMLConfig`.\n",
+      ]
-        "4. Train the model using the DSVM.\n",
+    },
-        "5. Explore the results.\n",
+    {
-        "6. Test the best fitted model.\n",
+      "cell_type": "markdown",
-        "\n",
+      "metadata": {},
-        "In addition, this notebook showcases the following features:\n",
+      "source": [
-        "- **Parallel** executions for iterations\n",
+        "## Setup"
        "- **Asynchronous** tracking of progress\n",
        "- **Cancellation** of individual iterations or the entire run\n",
        "- Retrieving models for any iteration or logged metric\n",
        "- Specifying AutoML settings as `**kwargs`"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "\n",
        "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
      ]
    },
@@ -66,18 +67,13 @@
      "outputs": [],
      "source": [
        "import logging\n",
        "import os\n",
        "import time\n",
        "import csv\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from sklearn import datasets\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
@@ -89,9 +85,10 @@
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
-        "# Choose a name for the run history container in the workspace.\n",
+        "# choose a name for experiment\n",
-        "experiment_name = 'automl-remote-dsvm'\n",
+        "experiment_name = 'automl-dataset-remote-bai'\n",
-        "project_folder = './project'\n",
+        "# project folder\n",
        "project_folder = './sample_projects/automl-dataprep-remote-bai'\n",
        " \n",
        "experiment = Experiment(ws, experiment_name)\n",
        " \n",
@@ -112,8 +109,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Create a Remote Linux DSVM\n",
+        "## Data"
        "**Note:** If creation fails with a message about Marketplace purchase eligibilty, start creation of a DSVM through the [Azure portal](https://portal.azure.com), and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled this setting, you can exit the portal without actually creating the DSVM, and creation of the DSVM through the notebook should work.\n"
      ]
    },
    {
@@ -122,29 +118,21 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "from azureml.core.compute import DsvmCompute\n",
+        "# The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.\n",
-        "\n",
+        "example_data = 'https://dprepdata.blob.core.windows.net/demo/crime0-random.csv'\n",
-        "dsvm_name = 'mydsvma'\n",
+        "dataset = Dataset.Tabular.from_delimited_files(example_data)\n",
-        "try:\n",
+        "dataset.take(5).to_pandas_dataframe()"
        "    dsvm_compute = DsvmCompute(ws, dsvm_name)\n",
        "    print('Found an existing DSVM.')\n",
        "except:\n",
        "    print('Creating a new DSVM.')\n",
        "    dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2s_v3\")\n",
        "    dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)\n",
        "    dsvm_compute.wait_for_completion(show_output = True)\n",
        "    print(\"Waiting one minute for ssh to be accessible\")\n",
        "    time.sleep(90) # Wait for ssh to be accessible"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Data\n",
+        "### Review the data\n",
-        "For remote executions, you need to make the data accessible from the remote compute.\n",
+        "\n",
-        "This can be done by uploading the data to DataStore.\n",
+        "You can peek the result of a `TabularDataset` at any range using `skip(i)` and `take(j).to_pandas_dataframe()`. Doing so evaluates only `j` records, which makes it fast even against large datasets.\n",
-        "In this example, we upload scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) data."
+        "\n",
        "`TabularDataset` objects are immutable and are composed of a list of subsetting transformations (optional)."
      ]
    },
    {
@@ -153,65 +141,8 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "data_train = datasets.load_digits()\n",
+        "X = dataset.drop_columns(columns=['Primary Type', 'FBI Code'])\n",
-        "\n",
+        "y = dataset.keep_columns(columns=['Primary Type'], validate=True)"
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')\n",
        "    \n",
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)\n",
        "    \n",
        "pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
        "pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
        "\n",
        "ds = ws.get_default_datastore()\n",
        "ds.upload(src_dir='./data', target_path='re_data', overwrite=True, show_progress=True)\n",
        "\n",
        "from azureml.core.runconfig import DataReferenceConfiguration\n",
        "dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
        "                   path_on_datastore='re_data', \n",
        "                   path_on_compute='/tmp/azureml_runs',\n",
        "                   mode='download', # download files from datastore to compute target\n",
        "                   overwrite=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to the Linux DSVM\n",
        "conda_run_config.target = dsvm_compute\n",
        "\n",
        "# set the data reference of the run coonfiguration\n",
        "conda_run_config.data_references = {ds.name: dr}\n",
        "\n",
        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile $project_folder/get_data.py\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "def get_data():\n",
        "    X_train = pd.read_csv(\"/tmp/azureml_runs/re_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "    y_train = pd.read_csv(\"/tmp/azureml_runs/re_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "\n",
        "    return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
      ]
    },
    {
@@ -220,17 +151,7 @@
      "source": [
        "## Train\n",
        "\n",
-        "You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
+        "This creates a general AutoML settings object applicable for both local and remote runs."
        "\n",
        "**Note:** When using Remote DSVM, you can't pass Numpy arrays directly to the fit method.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be less than the number of cores on the DSVM.|"
      ]
    },
    {
@@ -241,37 +162,18 @@
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\" : 10,\n",
-        "    \"iterations\": 20,\n",
+        "    \"iterations\" : 2,\n",
        "    \"n_cross_validations\": 5,\n",
        "    \"primary_metric\" : 'AUC_weighted',\n",
-        "    \"preprocess\": False,\n",
+        "    \"preprocess\" : True,\n",
        "    \"max_concurrent_iterations\": 2,\n",
        "    \"verbosity\" : logging.INFO\n",
-        "}\n",
+        "}"
        "\n",
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             path = project_folder, \n",
        "                             run_configuration=conda_run_config,\n",
        "                             data_script = project_folder + \"/get_data.py\",\n",
        "                             **automl_settings\n",
        "                            )\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "**Note:** The first run on a new DSVM may take several minutes to prepare the environment."
+        "### Create or Attach an AmlCompute cluster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run.\n",
        "\n",
        "In this example, we specify `show_output = False` to suppress console output while the run is in progress."
      ]
    },
    {
@@ -280,7 +182,91 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "remote_run = experiment.submit(automl_config, show_output = False)"
+        "from azureml.core.compute import AmlCompute\n",
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
        "amlcompute_cluster_name = \"automlc2\"\n",
        "\n",
        "found = False\n",
        "\n",
        "# Check if this compute target already exists in the workspace.\n",
        "\n",
        "cts = ws.compute_targets\n",
        "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
        "    found = True\n",
        "    print('Found existing compute target.')\n",
        "    compute_target = cts[amlcompute_cluster_name]\n",
        "\n",
        "if not found:\n",
        "    print('Creating a new compute target...')\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
        "    # Create the cluster.\\n\",\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "\n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
        "\n",
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "\n",
        "cd = CondaDependencies.create(conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Pass Data with `TabularDataset` Objects\n",
        "\n",
        "The `TabularDataset` objects captured above can also be passed to the `submit` method for a remote run. AutoML will serialize the `TabularDataset` object and send it to the remote compute target. The `TabularDataset` will not be evaluated locally."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             X = X,\n",
        "                             y = y,\n",
        "                             **automl_settings)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config, show_output = True)"
      ]
    },
    {
@@ -296,17 +282,45 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Results\n",
+        "### Pre-process cache cleanup\n",
-        "\n",
+        "The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
        "#### Loading Executed Runs\n",
        "In case you need to load a previously executed run, enable the cell below and replace the `run_id` value."
      ]
    },
    {
-      "cell_type": "raw",
+      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run.clean_preprocessor_cache()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "remote_run = AutoMLRun(experiment=experiment, run_id = 'AutoML_480d3ed6-fc94-44aa-8f4e-0b945db9d3ef')"
+        "### Cancelling Runs\n",
        "You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Cancel the ongoing experiment and stop scheduling new iterations.\n",
        "# remote_run.cancel()\n",
        "\n",
        "# Cancel iteration 1 and move onto iteration 2.\n",
        "# remote_run.cancel_iteration(1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results"
      ]
    },
    {
@@ -317,8 +331,6 @@
        "\n",
        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "You can click on a pipeline to see run properties and output logs.  Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
        "\n",
        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
      ]
    },
@@ -332,21 +344,10 @@
        "RunDetails(remote_run).show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Wait until the run finishes.\n",
        "remote_run.wait_for_completion(show_output = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "#### Retrieve All Child Runs\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
      ]
@@ -368,35 +369,13 @@
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Cancelling Runs\n",
        "\n",
        "You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Cancel the ongoing experiment and stop scheduling new iterations.\n",
        "# remote_run.cancel()\n",
        "\n",
        "# Cancel iteration 1 and move onto iteration 2.\n",
        "# remote_run.cancel_iteration(1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Retrieve the Best Model\n",
        "\n",
-        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing.  Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
+        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
      ]
    },
    {
@@ -415,7 +394,7 @@
      "metadata": {},
      "source": [
        "#### Best Model Based on Any Other Metric\n",
-        "Show the run and the model which has the smallest `log_loss` value:"
+        "Show the run and the model that has the smallest `log_loss` value:"
      ]
    },
    {
@@ -435,7 +414,7 @@
      "metadata": {},
      "source": [
        "#### Model from a Specific Iteration\n",
-        "Show the run and the model from the third iteration:"
+        "Show the run and the model from the first iteration:"
      ]
    },
    {
@@ -444,10 +423,10 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "iteration = 3\n",
+        "iteration = 0\n",
-        "third_run, third_model = remote_run.get_output(iteration = iteration)\n",
+        "best_run, fitted_model = remote_run.get_output(iteration = iteration)\n",
-        "print(third_run)\n",
+        "print(best_run)\n",
-        "print(third_model)"
+        "print(fitted_model)"
      ]
    },
    {
@@ -456,7 +435,8 @@
      "source": [
        "## Test\n",
        "\n",
-        "#### Load Test Data"
+        "#### Load Test Data\n",
        "For the test data, it should have the same preparation step as the train data. Otherwise it might get failed at the preprocessing step."
      ]
    },
    {
@@ -465,17 +445,21 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "digits = datasets.load_digits()\n",
+        "dataset_test = Dataset.Tabular.from_delimited_files(path='https://dprepdata.blob.core.windows.net/demo/crime0-test.csv')\n",
-        "X_test = digits.data[:10, :]\n",
+        "\n",
-        "y_test = digits.target[:10]\n",
+        "df_test = dataset_test.to_pandas_dataframe()\n",
-        "images = digits.images[:10]"
+        "df_test = df_test[pd.notnull(df_test['Primary Type'])]\n",
        "\n",
        "y_test = df_test[['Primary Type']]\n",
        "X_test = df_test.drop(['Primary Type', 'FBI Code'], axis=1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "#### Test Our Best Fitted Model"
+        "#### Testing Our Best Fitted Model\n",
        "We will use confusion matrix to see how our model works."
      ]
    },
    {
@@ -484,17 +468,15 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "# Randomly select digits and test.\n",
+        "from pandas_ml import ConfusionMatrix\n",
-        "for index in np.random.choice(len(y_test), 2, replace = False):\n",
+        "\n",
-        "    print(index)\n",
+        "ypred = fitted_model.predict(X_test)\n",
-        "    predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
+        "\n",
-        "    label = y_test[index]\n",
+        "cm = ConfusionMatrix(y_test['Primary Type'], ypred)\n",
-        "    title = \"Label value = %d  Predicted value = %d \" % (label, predicted)\n",
+        "\n",
-        "    fig = plt.figure(1, figsize=(3,3))\n",
+        "print(cm)\n",
-        "    ax1 = fig.add_axes((0,0,.8,.8))\n",
+        "\n",
-        "    ax1.set_title(title)\n",
+        "cm.plot()"
        "    plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
        "    plt.show()"
      ]
    }
  ],
@@ -519,7 +501,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.6.6"
+      "version": "3.6.5"
    }
  },
  "nbformat": 4,
--- a/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.yml
+++ b/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.yml
@@ -0,0 +1,10 @@
 name: auto-ml-dataset-remote-execution
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/dataset/auto-ml-dataset.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/dataset/auto-ml-dataset.ipynb
@@ -0,0 +1,402 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Load Data using `TabularDataset` for Local Execution**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Test](#Test)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "In this example we showcase how you can use AzureML Dataset to load data for AutoML.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create a `TabularDataset` pointing to the training data.\n",
        "2. Pass the `TabularDataset` to AutoML for a local run."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        " \n",
        "# choose a name for experiment\n",
        "experiment_name = 'automl-dataset-local'\n",
        "# project folder\n",
        "project_folder = './sample_projects/automl-dataset-local'\n",
        " \n",
        "experiment = Experiment(ws, experiment_name)\n",
        " \n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace Name'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.\n",
        "example_data = 'https://dprepdata.blob.core.windows.net/demo/crime0-random.csv'\n",
        "dataset = Dataset.Tabular.from_delimited_files(example_data)\n",
        "dataset.take(5).to_pandas_dataframe()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Review the data\n",
        "\n",
        "You can peek the result of a `TabularDataset` at any range using `skip(i)` and `take(j).to_pandas_dataframe()`. Doing so evaluates only `j` records, which makes it fast even against large datasets.\n",
        "\n",
        "`TabularDataset` objects are immutable and are composed of a list of subsetting transformations (optional)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X = dataset.drop_columns(columns=['Primary Type', 'FBI Code'])\n",
        "y = dataset.keep_columns(columns=['Primary Type'], validate=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "This creates a general AutoML settings object applicable for both local and remote runs."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\" : 10,\n",
        "    \"iterations\" : 2,\n",
        "    \"primary_metric\" : 'AUC_weighted',\n",
        "    \"preprocess\" : True,\n",
        "    \"verbosity\" : logging.INFO\n",
        "}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Pass Data with `TabularDataset` Objects\n",
        "\n",
        "The `TabularDataset` objects captured above can be passed to the `submit` method for a local run. AutoML will retrieve the results from the `TabularDataset` for model training."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             X = X,\n",
        "                             y = y,\n",
        "                             **automl_settings)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run = experiment.submit(automl_config, show_output = True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Widget for Monitoring Runs\n",
        "\n",
        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(local_run).show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Retrieve All Child Runs\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(local_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "    \n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Retrieve the Best Model\n",
        "\n",
        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = local_run.get_output()\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Best Model Based on Any Other Metric\n",
        "Show the run and the model that has the smallest `log_loss` value:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "lookup_metric = \"log_loss\"\n",
        "best_run, fitted_model = local_run.get_output(metric = lookup_metric)\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Model from a Specific Iteration\n",
        "Show the run and the model from the first iteration:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "iteration = 0\n",
        "best_run, fitted_model = local_run.get_output(iteration = iteration)\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test\n",
        "\n",
        "#### Load Test Data\n",
        "For the test data, it should have the same preparation step as the train data. Otherwise it might get failed at the preprocessing step."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "dataset_test = Dataset.Tabular.from_delimited_files(path='https://dprepdata.blob.core.windows.net/demo/crime0-test.csv')\n",
        "\n",
        "df_test = dataset_test.to_pandas_dataframe()\n",
        "df_test = df_test[pd.notnull(df_test['Primary Type'])]\n",
        "\n",
        "y_test = df_test[['Primary Type']]\n",
        "X_test = df_test.drop(['Primary Type', 'FBI Code'], axis=1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Testing Our Best Fitted Model\n",
        "We will use confusion matrix to see how our model works."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from pandas_ml import ConfusionMatrix\n",
        "\n",
        "ypred = fitted_model.predict(X_test)\n",
        "\n",
        "cm = ConfusionMatrix(y_test['Primary Type'], ypred)\n",
        "\n",
        "print(cm)\n",
        "\n",
        "cm.plot()"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "savitam"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.5"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/dataset/auto-ml-dataset.yml
+++ b/how-to-use-azureml/automated-machine-learning/dataset/auto-ml-dataset.yml
@@ -0,0 +1,8 @@
 name: auto-ml-dataset
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/exploring-previous-runs/auto-ml-exploring-previous-runs.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/exploring-previous-runs/auto-ml-exploring-previous-runs.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/exploring-previous-runs/auto-ml-exploring-previous-runs.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -190,12 +197,12 @@
        "display(HTML('<h3>Iterations</h3>'))\n",
        "RunDetails(ml_run).show() \n",
        "\n",
-        "children = list(ml_run.get_children())\n",
+        "all_metrics = ml_run.get_metrics(recursive=True)\n",
        "metricslist = {}\n",
-        "for run in children:\n",
+        "for run_id, metrics in all_metrics.items():\n",
-        "    properties = run.get_properties()\n",
+        "    iteration = int(run_id.split('_')[-1])\n",
-        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
+        "    float_metrics = {k: v for k, v in metrics.items() if isinstance(v, float)}\n",
-        "    metricslist[int(properties['iteration'])] = metrics\n",
+        "    metricslist[iteration] = float_metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "display(HTML('<h3>Metrics</h3>'))\n",
--- a/how-to-use-azureml/automated-machine-learning/exploring-previous-runs/auto-ml-exploring-previous-runs.yml
+++ b/how-to-use-azureml/automated-machine-learning/exploring-previous-runs/auto-ml-exploring-previous-runs.yml
@@ -0,0 +1,8 @@
 name: auto-ml-exploring-previous-runs
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb
@@ -0,0 +1,607 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "**BikeShare Demand Forecasting**\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Evaluate](#Evaluate)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "This notebook demonstrates demand forecasting for a bike-sharing service using AutoML.\n",
        "\n",
        "AutoML highlights here include built-in holiday featurization, accessing engineered feature names, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
        "Notebook synopsis:\n",
        "1. Creating an Experiment in an existing Workspace\n",
        "2. Configuration and local run of AutoML for a time-series model with lag and holiday features \n",
        "3. Viewing the engineered names for featurized data and featurization summary for all raw features\n",
        "4. Evaluating the fitted model using a rolling test "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import azureml.core\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import logging\n",
        "import warnings\n",
        "\n",
        "from pandas.tseries.frequencies import to_offset\n",
        "\n",
        "# Squash warning messages for cleaner output in the notebook\n",
        "warnings.showwarning = lambda *args, **kwargs: None\n",
        "\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.train.automl import AutoMLConfig\n",
        "from matplotlib import pyplot as plt\n",
        "from sklearn.metrics import mean_absolute_error, mean_squared_error"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# choose a name for the run history container in the workspace\n",
        "experiment_name = 'automl-bikeshareforecasting'\n",
        "# project folder\n",
        "project_folder = './sample_projects/automl-local-bikeshareforecasting'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Run History Name'] = experiment_name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Data\n",
        "Read bike share demand data from file, and preview data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = pd.read_csv('bike-no.csv', parse_dates=['date'])\n",
        "data.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Let's set up what we know about the dataset. \n",
        "\n",
        "**Target column** is what we want to forecast.\n",
        "\n",
        "**Time column** is the time axis along which to predict.\n",
        "\n",
        "**Grain** is another word for an individual time series in your dataset. Grains are identified by values of the columns listed `grain_column_names`, for example \"store\" and \"item\" if your data has multiple time series of sales, one series for each combination of store and item sold.\n",
        "\n",
        "This dataset has only one time series. Please see the [orange juice notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales) for an example of a multi-time series dataset."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "target_column_name = 'cnt'\n",
        "time_column_name = 'date'\n",
        "grain_column_names = []"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Split the data\n",
        "\n",
        "The first split we make is into train and test sets. Note we are splitting on time."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "train = data[data[time_column_name] < '2012-09-01']\n",
        "test = data[data[time_column_name] >= '2012-09-01']\n",
        "\n",
        "X_train = train.copy()\n",
        "y_train = X_train.pop(target_column_name).values\n",
        "\n",
        "X_test = test.copy()\n",
        "y_test = X_test.pop(target_column_name).values\n",
        "\n",
        "print(X_train.shape)\n",
        "print(y_train.shape)\n",
        "print(X_test.shape)\n",
        "print(y_test.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Setting forecaster maximum horizon \n",
        "\n",
        "The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 14 periods (i.e. 14 days). Notice that this is much shorter than the number of days in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand).  "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "max_horizon = 14"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**task**|forecasting|\n",
        "|**primary_metric**|This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>\n",
        "|**iterations**|Number of iterations. In each iteration, Auto ML trains a specific pipeline on the given data|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**country_or_region**|The country/region used to generate holiday features. These should be ISO 3166 two-letter country/region codes (i.e. 'US', 'GB').|\n",
        "|**path**|Relative path to the project folder.  AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    'time_column_name': time_column_name,\n",
        "    'max_horizon': max_horizon,\n",
        "    # knowing the country/region allows Automated ML to bring in holidays\n",
        "    'country_or_region': 'US',\n",
        "    'target_lags': 1,\n",
        "    # these columns are a breakdown of the total and therefore a leak\n",
        "    'drop_column_names': ['casual', 'registered']\n",
        "}\n",
        "\n",
        "automl_config = AutoMLConfig(task='forecasting',                             \n",
        "                             primary_metric='normalized_root_mean_squared_error',\n",
        "                             iterations=10,\n",
        "                             iteration_timeout_minutes=5,\n",
        "                             X=X_train,\n",
        "                             y=y_train,\n",
        "                             n_cross_validations=3,                             \n",
        "                             path=project_folder,\n",
        "                             verbosity=logging.INFO,\n",
        "                            **automl_settings)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We will now run the experiment, starting with 10 iterations of model search. The experiment can be continued for more iterations if more accurate results are required. You will see the currently running iterations printing to the console."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run = experiment.submit(automl_config, show_output=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Displaying the run objects gives you links to the visual tools in the Azure Portal. Go try them!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Retrieve the Best Model\n",
        "Below we select the best pipeline from our iterations. The get_output method on automl_classifier returns the best run and the fitted model for the last fit invocation. There are overloads on get_output that allow you to retrieve the best run and fitted model for any logged metric or a particular iteration."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = local_run.get_output()\n",
        "fitted_model.steps"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### View the engineered names for featurized data\n",
        "\n",
        "You can accees the engineered feature names generated in time-series featurization. Note that a number of named holiday periods are represented. We recommend that you have at least one year of data when using this feature to ensure that all yearly holidays are captured in the training featurization."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "fitted_model.named_steps['timeseriestransformer'].get_engineered_feature_names()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### View the featurization summary\n",
        "\n",
        "You can also see what featurization steps were performed on different raw features in the user data. For each raw feature in the user data, the following information is displayed:\n",
        "\n",
        "- Raw feature name\n",
        "- Number of engineered features formed out of this raw feature\n",
        "- Type detected\n",
        "- If feature was dropped\n",
        "- List of feature transformations for the raw feature"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Get the featurization summary as a list of JSON\n",
        "featurization_summary = fitted_model.named_steps['timeseriestransformer'].get_featurization_summary()\n",
        "# View the featurization summary as a pandas dataframe\n",
        "pd.DataFrame.from_records(featurization_summary)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Evaluate"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We now use the best fitted model from the AutoML Run to make forecasts for the test set.  \n",
        "\n",
        "We always score on the original dataset whose schema matches the training set schema."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X_test.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We now define some functions for aligning output to input and for producing rolling forecasts over the full test set. As previously stated, the forecast horizon of 14 days is shorter than the length of the test set - which is about 120 days. To get predictions over the full test set, we iterate over the test set, making forecasts 14 days at a time and combining the results. We also make sure that each 14-day forecast uses up-to-date actuals - the current context - to construct lag features. \n",
        "\n",
        "It is a good practice to always align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name='predicted',\n",
        "                  horizon_colname='horizon_origin'):\n",
        "    \"\"\"\n",
        "    Demonstrates how to get the output aligned to the inputs\n",
        "    using pandas indexes. Helps understand what happened if\n",
        "    the output's shape differs from the input shape, or if\n",
        "    the data got re-sorted by time and grain during forecasting.\n",
        "    \n",
        "    Typical causes of misalignment are:\n",
        "    * we predicted some periods that were missing in actuals -> drop from eval\n",
        "    * model was asked to predict past max_horizon -> increase max horizon\n",
        "    * data at start of X_test was needed for lags -> provide previous periods\n",
        "    \"\"\"\n",
        "    df_fcst = pd.DataFrame({predicted_column_name : y_predicted,\n",
        "                            horizon_colname: X_trans[horizon_colname]})\n",
        "    # y and X outputs are aligned by forecast() function contract\n",
        "    df_fcst.index = X_trans.index\n",
        "    \n",
        "    # align original X_test to y_test    \n",
        "    X_test_full = X_test.copy()\n",
        "    X_test_full[target_column_name] = y_test\n",
        "\n",
        "    # X_test_full's index does not include origin, so reset for merge\n",
        "    df_fcst.reset_index(inplace=True)\n",
        "    X_test_full = X_test_full.reset_index().drop(columns='index')\n",
        "    together = df_fcst.merge(X_test_full, how='right')\n",
        "    \n",
        "    # drop rows where prediction or actuals are nan \n",
        "    # happens because of missing actuals \n",
        "    # or at edges of time due to lags/rolling windows\n",
        "    clean = together[together[[target_column_name, predicted_column_name]].notnull().all(axis=1)]\n",
        "    return(clean)\n",
        "\n",
        "def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq='D'):\n",
        "    \"\"\"\n",
        "    Produce forecasts on a rolling origin over the given test set.\n",
        "    \n",
        "    Each iteration makes a forecast for the next 'max_horizon' periods \n",
        "    with respect to the current origin, then advances the origin by the horizon time duration. \n",
        "    The prediction context for each forecast is set so that the forecaster uses \n",
        "    the actual target values prior to the current origin time for constructing lag features.\n",
        "    \n",
        "    This function returns a concatenated DataFrame of rolling forecasts.\n",
        "     \"\"\"\n",
        "    df_list = []\n",
        "    origin_time = X_test[time_column_name].min()\n",
        "    while origin_time <= X_test[time_column_name].max():\n",
        "        # Set the horizon time - end date of the forecast\n",
        "        horizon_time = origin_time + max_horizon * to_offset(freq)\n",
        "        \n",
        "        # Extract test data from an expanding window up-to the horizon \n",
        "        expand_wind = (X_test[time_column_name] < horizon_time)\n",
        "        X_test_expand = X_test[expand_wind]\n",
        "        y_query_expand = np.zeros(len(X_test_expand)).astype(np.float)\n",
        "        y_query_expand.fill(np.NaN)\n",
        "        \n",
        "        if origin_time != X_test[time_column_name].min():\n",
        "            # Set the context by including actuals up-to the origin time\n",
        "            test_context_expand_wind = (X_test[time_column_name] < origin_time)\n",
        "            context_expand_wind = (X_test_expand[time_column_name] < origin_time)\n",
        "            y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]\n",
        "        \n",
        "        # Make a forecast out to the maximum horizon\n",
        "        y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)\n",
        "        \n",
        "        # Align forecast with test set for dates within the current rolling window \n",
        "        trans_tindex = X_trans.index.get_level_values(time_column_name)\n",
        "        trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)\n",
        "        test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)\n",
        "        df_list.append(align_outputs(y_fcst[trans_roll_wind], X_trans[trans_roll_wind],\n",
        "                                     X_test[test_roll_wind], y_test[test_roll_wind]))\n",
        "        \n",
        "        # Advance the origin time\n",
        "        origin_time = horizon_time\n",
        "    \n",
        "    return pd.concat(df_list, ignore_index=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "df_all = do_rolling_forecast(fitted_model, X_test, y_test, max_horizon)\n",
        "df_all"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We now calculate some error metrics for the forecasts and vizualize the predictions vs. the actuals."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def APE(actual, pred):\n",
        "    \"\"\"\n",
        "    Calculate absolute percentage error.\n",
        "    Returns a vector of APE values with same length as actual/pred.\n",
        "    \"\"\"\n",
        "    return 100*np.abs((actual - pred)/actual)\n",
        "\n",
        "def MAPE(actual, pred):\n",
        "    \"\"\"\n",
        "    Calculate mean absolute percentage error.\n",
        "    Remove NA and values where actual is close to zero\n",
        "    \"\"\"\n",
        "    not_na = ~(np.isnan(actual) | np.isnan(pred))\n",
        "    not_zero = ~np.isclose(actual, 0.0)\n",
        "    actual_safe = actual[not_na & not_zero]\n",
        "    pred_safe = pred[not_na & not_zero]\n",
        "    return np.mean(APE(actual_safe, pred_safe))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(\"Simple forecasting model\")\n",
        "rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all['predicted']))\n",
        "print(\"[Test Data] \\nRoot Mean squared error: %.2f\" % rmse)\n",
        "mae = mean_absolute_error(df_all[target_column_name], df_all['predicted'])\n",
        "print('mean_absolute_error score: %.2f' % mae)\n",
        "print('MAPE: %.2f' % MAPE(df_all[target_column_name], df_all['predicted']))\n",
        "\n",
        "# Plot outputs\n",
        "%matplotlib inline\n",
        "test_pred = plt.scatter(df_all[target_column_name], df_all['predicted'], color='b')\n",
        "test_test = plt.scatter(y_test, y_test, color='g')\n",
        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "The MAPE seems high; it is being skewed by an actual with a small absolute value. For a more informative evaluation, we can calculate the metrics by forecast horizon:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "df_all.groupby('horizon_origin').apply(\n",
        "    lambda df: pd.Series({'MAPE': MAPE(df[target_column_name], df['predicted']),\n",
        "                          'RMSE': np.sqrt(mean_squared_error(df[target_column_name], df['predicted'])),\n",
        "                          'MAE': mean_absolute_error(df[target_column_name], df['predicted'])}))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "It's also interesting to see the distributions of APE (absolute percentage error) by horizon. On a log scale, the outlying APE in the horizon-3 group is clear."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "df_all_APE = df_all.assign(APE=APE(df_all[target_column_name], df_all['predicted']))\n",
        "APEs = [df_all_APE[df_all['horizon_origin'] == h].APE.values for h in range(1, max_horizon + 1)]\n",
        "\n",
        "%matplotlib inline\n",
        "plt.boxplot(APEs)\n",
        "plt.yscale('log')\n",
        "plt.xlabel('horizon')\n",
        "plt.ylabel('APE (%)')\n",
        "plt.title('Absolute Percentage Errors by Forecast Horizon')\n",
        "\n",
        "plt.show()"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "erwright"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.8"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.yml
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.yml
@@ -0,0 +1,9 @@
 name: auto-ml-forecasting-bike-share
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - statsmodels
--- a/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/bike-no.csv
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/bike-no.csv
@@ -0,0 +1,732 @@
 instant,date,season,yr,mnth,weekday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
 1,1/1/2011,1,0,1,6,2,0.344167,0.363625,0.805833,0.160446,331,654,985
 2,1/2/2011,1,0,1,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
 3,1/3/2011,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
 4,1/4/2011,1,0,1,2,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
 5,1/5/2011,1,0,1,3,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600
 6,1/6/2011,1,0,1,4,1,0.204348,0.233209,0.518261,0.0895652,88,1518,1606
 7,1/7/2011,1,0,1,5,2,0.196522,0.208839,0.498696,0.168726,148,1362,1510
 8,1/8/2011,1,0,1,6,2,0.165,0.162254,0.535833,0.266804,68,891,959
 9,1/9/2011,1,0,1,0,1,0.138333,0.116175,0.434167,0.36195,54,768,822
 10,1/10/2011,1,0,1,1,1,0.150833,0.150888,0.482917,0.223267,41,1280,1321
 11,1/11/2011,1,0,1,2,2,0.169091,0.191464,0.686364,0.122132,43,1220,1263
 12,1/12/2011,1,0,1,3,1,0.172727,0.160473,0.599545,0.304627,25,1137,1162
 13,1/13/2011,1,0,1,4,1,0.165,0.150883,0.470417,0.301,38,1368,1406
 14,1/14/2011,1,0,1,5,1,0.16087,0.188413,0.537826,0.126548,54,1367,1421
 15,1/15/2011,1,0,1,6,2,0.233333,0.248112,0.49875,0.157963,222,1026,1248
 16,1/16/2011,1,0,1,0,1,0.231667,0.234217,0.48375,0.188433,251,953,1204
 17,1/17/2011,1,0,1,1,2,0.175833,0.176771,0.5375,0.194017,117,883,1000
 18,1/18/2011,1,0,1,2,2,0.216667,0.232333,0.861667,0.146775,9,674,683
 19,1/19/2011,1,0,1,3,2,0.292174,0.298422,0.741739,0.208317,78,1572,1650
 20,1/20/2011,1,0,1,4,2,0.261667,0.25505,0.538333,0.195904,83,1844,1927
 21,1/21/2011,1,0,1,5,1,0.1775,0.157833,0.457083,0.353242,75,1468,1543
 22,1/22/2011,1,0,1,6,1,0.0591304,0.0790696,0.4,0.17197,93,888,981
 23,1/23/2011,1,0,1,0,1,0.0965217,0.0988391,0.436522,0.2466,150,836,986
 24,1/24/2011,1,0,1,1,1,0.0973913,0.11793,0.491739,0.15833,86,1330,1416
 25,1/25/2011,1,0,1,2,2,0.223478,0.234526,0.616957,0.129796,186,1799,1985
 26,1/26/2011,1,0,1,3,3,0.2175,0.2036,0.8625,0.29385,34,472,506
 27,1/27/2011,1,0,1,4,1,0.195,0.2197,0.6875,0.113837,15,416,431
 28,1/28/2011,1,0,1,5,2,0.203478,0.223317,0.793043,0.1233,38,1129,1167
 29,1/29/2011,1,0,1,6,1,0.196522,0.212126,0.651739,0.145365,123,975,1098
 30,1/30/2011,1,0,1,0,1,0.216522,0.250322,0.722174,0.0739826,140,956,1096
 31,1/31/2011,1,0,1,1,2,0.180833,0.18625,0.60375,0.187192,42,1459,1501
 32,2/1/2011,1,0,2,2,2,0.192174,0.23453,0.829565,0.053213,47,1313,1360
 33,2/2/2011,1,0,2,3,2,0.26,0.254417,0.775417,0.264308,72,1454,1526
 34,2/3/2011,1,0,2,4,1,0.186957,0.177878,0.437826,0.277752,61,1489,1550
 35,2/4/2011,1,0,2,5,2,0.211304,0.228587,0.585217,0.127839,88,1620,1708
 36,2/5/2011,1,0,2,6,2,0.233333,0.243058,0.929167,0.161079,100,905,1005
 37,2/6/2011,1,0,2,0,1,0.285833,0.291671,0.568333,0.1418,354,1269,1623
 38,2/7/2011,1,0,2,1,1,0.271667,0.303658,0.738333,0.0454083,120,1592,1712
 39,2/8/2011,1,0,2,2,1,0.220833,0.198246,0.537917,0.36195,64,1466,1530
 40,2/9/2011,1,0,2,3,2,0.134783,0.144283,0.494783,0.188839,53,1552,1605
 41,2/10/2011,1,0,2,4,1,0.144348,0.149548,0.437391,0.221935,47,1491,1538
 42,2/11/2011,1,0,2,5,1,0.189091,0.213509,0.506364,0.10855,149,1597,1746
 43,2/12/2011,1,0,2,6,1,0.2225,0.232954,0.544167,0.203367,288,1184,1472
 44,2/13/2011,1,0,2,0,1,0.316522,0.324113,0.457391,0.260883,397,1192,1589
 45,2/14/2011,1,0,2,1,1,0.415,0.39835,0.375833,0.417908,208,1705,1913
 46,2/15/2011,1,0,2,2,1,0.266087,0.254274,0.314348,0.291374,140,1675,1815
 47,2/16/2011,1,0,2,3,1,0.318261,0.3162,0.423478,0.251791,218,1897,2115
 48,2/17/2011,1,0,2,4,1,0.435833,0.428658,0.505,0.230104,259,2216,2475
 49,2/18/2011,1,0,2,5,1,0.521667,0.511983,0.516667,0.264925,579,2348,2927
 50,2/19/2011,1,0,2,6,1,0.399167,0.391404,0.187917,0.507463,532,1103,1635
 51,2/20/2011,1,0,2,0,1,0.285217,0.27733,0.407826,0.223235,639,1173,1812
 52,2/21/2011,1,0,2,1,2,0.303333,0.284075,0.605,0.307846,195,912,1107
 53,2/22/2011,1,0,2,2,1,0.182222,0.186033,0.577778,0.195683,74,1376,1450
 54,2/23/2011,1,0,2,3,1,0.221739,0.245717,0.423043,0.094113,139,1778,1917
 55,2/24/2011,1,0,2,4,2,0.295652,0.289191,0.697391,0.250496,100,1707,1807
 56,2/25/2011,1,0,2,5,2,0.364348,0.350461,0.712174,0.346539,120,1341,1461
 57,2/26/2011,1,0,2,6,1,0.2825,0.282192,0.537917,0.186571,424,1545,1969
 58,2/27/2011,1,0,2,0,1,0.343478,0.351109,0.68,0.125248,694,1708,2402
 59,2/28/2011,1,0,2,1,2,0.407273,0.400118,0.876364,0.289686,81,1365,1446
 60,3/1/2011,1,0,3,2,1,0.266667,0.263879,0.535,0.216425,137,1714,1851
 61,3/2/2011,1,0,3,3,1,0.335,0.320071,0.449583,0.307833,231,1903,2134
 62,3/3/2011,1,0,3,4,1,0.198333,0.200133,0.318333,0.225754,123,1562,1685
 63,3/4/2011,1,0,3,5,2,0.261667,0.255679,0.610417,0.203346,214,1730,1944
 64,3/5/2011,1,0,3,6,2,0.384167,0.378779,0.789167,0.251871,640,1437,2077
 65,3/6/2011,1,0,3,0,2,0.376522,0.366252,0.948261,0.343287,114,491,605
 66,3/7/2011,1,0,3,1,1,0.261739,0.238461,0.551304,0.341352,244,1628,1872
 67,3/8/2011,1,0,3,2,1,0.2925,0.3024,0.420833,0.12065,316,1817,2133
 68,3/9/2011,1,0,3,3,2,0.295833,0.286608,0.775417,0.22015,191,1700,1891
 69,3/10/2011,1,0,3,4,3,0.389091,0.385668,0,0.261877,46,577,623
 70,3/11/2011,1,0,3,5,2,0.316522,0.305,0.649565,0.23297,247,1730,1977
 71,3/12/2011,1,0,3,6,1,0.329167,0.32575,0.594583,0.220775,724,1408,2132
 72,3/13/2011,1,0,3,0,1,0.384348,0.380091,0.527391,0.270604,982,1435,2417
 73,3/14/2011,1,0,3,1,1,0.325217,0.332,0.496957,0.136926,359,1687,2046
 74,3/15/2011,1,0,3,2,2,0.317391,0.318178,0.655652,0.184309,289,1767,2056
 75,3/16/2011,1,0,3,3,2,0.365217,0.36693,0.776522,0.203117,321,1871,2192
 76,3/17/2011,1,0,3,4,1,0.415,0.410333,0.602917,0.209579,424,2320,2744
 77,3/18/2011,1,0,3,5,1,0.54,0.527009,0.525217,0.231017,884,2355,3239
 78,3/19/2011,1,0,3,6,1,0.4725,0.466525,0.379167,0.368167,1424,1693,3117
 79,3/20/2011,1,0,3,0,1,0.3325,0.32575,0.47375,0.207721,1047,1424,2471
 80,3/21/2011,2,0,3,1,2,0.430435,0.409735,0.737391,0.288783,401,1676,2077
 81,3/22/2011,2,0,3,2,1,0.441667,0.440642,0.624583,0.22575,460,2243,2703
 82,3/23/2011,2,0,3,3,2,0.346957,0.337939,0.839565,0.234261,203,1918,2121
 83,3/24/2011,2,0,3,4,2,0.285,0.270833,0.805833,0.243787,166,1699,1865
 84,3/25/2011,2,0,3,5,1,0.264167,0.256312,0.495,0.230725,300,1910,2210
 85,3/26/2011,2,0,3,6,1,0.265833,0.257571,0.394167,0.209571,981,1515,2496
 86,3/27/2011,2,0,3,0,2,0.253043,0.250339,0.493913,0.1843,472,1221,1693
 87,3/28/2011,2,0,3,1,1,0.264348,0.257574,0.302174,0.212204,222,1806,2028
 88,3/29/2011,2,0,3,2,1,0.3025,0.292908,0.314167,0.226996,317,2108,2425
 89,3/30/2011,2,0,3,3,2,0.3,0.29735,0.646667,0.172888,168,1368,1536
 90,3/31/2011,2,0,3,4,3,0.268333,0.257575,0.918333,0.217646,179,1506,1685
 91,4/1/2011,2,0,4,5,2,0.3,0.283454,0.68625,0.258708,307,1920,2227
 92,4/2/2011,2,0,4,6,2,0.315,0.315637,0.65375,0.197146,898,1354,2252
 93,4/3/2011,2,0,4,0,1,0.378333,0.378767,0.48,0.182213,1651,1598,3249
 94,4/4/2011,2,0,4,1,1,0.573333,0.542929,0.42625,0.385571,734,2381,3115
 95,4/5/2011,2,0,4,2,2,0.414167,0.39835,0.642083,0.388067,167,1628,1795
 96,4/6/2011,2,0,4,3,1,0.390833,0.387608,0.470833,0.263063,413,2395,2808
 97,4/7/2011,2,0,4,4,1,0.4375,0.433696,0.602917,0.162312,571,2570,3141
 98,4/8/2011,2,0,4,5,2,0.335833,0.324479,0.83625,0.226992,172,1299,1471
 99,4/9/2011,2,0,4,6,2,0.3425,0.341529,0.8775,0.133083,879,1576,2455
 100,4/10/2011,2,0,4,0,2,0.426667,0.426737,0.8575,0.146767,1188,1707,2895
 101,4/11/2011,2,0,4,1,2,0.595652,0.565217,0.716956,0.324474,855,2493,3348
 102,4/12/2011,2,0,4,2,2,0.5025,0.493054,0.739167,0.274879,257,1777,2034
 103,4/13/2011,2,0,4,3,2,0.4125,0.417283,0.819167,0.250617,209,1953,2162
 104,4/14/2011,2,0,4,4,1,0.4675,0.462742,0.540417,0.1107,529,2738,3267
 105,4/15/2011,2,0,4,5,1,0.446667,0.441913,0.67125,0.226375,642,2484,3126
 106,4/16/2011,2,0,4,6,3,0.430833,0.425492,0.888333,0.340808,121,674,795
 107,4/17/2011,2,0,4,0,1,0.456667,0.445696,0.479583,0.303496,1558,2186,3744
 108,4/18/2011,2,0,4,1,1,0.5125,0.503146,0.5425,0.163567,669,2760,3429
 109,4/19/2011,2,0,4,2,2,0.505833,0.489258,0.665833,0.157971,409,2795,3204
 110,4/20/2011,2,0,4,3,1,0.595,0.564392,0.614167,0.241925,613,3331,3944
 111,4/21/2011,2,0,4,4,1,0.459167,0.453892,0.407083,0.325258,745,3444,4189
 112,4/22/2011,2,0,4,5,2,0.336667,0.321954,0.729583,0.219521,177,1506,1683
 113,4/23/2011,2,0,4,6,2,0.46,0.450121,0.887917,0.230725,1462,2574,4036
 114,4/24/2011,2,0,4,0,2,0.581667,0.551763,0.810833,0.192175,1710,2481,4191
 115,4/25/2011,2,0,4,1,1,0.606667,0.5745,0.776667,0.185333,773,3300,4073
 116,4/26/2011,2,0,4,2,1,0.631667,0.594083,0.729167,0.3265,678,3722,4400
 117,4/27/2011,2,0,4,3,2,0.62,0.575142,0.835417,0.3122,547,3325,3872
 118,4/28/2011,2,0,4,4,2,0.6175,0.578929,0.700833,0.320908,569,3489,4058
 119,4/29/2011,2,0,4,5,1,0.51,0.497463,0.457083,0.240063,878,3717,4595
 120,4/30/2011,2,0,4,6,1,0.4725,0.464021,0.503333,0.235075,1965,3347,5312
 121,5/1/2011,2,0,5,0,2,0.451667,0.448204,0.762083,0.106354,1138,2213,3351
 122,5/2/2011,2,0,5,1,2,0.549167,0.532833,0.73,0.183454,847,3554,4401
 123,5/3/2011,2,0,5,2,2,0.616667,0.582079,0.697083,0.342667,603,3848,4451
 124,5/4/2011,2,0,5,3,2,0.414167,0.40465,0.737083,0.328996,255,2378,2633
 125,5/5/2011,2,0,5,4,1,0.459167,0.441917,0.444167,0.295392,614,3819,4433
 126,5/6/2011,2,0,5,5,1,0.479167,0.474117,0.59,0.228246,894,3714,4608
 127,5/7/2011,2,0,5,6,1,0.52,0.512621,0.54125,0.16045,1612,3102,4714
 128,5/8/2011,2,0,5,0,1,0.528333,0.518933,0.631667,0.0746375,1401,2932,4333
 129,5/9/2011,2,0,5,1,1,0.5325,0.525246,0.58875,0.176,664,3698,4362
 130,5/10/2011,2,0,5,2,1,0.5325,0.522721,0.489167,0.115671,694,4109,4803
 131,5/11/2011,2,0,5,3,1,0.5425,0.5284,0.632917,0.120642,550,3632,4182
 132,5/12/2011,2,0,5,4,1,0.535,0.523363,0.7475,0.189667,695,4169,4864
 133,5/13/2011,2,0,5,5,2,0.5125,0.4943,0.863333,0.179725,692,3413,4105
 134,5/14/2011,2,0,5,6,2,0.520833,0.500629,0.9225,0.13495,902,2507,3409
 135,5/15/2011,2,0,5,0,2,0.5625,0.536,0.867083,0.152979,1582,2971,4553
 136,5/16/2011,2,0,5,1,1,0.5775,0.550512,0.787917,0.126871,773,3185,3958
 137,5/17/2011,2,0,5,2,2,0.561667,0.538529,0.837917,0.277354,678,3445,4123
 138,5/18/2011,2,0,5,3,2,0.55,0.527158,0.87,0.201492,536,3319,3855
 139,5/19/2011,2,0,5,4,2,0.530833,0.510742,0.829583,0.108213,735,3840,4575
 140,5/20/2011,2,0,5,5,1,0.536667,0.529042,0.719583,0.125013,909,4008,4917
 141,5/21/2011,2,0,5,6,1,0.6025,0.571975,0.626667,0.12065,2258,3547,5805
 142,5/22/2011,2,0,5,0,1,0.604167,0.5745,0.749583,0.148008,1576,3084,4660
 143,5/23/2011,2,0,5,1,2,0.631667,0.590296,0.81,0.233842,836,3438,4274
 144,5/24/2011,2,0,5,2,2,0.66,0.604813,0.740833,0.207092,659,3833,4492
 145,5/25/2011,2,0,5,3,1,0.660833,0.615542,0.69625,0.154233,740,4238,4978
 146,5/26/2011,2,0,5,4,1,0.708333,0.654688,0.6775,0.199642,758,3919,4677
 147,5/27/2011,2,0,5,5,1,0.681667,0.637008,0.65375,0.240679,871,3808,4679
 148,5/28/2011,2,0,5,6,1,0.655833,0.612379,0.729583,0.230092,2001,2757,4758
 149,5/29/2011,2,0,5,0,1,0.6675,0.61555,0.81875,0.213938,2355,2433,4788
 150,5/30/2011,2,0,5,1,1,0.733333,0.671092,0.685,0.131225,1549,2549,4098
 151,5/31/2011,2,0,5,2,1,0.775,0.725383,0.636667,0.111329,673,3309,3982
 152,6/1/2011,2,0,6,3,2,0.764167,0.720967,0.677083,0.207092,513,3461,3974
 153,6/2/2011,2,0,6,4,1,0.715,0.643942,0.305,0.292287,736,4232,4968
 154,6/3/2011,2,0,6,5,1,0.62,0.587133,0.354167,0.253121,898,4414,5312
 155,6/4/2011,2,0,6,6,1,0.635,0.594696,0.45625,0.123142,1869,3473,5342
 156,6/5/2011,2,0,6,0,2,0.648333,0.616804,0.6525,0.138692,1685,3221,4906
 157,6/6/2011,2,0,6,1,1,0.678333,0.621858,0.6,0.121896,673,3875,4548
 158,6/7/2011,2,0,6,2,1,0.7075,0.65595,0.597917,0.187808,763,4070,4833
 159,6/8/2011,2,0,6,3,1,0.775833,0.727279,0.622083,0.136817,676,3725,4401
 160,6/9/2011,2,0,6,4,2,0.808333,0.757579,0.568333,0.149883,563,3352,3915
 161,6/10/2011,2,0,6,5,1,0.755,0.703292,0.605,0.140554,815,3771,4586
 162,6/11/2011,2,0,6,6,1,0.725,0.678038,0.654583,0.15485,1729,3237,4966
 163,6/12/2011,2,0,6,0,1,0.6925,0.643325,0.747917,0.163567,1467,2993,4460
 164,6/13/2011,2,0,6,1,1,0.635,0.601654,0.494583,0.30535,863,4157,5020
 165,6/14/2011,2,0,6,2,1,0.604167,0.591546,0.507083,0.269283,727,4164,4891
 166,6/15/2011,2,0,6,3,1,0.626667,0.587754,0.471667,0.167912,769,4411,5180
 167,6/16/2011,2,0,6,4,2,0.628333,0.595346,0.688333,0.206471,545,3222,3767
 168,6/17/2011,2,0,6,5,1,0.649167,0.600383,0.735833,0.143029,863,3981,4844
 169,6/18/2011,2,0,6,6,1,0.696667,0.643954,0.670417,0.119408,1807,3312,5119
 170,6/19/2011,2,0,6,0,2,0.699167,0.645846,0.666667,0.102,1639,3105,4744
 171,6/20/2011,2,0,6,1,2,0.635,0.595346,0.74625,0.155475,699,3311,4010
 172,6/21/2011,3,0,6,2,2,0.680833,0.637646,0.770417,0.171025,774,4061,4835
 173,6/22/2011,3,0,6,3,1,0.733333,0.693829,0.7075,0.172262,661,3846,4507
 174,6/23/2011,3,0,6,4,2,0.728333,0.693833,0.703333,0.238804,746,4044,4790
 175,6/24/2011,3,0,6,5,1,0.724167,0.656583,0.573333,0.222025,969,4022,4991
 176,6/25/2011,3,0,6,6,1,0.695,0.643313,0.483333,0.209571,1782,3420,5202
 177,6/26/2011,3,0,6,0,1,0.68,0.637629,0.513333,0.0945333,1920,3385,5305
 178,6/27/2011,3,0,6,1,2,0.6825,0.637004,0.658333,0.107588,854,3854,4708
 179,6/28/2011,3,0,6,2,1,0.744167,0.692558,0.634167,0.144283,732,3916,4648
 180,6/29/2011,3,0,6,3,1,0.728333,0.654688,0.497917,0.261821,848,4377,5225
 181,6/30/2011,3,0,6,4,1,0.696667,0.637008,0.434167,0.185312,1027,4488,5515
 182,7/1/2011,3,0,7,5,1,0.7225,0.652162,0.39625,0.102608,1246,4116,5362
 183,7/2/2011,3,0,7,6,1,0.738333,0.667308,0.444583,0.115062,2204,2915,5119
 184,7/3/2011,3,0,7,0,2,0.716667,0.668575,0.6825,0.228858,2282,2367,4649
 185,7/4/2011,3,0,7,1,2,0.726667,0.665417,0.637917,0.0814792,3065,2978,6043
 186,7/5/2011,3,0,7,2,1,0.746667,0.696338,0.590417,0.126258,1031,3634,4665
 187,7/6/2011,3,0,7,3,1,0.72,0.685633,0.743333,0.149883,784,3845,4629
 188,7/7/2011,3,0,7,4,1,0.75,0.686871,0.65125,0.1592,754,3838,4592
 189,7/8/2011,3,0,7,5,2,0.709167,0.670483,0.757917,0.225129,692,3348,4040
 190,7/9/2011,3,0,7,6,1,0.733333,0.664158,0.609167,0.167912,1988,3348,5336
 191,7/10/2011,3,0,7,0,1,0.7475,0.690025,0.578333,0.183471,1743,3138,4881
 192,7/11/2011,3,0,7,1,1,0.7625,0.729804,0.635833,0.282337,723,3363,4086
 193,7/12/2011,3,0,7,2,1,0.794167,0.739275,0.559167,0.200254,662,3596,4258
 194,7/13/2011,3,0,7,3,1,0.746667,0.689404,0.631667,0.146133,748,3594,4342
 195,7/14/2011,3,0,7,4,1,0.680833,0.635104,0.47625,0.240667,888,4196,5084
 196,7/15/2011,3,0,7,5,1,0.663333,0.624371,0.59125,0.182833,1318,4220,5538
 197,7/16/2011,3,0,7,6,1,0.686667,0.638263,0.585,0.208342,2418,3505,5923
 198,7/17/2011,3,0,7,0,1,0.719167,0.669833,0.604167,0.245033,2006,3296,5302
 199,7/18/2011,3,0,7,1,1,0.746667,0.703925,0.65125,0.215804,841,3617,4458
 200,7/19/2011,3,0,7,2,1,0.776667,0.747479,0.650417,0.1306,752,3789,4541
 201,7/20/2011,3,0,7,3,1,0.768333,0.74685,0.707083,0.113817,644,3688,4332
 202,7/21/2011,3,0,7,4,2,0.815,0.826371,0.69125,0.222021,632,3152,3784
 203,7/22/2011,3,0,7,5,1,0.848333,0.840896,0.580417,0.1331,562,2825,3387
 204,7/23/2011,3,0,7,6,1,0.849167,0.804287,0.5,0.131221,987,2298,3285
 205,7/24/2011,3,0,7,0,1,0.83,0.794829,0.550833,0.169171,1050,2556,3606
 206,7/25/2011,3,0,7,1,1,0.743333,0.720958,0.757083,0.0908083,568,3272,3840
 207,7/26/2011,3,0,7,2,1,0.771667,0.696979,0.540833,0.200258,750,3840,4590
 208,7/27/2011,3,0,7,3,1,0.775,0.690667,0.402917,0.183463,755,3901,4656
 209,7/28/2011,3,0,7,4,1,0.779167,0.7399,0.583333,0.178479,606,3784,4390
 210,7/29/2011,3,0,7,5,1,0.838333,0.785967,0.5425,0.174138,670,3176,3846
 211,7/30/2011,3,0,7,6,1,0.804167,0.728537,0.465833,0.168537,1559,2916,4475
 212,7/31/2011,3,0,7,0,1,0.805833,0.729796,0.480833,0.164813,1524,2778,4302
 213,8/1/2011,3,0,8,1,1,0.771667,0.703292,0.550833,0.156717,729,3537,4266
 214,8/2/2011,3,0,8,2,1,0.783333,0.707071,0.49125,0.20585,801,4044,4845
 215,8/3/2011,3,0,8,3,2,0.731667,0.679937,0.6575,0.135583,467,3107,3574
 216,8/4/2011,3,0,8,4,2,0.71,0.664788,0.7575,0.19715,799,3777,4576
 217,8/5/2011,3,0,8,5,1,0.710833,0.656567,0.630833,0.184696,1023,3843,4866
 218,8/6/2011,3,0,8,6,2,0.716667,0.676154,0.755,0.22825,1521,2773,4294
 219,8/7/2011,3,0,8,0,1,0.7425,0.715292,0.752917,0.201487,1298,2487,3785
 220,8/8/2011,3,0,8,1,1,0.765,0.703283,0.592083,0.192175,846,3480,4326
 221,8/9/2011,3,0,8,2,1,0.775,0.724121,0.570417,0.151121,907,3695,4602
 222,8/10/2011,3,0,8,3,1,0.766667,0.684983,0.424167,0.200258,884,3896,4780
 223,8/11/2011,3,0,8,4,1,0.7175,0.651521,0.42375,0.164796,812,3980,4792
 224,8/12/2011,3,0,8,5,1,0.708333,0.654042,0.415,0.125621,1051,3854,4905
 225,8/13/2011,3,0,8,6,2,0.685833,0.645858,0.729583,0.211454,1504,2646,4150
 226,8/14/2011,3,0,8,0,2,0.676667,0.624388,0.8175,0.222633,1338,2482,3820
 227,8/15/2011,3,0,8,1,1,0.665833,0.616167,0.712083,0.208954,775,3563,4338
 228,8/16/2011,3,0,8,2,1,0.700833,0.645837,0.578333,0.236329,721,4004,4725
 229,8/17/2011,3,0,8,3,1,0.723333,0.666671,0.575417,0.143667,668,4026,4694
 230,8/18/2011,3,0,8,4,1,0.711667,0.662258,0.654583,0.233208,639,3166,3805
 231,8/19/2011,3,0,8,5,2,0.685,0.633221,0.722917,0.139308,797,3356,4153
 232,8/20/2011,3,0,8,6,1,0.6975,0.648996,0.674167,0.104467,1914,3277,5191
 233,8/21/2011,3,0,8,0,1,0.710833,0.675525,0.77,0.248754,1249,2624,3873
 234,8/22/2011,3,0,8,1,1,0.691667,0.638254,0.47,0.27675,833,3925,4758
 235,8/23/2011,3,0,8,2,1,0.640833,0.606067,0.455417,0.146763,1281,4614,5895
 236,8/24/2011,3,0,8,3,1,0.673333,0.630692,0.605,0.253108,949,4181,5130
 237,8/25/2011,3,0,8,4,2,0.684167,0.645854,0.771667,0.210833,435,3107,3542
 238,8/26/2011,3,0,8,5,1,0.7,0.659733,0.76125,0.0839625,768,3893,4661
 239,8/27/2011,3,0,8,6,2,0.68,0.635556,0.85,0.375617,226,889,1115
 240,8/28/2011,3,0,8,0,1,0.707059,0.647959,0.561765,0.304659,1415,2919,4334
 241,8/29/2011,3,0,8,1,1,0.636667,0.607958,0.554583,0.159825,729,3905,4634
 242,8/30/2011,3,0,8,2,1,0.639167,0.594704,0.548333,0.125008,775,4429,5204
 243,8/31/2011,3,0,8,3,1,0.656667,0.611121,0.597917,0.0833333,688,4370,5058
 244,9/1/2011,3,0,9,4,1,0.655,0.614921,0.639167,0.141796,783,4332,5115
 245,9/2/2011,3,0,9,5,2,0.643333,0.604808,0.727083,0.139929,875,3852,4727
 246,9/3/2011,3,0,9,6,1,0.669167,0.633213,0.716667,0.185325,1935,2549,4484
 247,9/4/2011,3,0,9,0,1,0.709167,0.665429,0.742083,0.206467,2521,2419,4940
 248,9/5/2011,3,0,9,1,2,0.673333,0.625646,0.790417,0.212696,1236,2115,3351
 249,9/6/2011,3,0,9,2,3,0.54,0.5152,0.886957,0.343943,204,2506,2710
 250,9/7/2011,3,0,9,3,3,0.599167,0.544229,0.917083,0.0970208,118,1878,1996
 251,9/8/2011,3,0,9,4,3,0.633913,0.555361,0.939565,0.192748,153,1689,1842
 252,9/9/2011,3,0,9,5,2,0.65,0.578946,0.897917,0.124379,417,3127,3544
 253,9/10/2011,3,0,9,6,1,0.66,0.607962,0.75375,0.153608,1750,3595,5345
 254,9/11/2011,3,0,9,0,1,0.653333,0.609229,0.71375,0.115054,1633,3413,5046
 255,9/12/2011,3,0,9,1,1,0.644348,0.60213,0.692174,0.088913,690,4023,4713
 256,9/13/2011,3,0,9,2,1,0.650833,0.603554,0.7125,0.141804,701,4062,4763
 257,9/14/2011,3,0,9,3,1,0.673333,0.6269,0.697083,0.1673,647,4138,4785
 258,9/15/2011,3,0,9,4,2,0.5775,0.553671,0.709167,0.271146,428,3231,3659
 259,9/16/2011,3,0,9,5,2,0.469167,0.461475,0.590417,0.164183,742,4018,4760
 260,9/17/2011,3,0,9,6,2,0.491667,0.478512,0.718333,0.189675,1434,3077,4511
 261,9/18/2011,3,0,9,0,1,0.5075,0.490537,0.695,0.178483,1353,2921,4274
 262,9/19/2011,3,0,9,1,2,0.549167,0.529675,0.69,0.151742,691,3848,4539
 263,9/20/2011,3,0,9,2,2,0.561667,0.532217,0.88125,0.134954,438,3203,3641
 264,9/21/2011,3,0,9,3,2,0.595,0.550533,0.9,0.0964042,539,3813,4352
 265,9/22/2011,3,0,9,4,2,0.628333,0.554963,0.902083,0.128125,555,4240,4795
 266,9/23/2011,4,0,9,5,2,0.609167,0.522125,0.9725,0.0783667,258,2137,2395
 267,9/24/2011,4,0,9,6,2,0.606667,0.564412,0.8625,0.0783833,1776,3647,5423
 268,9/25/2011,4,0,9,0,2,0.634167,0.572637,0.845,0.0503792,1544,3466,5010
 269,9/26/2011,4,0,9,1,2,0.649167,0.589042,0.848333,0.1107,684,3946,4630
 270,9/27/2011,4,0,9,2,2,0.636667,0.574525,0.885417,0.118171,477,3643,4120
 271,9/28/2011,4,0,9,3,2,0.635,0.575158,0.84875,0.148629,480,3427,3907
 272,9/29/2011,4,0,9,4,1,0.616667,0.574512,0.699167,0.172883,653,4186,4839
 273,9/30/2011,4,0,9,5,1,0.564167,0.544829,0.6475,0.206475,830,4372,5202
 274,10/1/2011,4,0,10,6,2,0.41,0.412863,0.75375,0.292296,480,1949,2429
 275,10/2/2011,4,0,10,0,2,0.356667,0.345317,0.791667,0.222013,616,2302,2918
 276,10/3/2011,4,0,10,1,2,0.384167,0.392046,0.760833,0.0833458,330,3240,3570
 277,10/4/2011,4,0,10,2,1,0.484167,0.472858,0.71,0.205854,486,3970,4456
 278,10/5/2011,4,0,10,3,1,0.538333,0.527138,0.647917,0.17725,559,4267,4826
 279,10/6/2011,4,0,10,4,1,0.494167,0.480425,0.620833,0.134954,639,4126,4765
 280,10/7/2011,4,0,10,5,1,0.510833,0.504404,0.684167,0.0223917,949,4036,4985
 281,10/8/2011,4,0,10,6,1,0.521667,0.513242,0.70125,0.0454042,2235,3174,5409
 282,10/9/2011,4,0,10,0,1,0.540833,0.523983,0.7275,0.06345,2397,3114,5511
 283,10/10/2011,4,0,10,1,1,0.570833,0.542925,0.73375,0.0423042,1514,3603,5117
 284,10/11/2011,4,0,10,2,2,0.566667,0.546096,0.80875,0.143042,667,3896,4563
 285,10/12/2011,4,0,10,3,3,0.543333,0.517717,0.90625,0.24815,217,2199,2416
 286,10/13/2011,4,0,10,4,2,0.589167,0.551804,0.896667,0.141787,290,2623,2913
 287,10/14/2011,4,0,10,5,2,0.550833,0.529675,0.71625,0.223883,529,3115,3644
 288,10/15/2011,4,0,10,6,1,0.506667,0.498725,0.483333,0.258083,1899,3318,5217
 289,10/16/2011,4,0,10,0,1,0.511667,0.503154,0.486667,0.281717,1748,3293,5041
 290,10/17/2011,4,0,10,1,1,0.534167,0.510725,0.579583,0.175379,713,3857,4570
 291,10/18/2011,4,0,10,2,2,0.5325,0.522721,0.701667,0.110087,637,4111,4748
 292,10/19/2011,4,0,10,3,3,0.541739,0.513848,0.895217,0.243339,254,2170,2424
 293,10/20/2011,4,0,10,4,1,0.475833,0.466525,0.63625,0.422275,471,3724,4195
 294,10/21/2011,4,0,10,5,1,0.4275,0.423596,0.574167,0.221396,676,3628,4304
 295,10/22/2011,4,0,10,6,1,0.4225,0.425492,0.629167,0.0926667,1499,2809,4308
 296,10/23/2011,4,0,10,0,1,0.421667,0.422333,0.74125,0.0995125,1619,2762,4381
 297,10/24/2011,4,0,10,1,1,0.463333,0.457067,0.772083,0.118792,699,3488,4187
 298,10/25/2011,4,0,10,2,1,0.471667,0.463375,0.622917,0.166658,695,3992,4687
 299,10/26/2011,4,0,10,3,2,0.484167,0.472846,0.720417,0.148642,404,3490,3894
 300,10/27/2011,4,0,10,4,2,0.47,0.457046,0.812917,0.197763,240,2419,2659
 301,10/28/2011,4,0,10,5,2,0.330833,0.318812,0.585833,0.229479,456,3291,3747
 302,10/29/2011,4,0,10,6,3,0.254167,0.227913,0.8825,0.351371,57,570,627
 303,10/30/2011,4,0,10,0,1,0.319167,0.321329,0.62375,0.176617,885,2446,3331
 304,10/31/2011,4,0,10,1,1,0.34,0.356063,0.703333,0.10635,362,3307,3669
 305,11/1/2011,4,0,11,2,1,0.400833,0.397088,0.68375,0.135571,410,3658,4068
 306,11/2/2011,4,0,11,3,1,0.3775,0.390133,0.71875,0.0820917,370,3816,4186
 307,11/3/2011,4,0,11,4,1,0.408333,0.405921,0.702083,0.136817,318,3656,3974
 308,11/4/2011,4,0,11,5,2,0.403333,0.403392,0.6225,0.271779,470,3576,4046
 309,11/5/2011,4,0,11,6,1,0.326667,0.323854,0.519167,0.189062,1156,2770,3926
 310,11/6/2011,4,0,11,0,1,0.348333,0.362358,0.734583,0.0920542,952,2697,3649
 311,11/7/2011,4,0,11,1,1,0.395,0.400871,0.75875,0.057225,373,3662,4035
 312,11/8/2011,4,0,11,2,1,0.408333,0.412246,0.721667,0.0690375,376,3829,4205
 313,11/9/2011,4,0,11,3,1,0.4,0.409079,0.758333,0.0621958,305,3804,4109
 314,11/10/2011,4,0,11,4,2,0.38,0.373721,0.813333,0.189067,190,2743,2933
 315,11/11/2011,4,0,11,5,1,0.324167,0.306817,0.44625,0.314675,440,2928,3368
 316,11/12/2011,4,0,11,6,1,0.356667,0.357942,0.552917,0.212062,1275,2792,4067
 317,11/13/2011,4,0,11,0,1,0.440833,0.43055,0.458333,0.281721,1004,2713,3717
 318,11/14/2011,4,0,11,1,1,0.53,0.524612,0.587083,0.306596,595,3891,4486
 319,11/15/2011,4,0,11,2,2,0.53,0.507579,0.68875,0.199633,449,3746,4195
 320,11/16/2011,4,0,11,3,3,0.456667,0.451988,0.93,0.136829,145,1672,1817
 321,11/17/2011,4,0,11,4,2,0.341667,0.323221,0.575833,0.305362,139,2914,3053
 322,11/18/2011,4,0,11,5,1,0.274167,0.272721,0.41,0.168533,245,3147,3392
 323,11/19/2011,4,0,11,6,1,0.329167,0.324483,0.502083,0.224496,943,2720,3663
 324,11/20/2011,4,0,11,0,2,0.463333,0.457058,0.684583,0.18595,787,2733,3520
 325,11/21/2011,4,0,11,1,3,0.4475,0.445062,0.91,0.138054,220,2545,2765
 326,11/22/2011,4,0,11,2,3,0.416667,0.421696,0.9625,0.118792,69,1538,1607
 327,11/23/2011,4,0,11,3,2,0.440833,0.430537,0.757917,0.335825,112,2454,2566
 328,11/24/2011,4,0,11,4,1,0.373333,0.372471,0.549167,0.167304,560,935,1495
 329,11/25/2011,4,0,11,5,1,0.375,0.380671,0.64375,0.0988958,1095,1697,2792
 330,11/26/2011,4,0,11,6,1,0.375833,0.385087,0.681667,0.0684208,1249,1819,3068
 331,11/27/2011,4,0,11,0,1,0.459167,0.4558,0.698333,0.208954,810,2261,3071
 332,11/28/2011,4,0,11,1,1,0.503478,0.490122,0.743043,0.142122,253,3614,3867
 333,11/29/2011,4,0,11,2,2,0.458333,0.451375,0.830833,0.258092,96,2818,2914
 334,11/30/2011,4,0,11,3,1,0.325,0.311221,0.613333,0.271158,188,3425,3613
 335,12/1/2011,4,0,12,4,1,0.3125,0.305554,0.524583,0.220158,182,3545,3727
 336,12/2/2011,4,0,12,5,1,0.314167,0.331433,0.625833,0.100754,268,3672,3940
 337,12/3/2011,4,0,12,6,1,0.299167,0.310604,0.612917,0.0957833,706,2908,3614
 338,12/4/2011,4,0,12,0,1,0.330833,0.3491,0.775833,0.0839583,634,2851,3485
 339,12/5/2011,4,0,12,1,2,0.385833,0.393925,0.827083,0.0622083,233,3578,3811
 340,12/6/2011,4,0,12,2,3,0.4625,0.4564,0.949583,0.232583,126,2468,2594
 341,12/7/2011,4,0,12,3,3,0.41,0.400246,0.970417,0.266175,50,655,705
 342,12/8/2011,4,0,12,4,1,0.265833,0.256938,0.58,0.240058,150,3172,3322
 343,12/9/2011,4,0,12,5,1,0.290833,0.317542,0.695833,0.0827167,261,3359,3620
 344,12/10/2011,4,0,12,6,1,0.275,0.266412,0.5075,0.233221,502,2688,3190
 345,12/11/2011,4,0,12,0,1,0.220833,0.253154,0.49,0.0665417,377,2366,2743
 346,12/12/2011,4,0,12,1,1,0.238333,0.270196,0.670833,0.06345,143,3167,3310
 347,12/13/2011,4,0,12,2,1,0.2825,0.301138,0.59,0.14055,155,3368,3523
 348,12/14/2011,4,0,12,3,2,0.3175,0.338362,0.66375,0.0609583,178,3562,3740
 349,12/15/2011,4,0,12,4,2,0.4225,0.412237,0.634167,0.268042,181,3528,3709
 350,12/16/2011,4,0,12,5,2,0.375,0.359825,0.500417,0.260575,178,3399,3577
 351,12/17/2011,4,0,12,6,2,0.258333,0.249371,0.560833,0.243167,275,2464,2739
 352,12/18/2011,4,0,12,0,1,0.238333,0.245579,0.58625,0.169779,220,2211,2431
 353,12/19/2011,4,0,12,1,1,0.276667,0.280933,0.6375,0.172896,260,3143,3403
 354,12/20/2011,4,0,12,2,2,0.385833,0.396454,0.595417,0.0615708,216,3534,3750
 355,12/21/2011,1,0,12,3,2,0.428333,0.428017,0.858333,0.2214,107,2553,2660
 356,12/22/2011,1,0,12,4,2,0.423333,0.426121,0.7575,0.047275,227,2841,3068
 357,12/23/2011,1,0,12,5,1,0.373333,0.377513,0.68625,0.274246,163,2046,2209
 358,12/24/2011,1,0,12,6,1,0.3025,0.299242,0.5425,0.190304,155,856,1011
 359,12/25/2011,1,0,12,0,1,0.274783,0.279961,0.681304,0.155091,303,451,754
 360,12/26/2011,1,0,12,1,1,0.321739,0.315535,0.506957,0.239465,430,887,1317
 361,12/27/2011,1,0,12,2,2,0.325,0.327633,0.7625,0.18845,103,1059,1162
 362,12/28/2011,1,0,12,3,1,0.29913,0.279974,0.503913,0.293961,255,2047,2302
 363,12/29/2011,1,0,12,4,1,0.248333,0.263892,0.574167,0.119412,254,2169,2423
 364,12/30/2011,1,0,12,5,1,0.311667,0.318812,0.636667,0.134337,491,2508,2999
 365,12/31/2011,1,0,12,6,1,0.41,0.414121,0.615833,0.220154,665,1820,2485
 366,1/1/2012,1,1,1,0,1,0.37,0.375621,0.6925,0.192167,686,1608,2294
 367,1/2/2012,1,1,1,1,1,0.273043,0.252304,0.381304,0.329665,244,1707,1951
 368,1/3/2012,1,1,1,2,1,0.15,0.126275,0.44125,0.365671,89,2147,2236
 369,1/4/2012,1,1,1,3,2,0.1075,0.119337,0.414583,0.1847,95,2273,2368
 370,1/5/2012,1,1,1,4,1,0.265833,0.278412,0.524167,0.129987,140,3132,3272
 371,1/6/2012,1,1,1,5,1,0.334167,0.340267,0.542083,0.167908,307,3791,4098
 372,1/7/2012,1,1,1,6,1,0.393333,0.390779,0.531667,0.174758,1070,3451,4521
 373,1/8/2012,1,1,1,0,1,0.3375,0.340258,0.465,0.191542,599,2826,3425
 374,1/9/2012,1,1,1,1,2,0.224167,0.247479,0.701667,0.0989,106,2270,2376
 375,1/10/2012,1,1,1,2,1,0.308696,0.318826,0.646522,0.187552,173,3425,3598
 376,1/11/2012,1,1,1,3,2,0.274167,0.282821,0.8475,0.131221,92,2085,2177
 377,1/12/2012,1,1,1,4,2,0.3825,0.381938,0.802917,0.180967,269,3828,4097
 378,1/13/2012,1,1,1,5,1,0.274167,0.249362,0.5075,0.378108,174,3040,3214
 379,1/14/2012,1,1,1,6,1,0.18,0.183087,0.4575,0.187183,333,2160,2493
 380,1/15/2012,1,1,1,0,1,0.166667,0.161625,0.419167,0.251258,284,2027,2311
 381,1/16/2012,1,1,1,1,1,0.19,0.190663,0.5225,0.231358,217,2081,2298
 382,1/17/2012,1,1,1,2,2,0.373043,0.364278,0.716087,0.34913,127,2808,2935
 383,1/18/2012,1,1,1,3,1,0.303333,0.275254,0.443333,0.415429,109,3267,3376
 384,1/19/2012,1,1,1,4,1,0.19,0.190038,0.4975,0.220158,130,3162,3292
 385,1/20/2012,1,1,1,5,2,0.2175,0.220958,0.45,0.20275,115,3048,3163
 386,1/21/2012,1,1,1,6,2,0.173333,0.174875,0.83125,0.222642,67,1234,1301
 387,1/22/2012,1,1,1,0,2,0.1625,0.16225,0.79625,0.199638,196,1781,1977
 388,1/23/2012,1,1,1,1,2,0.218333,0.243058,0.91125,0.110708,145,2287,2432
 389,1/24/2012,1,1,1,2,1,0.3425,0.349108,0.835833,0.123767,439,3900,4339
 390,1/25/2012,1,1,1,3,1,0.294167,0.294821,0.64375,0.161071,467,3803,4270
 391,1/26/2012,1,1,1,4,2,0.341667,0.35605,0.769583,0.0733958,244,3831,4075
 392,1/27/2012,1,1,1,5,2,0.425,0.415383,0.74125,0.342667,269,3187,3456
 393,1/28/2012,1,1,1,6,1,0.315833,0.326379,0.543333,0.210829,775,3248,4023
 394,1/29/2012,1,1,1,0,1,0.2825,0.272721,0.31125,0.24005,558,2685,3243
 395,1/30/2012,1,1,1,1,1,0.269167,0.262625,0.400833,0.215792,126,3498,3624
 396,1/31/2012,1,1,1,2,1,0.39,0.381317,0.416667,0.261817,324,4185,4509
 397,2/1/2012,1,1,2,3,1,0.469167,0.466538,0.507917,0.189067,304,4275,4579
 398,2/2/2012,1,1,2,4,2,0.399167,0.398971,0.672917,0.187187,190,3571,3761
 399,2/3/2012,1,1,2,5,1,0.313333,0.309346,0.526667,0.178496,310,3841,4151
 400,2/4/2012,1,1,2,6,2,0.264167,0.272725,0.779583,0.121896,384,2448,2832
 401,2/5/2012,1,1,2,0,2,0.265833,0.264521,0.687917,0.175996,318,2629,2947
 402,2/6/2012,1,1,2,1,1,0.282609,0.296426,0.622174,0.1538,206,3578,3784
 403,2/7/2012,1,1,2,2,1,0.354167,0.361104,0.49625,0.147379,199,4176,4375
 404,2/8/2012,1,1,2,3,2,0.256667,0.266421,0.722917,0.133721,109,2693,2802
 405,2/9/2012,1,1,2,4,1,0.265,0.261988,0.562083,0.194037,163,3667,3830
 406,2/10/2012,1,1,2,5,2,0.280833,0.293558,0.54,0.116929,227,3604,3831
 407,2/11/2012,1,1,2,6,3,0.224167,0.210867,0.73125,0.289796,192,1977,2169
 408,2/12/2012,1,1,2,0,1,0.1275,0.101658,0.464583,0.409212,73,1456,1529
 409,2/13/2012,1,1,2,1,1,0.2225,0.227913,0.41125,0.167283,94,3328,3422
 410,2/14/2012,1,1,2,2,2,0.319167,0.333946,0.50875,0.141179,135,3787,3922
 411,2/15/2012,1,1,2,3,1,0.348333,0.351629,0.53125,0.1816,141,4028,4169
 412,2/16/2012,1,1,2,4,2,0.316667,0.330162,0.752917,0.091425,74,2931,3005
 413,2/17/2012,1,1,2,5,1,0.343333,0.351629,0.634583,0.205846,349,3805,4154
 414,2/18/2012,1,1,2,6,1,0.346667,0.355425,0.534583,0.190929,1435,2883,4318
 415,2/19/2012,1,1,2,0,2,0.28,0.265788,0.515833,0.253112,618,2071,2689
 416,2/20/2012,1,1,2,1,1,0.28,0.273391,0.507826,0.229083,502,2627,3129
 417,2/21/2012,1,1,2,2,1,0.287826,0.295113,0.594348,0.205717,163,3614,3777
 418,2/22/2012,1,1,2,3,1,0.395833,0.392667,0.567917,0.234471,394,4379,4773
 419,2/23/2012,1,1,2,4,1,0.454167,0.444446,0.554583,0.190913,516,4546,5062
 420,2/24/2012,1,1,2,5,2,0.4075,0.410971,0.7375,0.237567,246,3241,3487
 421,2/25/2012,1,1,2,6,1,0.290833,0.255675,0.395833,0.421642,317,2415,2732
 422,2/26/2012,1,1,2,0,1,0.279167,0.268308,0.41,0.205229,515,2874,3389
 423,2/27/2012,1,1,2,1,1,0.366667,0.357954,0.490833,0.268033,253,4069,4322
 424,2/28/2012,1,1,2,2,1,0.359167,0.353525,0.395833,0.193417,229,4134,4363
 425,2/29/2012,1,1,2,3,2,0.344348,0.34847,0.804783,0.179117,65,1769,1834
 426,3/1/2012,1,1,3,4,1,0.485833,0.475371,0.615417,0.226987,325,4665,4990
 427,3/2/2012,1,1,3,5,2,0.353333,0.359842,0.657083,0.144904,246,2948,3194
 428,3/3/2012,1,1,3,6,2,0.414167,0.413492,0.62125,0.161079,956,3110,4066
 429,3/4/2012,1,1,3,0,1,0.325833,0.303021,0.403333,0.334571,710,2713,3423
 430,3/5/2012,1,1,3,1,1,0.243333,0.241171,0.50625,0.228858,203,3130,3333
 431,3/6/2012,1,1,3,2,1,0.258333,0.255042,0.456667,0.200875,221,3735,3956
 432,3/7/2012,1,1,3,3,1,0.404167,0.3851,0.513333,0.345779,432,4484,4916
 433,3/8/2012,1,1,3,4,1,0.5275,0.524604,0.5675,0.441563,486,4896,5382
 434,3/9/2012,1,1,3,5,2,0.410833,0.397083,0.407083,0.4148,447,4122,4569
 435,3/10/2012,1,1,3,6,1,0.2875,0.277767,0.350417,0.22575,968,3150,4118
 436,3/11/2012,1,1,3,0,1,0.361739,0.35967,0.476957,0.222587,1658,3253,4911
 437,3/12/2012,1,1,3,1,1,0.466667,0.459592,0.489167,0.207713,838,4460,5298
 438,3/13/2012,1,1,3,2,1,0.565,0.542929,0.6175,0.23695,762,5085,5847
 439,3/14/2012,1,1,3,3,1,0.5725,0.548617,0.507083,0.115062,997,5315,6312
 440,3/15/2012,1,1,3,4,1,0.5575,0.532825,0.579583,0.149883,1005,5187,6192
 441,3/16/2012,1,1,3,5,2,0.435833,0.436229,0.842083,0.113192,548,3830,4378
 442,3/17/2012,1,1,3,6,2,0.514167,0.505046,0.755833,0.110704,3155,4681,7836
 443,3/18/2012,1,1,3,0,2,0.4725,0.464,0.81,0.126883,2207,3685,5892
 444,3/19/2012,1,1,3,1,1,0.545,0.532821,0.72875,0.162317,982,5171,6153
 445,3/20/2012,1,1,3,2,1,0.560833,0.538533,0.807917,0.121271,1051,5042,6093
 446,3/21/2012,2,1,3,3,2,0.531667,0.513258,0.82125,0.0895583,1122,5108,6230
 447,3/22/2012,2,1,3,4,1,0.554167,0.531567,0.83125,0.117562,1334,5537,6871
 448,3/23/2012,2,1,3,5,2,0.601667,0.570067,0.694167,0.1163,2469,5893,8362
 449,3/24/2012,2,1,3,6,2,0.5025,0.486733,0.885417,0.192783,1033,2339,3372
 450,3/25/2012,2,1,3,0,2,0.4375,0.437488,0.880833,0.220775,1532,3464,4996
 451,3/26/2012,2,1,3,1,1,0.445833,0.43875,0.477917,0.386821,795,4763,5558
 452,3/27/2012,2,1,3,2,1,0.323333,0.315654,0.29,0.187192,531,4571,5102
 453,3/28/2012,2,1,3,3,1,0.484167,0.47095,0.48125,0.291671,674,5024,5698
 454,3/29/2012,2,1,3,4,1,0.494167,0.482304,0.439167,0.31965,834,5299,6133
 455,3/30/2012,2,1,3,5,2,0.37,0.375621,0.580833,0.138067,796,4663,5459
 456,3/31/2012,2,1,3,6,2,0.424167,0.421708,0.738333,0.250617,2301,3934,6235
 457,4/1/2012,2,1,4,0,2,0.425833,0.417287,0.67625,0.172267,2347,3694,6041
 458,4/2/2012,2,1,4,1,1,0.433913,0.427513,0.504348,0.312139,1208,4728,5936
 459,4/3/2012,2,1,4,2,1,0.466667,0.461483,0.396667,0.100133,1348,5424,6772
 460,4/4/2012,2,1,4,3,1,0.541667,0.53345,0.469583,0.180975,1058,5378,6436
 461,4/5/2012,2,1,4,4,1,0.435,0.431163,0.374167,0.219529,1192,5265,6457
 462,4/6/2012,2,1,4,5,1,0.403333,0.390767,0.377083,0.300388,1807,4653,6460
 463,4/7/2012,2,1,4,6,1,0.4375,0.426129,0.254167,0.274871,3252,3605,6857
 464,4/8/2012,2,1,4,0,1,0.5,0.492425,0.275833,0.232596,2230,2939,5169
 465,4/9/2012,2,1,4,1,1,0.489167,0.476638,0.3175,0.358196,905,4680,5585
 466,4/10/2012,2,1,4,2,1,0.446667,0.436233,0.435,0.249375,819,5099,5918
 467,4/11/2012,2,1,4,3,1,0.348696,0.337274,0.469565,0.295274,482,4380,4862
 468,4/12/2012,2,1,4,4,1,0.3975,0.387604,0.46625,0.290429,663,4746,5409
 469,4/13/2012,2,1,4,5,1,0.4425,0.431808,0.408333,0.155471,1252,5146,6398
 470,4/14/2012,2,1,4,6,1,0.495,0.487996,0.502917,0.190917,2795,4665,7460
 471,4/15/2012,2,1,4,0,1,0.606667,0.573875,0.507917,0.225129,2846,4286,7132
 472,4/16/2012,2,1,4,1,1,0.664167,0.614925,0.561667,0.284829,1198,5172,6370
 473,4/17/2012,2,1,4,2,1,0.608333,0.598487,0.390417,0.273629,989,5702,6691
 474,4/18/2012,2,1,4,3,2,0.463333,0.457038,0.569167,0.167912,347,4020,4367
 475,4/19/2012,2,1,4,4,1,0.498333,0.493046,0.6125,0.0659292,846,5719,6565
 476,4/20/2012,2,1,4,5,1,0.526667,0.515775,0.694583,0.149871,1340,5950,7290
 477,4/21/2012,2,1,4,6,1,0.57,0.542921,0.682917,0.283587,2541,4083,6624
 478,4/22/2012,2,1,4,0,3,0.396667,0.389504,0.835417,0.344546,120,907,1027
 479,4/23/2012,2,1,4,1,2,0.321667,0.301125,0.766667,0.303496,195,3019,3214
 480,4/24/2012,2,1,4,2,1,0.413333,0.405283,0.454167,0.249383,518,5115,5633
 481,4/25/2012,2,1,4,3,1,0.476667,0.470317,0.427917,0.118792,655,5541,6196
 482,4/26/2012,2,1,4,4,2,0.498333,0.483583,0.756667,0.176625,475,4551,5026
 483,4/27/2012,2,1,4,5,1,0.4575,0.452637,0.400833,0.347633,1014,5219,6233
 484,4/28/2012,2,1,4,6,2,0.376667,0.377504,0.489583,0.129975,1120,3100,4220
 485,4/29/2012,2,1,4,0,1,0.458333,0.450121,0.587083,0.116908,2229,4075,6304
 486,4/30/2012,2,1,4,1,2,0.464167,0.457696,0.57,0.171638,665,4907,5572
 487,5/1/2012,2,1,5,2,2,0.613333,0.577021,0.659583,0.156096,653,5087,5740
 488,5/2/2012,2,1,5,3,1,0.564167,0.537896,0.797083,0.138058,667,5502,6169
 489,5/3/2012,2,1,5,4,2,0.56,0.537242,0.768333,0.133696,764,5657,6421
 490,5/4/2012,2,1,5,5,1,0.6275,0.590917,0.735417,0.162938,1069,5227,6296
 491,5/5/2012,2,1,5,6,2,0.621667,0.584608,0.756667,0.152992,2496,4387,6883
 492,5/6/2012,2,1,5,0,2,0.5625,0.546737,0.74,0.149879,2135,4224,6359
 493,5/7/2012,2,1,5,1,2,0.5375,0.527142,0.664167,0.230721,1008,5265,6273
 494,5/8/2012,2,1,5,2,2,0.581667,0.557471,0.685833,0.296029,738,4990,5728
 495,5/9/2012,2,1,5,3,2,0.575,0.553025,0.744167,0.216412,620,4097,4717
 496,5/10/2012,2,1,5,4,1,0.505833,0.491783,0.552083,0.314063,1026,5546,6572
 497,5/11/2012,2,1,5,5,1,0.533333,0.520833,0.360417,0.236937,1319,5711,7030
 498,5/12/2012,2,1,5,6,1,0.564167,0.544817,0.480417,0.123133,2622,4807,7429
 499,5/13/2012,2,1,5,0,1,0.6125,0.585238,0.57625,0.225117,2172,3946,6118
 500,5/14/2012,2,1,5,1,2,0.573333,0.5499,0.789583,0.212692,342,2501,2843
 501,5/15/2012,2,1,5,2,2,0.611667,0.576404,0.794583,0.147392,625,4490,5115
 502,5/16/2012,2,1,5,3,1,0.636667,0.595975,0.697917,0.122512,991,6433,7424
 503,5/17/2012,2,1,5,4,1,0.593333,0.572613,0.52,0.229475,1242,6142,7384
 504,5/18/2012,2,1,5,5,1,0.564167,0.551121,0.523333,0.136817,1521,6118,7639
 505,5/19/2012,2,1,5,6,1,0.6,0.566908,0.45625,0.083975,3410,4884,8294
 506,5/20/2012,2,1,5,0,1,0.620833,0.583967,0.530417,0.254367,2704,4425,7129
 507,5/21/2012,2,1,5,1,2,0.598333,0.565667,0.81125,0.233204,630,3729,4359
 508,5/22/2012,2,1,5,2,2,0.615,0.580825,0.765833,0.118167,819,5254,6073
 509,5/23/2012,2,1,5,3,2,0.621667,0.584612,0.774583,0.102,766,4494,5260
 510,5/24/2012,2,1,5,4,1,0.655,0.6067,0.716667,0.172896,1059,5711,6770
 511,5/25/2012,2,1,5,5,1,0.68,0.627529,0.747083,0.14055,1417,5317,6734
 512,5/26/2012,2,1,5,6,1,0.6925,0.642696,0.7325,0.198992,2855,3681,6536
 513,5/27/2012,2,1,5,0,1,0.69,0.641425,0.697083,0.215171,3283,3308,6591
 514,5/28/2012,2,1,5,1,1,0.7125,0.6793,0.67625,0.196521,2557,3486,6043
 515,5/29/2012,2,1,5,2,1,0.7225,0.672992,0.684583,0.2954,880,4863,5743
 516,5/30/2012,2,1,5,3,2,0.656667,0.611129,0.67,0.134329,745,6110,6855
 517,5/31/2012,2,1,5,4,1,0.68,0.631329,0.492917,0.195279,1100,6238,7338
 518,6/1/2012,2,1,6,5,2,0.654167,0.607962,0.755417,0.237563,533,3594,4127
 519,6/2/2012,2,1,6,6,1,0.583333,0.566288,0.549167,0.186562,2795,5325,8120
 520,6/3/2012,2,1,6,0,1,0.6025,0.575133,0.493333,0.184087,2494,5147,7641
 521,6/4/2012,2,1,6,1,1,0.5975,0.578283,0.487083,0.284833,1071,5927,6998
 522,6/5/2012,2,1,6,2,2,0.540833,0.525892,0.613333,0.209575,968,6033,7001
 523,6/6/2012,2,1,6,3,1,0.554167,0.542292,0.61125,0.077125,1027,6028,7055
 524,6/7/2012,2,1,6,4,1,0.6025,0.569442,0.567083,0.15735,1038,6456,7494
 525,6/8/2012,2,1,6,5,1,0.649167,0.597862,0.467917,0.175383,1488,6248,7736
 526,6/9/2012,2,1,6,6,1,0.710833,0.648367,0.437083,0.144287,2708,4790,7498
 527,6/10/2012,2,1,6,0,1,0.726667,0.663517,0.538333,0.133721,2224,4374,6598
 528,6/11/2012,2,1,6,1,2,0.720833,0.659721,0.587917,0.207713,1017,5647,6664
 529,6/12/2012,2,1,6,2,2,0.653333,0.597875,0.833333,0.214546,477,4495,4972
 530,6/13/2012,2,1,6,3,1,0.655833,0.611117,0.582083,0.343279,1173,6248,7421
 531,6/14/2012,2,1,6,4,1,0.648333,0.624383,0.569583,0.253733,1180,6183,7363
 532,6/15/2012,2,1,6,5,1,0.639167,0.599754,0.589583,0.176617,1563,6102,7665
 533,6/16/2012,2,1,6,6,1,0.631667,0.594708,0.504167,0.166667,2963,4739,7702
 534,6/17/2012,2,1,6,0,1,0.5925,0.571975,0.59875,0.144904,2634,4344,6978
 535,6/18/2012,2,1,6,1,2,0.568333,0.544842,0.777917,0.174746,653,4446,5099
 536,6/19/2012,2,1,6,2,1,0.688333,0.654692,0.69,0.148017,968,5857,6825
 537,6/20/2012,2,1,6,3,1,0.7825,0.720975,0.592083,0.113812,872,5339,6211
 538,6/21/2012,3,1,6,4,1,0.805833,0.752542,0.567917,0.118787,778,5127,5905
 539,6/22/2012,3,1,6,5,1,0.7775,0.724121,0.57375,0.182842,964,4859,5823
 540,6/23/2012,3,1,6,6,1,0.731667,0.652792,0.534583,0.179721,2657,4801,7458
 541,6/24/2012,3,1,6,0,1,0.743333,0.674254,0.479167,0.145525,2551,4340,6891
 542,6/25/2012,3,1,6,1,1,0.715833,0.654042,0.504167,0.300383,1139,5640,6779
 543,6/26/2012,3,1,6,2,1,0.630833,0.594704,0.373333,0.347642,1077,6365,7442
 544,6/27/2012,3,1,6,3,1,0.6975,0.640792,0.36,0.271775,1077,6258,7335
 545,6/28/2012,3,1,6,4,1,0.749167,0.675512,0.4225,0.17165,921,5958,6879
 546,6/29/2012,3,1,6,5,1,0.834167,0.786613,0.48875,0.165417,829,4634,5463
 547,6/30/2012,3,1,6,6,1,0.765,0.687508,0.60125,0.161071,1455,4232,5687
 548,7/1/2012,3,1,7,0,1,0.815833,0.750629,0.51875,0.168529,1421,4110,5531
 549,7/2/2012,3,1,7,1,1,0.781667,0.702038,0.447083,0.195267,904,5323,6227
 550,7/3/2012,3,1,7,2,1,0.780833,0.70265,0.492083,0.126237,1052,5608,6660
 551,7/4/2012,3,1,7,3,1,0.789167,0.732337,0.53875,0.13495,2562,4841,7403
 552,7/5/2012,3,1,7,4,1,0.8275,0.761367,0.457917,0.194029,1405,4836,6241
 553,7/6/2012,3,1,7,5,1,0.828333,0.752533,0.450833,0.146142,1366,4841,6207
 554,7/7/2012,3,1,7,6,1,0.861667,0.804913,0.492083,0.163554,1448,3392,4840
 555,7/8/2012,3,1,7,0,1,0.8225,0.790396,0.57375,0.125629,1203,3469,4672
 556,7/9/2012,3,1,7,1,2,0.710833,0.654054,0.683333,0.180975,998,5571,6569
 557,7/10/2012,3,1,7,2,2,0.720833,0.664796,0.6675,0.151737,954,5336,6290
 558,7/11/2012,3,1,7,3,1,0.716667,0.650271,0.633333,0.151733,975,6289,7264
 559,7/12/2012,3,1,7,4,1,0.715833,0.654683,0.529583,0.146775,1032,6414,7446
 560,7/13/2012,3,1,7,5,2,0.731667,0.667933,0.485833,0.08085,1511,5988,7499
 561,7/14/2012,3,1,7,6,2,0.703333,0.666042,0.699167,0.143679,2355,4614,6969
 562,7/15/2012,3,1,7,0,1,0.745833,0.705196,0.717917,0.166667,1920,4111,6031
 563,7/16/2012,3,1,7,1,1,0.763333,0.724125,0.645,0.164187,1088,5742,6830
 564,7/17/2012,3,1,7,2,1,0.818333,0.755683,0.505833,0.114429,921,5865,6786
 565,7/18/2012,3,1,7,3,1,0.793333,0.745583,0.577083,0.137442,799,4914,5713
 566,7/19/2012,3,1,7,4,1,0.77,0.714642,0.600417,0.165429,888,5703,6591
 567,7/20/2012,3,1,7,5,2,0.665833,0.613025,0.844167,0.208967,747,5123,5870
 568,7/21/2012,3,1,7,6,3,0.595833,0.549912,0.865417,0.2133,1264,3195,4459
 569,7/22/2012,3,1,7,0,2,0.6675,0.623125,0.7625,0.0939208,2544,4866,7410
 570,7/23/2012,3,1,7,1,1,0.741667,0.690017,0.694167,0.138683,1135,5831,6966
 571,7/24/2012,3,1,7,2,1,0.750833,0.70645,0.655,0.211454,1140,6452,7592
 572,7/25/2012,3,1,7,3,1,0.724167,0.654054,0.45,0.1648,1383,6790,8173
 573,7/26/2012,3,1,7,4,1,0.776667,0.739263,0.596667,0.284813,1036,5825,6861
 574,7/27/2012,3,1,7,5,1,0.781667,0.734217,0.594583,0.152992,1259,5645,6904
 575,7/28/2012,3,1,7,6,1,0.755833,0.697604,0.613333,0.15735,2234,4451,6685
 576,7/29/2012,3,1,7,0,1,0.721667,0.667933,0.62375,0.170396,2153,4444,6597
 577,7/30/2012,3,1,7,1,1,0.730833,0.684987,0.66875,0.153617,1040,6065,7105
 578,7/31/2012,3,1,7,2,1,0.713333,0.662896,0.704167,0.165425,968,6248,7216
 579,8/1/2012,3,1,8,3,1,0.7175,0.667308,0.6775,0.141179,1074,6506,7580
 580,8/2/2012,3,1,8,4,1,0.7525,0.707088,0.659583,0.129354,983,6278,7261
 581,8/3/2012,3,1,8,5,2,0.765833,0.722867,0.6425,0.215792,1328,5847,7175
 582,8/4/2012,3,1,8,6,1,0.793333,0.751267,0.613333,0.257458,2345,4479,6824
 583,8/5/2012,3,1,8,0,1,0.769167,0.731079,0.6525,0.290421,1707,3757,5464
 584,8/6/2012,3,1,8,1,2,0.7525,0.710246,0.654167,0.129354,1233,5780,7013
 585,8/7/2012,3,1,8,2,2,0.735833,0.697621,0.70375,0.116908,1278,5995,7273
 586,8/8/2012,3,1,8,3,2,0.75,0.707717,0.672917,0.1107,1263,6271,7534
 587,8/9/2012,3,1,8,4,1,0.755833,0.699508,0.620417,0.1561,1196,6090,7286
 588,8/10/2012,3,1,8,5,2,0.715833,0.667942,0.715833,0.238813,1065,4721,5786
 589,8/11/2012,3,1,8,6,2,0.6925,0.638267,0.732917,0.206479,2247,4052,6299
 590,8/12/2012,3,1,8,0,1,0.700833,0.644579,0.530417,0.122512,2182,4362,6544
 591,8/13/2012,3,1,8,1,1,0.720833,0.662254,0.545417,0.136212,1207,5676,6883
 592,8/14/2012,3,1,8,2,1,0.726667,0.676779,0.686667,0.169158,1128,5656,6784
 593,8/15/2012,3,1,8,3,1,0.706667,0.654037,0.619583,0.169771,1198,6149,7347
 594,8/16/2012,3,1,8,4,1,0.719167,0.654688,0.519167,0.141796,1338,6267,7605
 595,8/17/2012,3,1,8,5,1,0.723333,0.2424,0.570833,0.231354,1483,5665,7148
 596,8/18/2012,3,1,8,6,1,0.678333,0.618071,0.603333,0.177867,2827,5038,7865
 597,8/19/2012,3,1,8,0,2,0.635833,0.603554,0.711667,0.08645,1208,3341,4549
 598,8/20/2012,3,1,8,1,2,0.635833,0.595967,0.734167,0.129979,1026,5504,6530
 599,8/21/2012,3,1,8,2,1,0.649167,0.601025,0.67375,0.0727708,1081,5925,7006
 600,8/22/2012,3,1,8,3,1,0.6675,0.621854,0.677083,0.0702833,1094,6281,7375
 601,8/23/2012,3,1,8,4,1,0.695833,0.637008,0.635833,0.0845958,1363,6402,7765
 602,8/24/2012,3,1,8,5,2,0.7025,0.6471,0.615,0.0721458,1325,6257,7582
 603,8/25/2012,3,1,8,6,2,0.661667,0.618696,0.712917,0.244408,1829,4224,6053
 604,8/26/2012,3,1,8,0,2,0.653333,0.595996,0.845833,0.228858,1483,3772,5255
 605,8/27/2012,3,1,8,1,1,0.703333,0.654688,0.730417,0.128733,989,5928,6917
 606,8/28/2012,3,1,8,2,1,0.728333,0.66605,0.62,0.190925,935,6105,7040
 607,8/29/2012,3,1,8,3,1,0.685,0.635733,0.552083,0.112562,1177,6520,7697
 608,8/30/2012,3,1,8,4,1,0.706667,0.652779,0.590417,0.0771167,1172,6541,7713
 609,8/31/2012,3,1,8,5,1,0.764167,0.6894,0.5875,0.168533,1433,5917,7350
 610,9/1/2012,3,1,9,6,2,0.753333,0.702654,0.638333,0.113187,2352,3788,6140
 611,9/2/2012,3,1,9,0,2,0.696667,0.649,0.815,0.0640708,2613,3197,5810
 612,9/3/2012,3,1,9,1,1,0.7075,0.661629,0.790833,0.151121,1965,4069,6034
 613,9/4/2012,3,1,9,2,1,0.725833,0.686888,0.755,0.236321,867,5997,6864
 614,9/5/2012,3,1,9,3,1,0.736667,0.708983,0.74125,0.187808,832,6280,7112
 615,9/6/2012,3,1,9,4,2,0.696667,0.655329,0.810417,0.142421,611,5592,6203
 616,9/7/2012,3,1,9,5,1,0.703333,0.657204,0.73625,0.171646,1045,6459,7504
 617,9/8/2012,3,1,9,6,2,0.659167,0.611121,0.799167,0.281104,1557,4419,5976
 618,9/9/2012,3,1,9,0,1,0.61,0.578925,0.5475,0.224496,2570,5657,8227
 619,9/10/2012,3,1,9,1,1,0.583333,0.565654,0.50375,0.258713,1118,6407,7525
 620,9/11/2012,3,1,9,2,1,0.5775,0.554292,0.52,0.0920542,1070,6697,7767
 621,9/12/2012,3,1,9,3,1,0.599167,0.570075,0.577083,0.131846,1050,6820,7870
 622,9/13/2012,3,1,9,4,1,0.6125,0.579558,0.637083,0.0827208,1054,6750,7804
 623,9/14/2012,3,1,9,5,1,0.633333,0.594083,0.6725,0.103863,1379,6630,8009
 624,9/15/2012,3,1,9,6,1,0.608333,0.585867,0.501667,0.247521,3160,5554,8714
 625,9/16/2012,3,1,9,0,1,0.58,0.563125,0.57,0.0901833,2166,5167,7333
 626,9/17/2012,3,1,9,1,2,0.580833,0.55305,0.734583,0.151742,1022,5847,6869
 627,9/18/2012,3,1,9,2,2,0.623333,0.565067,0.8725,0.357587,371,3702,4073
 628,9/19/2012,3,1,9,3,1,0.5525,0.540404,0.536667,0.215175,788,6803,7591
 629,9/20/2012,3,1,9,4,1,0.546667,0.532192,0.618333,0.118167,939,6781,7720
 630,9/21/2012,3,1,9,5,1,0.599167,0.571971,0.66875,0.154229,1250,6917,8167
 631,9/22/2012,3,1,9,6,1,0.65,0.610488,0.646667,0.283583,2512,5883,8395
 632,9/23/2012,4,1,9,0,1,0.529167,0.518933,0.467083,0.223258,2454,5453,7907
 633,9/24/2012,4,1,9,1,1,0.514167,0.502513,0.492917,0.142404,1001,6435,7436
 634,9/25/2012,4,1,9,2,1,0.55,0.544179,0.57,0.236321,845,6693,7538
 635,9/26/2012,4,1,9,3,1,0.635,0.596613,0.630833,0.2444,787,6946,7733
 636,9/27/2012,4,1,9,4,2,0.65,0.607975,0.690833,0.134342,751,6642,7393
 637,9/28/2012,4,1,9,5,2,0.619167,0.585863,0.69,0.164179,1045,6370,7415
 638,9/29/2012,4,1,9,6,1,0.5425,0.530296,0.542917,0.227604,2589,5966,8555
 639,9/30/2012,4,1,9,0,1,0.526667,0.517663,0.583333,0.134958,2015,4874,6889
 640,10/1/2012,4,1,10,1,2,0.520833,0.512,0.649167,0.0908042,763,6015,6778
 641,10/2/2012,4,1,10,2,3,0.590833,0.542333,0.871667,0.104475,315,4324,4639
 642,10/3/2012,4,1,10,3,2,0.6575,0.599133,0.79375,0.0665458,728,6844,7572
 643,10/4/2012,4,1,10,4,2,0.6575,0.607975,0.722917,0.117546,891,6437,7328
 644,10/5/2012,4,1,10,5,1,0.615,0.580187,0.6275,0.10635,1516,6640,8156
 645,10/6/2012,4,1,10,6,1,0.554167,0.538521,0.664167,0.268025,3031,4934,7965
 646,10/7/2012,4,1,10,0,2,0.415833,0.419813,0.708333,0.141162,781,2729,3510
 647,10/8/2012,4,1,10,1,2,0.383333,0.387608,0.709583,0.189679,874,4604,5478
 648,10/9/2012,4,1,10,2,2,0.446667,0.438112,0.761667,0.1903,601,5791,6392
 649,10/10/2012,4,1,10,3,1,0.514167,0.503142,0.630833,0.187821,780,6911,7691
 650,10/11/2012,4,1,10,4,1,0.435,0.431167,0.463333,0.181596,834,6736,7570
 651,10/12/2012,4,1,10,5,1,0.4375,0.433071,0.539167,0.235092,1060,6222,7282
 652,10/13/2012,4,1,10,6,1,0.393333,0.391396,0.494583,0.146142,2252,4857,7109
 653,10/14/2012,4,1,10,0,1,0.521667,0.508204,0.640417,0.278612,2080,4559,6639
 654,10/15/2012,4,1,10,1,2,0.561667,0.53915,0.7075,0.296037,760,5115,5875
 655,10/16/2012,4,1,10,2,1,0.468333,0.460846,0.558333,0.182221,922,6612,7534
 656,10/17/2012,4,1,10,3,1,0.455833,0.450108,0.692917,0.101371,979,6482,7461
 657,10/18/2012,4,1,10,4,2,0.5225,0.512625,0.728333,0.236937,1008,6501,7509
 658,10/19/2012,4,1,10,5,2,0.563333,0.537896,0.815,0.134954,753,4671,5424
 659,10/20/2012,4,1,10,6,1,0.484167,0.472842,0.572917,0.117537,2806,5284,8090
 660,10/21/2012,4,1,10,0,1,0.464167,0.456429,0.51,0.166054,2132,4692,6824
 661,10/22/2012,4,1,10,1,1,0.4875,0.482942,0.568333,0.0814833,830,6228,7058
 662,10/23/2012,4,1,10,2,1,0.544167,0.530304,0.641667,0.0945458,841,6625,7466
 663,10/24/2012,4,1,10,3,1,0.5875,0.558721,0.63625,0.0727792,795,6898,7693
 664,10/25/2012,4,1,10,4,2,0.55,0.529688,0.800417,0.124375,875,6484,7359
 665,10/26/2012,4,1,10,5,2,0.545833,0.52275,0.807083,0.132467,1182,6262,7444
 666,10/27/2012,4,1,10,6,2,0.53,0.515133,0.72,0.235692,2643,5209,7852
 667,10/28/2012,4,1,10,0,2,0.4775,0.467771,0.694583,0.398008,998,3461,4459
 668,10/29/2012,4,1,10,1,3,0.44,0.4394,0.88,0.3582,2,20,22
 669,10/30/2012,4,1,10,2,2,0.318182,0.309909,0.825455,0.213009,87,1009,1096
 670,10/31/2012,4,1,10,3,2,0.3575,0.3611,0.666667,0.166667,419,5147,5566
 671,11/1/2012,4,1,11,4,2,0.365833,0.369942,0.581667,0.157346,466,5520,5986
 672,11/2/2012,4,1,11,5,1,0.355,0.356042,0.522083,0.266175,618,5229,5847
 673,11/3/2012,4,1,11,6,2,0.343333,0.323846,0.49125,0.270529,1029,4109,5138
 674,11/4/2012,4,1,11,0,1,0.325833,0.329538,0.532917,0.179108,1201,3906,5107
 675,11/5/2012,4,1,11,1,1,0.319167,0.308075,0.494167,0.236325,378,4881,5259
 676,11/6/2012,4,1,11,2,1,0.280833,0.281567,0.567083,0.173513,466,5220,5686
 677,11/7/2012,4,1,11,3,2,0.295833,0.274621,0.5475,0.304108,326,4709,5035
 678,11/8/2012,4,1,11,4,1,0.352174,0.341891,0.333478,0.347835,340,4975,5315
 679,11/9/2012,4,1,11,5,1,0.361667,0.355413,0.540833,0.214558,709,5283,5992
 680,11/10/2012,4,1,11,6,1,0.389167,0.393937,0.645417,0.0578458,2090,4446,6536
 681,11/11/2012,4,1,11,0,1,0.420833,0.421713,0.659167,0.1275,2290,4562,6852
 682,11/12/2012,4,1,11,1,1,0.485,0.475383,0.741667,0.173517,1097,5172,6269
 683,11/13/2012,4,1,11,2,2,0.343333,0.323225,0.662917,0.342046,327,3767,4094
 684,11/14/2012,4,1,11,3,1,0.289167,0.281563,0.552083,0.199625,373,5122,5495
 685,11/15/2012,4,1,11,4,2,0.321667,0.324492,0.620417,0.152987,320,5125,5445
 686,11/16/2012,4,1,11,5,1,0.345,0.347204,0.524583,0.171025,484,5214,5698
 687,11/17/2012,4,1,11,6,1,0.325,0.326383,0.545417,0.179729,1313,4316,5629
 688,11/18/2012,4,1,11,0,1,0.3425,0.337746,0.692917,0.227612,922,3747,4669
 689,11/19/2012,4,1,11,1,2,0.380833,0.375621,0.623333,0.235067,449,5050,5499
 690,11/20/2012,4,1,11,2,2,0.374167,0.380667,0.685,0.082725,534,5100,5634
 691,11/21/2012,4,1,11,3,1,0.353333,0.364892,0.61375,0.103246,615,4531,5146
 692,11/22/2012,4,1,11,4,1,0.34,0.350371,0.580417,0.0528708,955,1470,2425
 693,11/23/2012,4,1,11,5,1,0.368333,0.378779,0.56875,0.148021,1603,2307,3910
 694,11/24/2012,4,1,11,6,1,0.278333,0.248742,0.404583,0.376871,532,1745,2277
 695,11/25/2012,4,1,11,0,1,0.245833,0.257583,0.468333,0.1505,309,2115,2424
 696,11/26/2012,4,1,11,1,1,0.313333,0.339004,0.535417,0.04665,337,4750,5087
 697,11/27/2012,4,1,11,2,2,0.291667,0.281558,0.786667,0.237562,123,3836,3959
 698,11/28/2012,4,1,11,3,1,0.296667,0.289762,0.50625,0.210821,198,5062,5260
 699,11/29/2012,4,1,11,4,1,0.28087,0.298422,0.555652,0.115522,243,5080,5323
 700,11/30/2012,4,1,11,5,1,0.298333,0.323867,0.649583,0.0584708,362,5306,5668
 701,12/1/2012,4,1,12,6,2,0.298333,0.316904,0.806667,0.0597042,951,4240,5191
 702,12/2/2012,4,1,12,0,2,0.3475,0.359208,0.823333,0.124379,892,3757,4649
 703,12/3/2012,4,1,12,1,1,0.4525,0.455796,0.7675,0.0827208,555,5679,6234
 704,12/4/2012,4,1,12,2,1,0.475833,0.469054,0.73375,0.174129,551,6055,6606
 705,12/5/2012,4,1,12,3,1,0.438333,0.428012,0.485,0.324021,331,5398,5729
 706,12/6/2012,4,1,12,4,1,0.255833,0.258204,0.50875,0.174754,340,5035,5375
 707,12/7/2012,4,1,12,5,2,0.320833,0.321958,0.764167,0.1306,349,4659,5008
 708,12/8/2012,4,1,12,6,2,0.381667,0.389508,0.91125,0.101379,1153,4429,5582
 709,12/9/2012,4,1,12,0,2,0.384167,0.390146,0.905417,0.157975,441,2787,3228
 710,12/10/2012,4,1,12,1,2,0.435833,0.435575,0.925,0.190308,329,4841,5170
 711,12/11/2012,4,1,12,2,2,0.353333,0.338363,0.596667,0.296037,282,5219,5501
 712,12/12/2012,4,1,12,3,2,0.2975,0.297338,0.538333,0.162937,310,5009,5319
 713,12/13/2012,4,1,12,4,1,0.295833,0.294188,0.485833,0.174129,425,5107,5532
 714,12/14/2012,4,1,12,5,1,0.281667,0.294192,0.642917,0.131229,429,5182,5611
 715,12/15/2012,4,1,12,6,1,0.324167,0.338383,0.650417,0.10635,767,4280,5047
 716,12/16/2012,4,1,12,0,2,0.3625,0.369938,0.83875,0.100742,538,3248,3786
 717,12/17/2012,4,1,12,1,2,0.393333,0.4015,0.907083,0.0982583,212,4373,4585
 718,12/18/2012,4,1,12,2,1,0.410833,0.409708,0.66625,0.221404,433,5124,5557
 719,12/19/2012,4,1,12,3,1,0.3325,0.342162,0.625417,0.184092,333,4934,5267
 720,12/20/2012,4,1,12,4,2,0.33,0.335217,0.667917,0.132463,314,3814,4128
 721,12/21/2012,1,1,12,5,2,0.326667,0.301767,0.556667,0.374383,221,3402,3623
 722,12/22/2012,1,1,12,6,1,0.265833,0.236113,0.44125,0.407346,205,1544,1749
 723,12/23/2012,1,1,12,0,1,0.245833,0.259471,0.515417,0.133083,408,1379,1787
 724,12/24/2012,1,1,12,1,2,0.231304,0.2589,0.791304,0.0772304,174,746,920
 725,12/25/2012,1,1,12,2,2,0.291304,0.294465,0.734783,0.168726,440,573,1013
 726,12/26/2012,1,1,12,3,3,0.243333,0.220333,0.823333,0.316546,9,432,441
 727,12/27/2012,1,1,12,4,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
 728,12/28/2012,1,1,12,5,2,0.253333,0.255046,0.59,0.155471,644,2451,3095
 729,12/29/2012,1,1,12,6,2,0.253333,0.2424,0.752917,0.124383,159,1182,1341
 730,12/30/2012,1,1,12,0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
 731,12/31/2012,1,1,12,1,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729
--- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -28,16 +35,16 @@
      "metadata": {},
      "source": [
        "## Introduction\n",
-        "In this example, we show how AutoML can be used for energy demand forecasting.\n",
+        "In this example, we show how AutoML can be used to forecast a single time-series in the energy demand application area. \n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
-        "In this notebook you would see\n",
+        "Notebook synopsis:\n",
        "1. Creating an Experiment in an existing Workspace\n",
-        "2. Instantiating AutoMLConfig with new task type \"forecasting\" for timeseries data training, and other timeseries related settings: for this dataset we use the basic one: \"time_column_name\" \n",
+        "2. Configuration and local run of AutoML for a simple time-series model\n",
-        "3. Training the Model using local compute\n",
+        "3. View engineered features and prediction results\n",
-        "4. Exploring the results\n",
+        "4. Configuration and local run of AutoML for a time-series model with lag and rolling window features\n",
-        "5. Testing the fitted model"
+        "5. Estimate feature importance"
      ]
    },
    {
@@ -58,10 +65,10 @@
        "import numpy as np\n",
        "import logging\n",
        "import warnings\n",
        "\n",
        "# Squash warning messages for cleaner output in the notebook\n",
        "warnings.showwarning = lambda *args, **kwargs: None\n",
        "\n",
        "\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.train.automl import AutoMLConfig\n",
@@ -73,7 +80,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments."
+        "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
      ]
    },
    {
@@ -109,7 +116,7 @@
      "metadata": {},
      "source": [
        "## Data\n",
-        "Read energy demanding data from file, and preview data."
+        "We will use energy consumption data from New York City for model training. The data is stored in a tabular format and includes energy demand and basic weather data at an hourly frequency. Pandas CSV reader is used to read the file into memory. Special attention is given to the \"timeStamp\" column in the data since it contains text which should be parsed as datetime-type objects. "
      ]
    },
    {
@@ -126,8 +133,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Split the data to train and test\n",
+        "We must now define the schema of this dataset. Every time-series must have a time column and a target. The target quantity is what will be eventually forecasted by a trained model. In this case, the target is the \"demand\" column. The other columns, \"temp\" and \"precip,\" are implicitly designated as features."
        "\n"
      ]
    },
    {
@@ -136,34 +142,23 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "train = data[data['timeStamp'] < '2017-02-01']\n",
+        "# Dataset schema\n",
-        "test = data[data['timeStamp'] >= '2017-02-01']\n"
+        "time_column_name = 'timeStamp'\n",
        "target_column_name = 'demand'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Prepare the test data, we will feed X_test to the fitted model and get prediction"
+        "### Forecast Horizon\n",
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_test = test.pop('demand').values\n",
        "X_test = test"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Split the train data to train and valid\n",
        "\n",
-        "Use one month's data as valid data\n"
+        "In addition to the data schema, we must also specify the forecast horizon. A forecast horizon is a time span into the future (or just beyond the latest date in the training data) where forecasts of the target quantity are needed. Choosing a forecast horizon is application specific, but a rule-of-thumb is that **the horizon should be the time-frame where you need actionable decisions based on the forecast.** The horizon usually has a strong relationship with the frequency of the time-series data, that is, the sampling interval of the target quantity and the features. For instance, the NYC energy demand data has an hourly frequency. A decision that requires a demand forecast to the hour is unlikely to be made weeks or months in advance, particularly if we expect weather to be a strong determinant of demand. We may have fairly accurate meteorological forecasts of the hourly temperature and precipitation on a the time-scale of a day or two, however.\n",
        "\n",
        "Given the above discussion, we generally recommend that users set forecast horizons to less than 100 time periods (i.e. less than 100 hours in the NYC energy example). Furthermore, **AutoML's memory use and computation time increase in proportion to the length of the horizon**, so the user should consider carefully how they set this value. If a long horizon forecast really is necessary, it may be good practice to aggregate the series to a coarser time scale.  \n",
        "\n",
        "\n",
        "Forecast horizons in AutoML are given as integer multiples of the time-series frequency. In this example, we set the horizon to 48 hours."
      ]
    },
    {
@@ -172,14 +167,34 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X_train = train[train['timeStamp'] < '2017-01-01']\n",
+        "max_horizon = 48"
-        "X_valid = train[train['timeStamp'] >= '2017-01-01']\n",
+      ]
-        "y_train = X_train.pop('demand').values\n",
+    },
-        "y_valid = X_valid.pop('demand').values\n",
+    {
-        "print(X_train.shape)\n",
+      "cell_type": "markdown",
-        "print(y_train.shape)\n",
+      "metadata": {},
-        "print(X_valid.shape)\n",
+      "source": [
-        "print(y_valid.shape)"
+        "### Split the data into train and test sets\n",
        "We now split the data into a train and a test set so that we may evaluate model performance. We note that the tail of the dataset contains a large number of NA values in the target column, so we designate the test set as the 48 hour window ending on the latest date of known energy demand. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Find time point to split on\n",
        "latest_known_time = data[~pd.isnull(data[target_column_name])][time_column_name].max()\n",
        "split_time = latest_known_time - pd.Timedelta(hours=max_horizon)\n",
        "\n",
        "# Split into train/test sets\n",
        "X_train = data[data[time_column_name] <= split_time]\n",
        "X_test = data[(data[time_column_name] > split_time) & (data[time_column_name] <= latest_known_time)]\n",
        "\n",
        "# Move the target values into their own arrays \n",
        "y_train = X_train.pop(target_column_name).values\n",
        "y_test = X_test.pop(target_column_name).values"
      ]
    },
    {
@@ -188,7 +203,7 @@
      "source": [
        "## Train\n",
        "\n",
-        "Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
+        "We now instantiate an AutoMLConfig object. This config defines the settings and data used to run the experiment. For forecasting tasks, we must provide extra configuration related to the time-series data schema and forecasting context. Here, only the name of the time column and the maximum forecast horizon are needed. Other settings are described below:\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
@@ -198,8 +213,7 @@
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
-        "|**X_valid**|Data used to evaluate a model in a iteration. (sparse) array-like, shape = [n_samples, n_features]|\n",
+        "|**n_cross_validations**|Number of cross validation splits. Rolling Origin Validation is used to split time-series in a temporally consistent way.|\n",
        "|**y_valid**|Data used to evaluate a model in a iteration. (sparse) array-like, shape = [n_samples, ], targets values.|\n",
        "|**path**|Relative path to the project folder.  AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder. "
      ]
    },
@@ -209,31 +223,31 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "time_column_name = 'timeStamp'\n",
+        "time_series_settings = {\n",
-        "automl_settings = {\n",
+        "    'time_column_name': time_column_name,\n",
-        "    \"time_column_name\": time_column_name,\n",
+        "    'max_horizon': max_horizon\n",
        "}\n",
        "\n",
        "\n",
        "automl_config = AutoMLConfig(task='forecasting',\n",
        "                             debug_log='automl_nyc_energy_errors.log',\n",
        "                             primary_metric='normalized_root_mean_squared_error',\n",
        "                             blacklist_models = ['ExtremeRandomTrees'],\n",
        "                             iterations=10,\n",
        "                             iteration_timeout_minutes=5,\n",
        "                             X=X_train,\n",
        "                             y=y_train,\n",
-        "                             X_valid = X_valid,\n",
+        "                             n_cross_validations=3,\n",
        "                             y_valid = y_valid,\n",
        "                             path=project_folder,\n",
        "                             verbosity = logging.INFO,\n",
-        "                            **automl_settings)"
+        "                             **time_series_settings)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "You can call the submit method on the experiment object and pass the run configuration. For Local runs the execution is synchronous. Depending on the data and number of iterations this can run for while.\n",
+        "Submitting the configuration will start a new run in this experiment. For local runs, the execution is synchronous. Depending on the data and number of iterations, this can run for a while. Parameters controlling concurrency may speed up the process, depending on your hardware.\n",
        "\n",
        "You will see the currently running iterations printing to the console."
      ]
    },
@@ -273,13 +287,34 @@
        "fitted_model.steps"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### View the engineered names for featurized data\n",
        "Below we display the engineered feature names generated for the featurized data using the time-series featurization."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "fitted_model.named_steps['timeseriestransformer'].get_engineered_feature_names()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test the Best Fitted Model\n",
        "\n",
-        "Predict on training and test set, and calculate residual values."
+        "For forecasting, we will use the `forecast` function instead of the `predict` function. There are two reasons for this.\n",
        "\n",
        "We need to pass the recent values of the target variable `y`, whereas the scikit-compatible `predict` function only takes the non-target variables `X`. In our case, the test data immediately follows the training data, and we fill the `y` variable with `NaN`. The `NaN` serves as a question mark for the forecaster to fill with the actuals. Using the forecast function will produce forecasts using the shortest possible forecast horizon. The last time at which a definite (non-NaN) value is seen is the _forecast origin_ - the last time when the value of the target is known. \n",
        "\n",
        "Using the `predict` method would result in getting predictions for EVERY horizon the forecaster can predict at. This is useful when training and evaluating the performance of the forecaster at various horizons, but the level of detail is excessive for normal use."
      ]
    },
    {
@@ -288,15 +323,64 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "y_pred = fitted_model.predict(X_test)\n",
+        "# Replace ALL values in y_pred by NaN. \n",
-        "y_pred"
+        "# The forecast origin will be at the beginning of the first forecast period\n",
        "# (which is the same time as the end of the last training period).\n",
        "y_query = y_test.copy().astype(np.float)\n",
        "y_query.fill(np.nan)\n",
        "# The featurized data, aligned to y, will also be returned.\n",
        "# This contains the assumptions that were made in the forecast\n",
        "# and helps align the forecast to the original data\n",
        "y_fcst, X_trans = fitted_model.forecast(X_test, y_query)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# limit the evaluation to data where y_test has actuals\n",
        "def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name = 'predicted'):\n",
        "    \"\"\"\n",
        "    Demonstrates how to get the output aligned to the inputs\n",
        "    using pandas indexes. Helps understand what happened if\n",
        "    the output's shape differs from the input shape, or if\n",
        "    the data got re-sorted by time and grain during forecasting.\n",
        "    \n",
        "    Typical causes of misalignment are:\n",
        "    * we predicted some periods that were missing in actuals -> drop from eval\n",
        "    * model was asked to predict past max_horizon -> increase max horizon\n",
        "    * data at start of X_test was needed for lags -> provide previous periods\n",
        "    \"\"\"\n",
        "    df_fcst = pd.DataFrame({predicted_column_name : y_predicted})\n",
        "    # y and X outputs are aligned by forecast() function contract\n",
        "    df_fcst.index = X_trans.index\n",
        "    \n",
        "    # align original X_test to y_test    \n",
        "    X_test_full = X_test.copy()\n",
        "    X_test_full[target_column_name] = y_test\n",
        "\n",
        "    # X_test_full's does not include origin, so reset for merge\n",
        "    df_fcst.reset_index(inplace=True)\n",
        "    X_test_full = X_test_full.reset_index().drop(columns='index')\n",
        "    together = df_fcst.merge(X_test_full, how='right')\n",
        "    \n",
        "    # drop rows where prediction or actuals are nan \n",
        "    # happens because of missing actuals \n",
        "    # or at edges of time due to lags/rolling windows\n",
        "    clean = together[together[[target_column_name, predicted_column_name]].notnull().all(axis=1)]\n",
        "    return(clean)\n",
        "\n",
        "df_all = align_outputs(y_fcst, X_trans, X_test, y_test)\n",
        "df_all.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Use the Check Data Function to remove the nan values from y_test to avoid error when calculate metrics "
+        "Looking at `X_trans` is also useful to see what featurization happened to the data."
      ]
    },
    {
@@ -305,29 +389,15 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "if len(y_test) != len(y_pred):\n",
+        "X_trans"
        "    raise ValueError(\n",
        "        'the true values and prediction values do not have equal length.')\n",
        "elif len(y_test) == 0:\n",
        "    raise ValueError(\n",
        "        'y_true and y_pred are empty.')\n",
        "\n",
        "# if there is any non-numeric element in the y_true or y_pred,\n",
        "# the ValueError exception will be thrown.\n",
        "y_test_f = np.array(y_test).astype(float)\n",
        "y_pred_f = np.array(y_pred).astype(float)\n",
        "\n",
        "# remove entries both in y_true and y_pred where at least\n",
        "# one element in y_true or y_pred is missing\n",
        "y_test = y_test_f[~(np.isnan(y_test_f) | np.isnan(y_pred_f))]\n",
        "y_pred = y_pred_f[~(np.isnan(y_test_f) | np.isnan(y_pred_f))]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Calculate metrics for the prediction\n"
+        "### Calculate accuracy metrics\n",
        "Finally, we calculate some accuracy metrics for the forecast and plot the predictions vs. the actuals over the time range in the test set."
      ]
    },
    {
@@ -336,26 +406,189 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "print(\"[Test Data] \\nRoot Mean squared error: %.2f\" % np.sqrt(mean_squared_error(y_test, y_pred)))\n",
+        "def MAPE(actual, pred):\n",
-        "# Explained variance score: 1 is perfect prediction\n",
+        "    \"\"\"\n",
-        "print('mean_absolute_error score: %.2f' % mean_absolute_error(y_test, y_pred))\n",
+        "    Calculate mean absolute percentage error.\n",
-        "print('R2 score: %.2f' % r2_score(y_test, y_pred))\n",
+        "    Remove NA and values where actual is close to zero\n",
-        "\n",
+        "    \"\"\"\n",
-        "\n",
+        "    not_na = ~(np.isnan(actual) | np.isnan(pred))\n",
        "    not_zero = ~np.isclose(actual, 0.0)\n",
        "    actual_safe = actual[not_na & not_zero]\n",
        "    pred_safe = pred[not_na & not_zero]\n",
        "    APE = 100*np.abs((actual_safe - pred_safe)/actual_safe)\n",
        "    return np.mean(APE)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(\"Simple forecasting model\")\n",
        "rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all['predicted']))\n",
        "print(\"[Test Data] \\nRoot Mean squared error: %.2f\" % rmse)\n",
        "mae = mean_absolute_error(df_all[target_column_name], df_all['predicted'])\n",
        "print('mean_absolute_error score: %.2f' % mae)\n",
        "print('MAPE: %.2f' % MAPE(df_all[target_column_name], df_all['predicted']))\n",
        "\n",
        "# Plot outputs\n",
-        "%matplotlib notebook\n",
+        "%matplotlib inline\n",
-        "test_pred = plt.scatter(y_test, y_pred, color='b')\n",
+        "pred, = plt.plot(df_all[time_column_name], df_all['predicted'], color='b')\n",
-        "test_test = plt.scatter(y_test, y_test, color='g')\n",
+        "actual, = plt.plot(df_all[time_column_name], df_all[target_column_name], color='g')\n",
-        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
+        "plt.xticks(fontsize=8)\n",
        "plt.legend((pred, actual), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.title('Prediction vs. Actual Time-Series')\n",
        "\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "The distribution looks a little heavy tailed: we underestimate the excursions of the extremes. A normal-quantile transform of the target might help, but let's first try using some past data with the lags and rolling window transforms.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Using lags and rolling window features"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, grain and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data. In the previous example, the horizon was only used to split the data for cross-validation.\n",
        "\n",
        "Now that we configured target lags, that is the previous values of the target variables, and the prediction is no longer horizon-less. We therefore must still specify the `max_horizon` that the model will learn to forecast. The `target_lags` keyword specifies how far back we will construct the lags of the target variable, and the `target_rolling_window_size` specifies the size of the rolling window over which we will generate the `max`, `min` and `sum` features."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "time_series_settings_with_lags = {\n",
        "    'time_column_name': time_column_name,\n",
        "    'max_horizon': max_horizon,\n",
        "    'target_lags': 12,\n",
        "    'target_rolling_window_size': 4\n",
        "}\n",
        "\n",
        "automl_config_lags = AutoMLConfig(task='forecasting',\n",
        "                                  debug_log='automl_nyc_energy_errors.log',\n",
        "                                  primary_metric='normalized_root_mean_squared_error',\n",
        "                                  blacklist_models=['ElasticNet','ExtremeRandomTrees','GradientBoosting'],\n",
        "                                  iterations=10,\n",
        "                                  iteration_timeout_minutes=10,\n",
        "                                  X=X_train,\n",
        "                                  y=y_train,\n",
        "                                  n_cross_validations=3,\n",
        "                                  path=project_folder,\n",
        "                                  verbosity=logging.INFO,\n",
        "                                  **time_series_settings_with_lags)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We now start a new local run, this time with lag and rolling window featurization. AutoML applies featurizations in the setup stage, prior to iterating over ML models. The full training set is featurized first, followed by featurization of each of the CV splits. Lag and rolling window features introduce additional complexity, so the run will take longer than in the previous example that lacked these featurizations."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run_lags = experiment.submit(automl_config_lags, show_output=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run_lags, fitted_model_lags = local_run_lags.get_output()\n",
        "y_fcst_lags, X_trans_lags = fitted_model_lags.forecast(X_test, y_query)\n",
        "df_lags = align_outputs(y_fcst_lags, X_trans_lags, X_test, y_test)\n",
        "df_lags.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X_trans_lags"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(\"Forecasting model with lags\")\n",
        "rmse = np.sqrt(mean_squared_error(df_lags[target_column_name], df_lags['predicted']))\n",
        "print(\"[Test Data] \\nRoot Mean squared error: %.2f\" % rmse)\n",
        "mae = mean_absolute_error(df_lags[target_column_name], df_lags['predicted'])\n",
        "print('mean_absolute_error score: %.2f' % mae)\n",
        "print('MAPE: %.2f' % MAPE(df_lags[target_column_name], df_lags['predicted']))\n",
        "\n",
        "# Plot outputs\n",
        "%matplotlib inline\n",
        "pred, = plt.plot(df_lags[time_column_name], df_lags['predicted'], color='b')\n",
        "actual, = plt.plot(df_lags[time_column_name], df_lags[target_column_name], color='g')\n",
        "plt.xticks(fontsize=8)\n",
        "plt.legend((pred, actual), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### What features matter for the forecast?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.train.automl.automlexplainer import explain_model\n",
        "\n",
        "# feature names are everything in the transformed data except the target\n",
        "features = X_trans_lags.columns[:-1]\n",
        "expl = explain_model(fitted_model_lags, X_train.copy(), X_test.copy(), features=features, best_run=best_run_lags, y_train=y_train)\n",
        "# unpack the tuple\n",
        "shap_values, expected_values, feat_overall_imp, feat_names, per_class_summary, per_class_imp = expl\n",
        "best_run_lags"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Please go to the Azure Portal's best run to see the top features chart.\n",
        "\n",
        "The informative features make all sorts of intuitive sense. Temperature is a strong driver of heating and cooling demand in NYC. Apart from that, the daily life cycle, expressed by `hour`, and the weekly cycle, expressed by `wday` drives people's energy use habits."
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
-        "name": "xiaga"
+        "name": "erwright"
      }
    ],
    "kernelspec": {
--- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.yml
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.yml
@@ -0,0 +1,10 @@
 name: auto-ml-forecasting-energy-demand
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - statsmodels
  - azureml-explain-model
--- a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -20,7 +27,9 @@
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
-        "1. [Train](#Train)"
+        "1. [Train](#Train)\n",
        "1. [Predict](#Predict)\n",
        "1. [Operationalize](#Operationalize)"
      ]
    },
    {
@@ -28,16 +37,10 @@
      "metadata": {},
      "source": [
        "## Introduction\n",
-        "In this example, we use AutoML to find and tune a time-series forecasting model.\n",
+        "In this example, we use AutoML to train, select, and operationalize a time-series forecasting model for multiple time-series.\n",
        "\n",
        "Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
        "In this notebook, you will:\n",
        "1. Create an Experiment in an existing Workspace\n",
        "2. Instantiate an AutoMLConfig \n",
        "3. Find and train a forecasting model using local compute\n",
        "4. Evaluate the performance of the model\n",
        "\n",
        "The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area."
      ]
    },
@@ -59,10 +62,10 @@
        "import numpy as np\n",
        "import logging\n",
        "import warnings\n",
        "\n",
        "# Squash warning messages for cleaner output in the notebook\n",
        "warnings.showwarning = lambda *args, **kwargs: None\n",
        "\n",
        "\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.train.automl import AutoMLConfig\n",
@@ -73,7 +76,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment is a named object in a Workspace which represents a predictive task, the output of which is a trained model and a set of evaluation metrics for the model. "
+        "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem. "
      ]
    },
    {
@@ -85,9 +88,9 @@
        "ws = Workspace.from_config()\n",
        "\n",
        "# choose a name for the run history container in the workspace\n",
-        "experiment_name = 'automl-ojsalesforecasting'\n",
+        "experiment_name = 'automl-ojforecasting'\n",
        "# project folder\n",
-        "project_folder = './sample_projects/automl-local-ojsalesforecasting'\n",
+        "project_folder = './sample_projects/automl-local-ojforecasting'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "\n",
@@ -227,7 +230,7 @@
        "\n",
        "For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time, the grain column names, and the maximum forecast horizon. A time column is required for forecasting, while the grain is optional. If a grain is not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n",
        "\n",
-        "The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up-to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly.   \n",
+        "The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up-to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly. Please see the [energy_demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) for more discussion of forecast horizon.\n",
        "\n",
        "Finally, a note about the cross-validation (CV) procedure for time-series data. AutoML uses out-of-sample error estimates to select a best pipeline/model, so it is important that the CV fold splitting is done correctly. Time-series can violate the basic statistical assumptions of the canonical K-Fold CV strategy, so AutoML implements a [rolling origin validation](https://robjhyndman.com/hyndsight/tscv/) procedure to create CV folds for time-series data. To use this procedure, you just need to specify the desired number of CV folds in the AutoMLConfig object. It is also possible to bypass CV and use your own validation set by setting the *X_valid* and *y_valid* parameters of AutoMLConfig.\n",
        "\n",
@@ -241,7 +244,8 @@
        "|**X**|Training matrix of features as a pandas DataFrame, shape = [n_training_samples, n_features]|\n",
        "|**y**|Target values as a numpy.ndarray, shape = [n_training_samples, ]|\n",
        "|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection|\n",
-        "|**enable_ensembling**|Allow AutoML to create ensembles of the best performing models\n",
+        "|**enable_voting_ensemble**|Allow AutoML to create a Voting ensemble of the best performing models\n",
        "|**enable_stack_ensemble**|Allow AutoML to create a Stack ensemble of the best performing models\n",
        "|**debug_log**|Log file path for writing debugging information\n",
        "|**path**|Relative path to the project folder.  AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
        "|**time_column_name**|Name of the datetime column in the input data|\n",
@@ -265,12 +269,13 @@
        "\n",
        "automl_config = AutoMLConfig(task='forecasting',\n",
        "                             debug_log='automl_oj_sales_errors.log',\n",
-        "                             primary_metric='normalized_root_mean_squared_error',\n",
+        "                             primary_metric='normalized_mean_absolute_error',\n",
        "                             iterations=10,\n",
        "                             X=X_train,\n",
        "                             y=y_train,\n",
-        "                             n_cross_validations=5,\n",
+        "                             n_cross_validations=3,\n",
-        "                             enable_ensembling=False,\n",
+        "                             enable_voting_ensemble=False,\n",
        "                             enable_stack_ensemble=False,\n",
        "                             path=project_folder,\n",
        "                             verbosity=logging.INFO,\n",
        "                             **time_series_settings)"
@@ -293,15 +298,6 @@
        "local_run = experiment.submit(automl_config, show_output=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -324,7 +320,8 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Make Predictions from the Best Fitted Model\n",
+        "# Forecasting\n",
        "\n",
        "Now that we have retrieved the best pipeline/model, it can be used to make predictions on test data. First, we remove the target values from the test set:"
      ]
    },
@@ -352,7 +349,7 @@
      "source": [
        "To produce predictions on the test set, we need to know the feature values at all dates in the test set. This requirement is somewhat reasonable for the OJ sales data since the features mainly consist of price, which is usually set in advance, and customer demographics which are approximately constant for each store over the 20 week forecast horizon in the testing data. \n",
        "\n",
-        "The target predictions can be retrieved by calling the `predict` method on the best model:"
+        "We will first create a query `y_query`, which is aligned index-for-index to `X_test`. This is a vector of target values where each `NaN` serves the function of the question mark to be replaced by forecast. Passing definite values in the `y` argument allows the `forecast` function to make predictions on data that does not immediately follow the train data which contains `y`. In each grain, the last time point where the model sees a definite value of `y` is that grain's _forecast origin_."
      ]
    },
    {
@@ -361,15 +358,76 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "y_pred = fitted_pipeline.predict(X_test)"
+        "# Replace ALL values in y_pred by NaN.\n",
        "# The forecast origin will be at the beginning of the first forecast period.\n",
        "# (Which is the same time as the end of the last training period.)\n",
        "y_query = y_test.copy().astype(np.float)\n",
        "y_query.fill(np.nan)\n",
        "# The featurized data, aligned to y, will also be returned.\n",
        "# This contains the assumptions that were made in the forecast\n",
        "# and helps align the forecast to the original data\n",
        "y_pred, X_trans = fitted_pipeline.forecast(X_test, y_query)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Calculate evaluation metrics for the prediction\n",
+        "If you are used to scikit pipelines, perhaps you expected `predict(X_test)`. However, forecasting requires a more general interface that also supplies the past target `y` values. Please use `forecast(X,y)` as `predict(X)` is reserved for internal purposes on forecasting models.\n",
-        "To evaluate the accuracy of the forecast, we'll compare against the actual sales quantities for some select metrics, included the mean absolute percentage error (MAPE)."
+        "\n",
        "The [energy demand forecasting notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) demonstrates the use of the forecast function in more detail in the context of using lags and rolling window features. "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Evaluate\n",
        "\n",
        "To evaluate the accuracy of the forecast, we'll compare against the actual sales quantities for some select metrics, included the mean absolute percentage error (MAPE). \n",
        "\n",
        "It is a good practice to always align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name = 'predicted'):\n",
        "    \"\"\"\n",
        "    Demonstrates how to get the output aligned to the inputs\n",
        "    using pandas indexes. Helps understand what happened if\n",
        "    the output's shape differs from the input shape, or if\n",
        "    the data got re-sorted by time and grain during forecasting.\n",
        "    \n",
        "    Typical causes of misalignment are:\n",
        "    * we predicted some periods that were missing in actuals -> drop from eval\n",
        "    * model was asked to predict past max_horizon -> increase max horizon\n",
        "    * data at start of X_test was needed for lags -> provide previous periods in y\n",
        "    \"\"\"\n",
        "    \n",
        "    df_fcst = pd.DataFrame({predicted_column_name : y_predicted})\n",
        "    # y and X outputs are aligned by forecast() function contract\n",
        "    df_fcst.index = X_trans.index\n",
        "    \n",
        "    # align original X_test to y_test    \n",
        "    X_test_full = X_test.copy()\n",
        "    X_test_full[target_column_name] = y_test\n",
        "\n",
        "    # X_test_full's index does not include origin, so reset for merge\n",
        "    df_fcst.reset_index(inplace=True)\n",
        "    X_test_full = X_test_full.reset_index().drop(columns='index')\n",
        "    together = df_fcst.merge(X_test_full, how='right')\n",
        "    \n",
        "    # drop rows where prediction or actuals are nan \n",
        "    # happens because of missing actuals \n",
        "    # or at edges of time due to lags/rolling windows\n",
        "    clean = together[together[[target_column_name, predicted_column_name]].notnull().all(axis=1)]\n",
        "    return(clean)\n",
        "\n",
        "df_all = align_outputs(y_pred, X_trans, X_test, y_test)"
      ]
    },
    {
@@ -388,11 +446,385 @@
        "    actual_safe = actual[not_na & not_zero]\n",
        "    pred_safe = pred[not_na & not_zero]\n",
        "    APE = 100*np.abs((actual_safe - pred_safe)/actual_safe)\n",
-        "    return np.mean(APE)\n",
+        "    return np.mean(APE)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(\"Simple forecasting model\")\n",
        "rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all['predicted']))\n",
        "print(\"[Test Data] \\nRoot Mean squared error: %.2f\" % rmse)\n",
        "mae = mean_absolute_error(df_all[target_column_name], df_all['predicted'])\n",
        "print('mean_absolute_error score: %.2f' % mae)\n",
        "print('MAPE: %.2f' % MAPE(df_all[target_column_name], df_all['predicted']))\n",
        "\n",
-        "print(\"[Test Data] \\nRoot Mean squared error: %.2f\" % np.sqrt(mean_squared_error(y_test, y_pred)))\n",
+        "# Plot outputs\n",
-        "print('mean_absolute_error score: %.2f' % mean_absolute_error(y_test, y_pred))\n",
+        "import matplotlib.pyplot as plt\n",
-        "print('MAPE: %.2f' % MAPE(y_test, y_pred))"
+        "\n",
        "%matplotlib inline\n",
        "test_pred = plt.scatter(df_all[target_column_name], df_all['predicted'], color='b')\n",
        "test_test = plt.scatter(y_test, y_test, color='g')\n",
        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Operationalize"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "_Operationalization_ means getting the model into the cloud so that other can run it after you close the notebook. We will create a docker running on Azure Container Instances with the model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "description = 'AutoML OJ forecaster'\n",
        "tags = None\n",
        "model = local_run.register_model(description = description, tags = tags)\n",
        "\n",
        "print(local_run.model_id)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Develop the scoring script\n",
        "\n",
        "Serializing and deserializing complex data frames may be tricky. We first develop the `run()` function of the scoring script locally, then write it into a scoring script. It is much easier to debug any quirks of the scoring function without crossing two compute environments. For this exercise, we handle a common quirk of how pandas dataframes serialize time stamp values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# this is where we test the run function of the scoring script interactively\n",
        "# before putting it in the scoring script\n",
        "\n",
        "timestamp_columns = ['WeekStarting']\n",
        "\n",
        "def run(rawdata, test_model = None):\n",
        "    \"\"\"\n",
        "    Intended to process 'rawdata' string produced by\n",
        "    \n",
        "    {'X': X_test.to_json(), y' : y_test.to_json()}\n",
        "    \n",
        "    Don't convert the X payload to numpy.array, use it as pandas.DataFrame\n",
        "    \"\"\"\n",
        "    try:\n",
        "        # unpack the data frame with timestamp        \n",
        "        rawobj = json.loads(rawdata)                    # rawobj is now a dict of strings        \n",
        "        X_pred = pd.read_json(rawobj['X'], convert_dates=False)   # load the pandas DF from a json string\n",
        "        for col in timestamp_columns:                             # fix timestamps\n",
        "            X_pred[col] = pd.to_datetime(X_pred[col], unit='ms') \n",
        "        \n",
        "        y_pred = np.array(rawobj['y'])                    # reconstitute numpy array from serialized list\n",
        "        \n",
        "        if test_model is None:\n",
        "            result = model.forecast(X_pred, y_pred)       # use the global model from init function\n",
        "        else:\n",
        "            result = test_model.forecast(X_pred, y_pred)  # use the model on which we are testing\n",
        "        \n",
        "    except Exception as e:\n",
        "        result = str(e)\n",
        "        return json.dumps({\"error\": result})\n",
        "    \n",
        "    forecast_as_list = result[0].tolist()\n",
        "    index_as_df = result[1].index.to_frame().reset_index(drop=True)\n",
        "    \n",
        "    return json.dumps({\"forecast\": forecast_as_list,   # return the minimum over the wire: \n",
        "                       \"index\": index_as_df.to_json()  # no forecast and its featurized values\n",
        "                      })"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# test the run function here before putting in the scoring script\n",
        "import json\n",
        "\n",
        "test_sample = json.dumps({'X': X_test.to_json(), 'y' : y_query.tolist()})\n",
        "response = run(test_sample, fitted_pipeline)\n",
        "\n",
        "# unpack the response, dealing with the timestamp serialization again\n",
        "res_dict = json.loads(response)\n",
        "y_fcst_all = pd.read_json(res_dict['index'])\n",
        "y_fcst_all[time_column_name] = pd.to_datetime(y_fcst_all[time_column_name], unit = 'ms')\n",
        "y_fcst_all['forecast'] = res_dict['forecast']\n",
        "y_fcst_all.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now that the function works locally in the notebook, let's write it down into the scoring script. The scoring script is authored by the data scientist. Adjust it to taste, adding inputs, outputs and processing as needed."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile score_fcast.py\n",
        "import pickle\n",
        "import json\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import azureml.train.automl\n",
        "from sklearn.externals import joblib\n",
        "from azureml.core.model import Model\n",
        "\n",
        "\n",
        "def init():\n",
        "    global model\n",
        "    model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
        "    # deserialize the model file back into a sklearn model\n",
        "    model = joblib.load(model_path)\n",
        "\n",
        "timestamp_columns = ['WeekStarting']\n",
        "\n",
        "def run(rawdata, test_model = None):\n",
        "    \"\"\"\n",
        "    Intended to process 'rawdata' string produced by\n",
        "    \n",
        "    {'X': X_test.to_json(), y' : y_test.to_json()}\n",
        "    \n",
        "    Don't convert the X payload to numpy.array, use it as pandas.DataFrame\n",
        "    \"\"\"\n",
        "    try:\n",
        "        # unpack the data frame with timestamp        \n",
        "        rawobj = json.loads(rawdata)                    # rawobj is now a dict of strings        \n",
        "        X_pred = pd.read_json(rawobj['X'], convert_dates=False)   # load the pandas DF from a json string\n",
        "        for col in timestamp_columns:                             # fix timestamps\n",
        "            X_pred[col] = pd.to_datetime(X_pred[col], unit='ms') \n",
        "        \n",
        "        y_pred = np.array(rawobj['y'])                    # reconstitute numpy array from serialized list\n",
        "        \n",
        "        if test_model is None:\n",
        "            result = model.forecast(X_pred, y_pred)       # use the global model from init function\n",
        "        else:\n",
        "            result = test_model.forecast(X_pred, y_pred)  # use the model on which we are testing\n",
        "        \n",
        "    except Exception as e:\n",
        "        result = str(e)\n",
        "        return json.dumps({\"error\": result})\n",
        "    \n",
        "    # prepare to send over wire as json\n",
        "    forecast_as_list = result[0].tolist()\n",
        "    index_as_df = result[1].index.to_frame().reset_index(drop=True)\n",
        "    \n",
        "    return json.dumps({\"forecast\": forecast_as_list,   # return the minimum over the wire: \n",
        "                       \"index\": index_as_df.to_json()  # no forecast and its featurized values\n",
        "                      })"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# get the model\n",
        "from azureml.train.automl.run import AutoMLRun\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "ml_run = AutoMLRun(experiment = experiment, run_id = local_run.id)\n",
        "best_iteration = int(str.split(best_run.id,'_')[-1])      # the iteration number is a postfix of the run ID."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# get the best model's dependencies and write them into this file\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "\n",
        "conda_env_file_name = 'fcast_env.yml'\n",
        "\n",
        "dependencies = ml_run.get_run_sdk_dependencies(iteration = best_iteration)\n",
        "for p in ['azureml-train-automl', 'azureml-core']:\n",
        "    print('{}\\t{}'.format(p, dependencies[p]))\n",
        "\n",
        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-train-automl'])\n",
        "\n",
        "myenv.save_to_file('.', conda_env_file_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# this is the script file name we wrote a few cells above\n",
        "script_file_name = 'score_fcast.py'\n",
        "\n",
        "# Substitute the actual version number in the environment file.\n",
        "# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
        "# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
        "\n",
        "with open(conda_env_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(conda_env_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-train-automl']))\n",
        "\n",
        "# Substitute the actual model id in the script file.\n",
        "\n",
        "with open(script_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(script_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace('<<modelid>>', local_run.model_id))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a Container Image"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.image import Image, ContainerImage\n",
        "\n",
        "image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
        "                                 execution_script = script_file_name,\n",
        "                                 conda_file = conda_env_file_name,\n",
        "                                 tags = {'type': \"automl-forecasting\"},\n",
        "                                 description = \"Image for automl forecasting sample\")\n",
        "\n",
        "image = Image.create(name = \"automl-fcast-image\",\n",
        "                     # this is the model object \n",
        "                     models = [model],\n",
        "                     image_config = image_config, \n",
        "                     workspace = ws)\n",
        "\n",
        "image.wait_for_creation(show_output = True)\n",
        "\n",
        "if image.creation_state == 'Failed':\n",
        "    print(\"Image build log at: \" + image.image_build_log_uri)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Deploy the Image as a Web Service on Azure Container Instance"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import AciWebservice\n",
        "\n",
        "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
        "                                               memory_gb = 2, \n",
        "                                               tags = {'type': \"automl-forecasting\"},\n",
        "                                               description = \"Automl forecasting sample service\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import Webservice\n",
        "\n",
        "aci_service_name = 'automl-forecast-01'\n",
        "print(aci_service_name)\n",
        "\n",
        "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
        "                                           image = image,\n",
        "                                           name = aci_service_name,\n",
        "                                           workspace = ws)\n",
        "aci_service.wait_for_deployment(True)\n",
        "print(aci_service.state)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Call the service"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# we send the data to the service serialized into a json string\n",
        "test_sample = json.dumps({'X':X_test.to_json(), 'y' : y_query.tolist()})\n",
        "response = aci_service.run(input_data = test_sample)\n",
        "\n",
        "# translate from networkese to datascientese\n",
        "try: \n",
        "    res_dict = json.loads(response)\n",
        "    y_fcst_all = pd.read_json(res_dict['index'])\n",
        "    y_fcst_all[time_column_name] = pd.to_datetime(y_fcst_all[time_column_name], unit = 'ms')\n",
        "    y_fcst_all['forecast'] = res_dict['forecast']    \n",
        "except:\n",
        "    print(res_dict)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_fcst_all.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Delete the web service if desired"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "serv = Webservice(ws, 'automl-forecast-01')\n",
        "# serv.delete()     # don't do it accidentally"
      ]
    }
  ],
--- a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.yml
+++ b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.yml
@@ -0,0 +1,9 @@
 name: auto-ml-forecasting-orange-juice-sales
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - statsmodels
--- a/how-to-use-azureml/automated-machine-learning/missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -37,8 +44,9 @@
        "In this notebook you will learn how to:\n",
        "1. Create an `Experiment` in an existing `Workspace`.\n",
        "2. Configure AutoML using `AutoMLConfig`.\n",
-        "4. Train the model.\n",
+        "3. Train the model.\n",
-        "5. Explore the results.\n",
+        "4. Explore the results.\n",
        "5. Viewing the engineered names for featurized data and featurization summary for all raw features.\n",
        "6. Test the best fitted model.\n",
        "\n",
        "In addition this notebook showcases the following features\n",
@@ -316,6 +324,48 @@
        "# best_run, fitted_model = local_run.get_output(iteration = iteration)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### View the engineered names for featurized data\n",
        "Below we display the engineered feature names generated for the featurized data using the preprocessing featurization."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "fitted_model.named_steps['datatransformer'].get_engineered_feature_names()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### View the featurization summary\n",
        "Below we display the featurization that was performed on different raw features in the user data. For each raw feature in the user data, the following information is displayed:-\n",
        "- Raw feature name\n",
        "- Number of engineered features formed out of this raw feature\n",
        "- Type detected\n",
        "- If feature was dropped\n",
        "- List of feature transformations for the raw feature"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Get the featurization summary as a list of JSON\n",
        "featurization_summary = fitted_model.named_steps['datatransformer'].get_featurization_summary()\n",
        "# View the featurization summary as a pandas dataframe\n",
        "pd.DataFrame.from_records(featurization_summary)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
--- a/how-to-use-azureml/automated-machine-learning/missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.yml
+++ b/how-to-use-azureml/automated-machine-learning/missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.yml
@@ -0,0 +1,8 @@
 name: auto-ml-missing-data-blacklist-early-termination
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -254,7 +261,9 @@
        "3.\toverall_summary: The model level feature importance values sorted in descending order\n",
        "4.\toverall_imp: The feature names sorted in the same order as in overall_summary\n",
        "5.\tper_class_summary: The class level feature importance values sorted in descending order. Only available for the classification case\n",
-        "6.\tper_class_imp: The feature names sorted in the same order as in per_class_summary. Only available for the classification case"
+        "6.\tper_class_imp: The feature names sorted in the same order as in per_class_summary. Only available for the classification case\n",
        "\n",
        "Note:- The **retrieve_model_explanation()** API only works in case AutoML has been configured with **'model_explainability'** flag set to **True**. "
      ]
    },
    {
@@ -305,7 +314,7 @@
        "from azureml.train.automl.automlexplainer import explain_model\n",
        "\n",
        "shap_values, expected_values, overall_summary, overall_imp, per_class_summary, per_class_imp = \\\n",
-        "    explain_model(fitted_model, X_train, X_test)"
+        "    explain_model(fitted_model, X_train, X_test, features=features)"
      ]
    },
    {
--- a/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.yml
+++ b/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.yml
@@ -0,0 +1,9 @@
 name: auto-ml-model-explanation
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - azureml-explain-model
--- a/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.ipynb
@@ -0,0 +1,796 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Regression with Deployment using Hardware Performance Dataset**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Test](#Test)\n",
        "1. [Acknowledgements](#Acknowledgements)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "In this example we use the Predicting Compressive Strength of Concrete Dataset to showcase how you can use AutoML for a regression problem. The regression goal is to predict the compressive strength of concrete based off of different ingredient combinations and the quantities of those ingredients.\n",
        "\n",
        "If you are using an Azure Machine Learning Notebook VM, you are all set.  Otherwise, go through the [configuration](../../../configuration.ipynb)  notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create an `Experiment` in an existing `Workspace`.\n",
        "2. Configure AutoML using `AutoMLConfig`.\n",
        "3. Train the model using local compute.\n",
        "4. Explore the results.\n",
        "5. Test the best fitted model."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "As part of the setup you have already created an Azure ML Workspace object. For AutoML you will need to create an Experiment object, which is a named object in a Workspace used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import os\n",
        " \n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# Choose a name for the experiment and specify the project folder.\n",
        "experiment_name = 'automl-regression-concrete'\n",
        "project_folder = './sample_projects/automl-regression-concrete'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace Name'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create or Attach existing AmlCompute\n",
        "You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
        "#### Creation of AmlCompute takes approximately 5 minutes. \n",
        "If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
        "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.compute import AmlCompute\n",
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
        "amlcompute_cluster_name = \"automlcl\"\n",
        "\n",
        "found = False\n",
        "# Check if this compute target already exists in the workspace.\n",
        "cts = ws.compute_targets\n",
        "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
        "    found = True\n",
        "    print('Found existing compute target.')\n",
        "    compute_target = cts[amlcompute_cluster_name]\n",
        "    \n",
        "if not found:\n",
        "    print('Creating a new compute target...')\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
        "    # Create the cluster.\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "    \n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
        "    \n",
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Data\n",
        "\n",
        "Here load the data in the get_data script to be utilized in azure compute. To do this, first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_run_config."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')\n",
        "    \n",
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "\n",
        "cd = CondaDependencies.create(conda_packages=['numpy', 'py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Load Data\n",
        "\n",
        "Here create the script to be run in azure compute for loading the data, load the concrete strength dataset into the X and y  variables. Next, split the data using random_split and return X_train and y_train for training the model. Finally, return X_train and y_train for training the model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\"\n",
        "dataset = Dataset.Tabular.from_delimited_files(data)\n",
        "X = dataset.drop_columns(columns=['CONCRETE'])\n",
        "y = dataset.keep_columns(columns=['CONCRETE'], validate=True)\n",
        "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
        "y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
        "dataset.take(5).to_pandas_dataframe()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**task**|classification or regression|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
        "\n",
        "**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "##### If you would like to see even better results increase \"iteration_time_out minutes\" to 10+ mins and increase \"iterations\" to a minimum of 30"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\": 5,\n",
        "    \"iterations\": 10,\n",
        "    \"n_cross_validations\": 5,\n",
        "    \"primary_metric\": 'spearman_correlation',\n",
        "    \"preprocess\": True,\n",
        "    \"max_concurrent_iterations\": 5,\n",
        "    \"verbosity\": logging.INFO,\n",
        "}\n",
        "\n",
        "automl_config = AutoMLConfig(task = 'regression',\n",
        "                             debug_log = 'automl.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             X = X_train,\n",
        "                             y = y_train,\n",
        "                             **automl_settings\n",
        "                            )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config, show_output = True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results\n",
        "Widget for Monitoring Runs\n",
        "The widget will first report a \u00e2\u20ac\u0153loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(remote_run).show() "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "Retrieve All Child Runs\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(remote_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Retrieve the Best Model\n",
        "Below we select the best pipeline from our iterations. The get_output method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on get_output allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Best Model Based on Any Other Metric\n",
        "Show the run and the model that has the smallest root_mean_squared_error value (which turned out to be the same as the one with largest spearman_correlation value):"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "lookup_metric = \"root_mean_squared_error\"\n",
        "best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "iteration = 3\n",
        "third_run, third_model = remote_run.get_output(iteration = iteration)\n",
        "print(third_run)\n",
        "print(third_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Register the Fitted Model for Deployment\n",
        "If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "description = 'AutoML Model'\n",
        "tags = None\n",
        "model = remote_run.register_model(description = description, tags = tags)\n",
        "\n",
        "print(remote_run.model_id) # This will be written to the script file later in the notebook."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create Scoring Script\n",
        "The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile score.py\n",
        "import pickle\n",
        "import json\n",
        "import numpy\n",
        "import azureml.train.automl\n",
        "from sklearn.externals import joblib\n",
        "from azureml.core.model import Model\n",
        "\n",
        "def init():\n",
        "    global model\n",
        "    model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
        "    # deserialize the model file back into a sklearn model\n",
        "    model = joblib.load(model_path)\n",
        "\n",
        "def run(rawdata):\n",
        "    try:\n",
        "        data = json.loads(rawdata)['data']\n",
        "        data = numpy.array(data)\n",
        "        result = model.predict(data)\n",
        "    except Exception as e:\n",
        "        result = str(e)\n",
        "        return json.dumps({\"error\": result})\n",
        "    return json.dumps({\"result\":result.tolist()})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a YAML File for the Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "for p in ['azureml-train-automl', 'azureml-core']:\n",
        "    print('{}\\t{}'.format(p, dependencies[p]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost==0.80'], pip_packages=['azureml-train-automl'])\n",
        "\n",
        "conda_env_file_name = 'myenv.yml'\n",
        "myenv.save_to_file('.', conda_env_file_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Substitute the actual version number in the environment file.\n",
        "# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
        "# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
        "\n",
        "with open(conda_env_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(conda_env_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-train-automl']))\n",
        "\n",
        "# Substitute the actual model id in the script file.\n",
        "\n",
        "script_file_name = 'score.py'\n",
        "\n",
        "with open(script_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(script_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a Container Image\n",
        "\n",
        "Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
        "or when testing a model that is under development."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.image import Image, ContainerImage\n",
        "\n",
        "image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
        "                                 execution_script = script_file_name,\n",
        "                                 conda_file = conda_env_file_name,\n",
        "                                 tags = {'area': \"digits\", 'type': \"automl_regression\"},\n",
        "                                 description = \"Image for automl regression sample\")\n",
        "\n",
        "image = Image.create(name = \"automlsampleimage\",\n",
        "                     # this is the model object \n",
        "                     models = [model],\n",
        "                     image_config = image_config, \n",
        "                     workspace = ws)\n",
        "\n",
        "image.wait_for_creation(show_output = True)\n",
        "\n",
        "if image.creation_state == 'Failed':\n",
        "    print(\"Image build log at: \" + image.image_build_log_uri)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Deploy the Image as a Web Service on Azure Container Instance\n",
        "\n",
        "Deploy an image that contains the model and other assets needed by the service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import AciWebservice\n",
        "\n",
        "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
        "                                               memory_gb = 1, \n",
        "                                               tags = {'area': \"digits\", 'type': \"automl_regression\"}, \n",
        "                                               description = 'sample service for Automl Regression')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import Webservice\n",
        "\n",
        "aci_service_name = 'automl-sample-concrete'\n",
        "print(aci_service_name)\n",
        "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
        "                                           image = image,\n",
        "                                           name = aci_service_name,\n",
        "                                           workspace = ws)\n",
        "aci_service.wait_for_deployment(True)\n",
        "print(aci_service.state)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Delete a Web Service\n",
        "\n",
        "Deletes the specified web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Get Logs from a Deployed Web Service\n",
        "\n",
        "Gets logs from a deployed web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.get_logs()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test\n",
        "\n",
        "Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X_test = X_test.to_pandas_dataframe()\n",
        "y_test = y_test.to_pandas_dataframe()\n",
        "y_test = np.array(y_test)\n",
        "y_test = y_test[:,0]\n",
        "X_train = X_train.to_pandas_dataframe()\n",
        "y_train = y_train.to_pandas_dataframe()\n",
        "y_train = np.array(y_train)\n",
        "y_train = y_train[:,0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "##### Predict on training and test set, and calculate residual values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_pred_train = fitted_model.predict(X_train)\n",
        "y_residual_train = y_train - y_pred_train\n",
        "\n",
        "y_pred_test = fitted_model.predict(X_test)\n",
        "y_residual_test = y_test - y_pred_test\n",
        "\n",
        "y_residual_train.shape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%matplotlib inline\n",
        "from sklearn.metrics import mean_squared_error, r2_score\n",
        "\n",
        "# Set up a multi-plot chart.\n",
        "f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
        "f.suptitle('Regression Residual Values', fontsize = 18)\n",
        "f.set_figheight(6)\n",
        "f.set_figwidth(16)\n",
        "\n",
        "# Plot residual values of training set.\n",
        "a0.axis([0, 360, -200, 200])\n",
        "a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
        "a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
        "a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
        "a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)), fontsize = 12)\n",
        "a0.set_xlabel('Training samples', fontsize = 12)\n",
        "a0.set_ylabel('Residual Values', fontsize = 12)\n",
        "\n",
        "# Plot a histogram.\n",
        "#a0.hist(y_residual_train, orientation = 'horizontal', color = ['b']*len(y_residual_train), bins = 10, histtype = 'step')\n",
        "#a0.hist(y_residual_train, orientation = 'horizontal', color = ['b']*len(y_residual_train), alpha = 0.2, bins = 10)\n",
        "\n",
        "# Plot residual values of test set.\n",
        "a1.axis([0, 90, -200, 200])\n",
        "a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
        "a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
        "a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
        "a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)), fontsize = 12)\n",
        "a1.set_xlabel('Test samples', fontsize = 12)\n",
        "a1.set_yticklabels([])\n",
        "\n",
        "# Plot a histogram.\n",
        "#a1.hist(y_residual_test, orientation = 'horizontal', color = ['b']*len(y_residual_test), bins = 10, histtype = 'step')\n",
        "#a1.hist(y_residual_test, orientation = 'horizontal', color = ['b']*len(y_residual_test), alpha = 0.2, bins = 10)\n",
        "\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Calculate metrics for the prediction\n",
        "\n",
        "Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
        "from the trained model that was returned."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Plot outputs\n",
        "%matplotlib notebook\n",
        "test_pred = plt.scatter(y_test, y_pred_test, color='b')\n",
        "test_test = plt.scatter(y_test, y_test, color='g')\n",
        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Acknowledgements\n",
        "\n",
        "This Predicting Compressive Strength of Concrete Dataset is made available under the CC0 1.0 Universal (CC0 1.0)\n",
        "Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/. Any rights in individual contents of the database are licensed under the CC0 1.0 Universal (CC0 1.0)\n",
        "Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/ . The dataset itself can be found here: https://www.kaggle.com/pavanraj159/concrete-compressive-strength-data-set and http://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength\n",
        "\n",
        "I-Cheng Yeh, \"Modeling of strength of high performance concrete using artificial neural networks,\" Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998). \n",
        "\n",
        "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "v-rasav"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.yml
+++ b/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.yml
@@ -0,0 +1,10 @@
 name: auto-ml-regression-concrete-strength
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.ipynb
@@ -0,0 +1,798 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Regression with Deployment using Hardware Performance Dataset**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Test](#Test)\n",
        "1. [Acknowledgements](#Acknowledgements)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "In this example we use the Hardware Performance Dataset to showcase how you can use AutoML for a simple regression problem. The Regression goal is to predict the performance of certain combinations of hardware parts.\n",
        "\n",
        "If you are using an Azure Machine Learning Notebook VM, you are all set.  Otherwise, go through the [configuration](../../../configuration.ipynb)  notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create an `Experiment` in an existing `Workspace`.\n",
        "2. Configure AutoML using `AutoMLConfig`.\n",
        "3. Train the model using local compute.\n",
        "4. Explore the results.\n",
        "5. Test the best fitted model."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "As part of the setup you have already created an Azure ML Workspace object. For AutoML you will need to create an Experiment object, which is a named object in a Workspace used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import os\n",
        " \n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# Choose a name for the experiment and specify the project folder.\n",
        "experiment_name = 'automl-regression-hardware'\n",
        "project_folder = './sample_projects/automl-remote-regression'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace Name'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create or Attach existing AmlCompute\n",
        "You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
        "#### Creation of AmlCompute takes approximately 5 minutes. \n",
        "If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
        "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.compute import AmlCompute\n",
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
        "amlcompute_cluster_name = \"automlcl\"\n",
        "\n",
        "found = False\n",
        "# Check if this compute target already exists in the workspace.\n",
        "cts = ws.compute_targets\n",
        "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
        "    found = True\n",
        "    print('Found existing compute target.')\n",
        "    compute_target = cts[amlcompute_cluster_name]\n",
        "    \n",
        "if not found:\n",
        "    print('Creating a new compute target...')\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
        "    # Create the cluster.\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "    \n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
        "    \n",
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Data\n",
        "\n",
        "Here load the data in the get_data script to be utilized in azure compute. To do this, first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_run_config."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')\n",
        "    \n",
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "\n",
        "cd = CondaDependencies.create(conda_packages=['numpy', 'py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Load Data\n",
        "\n",
        "Here create the script to be run in azure compute for loading the data, load the hardware dataset into the X and y variables. Next split the data using random_split and return X_train and y_train for training the model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
        "dataset = Dataset.Tabular.from_delimited_files(data)\n",
        "X = dataset.drop_columns(columns=['ERP'])\n",
        "y = dataset.keep_columns(columns=['ERP'], validate=True)\n",
        "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
        "y_train, y_test = y.random_split(percentage=0.8, seed=223)\n",
        "dataset.take(5).to_pandas_dataframe()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "## Train\n",
        "\n",
        "Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**task**|classification or regression|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
        "\n",
        "**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "##### If you would like to see even better results increase \"iteration_time_out minutes\" to 10+ mins and increase \"iterations\" to a minimum of 30"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\": 5,\n",
        "    \"iterations\": 10,\n",
        "    \"n_cross_validations\": 5,\n",
        "    \"primary_metric\": 'spearman_correlation',\n",
        "    \"preprocess\": True,\n",
        "    \"max_concurrent_iterations\": 5,\n",
        "    \"verbosity\": logging.INFO,\n",
        "}\n",
        "\n",
        "automl_config = AutoMLConfig(task = 'regression',\n",
        "                             debug_log = 'automl_errors_20190417.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             X = X_train,\n",
        "                             y = y_train,\n",
        "                             **automl_settings\n",
        "                            )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config, show_output = False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Widget for Monitoring Runs\n",
        "\n",
        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(remote_run).show() "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Wait until the run finishes.\n",
        "remote_run.wait_for_completion(show_output = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Retrieve All Child Runs\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(remote_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Retrieve the Best Model\n",
        "Below we select the best pipeline from our iterations. The get_output method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on get_output allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Best Model Based on Any Other Metric\n",
        "Show the run and the model that has the smallest `root_mean_squared_error` value (which turned out to be the same as the one with largest `spearman_correlation` value):"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "lookup_metric = \"root_mean_squared_error\"\n",
        "best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "iteration = 3\n",
        "third_run, third_model = remote_run.get_output(iteration = iteration)\n",
        "print(third_run)\n",
        "print(third_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Register the Fitted Model for Deployment\n",
        "If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "description = 'AutoML Model'\n",
        "tags = None\n",
        "model = remote_run.register_model(description = description, tags = tags)\n",
        "\n",
        "print(remote_run.model_id) # This will be written to the script file later in the notebook."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create Scoring Script\n",
        "The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile score.py\n",
        "import pickle\n",
        "import json\n",
        "import numpy\n",
        "import azureml.train.automl\n",
        "from sklearn.externals import joblib\n",
        "from azureml.core.model import Model\n",
        "\n",
        "def init():\n",
        "    global model\n",
        "    model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
        "    # deserialize the model file back into a sklearn model\n",
        "    model = joblib.load(model_path)\n",
        "\n",
        "def run(rawdata):\n",
        "    try:\n",
        "        data = json.loads(rawdata)['data']\n",
        "        data = numpy.array(data)\n",
        "        result = model.predict(data)\n",
        "    except Exception as e:\n",
        "        result = str(e)\n",
        "        return json.dumps({\"error\": result})\n",
        "    return json.dumps({\"result\":result.tolist()})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a YAML File for the Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "for p in ['azureml-train-automl', 'azureml-core']:\n",
        "    print('{}\\t{}'.format(p, dependencies[p]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost==0.80'], pip_packages=['azureml-train-automl'])\n",
        "\n",
        "conda_env_file_name = 'myenv.yml'\n",
        "myenv.save_to_file('.', conda_env_file_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Substitute the actual version number in the environment file.\n",
        "# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
        "# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
        "\n",
        "with open(conda_env_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(conda_env_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-train-automl']))\n",
        "\n",
        "# Substitute the actual model id in the script file.\n",
        "\n",
        "script_file_name = 'score.py'\n",
        "\n",
        "with open(script_file_name, 'r') as cefr:\n",
        "    content = cefr.read()\n",
        "\n",
        "with open(script_file_name, 'w') as cefw:\n",
        "    cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a Container Image\n",
        "\n",
        "Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
        "or when testing a model that is under development."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.image import Image, ContainerImage\n",
        "\n",
        "image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
        "                                 execution_script = script_file_name,\n",
        "                                 conda_file = conda_env_file_name,\n",
        "                                 tags = {'area': \"digits\", 'type': \"automl_regression\"},\n",
        "                                 description = \"Image for automl regression sample\")\n",
        "\n",
        "image = Image.create(name = \"automlsampleimage\",\n",
        "                     # this is the model object \n",
        "                     models = [model],\n",
        "                     image_config = image_config, \n",
        "                     workspace = ws)\n",
        "\n",
        "image.wait_for_creation(show_output = True)\n",
        "\n",
        "if image.creation_state == 'Failed':\n",
        "    print(\"Image build log at: \" + image.image_build_log_uri)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Deploy the Image as a Web Service on Azure Container Instance\n",
        "\n",
        "Deploy an image that contains the model and other assets needed by the service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import AciWebservice\n",
        "\n",
        "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
        "                                               memory_gb = 1, \n",
        "                                               tags = {'area': \"digits\", 'type': \"automl_regression\"}, \n",
        "                                               description = 'sample service for Automl Regression')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import Webservice\n",
        "\n",
        "aci_service_name = 'automl-sample-hardware'\n",
        "print(aci_service_name)\n",
        "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
        "                                           image = image,\n",
        "                                           name = aci_service_name,\n",
        "                                           workspace = ws)\n",
        "aci_service.wait_for_deployment(True)\n",
        "print(aci_service.state)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Delete a Web Service\n",
        "\n",
        "Deletes the specified web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Get Logs from a Deployed Web Service\n",
        "\n",
        "Gets logs from a deployed web service."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#aci_service.get_logs()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test\n",
        "\n",
        "Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X_test = X_test.to_pandas_dataframe()\n",
        "y_test = y_test.to_pandas_dataframe()\n",
        "y_test = np.array(y_test)\n",
        "y_test = y_test[:,0]\n",
        "X_train = X_train.to_pandas_dataframe()\n",
        "y_train = y_train.to_pandas_dataframe()\n",
        "y_train = np.array(y_train)\n",
        "y_train = y_train[:,0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "##### Predict on training and test set, and calculate residual values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_pred_train = fitted_model.predict(X_train)\n",
        "y_residual_train = y_train - y_pred_train\n",
        "\n",
        "y_pred_test = fitted_model.predict(X_test)\n",
        "y_residual_test = y_test - y_pred_test"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Calculate metrics for the prediction\n",
        "\n",
        "Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
        "from the trained model that was returned."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%matplotlib inline\n",
        "from sklearn.metrics import mean_squared_error, r2_score\n",
        "\n",
        "# Set up a multi-plot chart.\n",
        "f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
        "f.suptitle('Regression Residual Values', fontsize = 18)\n",
        "f.set_figheight(6)\n",
        "f.set_figwidth(16)\n",
        "\n",
        "# Plot residual values of training set.\n",
        "a0.axis([0, 360, -200, 200])\n",
        "a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
        "a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
        "a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
        "a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)),fontsize = 12)\n",
        "a0.set_xlabel('Training samples', fontsize = 12)\n",
        "a0.set_ylabel('Residual Values', fontsize = 12)\n",
        "\n",
        "# Plot residual values of test set.\n",
        "a1.axis([0, 90, -200, 200])\n",
        "a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
        "a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
        "a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
        "a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)),fontsize = 12)\n",
        "a1.set_xlabel('Test samples', fontsize = 12)\n",
        "a1.set_yticklabels([])\n",
        "\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%matplotlib notebook\n",
        "test_pred = plt.scatter(y_test, y_pred_test, color='')\n",
        "test_test = plt.scatter(y_test, y_test, color='g')\n",
        "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Acknowledgements\n",
        "This Predicting Hardware Performance Dataset is made available under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/. Any rights in individual contents of the database are licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/ . The dataset itself can be found here: https://www.kaggle.com/faizunnabi/comp-hardware-performance and https://archive.ics.uci.edu/ml/datasets/Computer+Hardware\n",
        "\n",
        "_**Citation Found Here**_\n"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "v-rasav"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.1"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.yml
+++ b/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.yml
@@ -0,0 +1,10 @@
 name: auto-ml-regression-hardware-performance
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
--- a/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.yml
+++ b/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.yml
@@ -0,0 +1,9 @@
 name: auto-ml-regression
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - paramiko<2.5.0
--- a/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -21,8 +28,7 @@
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
-        "1. [Results](#Results)\n",
+        "1. [Results](#Results)\n"
        "1. [Test](#Test)"
      ]
    },
    {
@@ -30,7 +36,7 @@
      "metadata": {},
      "source": [
        "## Introduction\n",
-        "In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
+        "In this example we use the scikit-learn's [iris dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html) to showcase how you can use AutoML for a simple classification problem.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
@@ -38,9 +44,9 @@
        "1. Create an `Experiment` in an existing `Workspace`.\n",
        "2. Create or Attach existing AmlCompute to a workspace.\n",
        "3. Configure AutoML using `AutoMLConfig`.\n",
-        "4. Train the model using AmlCompute\n",
+        "4. Train the model using AmlCompute with ONNX compatible config on.\n",
-        "5. Explore the results.\n",
+        "5. Explore the results and save the ONNX model.\n",
-        "6. Test the best fitted model.\n",
+        "6. Inference with the ONNX model.\n",
        "\n",
        "In addition this notebook showcases the following features\n",
        "- **Parallel** executions for iterations\n",
@@ -67,16 +73,15 @@
      "source": [
        "import logging\n",
        "import os\n",
        "import csv\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from sklearn import datasets\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
@@ -89,7 +94,7 @@
        "ws = Workspace.from_config()\n",
        "\n",
        "# Choose a name for the run history container in the workspace.\n",
-        "experiment_name = 'automl-remote-amlcompute'\n",
+        "experiment_name = 'automl-remote-amlcompute-with-onnx'\n",
        "project_folder = './project'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
@@ -129,7 +134,7 @@
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
-        "amlcompute_cluster_name = \"automlcl\"\n",
+        "amlcompute_cluster_name = \"automlc2\"\n",
        "\n",
        "found = False\n",
        "# Check if this compute target already exists in the workspace.\n",
@@ -145,9 +150,10 @@
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
-        "    # Create the cluster.\n",
+        "    # Create the cluster.\\n\",\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "\n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
@@ -155,13 +161,6 @@
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -169,7 +168,7 @@
        "## Data\n",
        "For remote executions, you need to make the data accessible from the remote compute.\n",
        "This can be done by uploading the data to DataStore.\n",
-        "In this example, we upload scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) data."
+        "In this example, we upload scikit-learn's [load_iris](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html) data."
      ]
    },
    {
@@ -178,7 +177,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "data_train = datasets.load_digits()\n",
+        "iris = datasets.load_iris()\n",
        "\n",
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')\n",
@@ -186,18 +185,37 @@
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)\n",
        "\n",
-        "pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
+        "X_train, X_test, y_train, y_test = train_test_split(iris.data, \n",
-        "pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
+        "                                                    iris.target, \n",
        "                                                    test_size=0.2, \n",
        "                                                    random_state=0)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Ensure the x_train and x_test are pandas DataFrame."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Convert the X_train and X_test to pandas DataFrame and set column names,\n",
        "# This is needed for initializing the input variable names of ONNX model, \n",
        "# and the prediction with the ONNX model using the inference helper.\n",
        "X_train = pd.DataFrame(X_train, columns=['c1', 'c2', 'c3', 'c4'])\n",
        "X_test = pd.DataFrame(X_test, columns=['c1', 'c2', 'c3', 'c4'])\n",
        "y_train = pd.DataFrame(y_train, columns=['label'])\n",
        "\n",
        "X_train.to_csv(\"data/X_train.csv\", index=False)\n",
        "y_train.to_csv(\"data/y_train.csv\", index=False)\n",
        "\n",
        "ds = ws.get_default_datastore()\n",
-        "ds.upload(src_dir='./data', target_path='bai_data', overwrite=True, show_progress=True)\n",
+        "ds.upload(src_dir='./data', target_path='irisdata', overwrite=True, show_progress=True)"
        "\n",
        "from azureml.core.runconfig import DataReferenceConfiguration\n",
        "dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
        "                   path_on_datastore='bai_data', \n",
        "                   path_on_compute='/tmp/azureml_runs',\n",
        "                   mode='download', # download files from datastore to compute target\n",
        "                   overwrite=False)"
      ]
    },
    {
@@ -208,6 +226,7 @@
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
@@ -215,30 +234,28 @@
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
        "\n",
-        "# set the data reference of the run coonfiguration\n",
+        "cd = CondaDependencies.create(conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.data_references = {ds.name: dr}\n",
        "\n",
        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "###  Creating a TabularDataset\n",
        "\n",
        "Defined X and y as `TabularDataset`s, which are passed to automated machine learning in the AutoMLConfig."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
-        "%%writefile $project_folder/get_data.py\n",
+        "X = Dataset.Tabular.from_delimited_files(path=ds.path('irisdata/X_train.csv'))\n",
-        "\n",
+        "y = Dataset.Tabular.from_delimited_files(path=ds.path('irisdata/y_train.csv'))"
        "import pandas as pd\n",
        "\n",
        "def get_data():\n",
        "    X_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "    y_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "\n",
        "    return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
      ]
    },
    {
@@ -249,6 +266,8 @@
        "\n",
        "You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
        "\n",
        "**Note:** Set the parameter enable_onnx_compatible_models=True, if you also want to generate the ONNX compatible models. Please note, the forecasting task and TensorFlow models are not ONNX compatible yet.\n",
        "\n",
        "**Note:** When using AmlCompute, you can't pass Numpy arrays directly to the fit method.\n",
        "\n",
        "|Property|Description|\n",
@@ -257,7 +276,15 @@
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
-        "|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|"
+        "|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|\n",
        "|**enable_onnx_compatible_models**|Enable the ONNX compatible models in the experiment.|"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Set the preprocess=True,  currently the InferenceHelper only supports this mode."
      ]
    },
    {
@@ -267,11 +294,11 @@
      "outputs": [],
      "source": [
        "automl_settings = {\n",
-        "    \"iteration_timeout_minutes\": 2,\n",
+        "    \"iteration_timeout_minutes\": 10,\n",
-        "    \"iterations\": 20,\n",
+        "    \"iterations\": 10,\n",
        "    \"n_cross_validations\": 5,\n",
        "    \"primary_metric\": 'AUC_weighted',\n",
-        "    \"preprocess\": False,\n",
+        "    \"preprocess\": True,\n",
        "    \"max_concurrent_iterations\": 5,\n",
        "    \"verbosity\": logging.INFO\n",
        "}\n",
@@ -280,9 +307,11 @@
        "                             debug_log = 'automl_errors.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+        "                             X = X,\n",
        "                             y = y,\n",
        "                             enable_onnx_compatible_models=True, # This will generate ONNX compatible models.\n",
        "                             **automl_settings\n",
-        "                            )\n"
+        "                            )"
      ]
    },
    {
@@ -370,32 +399,6 @@
        "remote_run.wait_for_completion(show_output = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "#### Retrieve All Child Runs\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(remote_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -422,69 +425,11 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "### Retrieve the Best Model\n",
+        "### Retrieve the Best ONNX Model\n",
        "\n",
-        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing.  Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
+        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing.  Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*.\n",
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Best Model Based on Any Other Metric\n",
        "Show the run and the model which has the smallest `log_loss` value:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "lookup_metric = \"log_loss\"\n",
        "best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Model from a Specific Iteration\n",
        "Show the run and the model from the third iteration:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "iteration = 3\n",
        "third_run, third_model = remote_run.get_output(iteration=iteration)\n",
        "print(third_run)\n",
        "print(third_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test\n",
        "\n",
-        "#### Load Test Data"
+        "Set the parameter return_onnx_model=True to retrieve the best ONNX model, instead of the Python model."
      ]
    },
    {
@@ -493,17 +438,14 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "digits = datasets.load_digits()\n",
+        "best_run, onnx_mdl = remote_run.get_output(return_onnx_model=True)"
        "X_test = digits.data[:10, :]\n",
        "y_test = digits.target[:10]\n",
        "images = digits.images[:10]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "#### Testing Our Best Fitted Model"
+        "### Save the best ONNX model"
      ]
    },
    {
@@ -512,18 +454,69 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "# Randomly select digits and test.\n",
+        "from azureml.automl.core.onnx_convert import OnnxConverter\n",
-        "for index in np.random.choice(len(y_test), 2, replace = False):\n",
+        "onnx_fl_path = \"./best_model.onnx\"\n",
-        "    print(index)\n",
+        "OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)"
        "    predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
        "    label = y_test[index]\n",
        "    title = \"Label value = %d  Predicted value = %d \" % (label, predicted)\n",
        "    fig = plt.figure(1, figsize=(3,3))\n",
        "    ax1 = fig.add_axes((0,0,.8,.8))\n",
        "    ax1.set_title(title)\n",
        "    plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
        "    plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Predict with the ONNX model, using onnxruntime package"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import sys\n",
        "import json\n",
        "from azureml.automl.core.onnx_convert import OnnxConvertConstants\n",
        "from azureml.train.automl import constants\n",
        "\n",
        "if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:\n",
        "    python_version_compatible = True\n",
        "else:\n",
        "    python_version_compatible = False\n",
        "\n",
        "try:\n",
        "    import onnxruntime\n",
        "    from azureml.automl.core.onnx_convert import OnnxInferenceHelper    \n",
        "    onnxrt_present = True\n",
        "except ImportError:\n",
        "    onnxrt_present = False\n",
        "\n",
        "def get_onnx_res(run):\n",
        "    res_path = 'onnx_resource.json'\n",
        "    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)\n",
        "    with open(res_path) as f:\n",
        "        return json.load(f)\n",
        "\n",
        "if onnxrt_present and python_version_compatible:    \n",
        "    mdl_bytes = onnx_mdl.SerializeToString()\n",
        "    onnx_res = get_onnx_res(best_run)\n",
        "\n",
        "    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)\n",
        "    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(X_test)\n",
        "\n",
        "    print(pred_onnx)\n",
        "    print(pred_prob_onnx)\n",
        "else:\n",
        "    if not python_version_compatible:\n",
        "        print('Please use Python version 3.6 or 3.7 to run the inference helper.')    \n",
        "    if not onnxrt_present:\n",
        "        print('Please install the onnxruntime package to do the prediction with ONNX model.')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.yml
+++ b/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.yml
@@ -0,0 +1,11 @@
 name: auto-ml-remote-amlcompute-with-onnx
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
  - onnxruntime
--- a/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -67,7 +74,6 @@
      "source": [
        "import logging\n",
        "import os\n",
        "import csv\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import numpy as np\n",
@@ -77,6 +83,7 @@
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.dataset import Dataset\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
@@ -129,7 +136,7 @@
        "from azureml.core.compute import ComputeTarget\n",
        "\n",
        "# Choose a name for your cluster.\n",
-        "amlcompute_cluster_name = \"automlcl\"\n",
+        "amlcompute_cluster_name = \"automlc2\"\n",
        "\n",
        "found = False\n",
        "# Check if this compute target already exists in the workspace.\n",
@@ -145,9 +152,10 @@
        "                                                                #vm_priority = 'lowpriority', # optional\n",
        "                                                                max_nodes = 6)\n",
        "\n",
-        "    # Create the cluster.\n",
+        "    # Create the cluster.\\n\",\n",
        "    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
        "\n",
        "print('Checking cluster status...')\n",
        "# Can poll for a minimum number of nodes and for a specific timeout.\n",
        "# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
        "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
@@ -155,13 +163,6 @@
        "# For a more detailed view of current AmlCompute status, use get_status()."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -186,18 +187,11 @@
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)\n",
        "    \n",
-        "pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
+        "pd.DataFrame(data_train.data[100:,:]).to_csv(\"data/X_train.csv\", index=False)\n",
-        "pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
+        "pd.DataFrame(data_train.target[100:]).to_csv(\"data/y_train.csv\", index=False)\n",
        "\n",
        "ds = ws.get_default_datastore()\n",
-        "ds.upload(src_dir='./data', target_path='bai_data', overwrite=True, show_progress=True)\n",
+        "ds.upload(src_dir='./data', target_path='digitsdata', overwrite=True, show_progress=True)"
        "\n",
        "from azureml.core.runconfig import DataReferenceConfiguration\n",
        "dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
        "                   path_on_datastore='bai_data', \n",
        "                   path_on_compute='/tmp/azureml_runs',\n",
        "                   mode='download', # download files from datastore to compute target\n",
        "                   overwrite=False)"
      ]
    },
    {
@@ -208,6 +202,7 @@
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "import pkg_resources\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
@@ -215,30 +210,28 @@
        "# Set compute target to AmlCompute\n",
        "conda_run_config.target = compute_target\n",
        "conda_run_config.environment.docker.enabled = True\n",
        "conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
        "\n",
-        "# set the data reference of the run coonfiguration\n",
+        "cd = CondaDependencies.create(conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.data_references = {ds.name: dr}\n",
        "\n",
        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "###  Creating TabularDataset\n",
        "\n",
        "Defined X and y as `TabularDataset`s, which are passed to Automated ML in the AutoMLConfig. `from_delimited_files` by default sets the `infer_column_types` to true, which will infer the columns type automatically. If you do wish to manually set the column types, you can set the `set_column_types` argument to manually set the type of each columns."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
-        "%%writefile $project_folder/get_data.py\n",
+        "X = Dataset.Tabular.from_delimited_files(path=ds.path('digitsdata/X_train.csv'))\n",
-        "\n",
+        "y = Dataset.Tabular.from_delimited_files(path=ds.path('digitsdata/y_train.csv'))"
        "import pandas as pd\n",
        "\n",
        "def get_data():\n",
        "    X_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "    y_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "\n",
        "    return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
      ]
    },
    {
@@ -280,7 +273,8 @@
        "                             debug_log = 'automl_errors.log',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+        "                             X = X,\n",
        "                             y = y,\n",
        "                             **automl_settings\n",
        "                            )\n"
      ]
--- a/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.yml
+++ b/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.yml
@@ -0,0 +1,10 @@
 name: auto-ml-remote-amlcompute
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-defaults
  - azureml-explain-model
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/remote-attach/auto-ml-remote-attach.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/remote-attach/auto-ml-remote-attach.ipynb
@@ -1,515 +0,0 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Remote Execution using attach**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Test](#Test)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "In this example we use the scikit-learn's [20newsgroup](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html) to showcase how you can use AutoML to handle text data with remote attach.\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create an `Experiment` in an existing `Workspace`.\n",
        "2. Attach an existing DSVM to a workspace.\n",
        "3. Configure AutoML using `AutoMLConfig`.\n",
        "4. Train the model using the DSVM.\n",
        "5. Explore the results.\n",
        "6. Test the best fitted model.\n",
        "\n",
        "In addition this notebook showcases the following features\n",
        "- **Parallel** executions for iterations\n",
        "- **Asynchronous** tracking of progress\n",
        "- **Cancellation** of individual iterations or the entire run\n",
        "- Retrieving models for any iteration or logged metric\n",
        "- Specifying AutoML settings as `**kwargs`\n",
        "- Handling **text** data using the `preprocess` flag"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "\n",
        "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# Choose a name for the run history container in the workspace.\n",
        "experiment_name = 'automl-remote-attach'\n",
        "project_folder = './sample_projects/automl-remote-attach'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Attach a Remote Linux DSVM\n",
        "To use a remote Docker compute target:\n",
        "1. Create a Linux DSVM in Azure, following these [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor (not CentOS). Make sure that disk space is available under `/tmp` because AutoML creates files under `/tmp/azureml_run`s. The DSVM should have more cores than the number of parallel runs that you plan to enable. It should also have at least 4GB per core.\n",
        "2. Enter the IP address, user name and password below.\n",
        "\n",
        "**Note:** By default, SSH runs on port 22 and you don't need to change the port number below. If you've configured SSH to use a different port, change `dsvm_ssh_port` accordinglyaddress. [Read more](https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/detailed-troubleshoot-ssh-connection) on changing SSH ports for security reasons."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.compute import ComputeTarget, RemoteCompute\n",
        "import time\n",
        "\n",
        "# Add your VM information below\n",
        "# If a compute with the specified compute_name already exists, it will be used and the dsvm_ip_addr, dsvm_ssh_port, \n",
        "# dsvm_username and dsvm_password will be ignored.\n",
        "compute_name  = 'mydsvmb'\n",
        "dsvm_ip_addr  = '<<ip_addr>>'\n",
        "dsvm_ssh_port = 22\n",
        "dsvm_username = '<<username>>'\n",
        "dsvm_password = '<<password>>'\n",
        "\n",
        "if compute_name in ws.compute_targets:\n",
        "    print('Using existing compute.')\n",
        "    dsvm_compute = ws.compute_targets[compute_name]\n",
        "else:\n",
        "    attach_config = RemoteCompute.attach_configuration(address=dsvm_ip_addr, username=dsvm_username, password=dsvm_password, ssh_port=dsvm_ssh_port)\n",
        "    ComputeTarget.attach(workspace=ws, name=compute_name, attach_configuration=attach_config)\n",
        "\n",
        "    while ws.compute_targets[compute_name].provisioning_state == 'Creating':\n",
        "        time.sleep(1)\n",
        "\n",
        "    dsvm_compute = ws.compute_targets[compute_name]\n",
        "    \n",
        "    if dsvm_compute.provisioning_state == 'Failed':\n",
        "        print('Attached failed.')\n",
        "        print(dsvm_compute.provisioning_errors)\n",
        "        dsvm_compute.detach()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to the Linux DSVM\n",
        "conda_run_config.target = dsvm_compute\n",
        "\n",
        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Data\n",
        "For remote executions you should author a `get_data.py` file containing a `get_data()` function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n",
        "In this example, the `get_data()` function returns a [dictionary](README.md#getdata)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile $project_folder/get_data.py\n",
        "\n",
        "import numpy as np\n",
        "from sklearn.datasets import fetch_20newsgroups\n",
        "\n",
        "def get_data():\n",
        "    remove = ('headers', 'footers', 'quotes')\n",
        "    categories = [\n",
        "        'alt.atheism',\n",
        "        'talk.religion.misc',\n",
        "        'comp.graphics',\n",
        "        'sci.space',\n",
        "    ]\n",
        "    data_train = fetch_20newsgroups(subset = 'train', categories = categories,\n",
        "                                    shuffle = True, random_state = 42,\n",
        "                                    remove = remove)\n",
        "    \n",
        "    X_train = np.array(data_train.data).reshape((len(data_train.data),1))\n",
        "    y_train = np.array(data_train.target)\n",
        "    \n",
        "    return { \"X\" : X_train, \"y\" : y_train }"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
        "\n",
        "**Note:** When using Remote DSVM, you can't pass Numpy arrays directly to the fit method.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|\n",
        "|**preprocess**|Setting this to *True* enables AutoML to perform preprocessing on the input to handle *missing data*, and to perform some common *feature extraction*.|\n",
        "|**enable_cache**|Setting this to *True* enables preprocess done once and reuse the same preprocessed data for all the iterations. Default value is True.\n",
        "|**max_cores_per_iteration**|Indicates how many cores on the compute target would be used to train a single pipeline.<br>Default is *1*; you can set it to *-1* to use all cores.|"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\": 60,\n",
        "    \"iterations\": 4,\n",
        "    \"n_cross_validations\": 5,\n",
        "    \"primary_metric\": 'AUC_weighted',\n",
        "    \"preprocess\": True,\n",
        "    \"max_cores_per_iteration\": 2\n",
        "}\n",
        "\n",
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             path = project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             data_script = project_folder + \"/get_data.py\",\n",
        "                             **automl_settings\n",
        "                            )\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results\n",
        "#### Widget for Monitoring Runs\n",
        "\n",
        "The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "You can click on a pipeline to see run properties and output logs.  Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
        "\n",
        "**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(remote_run).show() "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Wait until the run finishes.\n",
        "remote_run.wait_for_completion(show_output = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Pre-process cache cleanup\n",
        "The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run.clean_preprocessor_cache()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "#### Retrieve All Child Runs\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(remote_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Cancelling Runs\n",
        "You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Cancel the ongoing experiment and stop scheduling new iterations.\n",
        "# remote_run.cancel()\n",
        "\n",
        "# Cancel iteration 1 and move onto iteration 2.\n",
        "# remote_run.cancel_iteration(1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Retrieve the Best Model\n",
        "\n",
        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Best Model Based on Any Other Metric\n",
        "Show the run and the model which has the smallest `accuracy` value:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# lookup_metric = \"accuracy\"\n",
        "# best_run, fitted_model = remote_run.get_output(metric = lookup_metric)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Model from a Specific Iteration"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "iteration = 0\n",
        "zero_run, zero_model = remote_run.get_output(iteration = iteration)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load test data.\n",
        "from pandas_ml import ConfusionMatrix\n",
        "from sklearn.datasets import fetch_20newsgroups\n",
        "\n",
        "remove = ('headers', 'footers', 'quotes')\n",
        "categories = [\n",
        "        'alt.atheism',\n",
        "        'talk.religion.misc',\n",
        "        'comp.graphics',\n",
        "        'sci.space',\n",
        "    ]\n",
        "\n",
        "data_test = fetch_20newsgroups(subset = 'test', categories = categories,\n",
        "                               shuffle = True, random_state = 42,\n",
        "                               remove = remove)\n",
        "\n",
        "X_test = np.array(data_test.data).reshape((len(data_test.data),1))\n",
        "y_test = data_test.target\n",
        "\n",
        "# Test our best pipeline.\n",
        "\n",
        "y_pred = fitted_model.predict(X_test)\n",
        "y_pred_strings = [data_test.target_names[i] for i in y_pred]\n",
        "y_test_strings = [data_test.target_names[i] for i in y_test]\n",
        "\n",
        "cm = ConfusionMatrix(y_test_strings, y_pred_strings)\n",
        "print(cm)\n",
        "cm.plot()"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "savitam"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/remote-execution-with-datastore/auto-ml-remote-execution-with-datastore.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/remote-execution-with-datastore/auto-ml-remote-execution-with-datastore.ipynb
@@ -1,583 +0,0 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated Machine Learning\n",
        "_**Remote Execution with DataStore**_\n",
        "\n",
        "## Contents\n",
        "1. [Introduction](#Introduction)\n",
        "1. [Setup](#Setup)\n",
        "1. [Data](#Data)\n",
        "1. [Train](#Train)\n",
        "1. [Results](#Results)\n",
        "1. [Test](#Test)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Introduction\n",
        "This sample accesses a data file on a remote DSVM through DataStore. Advantages of using data store are:\n",
        "1. DataStore secures the access details.\n",
        "2. DataStore supports read, write to blob and file store\n",
        "3. AutoML natively supports copying data from DataStore to DSVM\n",
        "\n",
        "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
        "\n",
        "In this notebook you would see\n",
        "1. Storing data in DataStore.\n",
        "2. get_data returning data from DataStore."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup\n",
        "\n",
        "As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "import os\n",
        "import time\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.compute import DsvmCompute\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.train.automl import AutoMLConfig"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ws = Workspace.from_config()\n",
        "\n",
        "# choose a name for experiment\n",
        "experiment_name = 'automl-remote-datastore-file'\n",
        "# project folder\n",
        "project_folder = './sample_projects/automl-remote-datastore-file'\n",
        "\n",
        "experiment=Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "outputDf = pd.DataFrame(data = output, index = [''])\n",
        "outputDf.T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a Remote Linux DSVM\n",
        "Note: If creation fails with a message about Marketplace purchase eligibilty, go to portal.azure.com, start creating DSVM there, and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled it, you can exit without actually creating VM.\n",
        "\n",
        "**Note**: By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you can switch to a different port (such as 5022), you can append the port number to the address. [Read more](https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/detailed-troubleshoot-ssh-connection) on this."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "compute_target_name = 'mydsvmc'\n",
        "\n",
        "try:\n",
        "    while ws.compute_targets[compute_target_name].provisioning_state == 'Creating':\n",
        "        time.sleep(1)\n",
        "        \n",
        "    dsvm_compute = DsvmCompute(workspace=ws, name=compute_target_name)\n",
        "    print('found existing:', dsvm_compute.name)\n",
        "except:\n",
        "    dsvm_config = DsvmCompute.provisioning_configuration(vm_size=\"Standard_D2_v2\")\n",
        "    dsvm_compute = DsvmCompute.create(ws, name=compute_target_name, provisioning_configuration=dsvm_config)\n",
        "    dsvm_compute.wait_for_completion(show_output=True)\n",
        "    print(\"Waiting one minute for ssh to be accessible\")\n",
        "    time.sleep(90) # Wait for ssh to be accessible"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Data\n",
        "\n",
        "### Copy data file to local\n",
        "\n",
        "Download the data file.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.isdir('data'):\n",
        "    os.mkdir('data')    "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from sklearn.datasets import fetch_20newsgroups\n",
        "import csv\n",
        "\n",
        "remove = ('headers', 'footers', 'quotes')\n",
        "categories = [\n",
        "        'alt.atheism',\n",
        "        'talk.religion.misc',\n",
        "        'comp.graphics',\n",
        "        'sci.space',\n",
        "    ]\n",
        "data_train = fetch_20newsgroups(subset = 'train', categories = categories,\n",
        "                                    shuffle = True, random_state = 42,\n",
        "                                    remove = remove)\n",
        "    \n",
        "pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
        "pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Upload data to the cloud"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now make the data accessible remotely by uploading that data from your local machine into Azure so it can be accessed for remote training. The datastore is a convenient construct associated with your workspace for you to upload/download data, and interact with it from your remote compute targets. It is backed by Azure blob storage account.\n",
        "\n",
        "The data.tsv files are uploaded into a directory named data at the root of the datastore."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#blob_datastore = Datastore(ws, blob_datastore_name)\n",
        "ds = ws.get_default_datastore()\n",
        "print(ds.datastore_type, ds.account_name, ds.container_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# ds.upload_files(\"data.tsv\")\n",
        "ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Configure & Run\n",
        "\n",
        "First let's create a DataReferenceConfigruation object to inform the system what data folder to download to the compute target.\n",
        "The path_on_compute should be an absolute path to ensure that the data files are downloaded only once.   The get_data method should use this same path to access the data files."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import DataReferenceConfiguration\n",
        "dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
        "                   path_on_datastore='data', \n",
        "                   path_on_compute='/tmp/azureml_runs',\n",
        "                   mode='download', # download files from datastore to compute target\n",
        "                   overwrite=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.runconfig import RunConfiguration\n",
        "from azureml.core.conda_dependencies import CondaDependencies\n",
        "\n",
        "# create a new RunConfig object\n",
        "conda_run_config = RunConfiguration(framework=\"python\")\n",
        "\n",
        "# Set compute target to the Linux DSVM\n",
        "conda_run_config.target = dsvm_compute\n",
        "# set the data reference of the run coonfiguration\n",
        "conda_run_config.data_references = {ds.name: dr}\n",
        "\n",
        "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
        "conda_run_config.environment.python.conda_dependencies = cd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create Get Data File\n",
        "For remote executions you should author a get_data.py file containing a get_data() function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n",
        "\n",
        "The *get_data()* function returns a [dictionary](README.md#getdata).\n",
        "\n",
        "The read_csv uses the path_on_compute value specified in the DataReferenceConfiguration call plus the path_on_datastore folder and then the actual file name."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if not os.path.exists(project_folder):\n",
        "    os.makedirs(project_folder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%%writefile $project_folder/get_data.py\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "def get_data():\n",
        "    X_train = pd.read_csv(\"/tmp/azureml_runs/data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "    y_train = pd.read_csv(\"/tmp/azureml_runs/data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
        "\n",
        "    return { \"X\" : X_train.values, \"y\" : y_train[0].values }"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train\n",
        "\n",
        "You can specify automl_settings as **kwargs** as well. Also note that you can use the get_data() symantic for local excutions too. \n",
        "\n",
        "<i>Note: For Remote DSVM and Batch AI you cannot pass Numpy arrays directly to AutoMLConfig.</i>\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration|\n",
        "|**iterations**|Number of iterations. In each iteration Auto ML trains a specific pipeline with the data|\n",
        "|**n_cross_validations**|Number of cross validation splits|\n",
        "|**max_concurrent_iterations**|Max number of iterations that would be executed in parallel.  This should be less than the number of cores on the DSVM\n",
        "|**preprocess**| *True/False* <br>Setting this to *True* enables Auto ML to perform preprocessing <br>on the input to handle *missing data*, and perform some common *feature extraction*|\n",
        "|**enable_cache**|Setting this to *True* enables preprocess done once and reuse the same preprocessed data for all the iterations. Default value is True.|\n",
        "|**max_cores_per_iteration**| Indicates how many cores on the compute target would be used to train a single pipeline.<br> Default is *1*, you can set it to *-1* to use all cores|"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_settings = {\n",
        "    \"iteration_timeout_minutes\": 60,\n",
        "    \"iterations\": 4,\n",
        "    \"n_cross_validations\": 5,\n",
        "    \"primary_metric\": 'AUC_weighted',\n",
        "    \"preprocess\": True,\n",
        "    \"max_cores_per_iteration\": 1,\n",
        "    \"verbosity\": logging.INFO\n",
        "}\n",
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             path=project_folder,\n",
        "                             run_configuration=conda_run_config,\n",
        "                             #compute_target = dsvm_compute,\n",
        "                             data_script = project_folder + \"/get_data.py\",\n",
        "                             **automl_settings\n",
        "                            )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets/models even when the experiment is running to retreive the best model up to that point. Once you are satisfied with the model you can cancel a particular iteration or the whole run."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run = experiment.submit(automl_config, show_output=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Results\n",
        "#### Widget for monitoring runs\n",
        "\n",
        "The widget will sit on \"loading\" until the first iteration completed, then you will see an auto-updating graph and table show up. It refreshed once per minute, so you should see the graph update as child runs complete.\n",
        "\n",
        "You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under /tmp/azureml_run/{iterationid}/azureml-logs\n",
        "\n",
        "NOTE: The widget displays a link at the bottom. This links to a web-ui to explore the individual run details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.widgets import RunDetails\n",
        "RunDetails(remote_run).show() "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Wait until the run finishes.\n",
        "remote_run.wait_for_completion(show_output = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "#### Retrieve All Child Runs\n",
        "You can also use sdk methods to fetch all the child runs and see individual metrics that we log. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(remote_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}    \n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Canceling Runs\n",
        "You can cancel ongoing remote runs using the *cancel()* and *cancel_iteration()* functions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Cancel the ongoing experiment and stop scheduling new iterations\n",
        "# remote_run.cancel()\n",
        "\n",
        "# Cancel iteration 1 and move onto iteration 2\n",
        "# remote_run.cancel_iteration(1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Pre-process cache cleanup\n",
        "The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "remote_run.clean_preprocessor_cache()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Retrieve the Best Model\n",
        "\n",
        "Below we select the best pipeline from our iterations. The *get_output* method returns the best run and the fitted model. There are overloads on *get_output* that allow you to retrieve the best run and fitted model for *any* logged metric or a particular *iteration*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = remote_run.get_output()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Best Model based on any other metric"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# lookup_metric = \"accuracy\"\n",
        "# best_run, fitted_model = remote_run.get_output(metric=lookup_metric)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Model from a specific iteration"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# iteration = 1\n",
        "# best_run, fitted_model = remote_run.get_output(iteration=iteration)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Test\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load test data.\n",
        "from pandas_ml import ConfusionMatrix\n",
        "\n",
        "data_test = fetch_20newsgroups(subset = 'test', categories = categories,\n",
        "                               shuffle = True, random_state = 42,\n",
        "                               remove = remove)\n",
        "\n",
        "X_test = np.array(data_test.data).reshape((len(data_test.data),1))\n",
        "y_test = data_test.target\n",
        "\n",
        "# Test our best pipeline.\n",
        "\n",
        "y_pred = fitted_model.predict(X_test)\n",
        "y_pred_strings = [data_test.target_names[i] for i in y_pred]\n",
        "y_test_strings = [data_test.target_names[i] for i in y_test]\n",
        "\n",
        "cm = ConfusionMatrix(y_test_strings, y_pred_strings)\n",
        "print(cm)\n",
        "cm.plot()"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "savitam"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/sample-weight/auto-ml-sample-weight.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/sample-weight/auto-ml-sample-weight.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sample-weight/auto-ml-sample-weight.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
--- a/how-to-use-azureml/automated-machine-learning/sample-weight/auto-ml-sample-weight.yml
+++ b/how-to-use-azureml/automated-machine-learning/sample-weight/auto-ml-sample-weight.yml
@@ -0,0 +1,8 @@
 name: auto-ml-sample-weight
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/sparse-data-train-test-split/auto-ml-sparse-data-train-test-split.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/sparse-data-train-test-split/auto-ml-sparse-data-train-test-split.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sparse-data-train-test-split/auto-ml-sparse-data-train-test-split.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
--- a/how-to-use-azureml/automated-machine-learning/sparse-data-train-test-split/auto-ml-sparse-data-train-test-split.yml
+++ b/how-to-use-azureml/automated-machine-learning/sparse-data-train-test-split/auto-ml-sparse-data-train-test-split.yml
@@ -0,0 +1,8 @@
 name: auto-ml-sparse-data-train-test-split
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/automated-machine-learning/sql-server/README.md
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/README.md
@@ -0,0 +1,113 @@
 # Table of Contents
 1. [Introduction](#introduction)
 1. [Setup using Azure Data Studio](#azuredatastudiosetup)
 1. [Energy demand example using Azure Data Studio](#azuredatastudioenergydemand)
 1. [Set using SQL Server Management Studio for SQL Server 2017 on Windows](#ssms2017)
 1. [Set using SQL Server Management Studio for SQL Server 2019 on Linux](#ssms2019)
 1. [Energy demand example using SQL Server Management Studio](#ssmsenergydemand)
 <a name="introduction"></a>
 # Introduction
 SQL Server 2017 or 2019 can call Azure ML automated machine learning to create models trained on data from SQL Server.
 This uses the sp_execute_external_script stored procedure, which can call Python scripts.
 SQL Server 2017 and SQL Server 2019 can both run on Windows or Linux.
 However, this integration is not available for SQL Server 2017 on Linux. 
 This folder shows how to setup the integration and has a sample that uses the integration to train and predict based on an energy demand dataset.
 This integration is part of SQL Server and so can be used from any SQL client. 
 These instructions show using it from Azure Data Studio or SQL Server Managment Studio.
 <a name="azuredatastudiosetup"></a>
 ## Setup using Azure Data Studio
 These step show setting up the integration using Azure Data Studio.
 1. If you don't already have SQL Server, you can install it from [https://www.microsoft.com/en-us/sql-server/sql-server-downloads](https://www.microsoft.com/en-us/sql-server/sql-server-downloads)
 1. Install Azure Data Studio from [https://docs.microsoft.com/en-us/sql/azure-data-studio/download?view=sql-server-2017](https://docs.microsoft.com/en-us/sql/azure-data-studio/download?view=sql-server-2017)
 1. Start Azure Data Studio and connect to SQL Server. [https://docs.microsoft.com/en-us/sql/azure-data-studio/sql-notebooks?view=sql-server-2017](https://docs.microsoft.com/en-us/sql/azure-data-studio/sql-notebooks?view=sql-server-2017)
 1. Create a database named "automl".
 1. Open the notebook how-to-use-azureml\automated-machine-learning\sql-server\setup\auto-ml-sql-setup.ipynb and follow the instructions in it.
 <a name="azuredatastudioenergydemand"></a>
 ## Energy demand example using Azure Data Studio
 Once you have completed the setup, you can try the energy demand sample in the notebook energy-demand\auto-ml-sql-energy-demand.ipynb.
 This has cells to train a model, predict based on the model and show metrics for each pipeline run in training the model.
 <a name="ssms2017"></a>
 ## Setup using SQL Server Management Studio for SQL Server 2017 on Windows
 These instruction setup the integration for SQL Server 2017 on Windows.
 1. If you don't already have SQL Server, you can install it from [https://www.microsoft.com/en-us/sql-server/sql-server-downloads](https://www.microsoft.com/en-us/sql-server/sql-server-downloads)
 2. Enable external scripts with the following commands: 
 ```sh
   sp_configure 'external scripts enabled',1 
   reconfigure with override
 ```
 3. Stop SQL Server. 
 4. Install the automated machine learning libraries using the following commands from Administrator command prompt (If you are using a non-default SQL Server instance name, replace MSSQLSERVER in the second command with the instance name)
 ```sh
   cd "C:\Program Files\Microsoft SQL Server"
   cd "MSSQL14.MSSQLSERVER\PYTHON_SERVICES"
   python.exe -m pip install azureml-sdk[automl]
   python.exe -m pip install --upgrade numpy
   python.exe -m pip install --upgrade sklearn
 ```
 5. Start SQL Server and the service "SQL Server Launchpad service". 
 6. In Windows Firewall, click on advanced settings and in Outbound Rules, disable "Block network access for R local user accounts in SQL Server instance xxxx". 
 7. Execute the files in the setup folder in SQL Server Management Studio: aml_model.sql, aml_connection.sql, AutoMLGetMetrics.sql, AutoMLPredict.sql and AutoMLTrain.sql 
 8. Create an Azure Machine Learning Workspace.  You can use the instructions at: [https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace ](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace)
 9. Create a config.json file file using the subscription id, resource group name and workspace name that you used to create the workspace.  The file is described at: [https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-environment#workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-environment#workspace)
 10. Create an Azure service principal.  You can do this with the commands: 
 ```sh
   az login 
   az account set --subscription subscriptionid 
   az ad sp create-for-rbac --name principlename --password password 
 ```
 11. Insert the values \<tenant\>, \<AppId\> and \<password\> returned by create-for-rbac above into the aml_connection table.  Set \<path\> as the absolute path to your config.json file. Set the name to <20>Default<6C>. 
 <a name="ssms2019"></a>
 ## Setup using SQL Server Management Studio for SQL Server 2019 on Linux
 1. Install SQL Server 2019 from: [https://www.microsoft.com/en-us/sql-server/sql-server-downloads](https://www.microsoft.com/en-us/sql-server/sql-server-downloads)
 2. Install machine learning support from: [https://docs.microsoft.com/en-us/sql/linux/sql-server-linux-setup-machine-learning?view=sqlallproducts-allversions#ubuntu](https://docs.microsoft.com/en-us/sql/linux/sql-server-linux-setup-machine-learning?view=sqlallproducts-allversions#ubuntu)
 3. Then install SQL Server management Studio from [https://docs.microsoft.com/en-us/sql/ssms/download-sql-server-management-studio-ssms?view=sql-server-2017](https://docs.microsoft.com/en-us/sql/ssms/download-sql-server-management-studio-ssms?view=sql-server-2017)
 4. Enable external scripts with the following commands: 
 ```sh
   sp_configure 'external scripts enabled',1 
   reconfigure with override 
 ```
 5. Stop SQL Server. 
 6. Install the automated machine learning libraries using the following commands from Administrator command (If you are using a non-default SQL Server instance name, replace MSSQLSERVER in the second command with the instance name): 
 ```sh
   sudo /opt/mssql/mlservices/bin/python/python -m pip install azureml-sdk[automl] 
   sudo /opt/mssql/mlservices/bin/python/python -m pip install --upgrade numpy 
   sudo /opt/mssql/mlservices/bin/python/python -m pip install --upgrade sklearn
 ```
 7. Start SQL Server. 
 8. Execute the files aml_model.sql, aml_connection.sql, AutoMLGetMetrics.sql, AutoMLPredict.sql, AutoMLForecast.sql and AutoMLTrain.sql in SQL Server Management Studio. 
 9. Create an Azure Machine Learning Workspace.  You can use the instructions at: [https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace)
 10. Create a config.json file file using the subscription id, resource group name and workspace name that you use to create the workspace.  The file is described at: [https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-environment#workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-environment#workspace)
 11. Create an Azure service principal.  You can do this with the commands: 
 ```sh
   az login 
   az account set --subscription subscriptionid 
   az ad sp create-for-rbac --name principlename --password password 
 ``` 
 12. Insert the values \<tenant\>, \<AppId\> and \<password\> returned by create-for-rbac above into the aml_connection table.  Set \<path\> as the absolute path to your config.json file. Set the name to <20>Default<6C>. 
 <a name="ssmsenergydemand"></a>
 ## Energy demand example using SQL Server Management Studio
 Once you have completed the setup, you can try the energy demand sample queries.
 First you need to load the sample data in the database.
 1. In SQL Server Management Studio, you can right-click the database, select Tasks, then Import Flat file. 
 1. Select the file MachineLearningNotebooks\notebooks\how-to-use-azureml\automated-machine-learning\forecasting-energy-demand\nyc_energy.csv. 
 1. When you get to the column definition page, allow nulls for all columns. 
 You can then run the queries in the energy-demand folder:
 * TrainEnergyDemand.sql runs AutoML, trains multiple models on data and selects the best model.
 * ForecastEnergyDemand.sql forecasts based on the most recent training run.
 * GetMetrics.sql returns all the metrics for each model in the most recent training run.
--- a/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/ForecastEnergyDemand.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/ForecastEnergyDemand.sql
@@ -0,0 +1,23 @@
 -- This shows using the AutoMLForecast stored procedure to predict using a forecasting model for the nyc_energy dataset.
 DECLARE @Model NVARCHAR(MAX) = (SELECT TOP 1 Model FROM dbo.aml_model
                                WHERE ExperimentName = 'automl-sql-forecast'
 								ORDER BY CreatedDate DESC)
 DECLARE @max_horizon INT = 48
 DECLARE @split_time NVARCHAR(22) = (SELECT DATEADD(hour, -@max_horizon, MAX(timeStamp)) FROM nyc_energy WHERE demand IS NOT NULL)
 DECLARE @TestDataQuery NVARCHAR(MAX) = '
 SELECT CAST(timeStamp AS NVARCHAR(30)) AS timeStamp,
       demand,
 	   precip,
 	   temp
 FROM nyc_energy
 WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL
 AND timeStamp > ''' + @split_time + ''''
 EXEC dbo.AutoMLForecast @input_query=@TestDataQuery,
@label_column='demand',
@time_column_name='timeStamp',
@model=@model
 WITH RESULT SETS ((timeStamp DATETIME, grain NVARCHAR(255), predicted_demand FLOAT, precip FLOAT, temp FLOAT, actual_demand FLOAT))
--- a/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/GetMetrics.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/GetMetrics.sql
@@ -0,0 +1,10 @@
 -- This lists all the metrics for all iterations for the most recent run.
 DECLARE @RunId NVARCHAR(43)
 DECLARE @ExperimentName NVARCHAR(255)
 SELECT TOP 1 @ExperimentName=ExperimentName, @RunId=SUBSTRING(RunId, 1, 43)
 FROM aml_model
 ORDER BY CreatedDate DESC
 EXEC dbo.AutoMLGetMetrics @RunId, @ExperimentName
--- a/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/PredictEnergyDemand.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/PredictEnergyDemand.sql
@@ -0,0 +1,17 @@
 -- This shows using the AutoMLPredict stored procedure to predict using a forecasting model for the nyc_energy dataset.
 DECLARE @Model NVARCHAR(MAX) = (SELECT TOP 1 Model FROM dbo.aml_model
                                WHERE ExperimentName = 'automl-sql-forecast'
 								ORDER BY CreatedDate DESC)
 EXEC dbo.AutoMLPredict @input_query='
 SELECT CAST(timeStamp AS NVARCHAR(30)) AS timeStamp,
       demand,
 	   precip,
 	   temp
 FROM nyc_energy
 WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL
 AND timeStamp >= ''2017-02-01''',
@label_column='demand',
@model=@model
 WITH RESULT SETS ((timeStamp NVARCHAR(30), actual_demand FLOAT, precip FLOAT, temp FLOAT, predicted_demand FLOAT))
--- a/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/TrainEnergyDemand.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/TrainEnergyDemand.sql
@@ -0,0 +1,25 @@
 -- This shows using the AutoMLTrain stored procedure to create a forecasting model for the nyc_energy dataset.
 DECLARE @max_horizon INT = 48
 DECLARE @split_time NVARCHAR(22) = (SELECT DATEADD(hour, -@max_horizon, MAX(timeStamp)) FROM nyc_energy WHERE demand IS NOT NULL)
 DECLARE @TrainDataQuery NVARCHAR(MAX) = '
 SELECT CAST(timeStamp as NVARCHAR(30)) as timeStamp,
       demand,
 	   precip,
 	   temp
 FROM nyc_energy
 WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL
 and timeStamp < ''' + @split_time + ''''
 INSERT INTO dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
 EXEC dbo.AutoMLTrain @input_query= @TrainDataQuery,
@label_column='demand',
@task='forecasting',
@iterations=10,
@iteration_timeout_minutes=5,
@time_column_name='timeStamp',
@max_horizon=@max_horizon,
@experiment_name='automl-sql-forecast',
@primary_metric='normalized_root_mean_squared_error'
--- a/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/auto-ml-sql-energy-demand.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/auto-ml-sql-energy-demand.ipynb
@@ -0,0 +1,141 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Train a model and use it for prediction\r\n",
        "\r\n",
        "Before running this notebook, run the auto-ml-sql-setup.ipynb notebook."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/auto-ml-sql-energy-demand.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Set the default database"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "USE [automl]\r\n",
        "GO"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Use the AutoMLTrain stored procedure to create a forecasting model for the nyc_energy dataset."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "INSERT INTO dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
        "EXEC dbo.AutoMLTrain @input_query='\r\n",
        "SELECT CAST(timeStamp as NVARCHAR(30)) as timeStamp,\r\n",
        "       demand,\r\n",
        "\t   precip,\r\n",
        "\t   temp,\r\n",
        "\t   CASE WHEN timeStamp < ''2017-01-01'' THEN 0 ELSE 1 END AS is_validate_column\r\n",
        "FROM nyc_energy\r\n",
        "WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL\r\n",
        "and timeStamp < ''2017-02-01''',\r\n",
        "@label_column='demand',\r\n",
        "@task='forecasting',\r\n",
        "@iterations=10,\r\n",
        "@iteration_timeout_minutes=5,\r\n",
        "@time_column_name='timeStamp',\r\n",
        "@is_validate_column='is_validate_column',\r\n",
        "@experiment_name='automl-sql-forecast',\r\n",
        "@primary_metric='normalized_root_mean_squared_error'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Use the AutoMLPredict stored procedure to predict using the forecasting model for the nyc_energy dataset."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "DECLARE @Model NVARCHAR(MAX) = (SELECT TOP 1 Model FROM dbo.aml_model\r\n",
        "                                WHERE ExperimentName = 'automl-sql-forecast'\r\n",
        "\t\t\t\t\t\t\t\tORDER BY CreatedDate DESC)\r\n",
        "\r\n",
        "EXEC dbo.AutoMLPredict @input_query='\r\n",
        "SELECT CAST(timeStamp AS NVARCHAR(30)) AS timeStamp,\r\n",
        "       demand,\r\n",
        "\t   precip,\r\n",
        "\t   temp\r\n",
        "FROM nyc_energy\r\n",
        "WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL\r\n",
        "AND timeStamp >= ''2017-02-01''',\r\n",
        "@label_column='demand',\r\n",
        "@model=@model\r\n",
        "WITH RESULT SETS ((timeStamp NVARCHAR(30), actual_demand FLOAT, precip FLOAT, temp FLOAT, predicted_demand FLOAT))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## List all the metrics for all iterations for the most recent training run."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "DECLARE @RunId NVARCHAR(43)\r\n",
        "DECLARE @ExperimentName NVARCHAR(255)\r\n",
        "\r\n",
        "SELECT TOP 1 @ExperimentName=ExperimentName, @RunId=SUBSTRING(RunId, 1, 43)\r\n",
        "FROM aml_model\r\n",
        "ORDER BY CreatedDate DESC\r\n",
        "\r\n",
        "EXEC dbo.AutoMLGetMetrics @RunId, @ExperimentName"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "jeffshep"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "sql",
      "name": "python36"
    },
    "language_info": {
      "name": "sql",
      "version": ""
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLForecast.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLForecast.sql
@@ -0,0 +1,92 @@
 -- This procedure forecast values based on a forecasting model returned by AutoMLTrain.
 -- It returns a dataset with the forecasted values.
 SET ANSI_NULLS ON
 GO
 SET QUOTED_IDENTIFIER ON
 GO
 CREATE OR ALTER PROCEDURE [dbo].[AutoMLForecast]
 (
   @input_query NVARCHAR(MAX),          -- A SQL query returning data to predict on.
   @model NVARCHAR(MAX),                -- A model returned from AutoMLTrain.
   @time_column_name  NVARCHAR(255)='', -- The name of the timestamp column for forecasting.
   @label_column  NVARCHAR(255)='',     -- Optional name of the column from input_query, which should be ignored when predicting
   @y_query_column NVARCHAR(255)='',    -- Optional value column that can be used for predicting.
                                        -- If specified, this can contain values for past times (after the model was trained)
 									    -- and contain Nan for future times.
   @forecast_column_name NVARCHAR(255) = 'predicted'
                                        -- The name of the output column containing the forecast value.
 ) AS 
 BEGIN 
    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd 
 import azureml.core  
 import numpy as np 
 from azureml.train.automl import AutoMLConfig  
 import pickle 
 import codecs 
 model_obj = pickle.loads(codecs.decode(model.encode(), "base64")) 
 test_data = input_data.copy() 
 if label_column != "" and label_column is not None:
    y_test = test_data.pop(label_column).values
 else:
    y_test = None 
 if y_query_column != "" and y_query_column is not None:
    y_query = test_data.pop(y_query_column).values
 else:
    y_query = np.repeat(np.nan, len(test_data))
 X_test = test_data 
 if time_column_name != "" and time_column_name is not None:
    X_test[time_column_name] = pd.to_datetime(X_test[time_column_name])
 y_fcst, X_trans = model_obj.forecast(X_test, y_query) 
 def align_outputs(y_forecast, X_trans, X_test, y_test, forecast_column_name):
    # Demonstrates how to get the output aligned to the inputs
    # using pandas indexes. Helps understand what happened if
    # the output shape differs from the input shape, or if
    # the data got re-sorted by time and grain during forecasting.
    # Typical causes of misalignment are:
    # * we predicted some periods that were missing in actuals -> drop from eval
    # * model was asked to predict past max_horizon -> increase max horizon
    # * data at start of X_test was needed for lags -> provide previous periods
    df_fcst = pd.DataFrame({forecast_column_name : y_forecast})
    # y and X outputs are aligned by forecast() function contract
    df_fcst.index = X_trans.index
    # align original X_test to y_test    
    X_test_full = X_test.copy()
    if y_test is not None:
        X_test_full[label_column] = y_test
    # X_test_full does not include origin, so reset for merge
    df_fcst.reset_index(inplace=True)
    X_test_full = X_test_full.reset_index().drop(columns=''index'')
    together = df_fcst.merge(X_test_full, how=''right'')
    # drop rows where prediction or actuals are nan 
    # happens because of missing actuals 
    # or at edges of time due to lags/rolling windows
    clean = together[together[[label_column, forecast_column_name]].notnull().all(axis=1)]
    return(clean)
 combined_output = align_outputs(y_fcst, X_trans, X_test, y_test, forecast_column_name)
 ' 
    , @input_data_1 = @input_query 
    , @input_data_1_name = N'input_data' 
    , @output_data_1_name = N'combined_output' 
    , @params = N'@model NVARCHAR(MAX), @time_column_name  NVARCHAR(255), @label_column NVARCHAR(255), @y_query_column NVARCHAR(255), @forecast_column_name NVARCHAR(255)' 
    , @model = @model 
 	, @time_column_name = @time_column_name
 	, @label_column = @label_column
 	, @y_query_column = @y_query_column
 	, @forecast_column_name = @forecast_column_name
 END
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLGetMetrics.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLGetMetrics.sql
@@ -0,0 +1,70 @@
 -- This procedure returns a list of metrics for each iteration of a run.
 SET ANSI_NULLS ON
 GO
 SET QUOTED_IDENTIFIER ON
 GO
 CREATE OR ALTER PROCEDURE [dbo].[AutoMLGetMetrics]
 (
 	@run_id NVARCHAR(250),                           -- The RunId
    @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
    @connection_name NVARCHAR(255)='default'         -- The AML connection to use.
 ) AS
 BEGIN
    DECLARE @tenantid NVARCHAR(255)
    DECLARE @appid NVARCHAR(255)
    DECLARE @password NVARCHAR(255)
    DECLARE @config_file NVARCHAR(255)
 	SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
 	FROM aml_connection
 	WHERE ConnectionName = @connection_name;
    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
 import logging 
 import azureml.core 
 import numpy as np
 from azureml.core.experiment import Experiment 
 from azureml.train.automl.run import AutoMLRun
 from azureml.core.authentication import ServicePrincipalAuthentication 
 from azureml.core.workspace import Workspace 
 auth = ServicePrincipalAuthentication(tenantid, appid, password) 
 ws = Workspace.from_config(path=config_file, auth=auth) 
 experiment = Experiment(ws, experiment_name) 
 ml_run = AutoMLRun(experiment = experiment, run_id = run_id)
 children = list(ml_run.get_children())
 iterationlist = []
 metricnamelist = []
 metricvaluelist = []
 for run in children:
    properties = run.get_properties()
    if "iteration" in properties:
        iteration = int(properties["iteration"])
        for metric_name, metric_value in run.get_metrics().items():
            if isinstance(metric_value, float):
                iterationlist.append(iteration)
                metricnamelist.append(metric_name)
                metricvaluelist.append(metric_value)
 metrics = pd.DataFrame({"iteration": iterationlist, "metric_name": metricnamelist, "metric_value": metricvaluelist})
 '
    , @output_data_1_name = N'metrics'
 	, @params = N'@run_id NVARCHAR(250), 
 				  @experiment_name NVARCHAR(32),
  				  @tenantid NVARCHAR(255),
 				  @appid NVARCHAR(255),
 				  @password NVARCHAR(255),
 				  @config_file NVARCHAR(255)'
    , @run_id = @run_id
 	, @experiment_name = @experiment_name
 	, @tenantid = @tenantid
 	, @appid = @appid
 	, @password = @password
 	, @config_file = @config_file
 WITH RESULT SETS ((iteration INT, metric_name NVARCHAR(100), metric_value FLOAT))
 END
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLPredict.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLPredict.sql
@@ -0,0 +1,41 @@
 -- This procedure predicts values based on a model returned by AutoMLTrain and a dataset.
 -- It returns the dataset with a new column added, which is the predicted value.
 SET ANSI_NULLS ON
 GO
 SET QUOTED_IDENTIFIER ON
 GO
 CREATE OR ALTER PROCEDURE [dbo].[AutoMLPredict]
 (
   @input_query NVARCHAR(MAX),      -- A SQL query returning data to predict on.
   @model NVARCHAR(MAX),            -- A model returned from AutoMLTrain.
   @label_column  NVARCHAR(255)=''  -- Optional name of the column from input_query, which should be ignored when predicting
 ) AS 
 BEGIN 
    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd 
 import azureml.core  
 import numpy as np 
 from azureml.train.automl import AutoMLConfig  
 import pickle 
 import codecs 
 model_obj = pickle.loads(codecs.decode(model.encode(), "base64")) 
 test_data = input_data.copy() 
 if label_column != "" and label_column is not None:
    y_test = test_data.pop(label_column).values 
 X_test = test_data 
 predicted = model_obj.predict(X_test) 
 combined_output = input_data.assign(predicted=predicted)
 ' 
    , @input_data_1 = @input_query 
    , @input_data_1_name = N'input_data' 
    , @output_data_1_name = N'combined_output' 
    , @params = N'@model NVARCHAR(MAX), @label_column  NVARCHAR(255)' 
    , @model = @model 
 	, @label_column = @label_column
 END
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLTrain.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/AutoMLTrain.sql
@@ -0,0 +1,240 @@
 -- This stored procedure uses automated machine learning to train several models
 -- and returns the best model.
 --
 -- The result set has several columns:
 --   best_run - iteration ID for the best model
 --   experiment_name - experiment name pass in with the @experiment_name parameter
 --   fitted_model - best model found
 --   log_file_text - AutoML debug_log contents
 --   workspace - name of the Azure ML workspace where run history is stored
 --
 -- An example call for a classification problem is:
 --    insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
 --    exec dbo.AutoMLTrain @input_query='
 --    SELECT top 100000 
 --          CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime
 --          ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime
 --          ,[passenger_count]
 --          ,[trip_time_in_secs]
 --          ,[trip_distance]
 --          ,[payment_type]
 --          ,[tip_class]
 --      FROM [dbo].[nyctaxi_sample] order by [hack_license] ',
 --      @label_column = 'tip_class',
 --      @iterations=10
 -- 
 -- An example call for forecasting is:
 --      insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
 --      exec dbo.AutoMLTrain @input_query='
 --      select cast(timeStamp as nvarchar(30)) as timeStamp,
 --             demand,
 --      	   precip,
 --      	   temp,
 --             case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column
 --      from nyc_energy
 --      where demand is not null and precip is not null and temp is not null
 --      and timeStamp < ''2017-02-01''',
 --      @label_column='demand',
 --      @task='forecasting',
 --      @iterations=10,
 --      @iteration_timeout_minutes=5,
 --      @time_column_name='timeStamp',
 --      @is_validate_column='is_validate_column',
 --      @experiment_name='automl-sql-forecast',
 --      @primary_metric='normalized_root_mean_squared_error'
 SET ANSI_NULLS ON
 GO
 SET QUOTED_IDENTIFIER ON
 GO
 CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]
 (
    @input_query NVARCHAR(MAX),                      -- The SQL Query that will return the data to train and validate the model.
    @label_column NVARCHAR(255)='Label',             -- The name of the column in the result of @input_query that is the label.
    @primary_metric NVARCHAR(40)='AUC_weighted',     -- The metric to optimize.
    @iterations INT=100,                             -- The maximum number of pipelines to train.
    @task NVARCHAR(40)='classification',             -- The type of task.  Can be classification, regression or forecasting.
    @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
    @iteration_timeout_minutes INT = 15,             -- The maximum time in minutes for training a single pipeline. 
    @experiment_timeout_minutes INT = 60,            -- The maximum time in minutes for training all pipelines.
    @n_cross_validations INT = 3,                    -- The number of cross validations.
    @blacklist_models NVARCHAR(MAX) = '',            -- A comma separated list of algos that will not be used.
                                                     -- The list of possible models can be found at:
                                                     -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
    @whitelist_models NVARCHAR(MAX) = '',            -- A comma separated list of algos that can be used.
                                                     -- The list of possible models can be found at:
                                                     -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
    @experiment_exit_score FLOAT = 0,                -- Stop the experiment if this score is acheived.
    @sample_weight_column NVARCHAR(255)='',          -- The name of the column in the result of  @input_query that gives a sample weight.
    @is_validate_column NVARCHAR(255)='',            -- The name of the column in the result of  @input_query that indicates if the row is for training or validation.
 	                                                 -- In the values of the column, 0 means for training and 1 means for validation.
    @time_column_name  NVARCHAR(255)='',             -- The name of the timestamp column for forecasting.
    @connection_name NVARCHAR(255)='default',        -- The AML connection to use.
    @max_horizon INT = 0                             -- A forecast horizon is a time span into the future (or just beyond the latest date in the training data)
                                                     -- where forecasts of the target quantity are needed.
                                                     -- For example, if data is recorded daily and max_horizon is 5, we will predict 5 days ahead.
 ) AS
 BEGIN
    DECLARE @tenantid NVARCHAR(255)
    DECLARE @appid NVARCHAR(255)
    DECLARE @password NVARCHAR(255)
    DECLARE @config_file NVARCHAR(255)
 	SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
 	FROM aml_connection
 	WHERE ConnectionName = @connection_name;
 	EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
 import logging 
 import azureml.core 
 import pandas as pd
 import numpy as np
 from azureml.core.experiment import Experiment 
 from azureml.train.automl import AutoMLConfig 
 from sklearn import datasets 
 import pickle
 import codecs
 from azureml.core.authentication import ServicePrincipalAuthentication 
 from azureml.core.workspace import Workspace 
 if __name__.startswith("sqlindb"):
    auth = ServicePrincipalAuthentication(tenantid, appid, password) 
    ws = Workspace.from_config(path=config_file, auth=auth) 
    project_folder = "./sample_projects/" + experiment_name
    experiment = Experiment(ws, experiment_name) 
    data_train = input_data
    X_valid = None
    y_valid = None
    sample_weight_valid = None
    if is_validate_column != "" and is_validate_column is not None:
        data_train = input_data[input_data[is_validate_column] <= 0]
        data_valid = input_data[input_data[is_validate_column] > 0]
        data_train.pop(is_validate_column)
        data_valid.pop(is_validate_column)
        y_valid = data_valid.pop(label_column).values
        if sample_weight_column != "" and sample_weight_column is not None:
            sample_weight_valid = data_valid.pop(sample_weight_column).values
        X_valid = data_valid
        n_cross_validations = None
    y_train = data_train.pop(label_column).values
    sample_weight = None
    if sample_weight_column != "" and sample_weight_column is not None:
        sample_weight = data_train.pop(sample_weight_column).values
    X_train = data_train
    if experiment_timeout_minutes == 0:
        experiment_timeout_minutes = None
    if experiment_exit_score == 0:
        experiment_exit_score = None
    if blacklist_models == "":
        blacklist_models = None
    if blacklist_models is not None:
        blacklist_models = blacklist_models.replace(" ", "").split(",")
    if whitelist_models == "":
        whitelist_models = None
    if whitelist_models is not None:
        whitelist_models = whitelist_models.replace(" ", "").split(",")
    automl_settings = {}
    preprocess = True
    if time_column_name != "" and time_column_name is not None:
        automl_settings = { "time_column_name": time_column_name }
        preprocess = False
        if max_horizon > 0:
            automl_settings["max_horizon"] = max_horizon
    log_file_name = "automl_sqlindb_errors.log"
    automl_config = AutoMLConfig(task = task, 
                                 debug_log = log_file_name, 
                                 primary_metric = primary_metric, 
                                 iteration_timeout_minutes = iteration_timeout_minutes, 
                                 experiment_timeout_minutes = experiment_timeout_minutes,
                                 iterations = iterations, 
                                 n_cross_validations = n_cross_validations, 
                                 preprocess = preprocess,
                                 verbosity = logging.INFO, 
                                 X = X_train,  
                                 y = y_train, 
                                 path = project_folder,
                                 blacklist_models = blacklist_models,
                                 whitelist_models = whitelist_models,
                                 experiment_exit_score = experiment_exit_score,
                                 sample_weight = sample_weight,
                                 X_valid = X_valid,
                                 y_valid = y_valid,
                                 sample_weight_valid = sample_weight_valid,
                                 **automl_settings) 
    local_run = experiment.submit(automl_config, show_output = True) 
    best_run, fitted_model = local_run.get_output()
    pickled_model = codecs.encode(pickle.dumps(fitted_model), "base64").decode()
    log_file_text = ""
    try:
        with open(log_file_name, "r") as log_file:
            log_file_text = log_file.read()
    except:
        log_file_text = "Log file not found"
    returned_model = pd.DataFrame({"best_run": [best_run.id], "experiment_name": [experiment_name], "fitted_model": [pickled_model], "log_file_text": [log_file_text], "workspace": [ws.name]}, dtype=np.dtype(np.str))
 '
 	, @input_data_1 = @input_query
 	, @input_data_1_name = N'input_data'
 	, @output_data_1_name = N'returned_model'
 	, @params = N'@label_column NVARCHAR(255), 
 	              @primary_metric NVARCHAR(40),
 				  @iterations INT, @task NVARCHAR(40),
 				  @experiment_name NVARCHAR(32),
 				  @iteration_timeout_minutes INT,
 				  @experiment_timeout_minutes INT,
 				  @n_cross_validations INT,
 				  @blacklist_models NVARCHAR(MAX),
 				  @whitelist_models NVARCHAR(MAX),
 				  @experiment_exit_score FLOAT,
 				  @sample_weight_column NVARCHAR(255),
 				  @is_validate_column NVARCHAR(255),
 				  @time_column_name  NVARCHAR(255),
 				  @tenantid NVARCHAR(255),
 				  @appid NVARCHAR(255),
 				  @password NVARCHAR(255),
 				  @config_file NVARCHAR(255),
 				  @max_horizon INT'
 	, @label_column = @label_column
 	, @primary_metric = @primary_metric
 	, @iterations = @iterations
 	, @task = @task
 	, @experiment_name = @experiment_name
 	, @iteration_timeout_minutes = @iteration_timeout_minutes
 	, @experiment_timeout_minutes = @experiment_timeout_minutes
 	, @n_cross_validations = @n_cross_validations
 	, @blacklist_models = @blacklist_models
 	, @whitelist_models = @whitelist_models
 	, @experiment_exit_score = @experiment_exit_score
 	, @sample_weight_column = @sample_weight_column
 	, @is_validate_column = @is_validate_column
 	, @time_column_name = @time_column_name
 	, @tenantid = @tenantid
 	, @appid = @appid
 	, @password = @password
 	, @config_file = @config_file
 	, @max_horizon = @max_horizon
 WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))
 END
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/aml_connection.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/aml_connection.sql
@@ -0,0 +1,18 @@
 -- This is a table to store the Azure ML connection information.
 SET ANSI_NULLS ON
 GO
 SET QUOTED_IDENTIFIER ON
 GO
 CREATE TABLE [dbo].[aml_connection](
    [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,
 	[ConnectionName] [nvarchar](255) NULL,
 	[TenantId] [nvarchar](255) NULL,
 	[AppId] [nvarchar](255) NULL,
 	[Password] [nvarchar](255) NULL,
 	[ConfigFile] [nvarchar](255) NULL
 ) ON [PRIMARY]
 GO
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/aml_model.sql
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/aml_model.sql
@@ -0,0 +1,22 @@
 -- This is a table to hold the results from the AutoMLTrain procedure.
 SET ANSI_NULLS ON
 GO
 SET QUOTED_IDENTIFIER ON
 GO
 CREATE TABLE [dbo].[aml_model](
    [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,
    [Model] [varchar](max) NOT NULL,        -- The model, which can be passed to AutoMLPredict for testing or prediction.
    [RunId] [nvarchar](250) NULL,           -- The RunId, which can be used to view the model in the Azure Portal.
    [CreatedDate] [datetime] NULL,
    [ExperimentName] [nvarchar](100) NULL,  -- Azure ML Experiment Name
    [WorkspaceName] [nvarchar](100) NULL,   -- Azure ML Workspace Name
 	[LogFileText] [nvarchar](max) NULL
 ) 
 GO
 ALTER TABLE [dbo].[aml_model] ADD  DEFAULT (getutcdate()) FOR [CreatedDate]
 GO
--- a/how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.ipynb
@@ -0,0 +1,561 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Set up Azure ML Automated Machine Learning on SQL Server 2019 CTP 2.4 big data cluster\r\n",
        "\r\n",
        "\\# Prerequisites:  \r\n",
        "\\# - An Azure subscription and resource group  \r\n",
        "\\# - An Azure Machine Learning workspace  \r\n",
        "\\# - A SQL Server 2019 CTP 2.4 big data cluster with Internet access and a database named 'automl'  \r\n",
        "\\# - Azure CLI  \r\n",
        "\\# - kubectl command  \r\n",
        "\\# - The https://github.com/Azure/MachineLearningNotebooks repository downloaded (cloned) to your local machine\r\n",
        "\r\n",
        "\\# In the 'automl' database, create a table named 'dbo.nyc_energy' as follows:  \r\n",
        "\\# - In SQL Server Management Studio, right-click the 'automl' database, select Tasks, then Import Flat File.  \r\n",
        "\\# - Select the file AzureMlCli\\notebooks\\how-to-use-azureml\\automated-machine-learning\\forecasting-energy-demand\\nyc_energy.csv.  \r\n",
        "\\# - Using the \"Modify Columns\" page, allow nulls for all columns. \r\n",
        "\r\n",
        "\\# Create an Azure Machine Learning Workspace using the instructions at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace \r\n",
        "\r\n",
        "\\# Create an Azure service principal.  You can do this with the following commands: \r\n",
        "\r\n",
        "az login  \r\n",
        "az account set --subscription *subscriptionid*  \r\n",
        "\r\n",
        "\\# The following command prints out the **appId** and **tenant**,  \r\n",
        "\\# which you insert into the indicated cell later in this notebook  \r\n",
        "\\# to allow AutoML to authenticate with Azure:  \r\n",
        "\r\n",
        "az ad sp create-for-rbac --name *principlename* --password *password*\r\n",
        "\r\n",
        "\\# Log into the master instance of SQL Server 2019 CTP 2.4:  \r\n",
        "kubectl exec -it mssql-master-pool-0 -n *clustername* -c mssql-server -- /bin/bash\r\n",
        "\r\n",
        "mkdir /tmp/aml\r\n",
        "\r\n",
        "cd /tmp/aml\r\n",
        "\r\n",
        "\\# **Modify** the following with your subscription_id, resource_group, and workspace_name:  \r\n",
        "cat > config.json << EOF  \r\n",
        "{  \r\n",
        "    \"subscription_id\": \"123456ab-78cd-0123-45ef-abcd12345678\",  \r\n",
        "    \"resource_group\": \"myrg1\",  \r\n",
        "    \"workspace_name\": \"myws1\"  \r\n",
        "}  \r\n",
        "EOF\r\n",
        "\r\n",
        "\\# The directory referenced below is appropriate for the master instance of SQL Server 2019 CTP 2.4.\r\n",
        "\r\n",
        "cd /opt/mssql/mlservices/runtime/python/bin\r\n",
        "\r\n",
        "./python -m pip install azureml-sdk[automl]\r\n",
        "\r\n",
        "./python -m pip install --upgrade numpy \r\n",
        "\r\n",
        "./python -m pip install --upgrade sklearn\r\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.png)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- Enable external scripts to allow invoking Python\r\n",
        "sp_configure 'external scripts enabled',1 \r\n",
        "reconfigure with override \r\n",
        "GO\r\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- Use database 'automl'\r\n",
        "USE [automl]\r\n",
        "GO"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- This is a table to hold the Azure ML connection information.\r\n",
        "SET ANSI_NULLS ON\r\n",
        "GO\r\n",
        "\r\n",
        "SET QUOTED_IDENTIFIER ON\r\n",
        "GO\r\n",
        "\r\n",
        "CREATE TABLE [dbo].[aml_connection](\r\n",
        "    [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,\r\n",
        "\t[ConnectionName] [nvarchar](255) NULL,\r\n",
        "\t[TenantId] [nvarchar](255) NULL,\r\n",
        "\t[AppId] [nvarchar](255) NULL,\r\n",
        "\t[Password] [nvarchar](255) NULL,\r\n",
        "\t[ConfigFile] [nvarchar](255) NULL\r\n",
        ") ON [PRIMARY]\r\n",
        "GO"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Copy the values from create-for-rbac above into the cell below"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- Use the following values:\r\n",
        "-- Leave the name as 'Default'\r\n",
        "-- Insert <tenant> returned by create-for-rbac above\r\n",
        "-- Insert <AppId> returned by create-for-rbac above\r\n",
        "-- Insert <password> used in create-for-rbac above\r\n",
        "-- Leave <path> as '/tmp/aml/config.json'\r\n",
        "INSERT INTO [dbo].[aml_connection]  \r\n",
        "VALUES (\r\n",
        "    N'Default', -- Name\r\n",
        "    N'11111111-2222-3333-4444-555555555555', -- Tenant\r\n",
        "    N'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', -- AppId\r\n",
        "    N'insertpasswordhere', -- Password\r\n",
        "    N'/tmp/aml/config.json' -- Path\r\n",
        "    );\r\n",
        "GO"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- This is a table to hold the results from the AutoMLTrain procedure.\r\n",
        "SET ANSI_NULLS ON\r\n",
        "GO\r\n",
        "\r\n",
        "SET QUOTED_IDENTIFIER ON\r\n",
        "GO\r\n",
        "\r\n",
        "CREATE TABLE [dbo].[aml_model](\r\n",
        "    [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,\r\n",
        "    [Model] [varchar](max) NOT NULL,        -- The model, which can be passed to AutoMLPredict for testing or prediction.\r\n",
        "    [RunId] [nvarchar](250) NULL,           -- The RunId, which can be used to view the model in the Azure Portal.\r\n",
        "    [CreatedDate] [datetime] NULL,\r\n",
        "    [ExperimentName] [nvarchar](100) NULL,  -- Azure ML Experiment Name\r\n",
        "    [WorkspaceName] [nvarchar](100) NULL,   -- Azure ML Workspace Name\r\n",
        "\t[LogFileText] [nvarchar](max) NULL\r\n",
        ") \r\n",
        "GO\r\n",
        "\r\n",
        "ALTER TABLE [dbo].[aml_model] ADD  DEFAULT (getutcdate()) FOR [CreatedDate]\r\n",
        "GO\r\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- This stored procedure uses automated machine learning to train several models\r\n",
        "-- and return the best model.\r\n",
        "--\r\n",
        "-- The result set has several columns:\r\n",
        "--   best_run - ID of the best model found\r\n",
        "--   experiment_name - training run name\r\n",
        "--   fitted_model - best model found\r\n",
        "--   log_file_text - console output\r\n",
        "--   workspace - name of the Azure ML workspace where run history is stored\r\n",
        "--\r\n",
        "-- An example call for a classification problem is:\r\n",
        "--    insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
        "--    exec dbo.AutoMLTrain @input_query='\r\n",
        "--    SELECT top 100000 \r\n",
        "--          CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime\r\n",
        "--          ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime\r\n",
        "--          ,[passenger_count]\r\n",
        "--          ,[trip_time_in_secs]\r\n",
        "--          ,[trip_distance]\r\n",
        "--          ,[payment_type]\r\n",
        "--          ,[tip_class]\r\n",
        "--      FROM [dbo].[nyctaxi_sample] order by [hack_license] ',\r\n",
        "--      @label_column = 'tip_class',\r\n",
        "--      @iterations=10\r\n",
        "-- \r\n",
        "-- An example call for forecasting is:\r\n",
        "--      insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
        "--      exec dbo.AutoMLTrain @input_query='\r\n",
        "--      select cast(timeStamp as nvarchar(30)) as timeStamp,\r\n",
        "--             demand,\r\n",
        "--      \t   precip,\r\n",
        "--      \t   temp,\r\n",
        "--             case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column\r\n",
        "--      from nyc_energy\r\n",
        "--      where demand is not null and precip is not null and temp is not null\r\n",
        "--      and timeStamp < ''2017-02-01''',\r\n",
        "--      @label_column='demand',\r\n",
        "--      @task='forecasting',\r\n",
        "--      @iterations=10,\r\n",
        "--      @iteration_timeout_minutes=5,\r\n",
        "--      @time_column_name='timeStamp',\r\n",
        "--      @is_validate_column='is_validate_column',\r\n",
        "--      @experiment_name='automl-sql-forecast',\r\n",
        "--      @primary_metric='normalized_root_mean_squared_error'\r\n",
        "\r\n",
        "SET ANSI_NULLS ON\r\n",
        "GO\r\n",
        "SET QUOTED_IDENTIFIER ON\r\n",
        "GO\r\n",
        "CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]\r\n",
        " (\r\n",
        "    @input_query NVARCHAR(MAX),                      -- The SQL Query that will return the data to train and validate the model.\r\n",
        "    @label_column NVARCHAR(255)='Label',             -- The name of the column in the result of @input_query that is the label.\r\n",
        "    @primary_metric NVARCHAR(40)='AUC_weighted',     -- The metric to optimize.\r\n",
        "    @iterations INT=100,                             -- The maximum number of pipelines to train.\r\n",
        "    @task NVARCHAR(40)='classification',             -- The type of task.  Can be classification, regression or forecasting.\r\n",
        "    @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.\r\n",
        "    @iteration_timeout_minutes INT = 15,             -- The maximum time in minutes for training a single pipeline. \r\n",
        "    @experiment_timeout_minutes INT = 60,            -- The maximum time in minutes for training all pipelines.\r\n",
        "    @n_cross_validations INT = 3,                    -- The number of cross validations.\r\n",
        "    @blacklist_models NVARCHAR(MAX) = '',            -- A comma separated list of algos that will not be used.\r\n",
        "                                                     -- The list of possible models can be found at:\r\n",
        "                                                     -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings\r\n",
        "    @whitelist_models NVARCHAR(MAX) = '',            -- A comma separated list of algos that can be used.\r\n",
        "                                                     -- The list of possible models can be found at:\r\n",
        "                                                     -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings\r\n",
        "    @experiment_exit_score FLOAT = 0,                -- Stop the experiment if this score is acheived.\r\n",
        "    @sample_weight_column NVARCHAR(255)='',          -- The name of the column in the result of  @input_query that gives a sample weight.\r\n",
        "    @is_validate_column NVARCHAR(255)='',            -- The name of the column in the result of  @input_query that indicates if the row is for training or validation.\r\n",
        "\t                                                 -- In the values of the column, 0 means for training and 1 means for validation.\r\n",
        "    @time_column_name  NVARCHAR(255)='',             -- The name of the timestamp column for forecasting.\r\n",
        "\t@connection_name NVARCHAR(255)='default'         -- The AML connection to use.\r\n",
        " ) AS\r\n",
        "BEGIN\r\n",
        "\r\n",
        "    DECLARE @tenantid NVARCHAR(255)\r\n",
        "    DECLARE @appid NVARCHAR(255)\r\n",
        "    DECLARE @password NVARCHAR(255)\r\n",
        "    DECLARE @config_file NVARCHAR(255)\r\n",
        "\r\n",
        "\tSELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile\r\n",
        "\tFROM aml_connection\r\n",
        "\tWHERE ConnectionName = @connection_name;\r\n",
        "\r\n",
        "\tEXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd\r\n",
        "import logging \r\n",
        "import azureml.core \r\n",
        "import pandas as pd\r\n",
        "import numpy as np\r\n",
        "from azureml.core.experiment import Experiment \r\n",
        "from azureml.train.automl import AutoMLConfig \r\n",
        "from sklearn import datasets \r\n",
        "import pickle\r\n",
        "import codecs\r\n",
        "from azureml.core.authentication import ServicePrincipalAuthentication \r\n",
        "from azureml.core.workspace import Workspace \r\n",
        "\r\n",
        "if __name__.startswith(\"sqlindb\"):\r\n",
        "    auth = ServicePrincipalAuthentication(tenantid, appid, password) \r\n",
        " \r\n",
        "    ws = Workspace.from_config(path=config_file, auth=auth) \r\n",
        " \r\n",
        "    project_folder = \"./sample_projects/\" + experiment_name\r\n",
        " \r\n",
        "    experiment = Experiment(ws, experiment_name) \r\n",
        "\r\n",
        "    data_train = input_data\r\n",
        "    X_valid = None\r\n",
        "    y_valid = None\r\n",
        "    sample_weight_valid = None\r\n",
        "\r\n",
        "    if is_validate_column != \"\" and is_validate_column is not None:\r\n",
        "        data_train = input_data[input_data[is_validate_column] <= 0]\r\n",
        "        data_valid = input_data[input_data[is_validate_column] > 0]\r\n",
        "        data_train.pop(is_validate_column)\r\n",
        "        data_valid.pop(is_validate_column)\r\n",
        "        y_valid = data_valid.pop(label_column).values\r\n",
        "        if sample_weight_column != \"\" and sample_weight_column is not None:\r\n",
        "            sample_weight_valid = data_valid.pop(sample_weight_column).values\r\n",
        "        X_valid = data_valid\r\n",
        "        n_cross_validations = None\r\n",
        "\r\n",
        "    y_train = data_train.pop(label_column).values\r\n",
        "\r\n",
        "    sample_weight = None\r\n",
        "    if sample_weight_column != \"\" and sample_weight_column is not None:\r\n",
        "        sample_weight = data_train.pop(sample_weight_column).values\r\n",
        "\r\n",
        "    X_train = data_train\r\n",
        "\r\n",
        "    if experiment_timeout_minutes == 0:\r\n",
        "        experiment_timeout_minutes = None\r\n",
        "\r\n",
        "    if experiment_exit_score == 0:\r\n",
        "        experiment_exit_score = None\r\n",
        "\r\n",
        "    if blacklist_models == \"\":\r\n",
        "        blacklist_models = None\r\n",
        "\r\n",
        "    if blacklist_models is not None:\r\n",
        "        blacklist_models = blacklist_models.replace(\" \", \"\").split(\",\")\r\n",
        "\r\n",
        "    if whitelist_models == \"\":\r\n",
        "        whitelist_models = None\r\n",
        "\r\n",
        "    if whitelist_models is not None:\r\n",
        "        whitelist_models = whitelist_models.replace(\" \", \"\").split(\",\")\r\n",
        "\r\n",
        "    automl_settings = {}\r\n",
        "    preprocess = True\r\n",
        "    if time_column_name != \"\" and time_column_name is not None:\r\n",
        "        automl_settings = { \"time_column_name\": time_column_name }\r\n",
        "        preprocess = False\r\n",
        "\r\n",
        "    log_file_name = \"automl_errors.log\"\r\n",
        "\t \r\n",
        "    automl_config = AutoMLConfig(task = task, \r\n",
        "                                 debug_log = log_file_name, \r\n",
        "                                 primary_metric = primary_metric, \r\n",
        "                                 iteration_timeout_minutes = iteration_timeout_minutes, \r\n",
        "                                 experiment_timeout_minutes = experiment_timeout_minutes,\r\n",
        "                                 iterations = iterations, \r\n",
        "                                 n_cross_validations = n_cross_validations, \r\n",
        "                                 preprocess = preprocess,\r\n",
        "                                 verbosity = logging.INFO, \r\n",
        "                                 X = X_train,  \r\n",
        "                                 y = y_train, \r\n",
        "                                 path = project_folder,\r\n",
        "                                 blacklist_models = blacklist_models,\r\n",
        "                                 whitelist_models = whitelist_models,\r\n",
        "                                 experiment_exit_score = experiment_exit_score,\r\n",
        "                                 sample_weight = sample_weight,\r\n",
        "                                 X_valid = X_valid,\r\n",
        "                                 y_valid = y_valid,\r\n",
        "                                 sample_weight_valid = sample_weight_valid,\r\n",
        "                                 **automl_settings) \r\n",
        " \r\n",
        "    local_run = experiment.submit(automl_config, show_output = True) \r\n",
        "\r\n",
        "    best_run, fitted_model = local_run.get_output()\r\n",
        "\r\n",
        "    pickled_model = codecs.encode(pickle.dumps(fitted_model), \"base64\").decode()\r\n",
        "\r\n",
        "    log_file_text = \"\"\r\n",
        "\r\n",
        "    try:\r\n",
        "        with open(log_file_name, \"r\") as log_file:\r\n",
        "            log_file_text = log_file.read()\r\n",
        "    except:\r\n",
        "        log_file_text = \"Log file not found\"\r\n",
        "\r\n",
        "    returned_model = pd.DataFrame({\"best_run\": [best_run.id], \"experiment_name\": [experiment_name], \"fitted_model\": [pickled_model], \"log_file_text\": [log_file_text], \"workspace\": [ws.name]}, dtype=np.dtype(np.str))\r\n",
        "'\r\n",
        "\t, @input_data_1 = @input_query\r\n",
        "\t, @input_data_1_name = N'input_data'\r\n",
        "\t, @output_data_1_name = N'returned_model'\r\n",
        "\t, @params = N'@label_column NVARCHAR(255), \r\n",
        "\t              @primary_metric NVARCHAR(40),\r\n",
        "\t\t\t\t  @iterations INT, @task NVARCHAR(40),\r\n",
        "\t\t\t\t  @experiment_name NVARCHAR(32),\r\n",
        "\t\t\t\t  @iteration_timeout_minutes INT,\r\n",
        "\t\t\t\t  @experiment_timeout_minutes INT,\r\n",
        "\t\t\t\t  @n_cross_validations INT,\r\n",
        "\t\t\t\t  @blacklist_models NVARCHAR(MAX),\r\n",
        "\t\t\t\t  @whitelist_models NVARCHAR(MAX),\r\n",
        "\t\t\t\t  @experiment_exit_score FLOAT,\r\n",
        "\t\t\t\t  @sample_weight_column NVARCHAR(255),\r\n",
        "\t\t\t\t  @is_validate_column NVARCHAR(255),\r\n",
        "\t\t\t\t  @time_column_name  NVARCHAR(255),\r\n",
        "\t\t\t\t  @tenantid NVARCHAR(255),\r\n",
        "\t\t\t\t  @appid NVARCHAR(255),\r\n",
        "\t\t\t\t  @password NVARCHAR(255),\r\n",
        "\t\t\t\t  @config_file NVARCHAR(255)'\r\n",
        "\t, @label_column = @label_column\r\n",
        "\t, @primary_metric = @primary_metric\r\n",
        "\t, @iterations = @iterations\r\n",
        "\t, @task = @task\r\n",
        "\t, @experiment_name = @experiment_name\r\n",
        "\t, @iteration_timeout_minutes = @iteration_timeout_minutes\r\n",
        "\t, @experiment_timeout_minutes = @experiment_timeout_minutes\r\n",
        "\t, @n_cross_validations = @n_cross_validations\r\n",
        "\t, @blacklist_models = @blacklist_models\r\n",
        "\t, @whitelist_models = @whitelist_models\r\n",
        "\t, @experiment_exit_score = @experiment_exit_score\r\n",
        "\t, @sample_weight_column = @sample_weight_column\r\n",
        "\t, @is_validate_column = @is_validate_column\r\n",
        "\t, @time_column_name = @time_column_name\r\n",
        "\t, @tenantid = @tenantid\r\n",
        "\t, @appid = @appid\r\n",
        "\t, @password = @password\r\n",
        "\t, @config_file = @config_file\r\n",
        "WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))\r\n",
        "END"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- This procedure returns a list of metrics for each iteration of a training run.\r\n",
        "SET ANSI_NULLS ON\r\n",
        "GO\r\n",
        "SET QUOTED_IDENTIFIER ON\r\n",
        "GO\r\n",
        "CREATE OR ALTER PROCEDURE [dbo].[AutoMLGetMetrics]\r\n",
        " (\r\n",
        "\t@run_id NVARCHAR(250),                           -- The RunId\r\n",
        "    @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.\r\n",
        "    @connection_name NVARCHAR(255)='default'         -- The AML connection to use.\r\n",
        " ) AS\r\n",
        "BEGIN\r\n",
        "    DECLARE @tenantid NVARCHAR(255)\r\n",
        "    DECLARE @appid NVARCHAR(255)\r\n",
        "    DECLARE @password NVARCHAR(255)\r\n",
        "    DECLARE @config_file NVARCHAR(255)\r\n",
        "\r\n",
        "\tSELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile\r\n",
        "\tFROM aml_connection\r\n",
        "\tWHERE ConnectionName = @connection_name;\r\n",
        "\r\n",
        "    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd\r\n",
        "import logging \r\n",
        "import azureml.core \r\n",
        "import numpy as np\r\n",
        "from azureml.core.experiment import Experiment \r\n",
        "from azureml.train.automl.run import AutoMLRun\r\n",
        "from azureml.core.authentication import ServicePrincipalAuthentication \r\n",
        "from azureml.core.workspace import Workspace \r\n",
        "\r\n",
        "auth = ServicePrincipalAuthentication(tenantid, appid, password) \r\n",
        " \r\n",
        "ws = Workspace.from_config(path=config_file, auth=auth) \r\n",
        " \r\n",
        "experiment = Experiment(ws, experiment_name) \r\n",
        "\r\n",
        "ml_run = AutoMLRun(experiment = experiment, run_id = run_id)\r\n",
        "\r\n",
        "children = list(ml_run.get_children())\r\n",
        "iterationlist = []\r\n",
        "metricnamelist = []\r\n",
        "metricvaluelist = []\r\n",
        "\r\n",
        "for run in children:\r\n",
        "    properties = run.get_properties()\r\n",
        "    if \"iteration\" in properties:\r\n",
        "        iteration = int(properties[\"iteration\"])\r\n",
        "        for metric_name, metric_value in run.get_metrics().items():\r\n",
        "            if isinstance(metric_value, float):\r\n",
        "                iterationlist.append(iteration)\r\n",
        "                metricnamelist.append(metric_name)\r\n",
        "                metricvaluelist.append(metric_value)\r\n",
        "             \r\n",
        "metrics = pd.DataFrame({\"iteration\": iterationlist, \"metric_name\": metricnamelist, \"metric_value\": metricvaluelist})\r\n",
        "'\r\n",
        "    , @output_data_1_name = N'metrics'\r\n",
        "\t, @params = N'@run_id NVARCHAR(250), \r\n",
        "\t\t\t\t  @experiment_name NVARCHAR(32),\r\n",
        "  \t\t\t\t  @tenantid NVARCHAR(255),\r\n",
        "\t\t\t\t  @appid NVARCHAR(255),\r\n",
        "\t\t\t\t  @password NVARCHAR(255),\r\n",
        "\t\t\t\t  @config_file NVARCHAR(255)'\r\n",
        "    , @run_id = @run_id\r\n",
        "\t, @experiment_name = @experiment_name\r\n",
        "\t, @tenantid = @tenantid\r\n",
        "\t, @appid = @appid\r\n",
        "\t, @password = @password\r\n",
        "\t, @config_file = @config_file\r\n",
        "WITH RESULT SETS ((iteration INT, metric_name NVARCHAR(100), metric_value FLOAT))\r\n",
        "END"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "-- This procedure predicts values based on a model returned by AutoMLTrain and a dataset.\r\n",
        "-- It returns the dataset with a new column added, which is the predicted value.\r\n",
        "SET ANSI_NULLS ON\r\n",
        "GO\r\n",
        "SET QUOTED_IDENTIFIER ON\r\n",
        "GO\r\n",
        "CREATE OR ALTER PROCEDURE [dbo].[AutoMLPredict]\r\n",
        " (\r\n",
        "   @input_query NVARCHAR(MAX),      -- A SQL query returning data to predict on.\r\n",
        "   @model NVARCHAR(MAX),            -- A model returned from AutoMLTrain.\r\n",
        "   @label_column  NVARCHAR(255)=''  -- Optional name of the column from input_query, which should be ignored when predicting\r\n",
        " ) AS \r\n",
        "BEGIN \r\n",
        "  \r\n",
        "    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd \r\n",
        "import azureml.core  \r\n",
        "import numpy as np \r\n",
        "from azureml.train.automl import AutoMLConfig  \r\n",
        "import pickle \r\n",
        "import codecs \r\n",
        "  \r\n",
        "model_obj = pickle.loads(codecs.decode(model.encode(), \"base64\")) \r\n",
        "  \r\n",
        "test_data = input_data.copy() \r\n",
        "\r\n",
        "if label_column != \"\" and label_column is not None:\r\n",
        "    y_test = test_data.pop(label_column).values \r\n",
        "X_test = test_data \r\n",
        "  \r\n",
        "predicted = model_obj.predict(X_test) \r\n",
        "  \r\n",
        "combined_output = input_data.assign(predicted=predicted)\r\n",
        "  \r\n",
        "' \r\n",
        "    , @input_data_1 = @input_query \r\n",
        "    , @input_data_1_name = N'input_data' \r\n",
        "    , @output_data_1_name = N'combined_output' \r\n",
        "    , @params = N'@model NVARCHAR(MAX), @label_column  NVARCHAR(255)' \r\n",
        "    , @model = @model \r\n",
        "\t, @label_column = @label_column\r\n",
        "END"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "jeffshep"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "sql",
      "name": "python36"
    },
    "language_info": {
      "name": "sql",
      "version": ""
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/automated-machine-learning/subsampling/auto-ml-subsampling-local.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/subsampling/auto-ml-subsampling-local.ipynb
@@ -9,6 +9,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/subsampling/auto-ml-subsampling-local.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
--- a/how-to-use-azureml/automated-machine-learning/subsampling/auto-ml-subsampling-local.yml
+++ b/how-to-use-azureml/automated-machine-learning/subsampling/auto-ml-subsampling-local.yml
@@ -0,0 +1,8 @@
 name: auto-ml-subsampling-local
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-train-automl
  - azureml-widgets
  - matplotlib
  - pandas_ml
--- a/how-to-use-azureml/azure-databricks/README.md
+++ b/how-to-use-azureml/azure-databricks/README.md
@@ -27,3 +27,7 @@ You can use Azure Databricks as a compute target from [Azure Machine Learning Pi
 For more on SDK concepts, please refer to [notebooks](https://github.com/Azure/MachineLearningNotebooks).
 **Please let us know your feedback.**
 ![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/README.png) 
--- a/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb
+++ b/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb
@@ -11,6 +11,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/azure-databricks/amlsdk/build-model-run-history-03.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -333,6 +340,13 @@
      "source": [
        "dbutils.notebook.exit(\"success\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb
+++ b/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb
@@ -11,6 +11,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/azure-databricks/amlsdk/deploy-to-aci-04.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -277,6 +284,13 @@
        "#comment to not delete the web service\n",
        "myservice.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.ipynb
+++ b/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.ipynb
@@ -11,6 +11,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -203,6 +210,13 @@
        "#model.delete()\n",
        "aks_target.delete() "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb
+++ b/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb
@@ -11,6 +11,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/azure-databricks/amlsdk/ingest-data-02.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -139,6 +146,13 @@
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb
+++ b/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb
@@ -11,6 +11,13 @@
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/azure-databricks/amlsdk/installation-and-configuration-01.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -143,6 +150,13 @@
        "      'Subscription id: ' + ws.subscription_id, \n",
        "      'Resource group: ' + ws.resource_group, sep = '\\n')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb
+++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb
@@ -23,7 +23,8 @@
        "3. Configure Automated ML using `AutoMLConfig`.\n",
        "4. Train the model using Azure Databricks.\n",
        "5. Explore the results.\n",
-        "6. Test the best fitted model.\n",
+        "6. Viewing the engineered names for featurized data and featurization summary for all raw features.\n",
        "7. Test the best fitted model.\n",
        "\n",
        "Before running this notebook, please follow the <a href=\"https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/azure-databricks\" target=\"_blank\">readme for using Automated ML on Azure Databricks</a> for installing necessary libraries to your cluster."
      ]
@@ -313,25 +314,18 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Load Training Data Using DataPrep"
+        "## Load Training Data Using Dataset"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "Automated ML takes a Dataflow as input.\n",
+        "Automated ML takes a `TabularDataset` as input.\n",
        "\n",
-        "If you are familiar with Pandas and have done your data preparation work in Pandas already, you can use the `read_pandas_dataframe` method in dprep to convert the DataFrame to a Dataflow.\n",
+        "You are free to use the data preparation libraries/tools of your choice to do the require preparation and once you are done, you can write it to a datastore and create a TabularDataset from it.\n",
        "```python\n",
        "df = pd.read_csv(...)\n",
        "# apply some transforms\n",
        "dprep.read_pandas_dataframe(df, temp_folder='/path/accessible/by/both/driver/and/worker')\n",
        "```\n",
        "\n",
-        "If you just need to ingest data without doing any preparation, you can directly use AzureML Data Prep (Data Prep) to do so. The code below demonstrates this scenario. Data Prep also has data preparation capabilities, we have many [sample notebooks](https://github.com/Microsoft/AMLDataPrepDocs) demonstrating the capabilities.\n",
+        "You will get the datastore you registered previously and pass it to Dataset for reading. The data comes from the digits dataset: `sklearn.datasets.load_digits()`. `DataPath` points to a specific location within a datastore. "
        "\n",
        "You will get the datastore you registered previously and pass it to Data Prep for reading. The data comes from the digits dataset: `sklearn.datasets.load_digits()`. `DataPath` points to a specific location within a datastore. "
      ]
    },
    {
@@ -340,21 +334,21 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "import azureml.dataprep as dprep\n",
+        "from azureml.core.dataset import Dataset\n",
        "from azureml.data.datapath import DataPath\n",
        "\n",
        "datastore = Datastore.get(workspace = ws, datastore_name = datastore_name)\n",
        "\n",
-        "X_train = dprep.read_csv(datastore.path('X.csv'))\n",
+        "X_train = Dataset.Tabular.from_delimited_files(datastore.path('X.csv'))\n",
-        "y_train = dprep.read_csv(datastore.path('y.csv')).to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
+        "y_train = Dataset.Tabular.from_delimited_files(datastore.path('y.csv'))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Review the Data Preparation Result\n",
+        "## Review the TabularDataset\n",
-        "You can peek the result of a Dataflow at any range using `skip(i)` and `head(j)`. Doing so evaluates only j records for all the steps in the Dataflow, which makes it fast even against large datasets."
+        "You can peek the result of a TabularDataset at any range using `skip(i)` and `take(j).to_pandas_dataframe()`. Doing so evaluates only j records for all the steps in the TabularDataset, which makes it fast even against large datasets."
      ]
    },
    {
@@ -363,7 +357,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X_train.get_profile()"
+        "X_train.take(5).to_pandas_dataframe()"
      ]
    },
    {
@@ -372,7 +366,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "y_train.get_profile()"
+        "y_train.take(5).to_pandas_dataframe()"
      ]
    },
    {
@@ -556,6 +550,48 @@
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### View the engineered names for featurized data\n",
        "Below we display the engineered feature names generated for the featurized data using the preprocessing featurization."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "fitted_model.named_steps['datatransformer'].get_engineered_feature_names()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### View the featurization summary\n",
        "Below we display the featurization that was performed on different raw features in the user data. For each raw feature in the user data, the following information is displayed:-\n",
        "- Raw feature name\n",
        "- Number of engineered features formed out of this raw feature\n",
        "- Type detected\n",
        "- If feature was dropped\n",
        "- List of feature transformations for the raw feature"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Get the featurization summary as a list of JSON\n",
        "featurization_summary = fitted_model.named_steps['datatransformer'].get_featurization_summary()\n",
        "# View the featurization summary as a pandas dataframe\n",
        "pd.DataFrame.from_records(featurization_summary)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -620,6 +656,13 @@
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb
+++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb
@@ -331,25 +331,18 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Load Training Data Using DataPrep"
+        "## Load Training Data Using Dataset"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "Automated ML takes a Dataflow as input.\n",
+        "Automated ML takes a `TabularDataset` as input.\n",
        "\n",
-        "If you are familiar with Pandas and have done your data preparation work in Pandas already, you can use the `read_pandas_dataframe` method in dprep to convert the DataFrame to a Dataflow.\n",
+        "You are free to use the data preparation libraries/tools of your choice to do the require preparation and once you are done, you can write it to a datastore and create a TabularDataset from it.\n",
        "```python\n",
        "df = pd.read_csv(...)\n",
        "# apply some transforms\n",
        "dprep.read_pandas_dataframe(df, temp_folder='/path/accessible/by/both/driver/and/worker')\n",
        "```\n",
        "\n",
-        "If you just need to ingest data without doing any preparation, you can directly use AzureML Data Prep (Data Prep) to do so. The code below demonstrates this scenario. Data Prep also has data preparation capabilities, we have many [sample notebooks](https://github.com/Microsoft/AMLDataPrepDocs) demonstrating the capabilities.\n",
+        "You will get the datastore you registered previously and pass it to Dataset for reading. The data comes from the digits dataset: `sklearn.datasets.load_digits()`. `DataPath` points to a specific location within a datastore. "
        "\n",
        "You will get the datastore you registered previously and pass it to Data Prep for reading. The data comes from the digits dataset: `sklearn.datasets.load_digits()`. `DataPath` points to a specific location within a datastore. "
      ]
    },
    {
@@ -358,21 +351,21 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "import azureml.dataprep as dprep\n",
+        "from azureml.core.dataset import Dataset\n",
        "from azureml.data.datapath import DataPath\n",
        "\n",
        "datastore = Datastore.get(workspace = ws, datastore_name = datastore_name)\n",
        "\n",
-        "X_train = dprep.read_csv(datastore.path('X.csv'))\n",
+        "X_train = Dataset.Tabular.from_delimited_files(datastore.path('X.csv'))\n",
-        "y_train = dprep.read_csv(datastore.path('y.csv')).to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
+        "y_train = Dataset.Tabular.from_delimited_files(datastore.path('y.csv'))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "## Review the Data Preparation Result\n",
+        "## Review the TabularDataset\n",
-        "You can peek the result of a Dataflow at any range using skip(i) and head(j). Doing so evaluates only j records for all the steps in the Dataflow, which makes it fast even against large datasets."
+        "You can peek the result of a TabularDataset at any range using `skip(i)` and `take(j).to_pandas_dataframe()`. Doing so evaluates only j records for all the steps in the TabularDataset, which makes it fast even against large datasets."
      ]
    },
    {
@@ -381,7 +374,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X_train.get_profile()"
+        "X_train.take(5).to_pandas_dataframe()"
      ]
    },
    {
@@ -390,7 +383,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "y_train.get_profile()"
+        "y_train.take(5).to_pandas_dataframe()"
      ]
    },
    {
@@ -796,6 +789,13 @@
      "source": [
        "myservice.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.png)"
      ]
    }
  ],
  "metadata": {
--- a/how-to-use-azureml/azure-databricks/databricks-as-remote-compute-target/aml-pipelines-use-databricks-as-compute-target.ipynb
+++ b/how-to-use-azureml/azure-databricks/databricks-as-remote-compute-target/aml-pipelines-use-databricks-as-compute-target.ipynb
@@ -13,7 +13,7 @@
      "metadata": {},
      "source": [
        "# Using Databricks as a Compute Target from Azure Machine Learning Pipeline\n",
-        "To use Databricks as a compute target from [Azure Machine Learning Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines), a [DatabricksStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.databricks_step.databricksstep?view=azure-ml-py) is used. This notebook demonstrates the use of DatabricksStep in Azure Machine Learning Pipeline.\n",
+        "To use Databricks as a compute target from [Azure Machine Learning Pipeline](https://aka.ms/pl-concept), a [DatabricksStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.databricks_step.databricksstep?view=azure-ml-py) is used. This notebook demonstrates the use of DatabricksStep in Azure Machine Learning Pipeline.\n",
        "\n",
        "The notebook will show:\n",
        "1. Running an arbitrary Databricks notebook that the customer has in Databricks workspace\n",
@@ -675,7 +675,14 @@
      "metadata": {},
      "source": [
        "# Next: ADLA as a Compute Target\n",
-        "To use ADLA as a compute target from Azure Machine Learning Pipeline, a AdlaStep is used. This [notebook](./aml-pipelines-use-adla-as-compute-target.ipynb) demonstrates the use of AdlaStep in Azure Machine Learning Pipeline."
+        "To use ADLA as a compute target from Azure Machine Learning Pipeline, a AdlaStep is used. This [notebook](https://aka.ms/pl-adla) demonstrates the use of AdlaStep in Azure Machine Learning Pipeline."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/databricks-as-remote-compute-target/aml-pipelines-use-databricks-as-compute-target.png)"
      ]
    }
  ],
--- a/how-to-use-azureml/azure-hdi/README.md
+++ b/how-to-use-azureml/azure-hdi/README.md
@@ -0,0 +1,55 @@
 **Azure HDInsight**
 Azure HDInsight is a fully managed cloud Hadoop & Spark offering the gives
 optimized open-source analytic clusters for Spark, Hive, MapReduce, HBase,
 Storm, and Kafka. HDInsight Spark clusters provide kernels that you can use with
 the Jupyter notebook on [Apache Spark](https://spark.apache.org/) for testing
 your applications. 
 How Azure HDInsight works with Azure Machine Learning service
 -   You can train a model using Spark clusters and deploy the model to ACI/AKS
    from within Azure HDInsight.
 -   You can also use [automated machine
    learning](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml) capabilities
    integrated within Azure HDInsight.
 You can use Azure HDInsight as a compute target from an [Azure Machine Learning
 pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines).
 **Set up your HDInsight cluster**
 Create [HDInsight
 cluster](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-provision-linux-clusters)
 **Quick create: Basic cluster setup**
 This article walks you through setup in the [Azure
 portal](https://portal.azure.com/), where you can create an HDInsight cluster
 using *Quick create* or *Custom*.
 ![hdinsight create options custom quick create](media/0a235b34c0b881117e51dc31a232dbe1.png)
 Follow instructions on the screen to do a basic cluster setup. Details are
 provided below for:
 -   [Resource group
    name](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-provision-linux-clusters#resource-group-name)
 -   [Cluster types and
    configuration](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-provision-linux-clusters#cluster-types)
    (Cluster must be Spark 2.3 (HDI 3.6) or greater)
 -   Cluster login and SSH username
 -   [Location](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-provision-linux-clusters#location)
 **Import the sample HDI notebook in Jupyter**
 **Important links:**
 Create HDI cluster:
 <https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-provision-linux-clusters>
 ![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-hdi/README.png)
--- a/how-to-use-azureml/azure-hdi/automl_hdi_local_classification.ipynb
+++ b/how-to-use-azureml/azure-hdi/automl_hdi_local_classification.ipynb
@@ -0,0 +1,612 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-hdi/automl_hdi_local_classification.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Automated ML on Azure HDInsight\n",
        "\n",
        "In this example we use the scikit-learn's <a href=\"http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset\" target=\"_blank\">digit dataset</a> to showcase how you can use AutoML for a simple classification problem.\n",
        "\n",
        "In this notebook you will learn how to:\n",
        "1. Create Azure Machine Learning Workspace object and initialize your notebook directory to easily reload this object from a configuration file.\n",
        "2. Create an `Experiment` in an existing `Workspace`.\n",
        "3. Configure Automated ML using `AutoMLConfig`.\n",
        "4. Train the model using Azure HDInsight.\n",
        "5. Explore the results.\n",
        "6. Test the best fitted model.\n",
        "\n",
        "Before running this notebook, please follow the readme for using Automated ML on Azure HDI for installing necessary libraries to your cluster."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Check the Azure ML Core SDK Version to Validate Your Installation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import azureml.core\n",
        "import pandas as pd\n",
        "from azureml.core.authentication import ServicePrincipalAuthentication\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.train.automl import AutoMLConfig\n",
        "from azureml.train.automl.run import AutoMLRun\n",
        "import logging\n",
        "\n",
        "print(\"SDK Version:\", azureml.core.VERSION)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Initialize an Azure ML Workspace\n",
        "### What is an Azure ML Workspace and Why Do I Need One?\n",
        "\n",
        "An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows.  In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, operationalization, and the monitoring of operationalized models.\n",
        "\n",
        "\n",
        "### What do I Need?\n",
        "\n",
        "To create or access an Azure ML workspace, you will need to import the Azure ML library and specify following information:\n",
        "* A name for your workspace. You can choose one.\n",
        "* Your subscription id. Use the `id` value from the `az account show` command output above.\n",
        "* The resource group name. The resource group organizes Azure resources and provides a default region for the resources in the group. The resource group will be created if it doesn't exist. Resource groups can be created and viewed in the [Azure portal](https://portal.azure.com)\n",
        "* Supported regions include `eastus2`, `eastus`,`westcentralus`, `southeastasia`, `westeurope`, `australiaeast`, `westus2`, `southcentralus`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import azureml.core\n",
        "import pandas as pd\n",
        "from azureml.core.authentication import ServicePrincipalAuthentication\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.train.automl import AutoMLConfig\n",
        "from azureml.train.automl.run import AutoMLRun\n",
        "import logging\n",
        "\n",
        "subscription_id = \"<Your SubscriptionId>\" #you should be owner or contributor\n",
        "resource_group = \"<Resource group - new or existing>\" #you should be owner or contributor\n",
        "workspace_name = \"<workspace to be created>\" #your workspace name\n",
        "workspace_region = \"<azureregion>\" #your region\n",
        "\n",
        "\n",
        "tenant_id = \"<tenant_id>\"\n",
        "app_id = \"<app_id>\"\n",
        "app_key = \"<app_key>\"\n",
        "\n",
        "auth_sp = ServicePrincipalAuthentication(tenant_id = tenant_id,\n",
        "                                         service_principal_id = app_id,\n",
        "                                         service_principal_password = app_key)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Creating a Workspace\n",
        "If you already have access to an Azure ML workspace you want to use, you can skip this cell.  Otherwise, this cell will create an Azure ML workspace for you in the specified subscription, provided you have the correct permissions for the given `subscription_id`.\n",
        "\n",
        "This will fail when:\n",
        "1. The workspace already exists.\n",
        "2. You do not have permission to create a workspace in the resource group.\n",
        "3. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription.\n",
        "\n",
        "If workspace creation fails for any reason other than already existing, please work with your IT administrator to provide you with the appropriate permissions or to provision the required resources.\n",
        "\n",
        "**Note:** Creation of a new workspace can take several minutes."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Configuring Your Local Environment\n",
        "You can validate that you have access to the specified workspace and write a configuration file to the default configuration location, `./aml_config/config.json`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core import Workspace\n",
        "\n",
        "ws = Workspace(workspace_name = workspace_name,\n",
        "               subscription_id = subscription_id,\n",
        "               resource_group = resource_group,\n",
        "               auth = auth_sp)\n",
        "\n",
        "# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
        "ws.write_config()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create a Folder to Host Sample Projects\n",
        "Finally, create a folder where all the sample projects will be hosted."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "sample_projects_folder = './sample_projects'\n",
        "\n",
        "if not os.path.isdir(sample_projects_folder):\n",
        "    os.mkdir(sample_projects_folder)\n",
        "    \n",
        "print('Sample projects will be created in {}.'.format(sample_projects_folder))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create an Experiment\n",
        "\n",
        "As part of the setup you have already created an Azure ML `Workspace` object. For Automated ML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import logging\n",
        "import os\n",
        "import random\n",
        "import time\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "from matplotlib.pyplot import imshow\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "import azureml.core\n",
        "from azureml.core.experiment import Experiment\n",
        "from azureml.core.workspace import Workspace\n",
        "from azureml.train.automl import AutoMLConfig\n",
        "from azureml.train.automl.run import AutoMLRun"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Choose a name for the experiment and specify the project folder.\n",
        "experiment_name = 'automl-local-classification-hdi'\n",
        "project_folder = './sample_projects/automl-local-classification-hdi'\n",
        "\n",
        "experiment = Experiment(ws, experiment_name)\n",
        "\n",
        "output = {}\n",
        "output['SDK version'] = azureml.core.VERSION\n",
        "output['Subscription ID'] = ws.subscription_id\n",
        "output['Workspace Name'] = ws.name\n",
        "output['Resource Group'] = ws.resource_group\n",
        "output['Location'] = ws.location\n",
        "output['Project Directory'] = project_folder\n",
        "output['Experiment Name'] = experiment.name\n",
        "pd.set_option('display.max_colwidth', -1)\n",
        "pd.DataFrame(data = output, index = ['']).T"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Diagnostics\n",
        "\n",
        "Opt-in diagnostics for better experience, quality, and security of future releases."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.telemetry import set_diagnostics_collection\n",
        "set_diagnostics_collection(send_diagnostics = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Registering Datastore"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Datastore is the way to save connection information to a storage service (e.g. Azure Blob, Azure Data Lake, Azure SQL) information to your workspace so you can access them without exposing credentials in your code. The first thing you will need to do is register a datastore, you can refer to our [python SDK documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.datastore.datastore?view=azure-ml-py) on how to register datastores. __Note: for best security practices, please do not check in code that contains registering datastores with secrets into your source control__\n",
        "\n",
        "The code below registers a datastore pointing to a publicly readable blob container."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core import Datastore\n",
        "\n",
        "datastore_name = 'demo_training'\n",
        "container_name = 'digits' \n",
        "account_name = 'automlpublicdatasets'\n",
        "Datastore.register_azure_blob_container(\n",
        "    workspace = ws, \n",
        "    datastore_name = datastore_name, \n",
        "    container_name = container_name, \n",
        "    account_name = account_name,\n",
        "     overwrite = True\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Below is an example on how to register a private blob container\n",
        "```python\n",
        "datastore = Datastore.register_azure_blob_container(\n",
        "    workspace = ws, \n",
        "    datastore_name = 'example_datastore', \n",
        "    container_name = 'example-container', \n",
        "    account_name = 'storageaccount',\n",
        "    account_key = 'accountkey'\n",
        ")\n",
        "```\n",
        "The example below shows how  to register an Azure Data Lake store. Please make sure you have granted the necessary permissions for the service principal to access the data lake.\n",
        "```python\n",
        "datastore = Datastore.register_azure_data_lake(\n",
        "    workspace = ws,\n",
        "    datastore_name = 'example_datastore',\n",
        "    store_name = 'adlsstore',\n",
        "    tenant_id = 'tenant-id-of-service-principal',\n",
        "    client_id = 'client-id-of-service-principal',\n",
        "    client_secret = 'client-secret-of-service-principal'\n",
        ")\n",
        "```"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Load Training Data Using DataPrep"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Automated ML takes a Dataflow as input.\n",
        "\n",
        "If you are familiar with Pandas and have done your data preparation work in Pandas already, you can use the `read_pandas_dataframe` method in dprep to convert the DataFrame to a Dataflow.\n",
        "```python\n",
        "df = pd.read_csv(...)\n",
        "# apply some transforms\n",
        "dprep.read_pandas_dataframe(df, temp_folder='/path/accessible/by/both/driver/and/worker')\n",
        "```\n",
        "\n",
        "If you just need to ingest data without doing any preparation, you can directly use AzureML Data Prep (Data Prep) to do so. The code below demonstrates this scenario. Data Prep also has data preparation capabilities, we have many [sample notebooks](https://github.com/Microsoft/AMLDataPrepDocs) demonstrating the capabilities.\n",
        "\n",
        "You will get the datastore you registered previously and pass it to Data Prep for reading. The data comes from the digits dataset: `sklearn.datasets.load_digits()`. `DataPath` points to a specific location within a datastore. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import azureml.dataprep as dprep\n",
        "from azureml.data.datapath import DataPath\n",
        "\n",
        "datastore = Datastore.get(workspace = ws, datastore_name = datastore_name)\n",
        "\n",
        "X_train = dprep.read_csv(datastore.path('X.csv'))\n",
        "y_train = dprep.read_csv(datastore.path('y.csv')).to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Review the Data Preparation Result\n",
        "You can peek the result of a Dataflow at any range using `skip(i)` and `head(j)`. Doing so evaluates only j records for all the steps in the Dataflow, which makes it fast even against large datasets."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "X_train.get_profile()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "y_train.get_profile()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Configure AutoML\n",
        "\n",
        "Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
        "\n",
        "|Property|Description|\n",
        "|-|-|\n",
        "|**task**|classification or regression|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
        "|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|\n",
        "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
        "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
        "|**n_cross_validations**|Number of cross validation splits.|\n",
        "|**spark_context**|Spark Context object. for HDInsight, use spark_context=sc|\n",
        "|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be <= number of worker nodes in your Azure HDInsight cluster.|\n",
        "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
        "|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]<br>Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers.|\n",
        "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
        "|**preprocess**|set this to True to enable pre-processing of data eg. string to numeric using one-hot encoding|\n",
        "|**exit_score**|Target score for experiment. It is associated with the metric. eg. exit_score=0.995 will exit experiment after that|"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "automl_config = AutoMLConfig(task = 'classification',\n",
        "                             debug_log = 'automl_errors.log',\n",
        "                             primary_metric = 'AUC_weighted',\n",
        "                             iteration_timeout_minutes = 10,\n",
        "                             iterations = 3,\n",
        "                             preprocess = True,\n",
        "                             n_cross_validations = 10,\n",
        "                             max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
        "                             verbosity = logging.INFO,\n",
        "                             spark_context=sc, #HDI /spark related\n",
        "                             X = X_train, \n",
        "                             y = y_train,\n",
        "                             path = project_folder)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Train the Models\n",
        "\n",
        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "local_run = experiment.submit(automl_config, show_output = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Explore the Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "The following will show the child runs and waits for the parent run to complete."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Retrieve All Child Runs after the experiment is completed (in portal)\n",
        "You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "children = list(local_run.get_children())\n",
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}    \n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
        "rundata = pd.DataFrame(metricslist).sort_index(1)\n",
        "rundata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Retrieve the Best Model after the above run is complete \n",
        "\n",
        "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing.  Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "best_run, fitted_model = local_run.get_output()\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Best Model Based on Any Other Metric after the above run is complete based on the child run\n",
        "Show the run and the model that has the smallest `log_loss` value:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "lookup_metric = \"log_loss\"\n",
        "best_run, fitted_model = local_run.get_output(metric = lookup_metric)\n",
        "print(best_run)\n",
        "print(fitted_model)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test the Best Fitted Model\n",
        "\n",
        "#### Load Test Data - you can split the dataset beforehand & pass Train dataset to AutoML and use Test dataset to evaluate the best model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "blob_location = \"https://{}.blob.core.windows.net/{}\".format(account_name, container_name)\n",
        "X_test = pd.read_csv(\"{}./X_valid.csv\".format(blob_location), header=0)\n",
        "y_test = pd.read_csv(\"{}/y_valid.csv\".format(blob_location), header=0)\n",
        "images  = pd.read_csv(\"{}/images.csv\".format(blob_location), header=None)\n",
        "images = np.reshape(images.values, (100,8,8))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Testing Our Best Fitted Model\n",
        "We will try to predict digits and see how our model works. This is just an example to show you."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Randomly select digits and test.\n",
        "for index in np.random.choice(len(y_test), 2, replace = False):\n",
        "    print(index)\n",
        "    predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
        "    label = y_test.values[index]\n",
        "    title = \"Label value = %d  Predicted value = %d \" % (label, predicted)\n",
        "    fig = plt.figure(3, figsize = (5,5))\n",
        "    ax1 = fig.add_axes((0,0,.8,.8))\n",
        "    ax1.set_title(title)\n",
        "    plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
        "    display(fig)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "When deploying an automated ML trained model, please specify _pippackages=['azureml-sdk[automl]']_ in your CondaDependencies.\n",
        "\n",
        "Please refer to only the **Deploy** section in this notebook - <a href=\"https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/classification-with-deployment\" target=\"_blank\">Deployment of Automated ML trained model</a>"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "savitam"
      },
      {
        "name": "sasum"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "Python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "python",
        "version": 3
      },
      "mimetype": "text/x-python",
      "name": "pyspark3",
      "pygments_lexer": "python3"
    },
    "name": "auto-ml-classification-local-adb",
    "notebookId": 587284549713154
  },
  "nbformat": 4,
  "nbformat_minor": 1
 }
--- a/how-to-use-azureml/deploy-to-cloud/README.md
+++ b/how-to-use-azureml/deploy-to-cloud/README.md
@@ -0,0 +1,12 @@
 # Model Deployment with Azure ML service
 You can use Azure Machine Learning to package, debug, validate and deploy inference containers to a variety of compute targets. This process is known as "MLOps" (ML operationalization).
 For more information please check out this article: https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where
 ## Get Started
 To begin, you will need an ML workspace.
 For more information please check out this article: https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace
 ## Deploy to the cloud
 You can deploy to the cloud using the Azure ML CLI or the Azure ML SDK.
 - CLI example: https://aka.ms/azmlcli
 - Notebook example: [model-register-and-deploy](./model-register-and-deploy.ipynb).
--- a/how-to-use-azureml/deploy-to-cloud/helloworld.txt
+++ b/how-to-use-azureml/deploy-to-cloud/helloworld.txt
@@ -0,0 +1 @@
 RUN echo "this is test"
--- a/how-to-use-azureml/deploy-to-cloud/model-register-and-deploy.ipynb
+++ b/how-to-use-azureml/deploy-to-cloud/model-register-and-deploy.ipynb
@@ -0,0 +1,339 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/deploy-to-cloud/model-register-and-deploy.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/deploy-to-cloud/model-register-and-deploy.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Register Model and deploy as Webservice\n",
        "\n",
        "This example shows how to deploy a Webservice in step-by-step fashion:\n",
        "\n",
        " 1. Register Model\n",
        " 2. Deploy Model as Webservice"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Prerequisites\n",
        "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the [configuration](../../../configuration.ipynb) Notebook first if you haven't."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Check core SDK version number\n",
        "import azureml.core\n",
        "\n",
        "print(\"SDK version:\", azureml.core.VERSION)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Initialize Workspace\n",
        "\n",
        "Initialize a workspace object from persisted configuration."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": [
          "create workspace"
        ]
      },
      "outputs": [],
      "source": [
        "from azureml.core import Workspace\n",
        "\n",
        "ws = Workspace.from_config()\n",
        "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Register Model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "You can add tags and descriptions to your Models. Note you need to have a `sklearn_regression_model.pkl` file in the current directory. This file is generated by the 01 notebook. The below call registers that file as a Model with the same name `sklearn_regression_model.pkl` in the workspace.\n",
        "\n",
        "Using tags, you can track useful information such as the name and version of the machine learning library used to train the model. Note that tags must be alphanumeric."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": [
          "register model from file"
        ]
      },
      "outputs": [],
      "source": [
        "from azureml.core.model import Model\n",
        "\n",
        "model = Model.register(model_path=\"sklearn_regression_model.pkl\",\n",
        "                       model_name=\"sklearn_regression_model.pkl\",\n",
        "                       tags={'area': \"diabetes\", 'type': \"regression\"},\n",
        "                       description=\"Ridge regression model to predict diabetes\",\n",
        "                       workspace=ws)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "You can now create and/or use an Environment object when deploying a Webservice. The Environment can have been previously registered with your Workspace, or it will be registered with it as a part of the Webservice deployment. Only Environments that were created using azureml-defaults version 1.0.48 or later will work with this new handling however.\n",
        "\n",
        "More information can be found in our [using environments notebook](../training/using-environments/using-environments.ipynb)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core import Environment\n",
        "\n",
        "env = Environment.from_conda_specification(name='deploytocloudenv', file_path='myenv.yml')\n",
        "\n",
        "# This is optional at this point\n",
        "# env.register(workspace=ws)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create Inference Configuration\n",
        "\n",
        "There is now support for a source directory, you can upload an entire folder from your local machine as dependencies for the Webservice.\n",
        "Note: in that case, your entry_script, conda_file, and extra_docker_file_steps paths are relative paths to the source_directory path.\n",
        "\n",
        "Sample code for using a source directory:\n",
        "\n",
        "```python\n",
        "inference_config = InferenceConfig(source_directory=\"C:/abc\",\n",
        "                                   runtime= \"python\", \n",
        "                                   entry_script=\"x/y/score.py\",\n",
        "                                   conda_file=\"env/myenv.yml\", \n",
        "                                   extra_docker_file_steps=\"helloworld.txt\")\n",
        "```\n",
        "\n",
        " - source_directory = holds source path as string, this entire folder gets added in image so its really easy to access any files within this folder or subfolder\n",
        " - runtime = Which runtime to use for the image. Current supported runtimes are 'spark-py' and 'python\n",
        " - entry_script = contains logic specific to initializing your model and running predictions\n",
        " - conda_file = manages conda and python package dependencies.\n",
        " - extra_docker_file_steps = optional: any extra steps you want to inject into docker file"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": [
          "create image"
        ]
      },
      "outputs": [],
      "source": [
        "from azureml.core.model import InferenceConfig\n",
        "\n",
        "inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Deploy Model as Webservice on Azure Container Instance\n",
        "\n",
        "Note that the service creation can take few minutes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.webservice import AciWebservice, Webservice\n",
        "from azureml.exceptions import WebserviceException\n",
        "\n",
        "deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)\n",
        "aci_service_name = 'aciservice1'\n",
        "\n",
        "try:\n",
        "    # if you want to get existing service below is the command\n",
        "    # since aci name needs to be unique in subscription deleting existing aci if any\n",
        "    # we use aci_service_name to create azure aci\n",
        "    service = Webservice(ws, name=aci_service_name)\n",
        "    if service:\n",
        "        service.delete()\n",
        "except WebserviceException as e:\n",
        "    print()\n",
        "\n",
        "service = Model.deploy(ws, aci_service_name, [model], inference_config, deployment_config)\n",
        "\n",
        "service.wait_for_deployment(True)\n",
        "print(service.state)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Test web service"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import json\n",
        "test_sample = json.dumps({'data': [\n",
        "    [1,2,3,4,5,6,7,8,9,10], \n",
        "    [10,9,8,7,6,5,4,3,2,1]\n",
        "]})\n",
        "\n",
        "test_sample_encoded = bytes(test_sample, encoding='utf8')\n",
        "prediction = service.run(input_data=test_sample_encoded)\n",
        "print(prediction)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#### Delete ACI to clean up"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": [
          "deploy service",
          "aci"
        ]
      },
      "outputs": [],
      "source": [
        "service.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Model Profiling\n",
        "\n",
        "You can also take advantage of the profiling feature to estimate CPU and memory requirements for models.\n",
        "\n",
        "```python\n",
        "profile = Model.profile(ws, \"profilename\", [model], inference_config, test_sample)\n",
        "profile.wait_for_profiling(True)\n",
        "profiling_results = profile.get_results()\n",
        "print(profiling_results)\n",
        "```"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Model Packaging\n",
        "\n",
        "If you want to build a Docker image that encapsulates your model and its dependencies, you can use the model packaging option. The output image will be pushed to your workspace's ACR.\n",
        "\n",
        "You must include an Environment object in your inference configuration to use `Model.package()`.\n",
        "\n",
        "```python\n",
        "package = Model.package(ws, [model], inference_config)\n",
        "package.wait_for_creation(show_output=True)  # Or show_output=False to hide the Docker build logs.\n",
        "package.pull()\n",
        "```\n",
        "\n",
        "Instead of a fully-built image, you can also generate a Dockerfile and download all the assets needed to build an image on top of your Environment.\n",
        "\n",
        "```python\n",
        "package = Model.package(ws, [model], inference_config, generate_dockerfile=True)\n",
        "package.wait_for_creation(show_output=True)\n",
        "package.save(\"./local_context_dir\")\n",
        "```"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "aashishb"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/Show More
+++ b/Show More
`@@ -287,6 +287,8 @@ Notice how the parameters are modified when using the CPU-only mode.`

	`The outputs of the script can be observed in the master notebook as the script is executed`	`The outputs of the script can be observed in the master notebook as the script is executed`

		`![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/contrib/RAPIDS/README.png)`