Compare commits

...

42 Commits

Author SHA1 Message Date
vizhur
584ed9ae74 update samples - test 2019-06-25 01:46:55 +00:00
Roope Astala
4a6bcebccc Update configuration.ipynb 2019-06-21 09:35:13 -04:00
Roope Astala
56e0ebc5ac Merge pull request #438 from rastala/master
add pipeline scripts
2019-06-19 18:56:42 -04:00
rastala
2aa39f2f4a add pipeline scripts 2019-06-19 18:55:32 -04:00
Roope Astala
4d247c1877 Merge pull request #437 from rastala/master
pytorch with mlflow
2019-06-19 17:23:06 -04:00
rastala
f6682f6f6d pytorch with mlflow 2019-06-19 17:21:52 -04:00
Roope Astala
26ecf25233 Merge pull request #436 from rastala/master
Update readme
2019-06-19 11:52:23 -04:00
Roope Astala
44c3a486c0 update readme 2019-06-19 11:49:49 -04:00
Roope Astala
c574f429b8 update readme 2019-06-19 11:48:52 -04:00
Roope Astala
77d557a5dc Merge pull request #435 from ganzhi/jamgan/drift
Add demo notebook for AML Data Drift
2019-06-17 16:39:46 -04:00
James Gan
13dedec4a4 Make it in same folder as internal repo 2019-06-17 13:38:27 -07:00
James Gan
6f5c52676f Add notebook to demo data drift 2019-06-17 13:33:30 -07:00
James Gan
90c105537c Add demo notebook for AML Data Drift 2019-06-17 13:31:08 -07:00
Roope Astala
ef264b1073 Merge pull request #434 from rastala/master
update pytorch
2019-06-17 11:57:29 -04:00
Roope Astala
824ac5e021 update pytorch 2019-06-17 11:56:42 -04:00
Roope Astala
e9a7b95716 Merge pull request #421 from csteegz/csteegz-add-warning
Add warning for using prediction client on azure notebooks
2019-06-13 20:27:34 -04:00
Roope Astala
789ee26357 Merge pull request #431 from jeff-shepherd/master
Fixed path for auto-ml-remote-amlcompute notebook
2019-06-13 16:56:25 -04:00
Jeff Shepherd
fc541706e7 Fixed path for auto-ml-remote-amlcompute 2019-06-13 13:12:32 -07:00
Roope Astala
64b8aa2a55 Merge pull request #429 from jeff-shepherd/master
Removed deprecated notebooks from readme
2019-06-13 14:40:57 -04:00
Jeff Shepherd
d3dc35dbb6 Removed deprecated notebooks from readme 2019-06-13 11:03:25 -07:00
Roope Astala
b55ac368e7 Merge pull request #428 from rastala/master
update cluster creation
2019-06-13 12:16:30 -04:00
Roope Astala
de162316d7 update cluster creation 2019-06-13 12:14:58 -04:00
Roope Astala
4ecc58dfe2 Merge pull request #427 from rastala/master
dockerfile
2019-06-12 10:24:34 -04:00
Roope Astala
daf27a76e4 dockerfile 2019-06-12 10:23:34 -04:00
Roope Astala
a05444845b Merge pull request #426 from rastala/master
version 1.0.43
2019-06-12 10:09:08 -04:00
Roope Astala
79c9f50c15 version 1.0.43 2019-06-12 10:08:35 -04:00
Roope Astala
67e10e0f6b Merge pull request #417 from lan-tang/patch-1
Create readme.md in data-drift
2019-06-11 13:47:55 -04:00
Roope Astala
1ef0331a0f Merge pull request #423 from rastala/master
add sklearn estimator
2019-06-11 11:30:37 -04:00
Roope Astala
5e91c836b9 add sklearn estimator 2019-06-11 11:29:56 -04:00
Colin Versteeg
661762854a add warning to training 2019-06-10 16:51:33 -07:00
Colin Versteeg
fbc90ba74f add to quickstart 2019-06-10 16:50:59 -07:00
Colin Versteeg
0d9c83d0a8 Update accelerated-models-object-detection.ipynb 2019-06-10 16:48:17 -07:00
Colin Versteeg
ca4cab1de9 Merge pull request #1 from Azure/master
pull from master
2019-06-10 16:45:12 -07:00
Roope Astala
ddbb3c45f6 Merge pull request #420 from rastala/master
mlflow integration preview
2019-06-10 15:12:36 -04:00
rastala
8eed4e39d0 mlflow integration preview 2019-06-10 15:10:57 -04:00
Lan Tang
b37c0297db Create readme.md 2019-06-07 12:32:32 -07:00
Roope Astala
968cc798d0 Update README.md 2019-06-05 12:15:33 -04:00
Roope Astala
5c9ca452fb Create README.md 2019-06-05 12:15:19 -04:00
Shané Winner
5e82680272 Update README.md 2019-05-31 10:58:39 -07:00
Roope Astala
41841fc8c0 Update README.md 2019-05-31 13:00:41 -04:00
Roope Astala
896bf63736 Merge pull request #397 from rastala/master
dockerfile
2019-05-29 11:05:18 -04:00
Roope Astala
d4751bf6ec dockerfile 2019-05-29 11:04:19 -04:00
256 changed files with 45599 additions and 6954 deletions

View File

@@ -0,0 +1,29 @@
FROM continuumio/miniconda:4.5.11
# install git
RUN apt-get update && apt-get upgrade -y && apt-get install -y git
# create a new conda environment named azureml
RUN conda create -n azureml -y -q Python=3.6
# install additional packages used by sample notebooks. this is optional
RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
# install azurmel-sdk components
RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.41"]
# clone Azure ML GitHub sample notebooks
RUN cd /home && git clone -b "azureml-sdk-1.0.41" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
# generate jupyter configuration file
RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
# set an emtpy token for Jupyter to remove authentication.
# this is NOT recommended for production environment
RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
# open up port 8887 on the container
EXPOSE 8887
# start Jupyter notebook server on port 8887 when the container starts
CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"

View File

@@ -0,0 +1,29 @@
FROM continuumio/miniconda:4.5.11
# install git
RUN apt-get update && apt-get upgrade -y && apt-get install -y git
# create a new conda environment named azureml
RUN conda create -n azureml -y -q Python=3.6
# install additional packages used by sample notebooks. this is optional
RUN ["/bin/bash", "-c", "source activate azureml && conda install -y tqdm cython matplotlib scikit-learn"]
# install azurmel-sdk components
RUN ["/bin/bash", "-c", "source activate azureml && pip install azureml-sdk[notebooks]==1.0.43"]
# clone Azure ML GitHub sample notebooks
RUN cd /home && git clone -b "azureml-sdk-1.0.43" --single-branch https://github.com/Azure/MachineLearningNotebooks.git
# generate jupyter configuration file
RUN ["/bin/bash", "-c", "source activate azureml && mkdir ~/.jupyter && cd ~/.jupyter && jupyter notebook --generate-config"]
# set an emtpy token for Jupyter to remove authentication.
# this is NOT recommended for production environment
RUN echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
# open up port 8887 on the container
EXPOSE 8887
# start Jupyter notebook server on port 8887 when the container starts
CMD /bin/bash -c "cd /home/MachineLearningNotebooks && source activate azureml && jupyter notebook --port 8887 --no-browser --ip 0.0.0.0 --allow-root"

View File

@@ -60,16 +60,6 @@ This repository collects usage data and sends it to Mircosoft to help improve ou
To opt out of tracking, please go to the raw markdown or .ipynb files and remove the following line of code: To opt out of tracking, please go to the raw markdown or .ipynb files and remove the following line of code:
```sh
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/README.png)"
```
This URL will be slightly different depending on the file.
## Data/Telemetry
This repository collects usage data and sends it to Mircosoft to help improve our products and services. Read Microsoft's [privacy statement to learn more](https://privacy.microsoft.com/en-US/privacystatement)
To opt out of tracking, please go to the raw markdown or .ipynb files and remove the following line of code:
```sh ```sh
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/README.png)" "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/README.png)"
``` ```

View File

@@ -1,293 +1,383 @@
{ {
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n", "Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n", "\n",
"Licensed under the MIT License." "Licensed under the MIT License."
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/configuration.png)" "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/configuration.png)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Configuration\n", "# Configuration\n",
"\n", "\n",
"_**Setting up your Azure Machine Learning services workspace and configuring your notebook library**_\n", "_**Setting up your Azure Machine Learning services workspace and configuring your notebook library**_\n",
"\n", "\n",
"---\n", "---\n",
"---\n", "---\n",
"\n", "\n",
"## Table of Contents\n", "## Table of Contents\n",
"\n", "\n",
"1. [Introduction](#Introduction)\n", "1. [Introduction](#Introduction)\n",
" 1. What is an Azure Machine Learning workspace\n", " 1. What is an Azure Machine Learning workspace\n",
"1. [Setup](#Setup)\n", "1. [Setup](#Setup)\n",
" 1. Azure subscription\n", " 1. Azure subscription\n",
" 1. Azure ML SDK and other library installation\n", " 1. Azure ML SDK and other library installation\n",
" 1. Azure Container Instance registration\n", " 1. Azure Container Instance registration\n",
"1. [Configure your Azure ML Workspace](#Configure%20your%20Azure%20ML%20workspace)\n", "1. [Configure your Azure ML Workspace](#Configure%20your%20Azure%20ML%20workspace)\n",
" 1. Workspace parameters\n", " 1. Workspace parameters\n",
" 1. Access your workspace\n", " 1. Access your workspace\n",
" 1. Create a new workspace\n", " 1. Create a new workspace\n",
"1. [Next steps](#Next%20steps)\n", " 1. Create compute resources\n",
"\n", "1. [Next steps](#Next%20steps)\n",
"---\n", "\n",
"\n", "---\n",
"## Introduction\n", "\n",
"\n", "## Introduction\n",
"This notebook configures your library of notebooks to connect to an Azure Machine Learning (ML) workspace. In this case, a library contains all of the notebooks in the current folder and any nested folders. You can configure this notebook library to use an existing workspace or create a new workspace.\n", "\n",
"\n", "This notebook configures your library of notebooks to connect to an Azure Machine Learning (ML) workspace. In this case, a library contains all of the notebooks in the current folder and any nested folders. You can configure this notebook library to use an existing workspace or create a new workspace.\n",
"Typically you will need to run this notebook only once per notebook library as all other notebooks will use connection information that is written here. If you want to redirect your notebook library to work with a different workspace, then you should re-run this notebook.\n", "\n",
"\n", "Typically you will need to run this notebook only once per notebook library as all other notebooks will use connection information that is written here. If you want to redirect your notebook library to work with a different workspace, then you should re-run this notebook.\n",
"In this notebook you will\n", "\n",
"* Learn about getting an Azure subscription\n", "In this notebook you will\n",
"* Specify your workspace parameters\n", "* Learn about getting an Azure subscription\n",
"* Access or create your workspace\n", "* Specify your workspace parameters\n",
"* Add a default compute cluster for your workspace\n", "* Access or create your workspace\n",
"\n", "* Add a default compute cluster for your workspace\n",
"### What is an Azure Machine Learning workspace\n", "\n",
"\n", "### What is an Azure Machine Learning workspace\n",
"An Azure ML Workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inferencing, and the monitoring of deployed models." "\n",
] "An Azure ML Workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inferencing, and the monitoring of deployed models."
}, ]
{ },
"cell_type": "markdown", {
"metadata": {}, "cell_type": "markdown",
"source": [ "metadata": {},
"## Setup\n", "source": [
"\n", "## Setup\n",
"This section describes activities required before you can access any Azure ML services functionality." "\n",
] "This section describes activities required before you can access any Azure ML services functionality."
}, ]
{ },
"cell_type": "markdown", {
"metadata": {}, "cell_type": "markdown",
"source": [ "metadata": {},
"### 1. Azure Subscription\n", "source": [
"\n", "### 1. Azure Subscription\n",
"In order to create an Azure ML Workspace, first you need access to an Azure subscription. An Azure subscription allows you to manage storage, compute, and other assets in the Azure cloud. You can [create a new subscription](https://azure.microsoft.com/en-us/free/) or access existing subscription information from the [Azure portal](https://portal.azure.com). Later in this notebook you will need information such as your subscription ID in order to create and access AML workspaces.\n", "\n",
"\n", "In order to create an Azure ML Workspace, first you need access to an Azure subscription. An Azure subscription allows you to manage storage, compute, and other assets in the Azure cloud. You can [create a new subscription](https://azure.microsoft.com/en-us/free/) or access existing subscription information from the [Azure portal](https://portal.azure.com). Later in this notebook you will need information such as your subscription ID in order to create and access AML workspaces.\n",
"### 2. Azure ML SDK and other library installation\n", "\n",
"\n", "### 2. Azure ML SDK and other library installation\n",
"If you are running in your own environment, follow [SDK installation instructions](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-environment). If you are running in Azure Notebooks or another Microsoft managed environment, the SDK is already installed.\n", "\n",
"\n", "If you are running in your own environment, follow [SDK installation instructions](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-environment). If you are running in Azure Notebooks or another Microsoft managed environment, the SDK is already installed.\n",
"Also install following libraries to your environment. Many of the example notebooks depend on them\n", "\n",
"\n", "Also install following libraries to your environment. Many of the example notebooks depend on them\n",
"```\n", "\n",
"(myenv) $ conda install -y matplotlib tqdm scikit-learn\n", "```\n",
"```\n", "(myenv) $ conda install -y matplotlib tqdm scikit-learn\n",
"\n", "```\n",
"Once installation is complete, the following cell checks the Azure ML SDK version:" "\n",
] "Once installation is complete, the following cell checks the Azure ML SDK version:"
}, ]
{ },
"cell_type": "code", {
"execution_count": null, "cell_type": "code",
"metadata": { "execution_count": null,
"tags": [ "metadata": {
"install" "tags": [
] "install"
}, ]
"outputs": [], },
"source": [ "outputs": [],
"import azureml.core\n", "source": [
"\n", "import azureml.core\n",
"print(\"This notebook was created using version 1.0.41 of the Azure ML SDK\")\n", "\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" "print(\"This notebook was created using version of the Azure ML SDK\")\n",
] "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
}, ]
{ },
"cell_type": "markdown", {
"metadata": {}, "cell_type": "markdown",
"source": [ "metadata": {},
"If you are using an older version of the SDK then this notebook was created using, you should upgrade your SDK.\n", "source": [
"\n", "If you are using an older version of the SDK then this notebook was created using, you should upgrade your SDK.\n",
"### 3. Azure Container Instance registration\n", "\n",
"Azure Machine Learning uses of [Azure Container Instance (ACI)](https://azure.microsoft.com/services/container-instances) to deploy dev/test web services. An Azure subscription needs to be registered to use ACI. If you or the subscription owner have not yet registered ACI on your subscription, you will need to use the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) and execute the following commands. Note that if you ran through the AML [quickstart](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-get-started) you have already registered ACI. \n", "### 3. Azure Container Instance registration\n",
"\n", "Azure Machine Learning uses of [Azure Container Instance (ACI)](https://azure.microsoft.com/services/container-instances) to deploy dev/test web services. An Azure subscription needs to be registered to use ACI. If you or the subscription owner have not yet registered ACI on your subscription, you will need to use the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) and execute the following commands. Note that if you ran through the AML [quickstart](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-get-started) you have already registered ACI. \n",
"```shell\n", "\n",
"# check to see if ACI is already registered\n", "```shell\n",
"(myenv) $ az provider show -n Microsoft.ContainerInstance -o table\n", "# check to see if ACI is already registered\n",
"\n", "(myenv) $ az provider show -n Microsoft.ContainerInstance -o table\n",
"# if ACI is not registered, run this command.\n", "\n",
"# note you need to be the subscription owner in order to execute this command successfully.\n", "# if ACI is not registered, run this command.\n",
"(myenv) $ az provider register -n Microsoft.ContainerInstance\n", "# note you need to be the subscription owner in order to execute this command successfully.\n",
"```\n", "(myenv) $ az provider register -n Microsoft.ContainerInstance\n",
"\n", "```\n",
"---" "\n",
] "---"
}, ]
{ },
"cell_type": "markdown", {
"metadata": {}, "cell_type": "markdown",
"source": [ "metadata": {},
"## Configure your Azure ML workspace\n", "source": [
"\n", "## Configure your Azure ML workspace\n",
"### Workspace parameters\n", "\n",
"\n", "### Workspace parameters\n",
"To use an AML Workspace, you will need to import the Azure ML SDK and supply the following information:\n", "\n",
"* Your subscription id\n", "To use an AML Workspace, you will need to import the Azure ML SDK and supply the following information:\n",
"* A resource group name\n", "* Your subscription id\n",
"* (optional) The region that will host your workspace\n", "* A resource group name\n",
"* A name for your workspace\n", "* (optional) The region that will host your workspace\n",
"\n", "* A name for your workspace\n",
"You can get your subscription ID from the [Azure portal](https://portal.azure.com).\n", "\n",
"\n", "You can get your subscription ID from the [Azure portal](https://portal.azure.com).\n",
"You will also need access to a [_resource group_](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-overview#resource-groups), which organizes Azure resources and provides a default region for the resources in a group. You can see what resource groups to which you have access, or create a new one in the [Azure portal](https://portal.azure.com). If you don't have a resource group, the create workspace command will create one for you using the name you provide.\n", "\n",
"\n", "You will also need access to a [_resource group_](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-overview#resource-groups), which organizes Azure resources and provides a default region for the resources in a group. You can see what resource groups to which you have access, or create a new one in the [Azure portal](https://portal.azure.com). If you don't have a resource group, the create workspace command will create one for you using the name you provide.\n",
"The region to host your workspace will be used if you are creating a new workspace. You do not need to specify this if you are using an existing workspace. You can find the list of supported regions [here](https://azure.microsoft.com/en-us/global-infrastructure/services/?products=machine-learning-service). You should pick a region that is close to your location or that contains your data.\n", "\n",
"\n", "The region to host your workspace will be used if you are creating a new workspace. You do not need to specify this if you are using an existing workspace. You can find the list of supported regions [here](https://azure.microsoft.com/en-us/global-infrastructure/services/?products=machine-learning-service). You should pick a region that is close to your location or that contains your data.\n",
"The name for your workspace is unique within the subscription and should be descriptive enough to discern among other AML Workspaces. The subscription may be used only by you, or it may be used by your department or your entire enterprise, so choose a name that makes sense for your situation.\n", "\n",
"\n", "The name for your workspace is unique within the subscription and should be descriptive enough to discern among other AML Workspaces. The subscription may be used only by you, or it may be used by your department or your entire enterprise, so choose a name that makes sense for your situation.\n",
"The following cell allows you to specify your workspace parameters. This cell uses the python method `os.getenv` to read values from environment variables which is useful for automation. If no environment variable exists, the parameters will be set to the specified default values. \n", "\n",
"\n", "The following cell allows you to specify your workspace parameters. This cell uses the python method `os.getenv` to read values from environment variables which is useful for automation. If no environment variable exists, the parameters will be set to the specified default values. \n",
"If you ran the Azure Machine Learning [quickstart](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-get-started) in Azure Notebooks, you already have a configured workspace! You can go to your Azure Machine Learning Getting Started library, view *config.json* file, and copy-paste the values for subscription ID, resource group and workspace name below.\n", "\n",
"\n", "If you ran the Azure Machine Learning [quickstart](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-get-started) in Azure Notebooks, you already have a configured workspace! You can go to your Azure Machine Learning Getting Started library, view *config.json* file, and copy-paste the values for subscription ID, resource group and workspace name below.\n",
"Replace the default values in the cell below with your workspace parameters" "\n",
] "Replace the default values in the cell below with your workspace parameters"
}, ]
{ },
"cell_type": "code", {
"execution_count": null, "cell_type": "code",
"metadata": {}, "execution_count": null,
"outputs": [], "metadata": {},
"source": [ "outputs": [],
"import os\n", "source": [
"\n", "import os\n",
"subscription_id = os.getenv(\"SUBSCRIPTION_ID\", default=\"<my-subscription-id>\")\n", "\n",
"resource_group = os.getenv(\"RESOURCE_GROUP\", default=\"<my-resource-group>\")\n", "subscription_id = os.getenv(\"SUBSCRIPTION_ID\", default=\"<my-subscription-id>\")\n",
"workspace_name = os.getenv(\"WORKSPACE_NAME\", default=\"<my-workspace-name>\")\n", "resource_group = os.getenv(\"RESOURCE_GROUP\", default=\"<my-resource-group>\")\n",
"workspace_region = os.getenv(\"WORKSPACE_REGION\", default=\"eastus2\")" "workspace_name = os.getenv(\"WORKSPACE_NAME\", default=\"<my-workspace-name>\")\n",
] "workspace_region = os.getenv(\"WORKSPACE_REGION\", default=\"eastus2\")"
}, ]
{ },
"cell_type": "markdown", {
"metadata": {}, "cell_type": "markdown",
"source": [ "metadata": {},
"### Access your workspace\n", "source": [
"\n", "### Access your workspace\n",
"The following cell uses the Azure ML SDK to attempt to load the workspace specified by your parameters. If this cell succeeds, your notebook library will be configured to access the workspace from all notebooks using the `Workspace.from_config()` method. The cell can fail if the specified workspace doesn't exist or you don't have permissions to access it. " "\n",
] "The following cell uses the Azure ML SDK to attempt to load the workspace specified by your parameters. If this cell succeeds, your notebook library will be configured to access the workspace from all notebooks using the `Workspace.from_config()` method. The cell can fail if the specified workspace doesn't exist or you don't have permissions to access it. "
}, ]
{ },
"cell_type": "code", {
"execution_count": null, "cell_type": "code",
"metadata": {}, "execution_count": null,
"outputs": [], "metadata": {},
"source": [ "outputs": [],
"from azureml.core import Workspace\n", "source": [
"\n", "from azureml.core import Workspace\n",
"try:\n", "\n",
" ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)\n", "try:\n",
" # write the details of the workspace to a configuration file to the notebook library\n", " ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)\n",
" ws.write_config()\n", " # write the details of the workspace to a configuration file to the notebook library\n",
" print(\"Workspace configuration succeeded. Skip the workspace creation steps below\")\n", " ws.write_config()\n",
"except:\n", " print(\"Workspace configuration succeeded. Skip the workspace creation steps below\")\n",
" print(\"Workspace not accessible. Change your parameters or create a new workspace below\")" "except:\n",
] " print(\"Workspace not accessible. Change your parameters or create a new workspace below\")"
}, ]
{ },
"cell_type": "markdown", {
"metadata": {}, "cell_type": "markdown",
"source": [ "metadata": {},
"### Create a new workspace\n", "source": [
"\n", "### Create a new workspace\n",
"If you don't have an existing workspace and are the owner of the subscription or resource group, you can create a new workspace. If you don't have a resource group, the create workspace command will create one for you using the name you provide.\n", "\n",
"\n", "If you don't have an existing workspace and are the owner of the subscription or resource group, you can create a new workspace. If you don't have a resource group, the create workspace command will create one for you using the name you provide.\n",
"**Note**: The Workspace creation command will create default CPU and GPU compute clusters for you. As with other Azure services, there are limits on certain resources (for example AmlCompute quota) associated with the Azure ML service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.\n", "\n",
"\n", "**Note**: As with other Azure services, there are limits on certain resources (for example AmlCompute quota) associated with the Azure ML service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.\n",
"This cell will create an Azure ML workspace for you in a subscription provided you have the correct permissions.\n", "\n",
"\n", "This cell will create an Azure ML workspace for you in a subscription provided you have the correct permissions.\n",
"This will fail if:\n", "\n",
"* You do not have permission to create a workspace in the resource group\n", "This will fail if:\n",
"* You do not have permission to create a resource group if it's non-existing.\n", "* You do not have permission to create a workspace in the resource group\n",
"* You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription\n", "* You do not have permission to create a resource group if it's non-existing.\n",
"\n", "* You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription\n",
"If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to provision the required resources." "\n",
] "If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to provision the required resources."
}, ]
{ },
"cell_type": "code", {
"execution_count": null, "cell_type": "code",
"metadata": { "execution_count": null,
"tags": [ "metadata": {
"create workspace" "tags": [
] "create workspace"
}, ]
"outputs": [], },
"source": [ "outputs": [],
"from azureml.core import Workspace\n", "source": [
"\n", "from azureml.core import Workspace\n",
"# Create the workspace using the specified parameters\n", "\n",
"ws = Workspace.create(name = workspace_name,\n", "# Create the workspace using the specified parameters\n",
" subscription_id = subscription_id,\n", "ws = Workspace.create(name = workspace_name,\n",
" resource_group = resource_group, \n", " subscription_id = subscription_id,\n",
" location = workspace_region,\n", " resource_group = resource_group, \n",
" default_cpu_compute_target=Workspace.DEFAULT_CPU_CLUSTER_CONFIGURATION,\n", " location = workspace_region,\n",
" default_gpu_compute_target=Workspace.DEFAULT_GPU_CLUSTER_CONFIGURATION,\n", " create_resource_group = True,\n",
" create_resource_group = True,\n", " exist_ok = True)\n",
" exist_ok = True)\n", "ws.get_details()\n",
"ws.get_details()\n", "\n",
"\n", "# write the details of the workspace to a configuration file to the notebook library\n",
"# write the details of the workspace to a configuration file to the notebook library\n", "ws.write_config()"
"ws.write_config()" ]
] },
}, {
{ "cell_type": "markdown",
"cell_type": "markdown", "metadata": {},
"metadata": {}, "source": [
"source": [ "### Create compute resources for your training experiments\n",
"---\n", "\n",
"\n", "Many of the sample notebooks use Azure ML managed compute (AmlCompute) to train models using a dynamically scalable pool of compute. In this section you will create default compute clusters for use by the other notebooks and any other operations you choose.\n",
"## Next steps\n", "\n",
"\n", "To create a cluster, you need to specify a compute configuration that specifies the type of machine to be used and the scalability behaviors. Then you choose a name for the cluster that is unique within the workspace that can be used to address the cluster later.\n",
"In this notebook you configured this notebook library to connect easily to an Azure ML workspace. You can copy this notebook to your own libraries to connect them to you workspace, or use it to bootstrap new workspaces completely.\n", "\n",
"\n", "The cluster parameters are:\n",
"If you came here from another notebook, you can return there and complete that exercise, or you can try out the [Tutorials](./tutorials) or jump into \"how-to\" notebooks and start creating and deploying models. A good place to start is the [train within notebook](./how-to-use-azureml/training/train-within-notebook) example that walks through a simplified but complete end to end machine learning process." "* vm_size - this describes the virtual machine type and size used in the cluster. All machines in the cluster are the same type. You can get the list of vm sizes available in your region by using the CLI command\n",
] "\n",
}, "```shell\n",
{ "az vm list-skus -o tsv\n",
"cell_type": "code", "```\n",
"execution_count": null, "* min_nodes - this sets the minimum size of the cluster. If you set the minimum to 0 the cluster will shut down all nodes while not in use. Setting this number to a value higher than 0 will allow for faster start-up times, but you will also be billed when the cluster is not in use.\n",
"metadata": {}, "* max_nodes - this sets the maximum size of the cluster. Setting this to a larger number allows for more concurrency and a greater distributed processing of scale-out jobs.\n",
"outputs": [], "\n",
"source": [] "\n",
} "To create a **CPU** cluster now, run the cell below. The autoscale settings mean that the cluster will scale down to 0 nodes when inactive and up to 4 nodes when busy."
], ]
"metadata": { },
"authors": [ {
{ "cell_type": "code",
"name": "roastala" "execution_count": null,
} "metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# Choose a name for your CPU cluster\n",
"cpu_cluster_name = \"cpu-cluster\"\n",
"\n",
"# Verify that cluster does not exist already\n",
"try:\n",
" cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
" print(\"Found existing cpu-cluster\")\n",
"except ComputeTargetException:\n",
" print(\"Creating new cpu-cluster\")\n",
" \n",
" # Specify the configuration for the new cluster\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_D2_V2\",\n",
" min_nodes=0,\n",
" max_nodes=4)\n",
"\n",
" # Create the cluster with the specified name and configuration\n",
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
" \n",
" # Wait for the cluster to complete, show the output log\n",
" cpu_cluster.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To create a **GPU** cluster, run the cell below. Note that your subscription must have sufficient quota for GPU VMs or the command will fail. To increase quota, see [these instructions](https://docs.microsoft.com/en-us/azure/azure-supportability/resource-manager-core-quotas-request). "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# Choose a name for your GPU cluster\n",
"gpu_cluster_name = \"gpu-cluster\"\n",
"\n",
"# Verify that cluster does not exist already\n",
"try:\n",
" gpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
" print(\"Found existing gpu cluster\")\n",
"except ComputeTargetException:\n",
" print(\"Creating new gpu-cluster\")\n",
" \n",
" # Specify the configuration for the new cluster\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\",\n",
" min_nodes=0,\n",
" max_nodes=4)\n",
" # Create the cluster with the specified name and configuration\n",
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
"\n",
" # Wait for the cluster to complete, show the output log\n",
" gpu_cluster.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"## Next steps\n",
"\n",
"In this notebook you configured this notebook library to connect easily to an Azure ML workspace. You can copy this notebook to your own libraries to connect them to you workspace, or use it to bootstrap new workspaces completely.\n",
"\n",
"If you came here from another notebook, you can return there and complete that exercise, or you can try out the [Tutorials](./tutorials) or jump into \"how-to\" notebooks and start creating and deploying models. A good place to start is the [train within notebook](./how-to-use-azureml/training/train-within-notebook) example that walks through a simplified but complete end to end machine learning process."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
], ],
"kernelspec": { "metadata": {
"display_name": "Python 3.6", "authors": [
"language": "python", {
"name": "python36" "name": "roastala"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
}, },
"language_info": { "nbformat": 4,
"codemirror_mode": { "nbformat_minor": 2
"name": "ipython", }
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

4
configuration.yml Normal file
View File

@@ -0,0 +1,4 @@
name: configuration
dependencies:
- pip:
- azureml-sdk

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,709 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Track Data Drift between Training and Inference Data in Production \n",
"\n",
"With this notebook, you will learn how to enable the DataDrift service to automatically track and determine whether your inference data is drifting from the data your model was initially trained on. The DataDrift service provides metrics and visualizations to help stakeholders identify which specific features cause the concept drift to occur.\n",
"\n",
"Please email driftfeedback@microsoft.com with any issues. A member from the DataDrift team will respond shortly. \n",
"\n",
"The DataDrift Public Preview API can be found [here](https://docs.microsoft.com/en-us/python/api/azureml-contrib-datadrift/?view=azure-ml-py). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/contrib/datadrift/azureml-datadrift.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prerequisites and Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Install the DataDrift package\n",
"\n",
"Install the azureml-contrib-datadrift, azureml-contrib-opendatasets and lightgbm packages before running this notebook.\n",
"```\n",
"pip install azureml-contrib-datadrift\n",
"pip install azureml-contrib-datasets\n",
"pip install lightgbm\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import time\n",
"from datetime import datetime, timedelta\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import requests\n",
"from azureml.contrib.datadrift import DataDriftDetector, AlertConfiguration\n",
"from azureml.contrib.opendatasets import NoaaIsdWeather\n",
"from azureml.core import Dataset, Workspace, Run\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.image import ContainerImage\n",
"from azureml.core.model import Model\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"from azureml.widgets import RunDetails\n",
"from sklearn.externals import joblib\n",
"from sklearn.model_selection import train_test_split\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up Configuraton and Create Azure ML Workspace\n",
"\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Please type in your initials/alias. The prefix is prepended to the names of resources created by this notebook. \n",
"prefix = \"dd\"\n",
"\n",
"# NOTE: Please do not change the model_name, as it's required by the score.py file\n",
"model_name = \"driftmodel\"\n",
"image_name = \"{}driftimage\".format(prefix)\n",
"service_name = \"{}driftservice\".format(prefix)\n",
"\n",
"# optionally, set email address to receive an email alert for DataDrift\n",
"email_address = \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate Train/Testing Data\n",
"\n",
"For this demo, we will use NOAA weather data from [Azure Open Datasets](https://azure.microsoft.com/services/open-datasets/). You may replace this step with your own dataset. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"usaf_list = ['725724', '722149', '723090', '722159', '723910', '720279',\n",
" '725513', '725254', '726430', '720381', '723074', '726682',\n",
" '725486', '727883', '723177', '722075', '723086', '724053',\n",
" '725070', '722073', '726060', '725224', '725260', '724520',\n",
" '720305', '724020', '726510', '725126', '722523', '703333',\n",
" '722249', '722728', '725483', '722972', '724975', '742079',\n",
" '727468', '722193', '725624', '722030', '726380', '720309',\n",
" '722071', '720326', '725415', '724504', '725665', '725424',\n",
" '725066']\n",
"\n",
"columns = ['usaf', 'wban', 'datetime', 'latitude', 'longitude', 'elevation', 'windAngle', 'windSpeed', 'temperature', 'stationName', 'p_k']\n",
"\n",
"\n",
"def enrich_weather_noaa_data(noaa_df):\n",
" hours_in_day = 23\n",
" week_in_year = 52\n",
" \n",
" noaa_df[\"hour\"] = noaa_df[\"datetime\"].dt.hour\n",
" noaa_df[\"weekofyear\"] = noaa_df[\"datetime\"].dt.week\n",
" \n",
" noaa_df[\"sine_weekofyear\"] = noaa_df['datetime'].transform(lambda x: np.sin((2*np.pi*x.dt.week-1)/week_in_year))\n",
" noaa_df[\"cosine_weekofyear\"] = noaa_df['datetime'].transform(lambda x: np.cos((2*np.pi*x.dt.week-1)/week_in_year))\n",
"\n",
" noaa_df[\"sine_hourofday\"] = noaa_df['datetime'].transform(lambda x: np.sin(2*np.pi*x.dt.hour/hours_in_day))\n",
" noaa_df[\"cosine_hourofday\"] = noaa_df['datetime'].transform(lambda x: np.cos(2*np.pi*x.dt.hour/hours_in_day))\n",
" \n",
" return noaa_df\n",
"\n",
"def add_window_col(input_df):\n",
" shift_interval = pd.Timedelta('-7 days') # your X days interval\n",
" df_shifted = input_df.copy()\n",
" df_shifted['datetime'] = df_shifted['datetime'] - shift_interval\n",
" df_shifted.drop(list(input_df.columns.difference(['datetime', 'usaf', 'wban', 'sine_hourofday', 'temperature'])), axis=1, inplace=True)\n",
"\n",
" # merge, keeping only observations where -1 lag is present\n",
" df2 = pd.merge(input_df,\n",
" df_shifted,\n",
" on=['datetime', 'usaf', 'wban', 'sine_hourofday'],\n",
" how='inner', # use 'left' to keep observations without lags\n",
" suffixes=['', '-7'])\n",
" return df2\n",
"\n",
"def get_noaa_data(start_time, end_time, cols, station_list):\n",
" isd = NoaaIsdWeather(start_time, end_time, cols=cols)\n",
" # Read into Pandas data frame.\n",
" noaa_df = isd.to_pandas_dataframe()\n",
" noaa_df = noaa_df.rename(columns={\"stationName\": \"station_name\"})\n",
" \n",
" df_filtered = noaa_df[noaa_df[\"usaf\"].isin(station_list)]\n",
" df_filtered.reset_index(drop=True)\n",
" \n",
" # Enrich with time features\n",
" df_enriched = enrich_weather_noaa_data(df_filtered)\n",
" \n",
" return df_enriched\n",
"\n",
"def get_featurized_noaa_df(start_time, end_time, cols, station_list):\n",
" df_1 = get_noaa_data(start_time - timedelta(days=7), start_time - timedelta(seconds=1), cols, station_list)\n",
" df_2 = get_noaa_data(start_time, end_time, cols, station_list)\n",
" noaa_df = pd.concat([df_1, df_2])\n",
" \n",
" print(\"Adding window feature\")\n",
" df_window = add_window_col(noaa_df)\n",
" \n",
" cat_columns = df_window.dtypes == object\n",
" cat_columns = cat_columns[cat_columns == True]\n",
" \n",
" print(\"Encoding categorical columns\")\n",
" df_encoded = pd.get_dummies(df_window, columns=cat_columns.keys().tolist())\n",
" \n",
" print(\"Dropping unnecessary columns\")\n",
" df_featurized = df_encoded.drop(['windAngle', 'windSpeed', 'datetime', 'elevation'], axis=1).dropna().drop_duplicates()\n",
" \n",
" return df_featurized"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Train model on Jan 1 - 14, 2009 data\n",
"df = get_featurized_noaa_df(datetime(2009, 1, 1), datetime(2009, 1, 14, 23, 59, 59), columns, usaf_list)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label = \"temperature\"\n",
"x_df = df.drop(label, axis=1)\n",
"y_df = df[[label]]\n",
"x_train, x_test, y_train, y_test = train_test_split(df, y_df, test_size=0.2, random_state=223)\n",
"print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)\n",
"\n",
"training_dir = 'outputs/training'\n",
"training_file = \"training.csv\"\n",
"\n",
"# Generate training dataframe to register as Training Dataset\n",
"os.makedirs(training_dir, exist_ok=True)\n",
"training_df = pd.merge(x_train.drop(label, axis=1), y_train, left_index=True, right_index=True)\n",
"training_df.to_csv(training_dir + \"/\" + training_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create/Register Training Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_name = \"dataset\"\n",
"name_suffix = datetime.utcnow().strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
"snapshot_name = \"snapshot-{}\".format(name_suffix)\n",
"\n",
"dstore = ws.get_default_datastore()\n",
"dstore.upload(training_dir, \"data/training\", show_progress=True)\n",
"dpath = dstore.path(\"data/training/training.csv\")\n",
"trainingDataset = Dataset.auto_read_files(dpath, include_path=True)\n",
"trainingDataset = trainingDataset.register(workspace=ws, name=dataset_name, description=\"dset\", exist_ok=True)\n",
"\n",
"trainingDataSnapshot = trainingDataset.create_snapshot(snapshot_name=snapshot_name, compute_target=None, create_data_snapshot=True)\n",
"datasets = [(Dataset.Scenario.TRAINING, trainingDataSnapshot)]\n",
"print(\"dataset registration done.\\n\")\n",
"datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train and Save Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import lightgbm as lgb\n",
"\n",
"train = lgb.Dataset(data=x_train, \n",
" label=y_train)\n",
"\n",
"test = lgb.Dataset(data=x_test, \n",
" label=y_test,\n",
" reference=train)\n",
"\n",
"params = {'learning_rate' : 0.1,\n",
" 'boosting' : 'gbdt',\n",
" 'metric' : 'rmse',\n",
" 'feature_fraction' : 1,\n",
" 'bagging_fraction' : 1,\n",
" 'max_depth': 6,\n",
" 'num_leaves' : 31,\n",
" 'objective' : 'regression',\n",
" 'bagging_freq' : 1,\n",
" \"verbose\": -1,\n",
" 'min_data_per_leaf': 100}\n",
"\n",
"model = lgb.train(params, \n",
" num_boost_round=500,\n",
" train_set=train,\n",
" valid_sets=[train, test],\n",
" verbose_eval=50,\n",
" early_stopping_rounds=25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_file = 'outputs/{}.pkl'.format(model_name)\n",
"\n",
"os.makedirs('outputs', exist_ok=True)\n",
"joblib.dump(model, model_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Model.register(model_path=model_file,\n",
" model_name=model_name,\n",
" workspace=ws,\n",
" datasets=datasets)\n",
"\n",
"print(model_name, image_name, service_name, model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploy Model To AKS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare Environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn', 'joblib', 'lightgbm', 'pandas'],\n",
" pip_packages=['azureml-monitoring', 'azureml-sdk[automl]'])\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Image creation may take up to 15 minutes.\n",
"\n",
"image_name = image_name + str(model.version)\n",
"\n",
"if not image_name in ws.images:\n",
" # Use the score.py defined in this directory as the execution script\n",
" # NOTE: The Model Data Collector must be enabled in the execution script for DataDrift to run correctly\n",
" image_config = ContainerImage.image_configuration(execution_script=\"score.py\",\n",
" runtime=\"python\",\n",
" conda_file=\"myenv.yml\",\n",
" description=\"Image with weather dataset model\")\n",
" image = ContainerImage.create(name=image_name,\n",
" models=[model],\n",
" image_config=image_config,\n",
" workspace=ws)\n",
"\n",
" image.wait_for_creation(show_output=True)\n",
"else:\n",
" image = ws.images[image_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Compute Target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_name = 'dd-demo-e2e'\n",
"prov_config = AksCompute.provisioning_configuration()\n",
"\n",
"if not aks_name in ws.compute_targets:\n",
" aks_target = ComputeTarget.create(workspace=ws,\n",
" name=aks_name,\n",
" provisioning_configuration=prov_config)\n",
"\n",
" aks_target.wait_for_completion(show_output=True)\n",
" print(aks_target.provisioning_state)\n",
" print(aks_target.provisioning_errors)\n",
"else:\n",
" aks_target=ws.compute_targets[aks_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy Service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = service_name\n",
"\n",
"if not aks_service_name in ws.webservices:\n",
" aks_config = AksWebservice.deploy_configuration(collect_model_data=True, enable_app_insights=True)\n",
" aks_service = Webservice.deploy_from_image(workspace=ws,\n",
" name=aks_service_name,\n",
" image=image,\n",
" deployment_config=aks_config,\n",
" deployment_target=aks_target)\n",
" aks_service.wait_for_deployment(show_output=True)\n",
" print(aks_service.state)\n",
"else:\n",
" aks_service = ws.webservices[aks_service_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Run DataDrift Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Send Scoring Data to Service"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download Scoring Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Score Model on March 15, 2016 data\n",
"scoring_df = get_noaa_data(datetime(2016, 3, 15) - timedelta(days=7), datetime(2016, 3, 16), columns, usaf_list)\n",
"# Add the window feature column\n",
"scoring_df = add_window_col(scoring_df)\n",
"\n",
"# Drop features not used by the model\n",
"print(\"Dropping unnecessary columns\")\n",
"scoring_df = scoring_df.drop(['windAngle', 'windSpeed', 'datetime', 'elevation'], axis=1).dropna()\n",
"scoring_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# One Hot Encode the scoring dataset to match the training dataset schema\n",
"columns_dict = model.datasets[\"training\"][0].get_profile().columns\n",
"extra_cols = ('Path', 'Column1')\n",
"for k in extra_cols:\n",
" columns_dict.pop(k, None)\n",
"training_columns = list(columns_dict.keys())\n",
"\n",
"categorical_columns = scoring_df.dtypes == object\n",
"categorical_columns = categorical_columns[categorical_columns == True]\n",
"\n",
"test_df = pd.get_dummies(scoring_df[categorical_columns.keys().tolist()])\n",
"encoded_df = scoring_df.join(test_df)\n",
"\n",
"# Populate missing OHE columns with 0 values to match traning dataset schema\n",
"difference = list(set(training_columns) - set(encoded_df.columns.tolist()))\n",
"for col in difference:\n",
" encoded_df[col] = 0\n",
"encoded_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Serialize dataframe to list of row dictionaries\n",
"encoded_dict = encoded_df.to_dict('records')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit Scoring Data to Service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"\n",
"# retreive the API keys. AML generates two keys.\n",
"key1, key2 = aks_service.get_keys()\n",
"\n",
"total_count = len(scoring_df)\n",
"i = 0\n",
"load = []\n",
"for row in encoded_dict:\n",
" load.append(row)\n",
" i = i + 1\n",
" if i % 100 == 0:\n",
" payload = json.dumps({\"data\": load})\n",
" \n",
" # construct raw HTTP request and send to the service\n",
" payload_binary = bytes(payload,encoding = 'utf8')\n",
" headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}\n",
" resp = requests.post(aks_service.scoring_uri, payload_binary, headers=headers)\n",
" \n",
" print(\"prediction:\", resp.content, \"Progress: {}/{}\".format(i, total_count)) \n",
"\n",
" load = []\n",
" time.sleep(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure DataDrift"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"services = [service_name]\n",
"start = datetime.now() - timedelta(days=2)\n",
"end = datetime(year=2020, month=1, day=22, hour=15, minute=16)\n",
"feature_list = ['usaf', 'wban', 'latitude', 'longitude', 'station_name', 'p_k', 'sine_hourofday', 'cosine_hourofday', 'temperature-7']\n",
"alert_config = AlertConfiguration([email_address]) if email_address else None\n",
"\n",
"# there will be an exception indicating using get() method if DataDrift object already exist\n",
"try:\n",
" datadrift = DataDriftDetector.create(ws, model.name, model.version, services, frequency=\"Day\", alert_config=alert_config)\n",
"except KeyError:\n",
" datadrift = DataDriftDetector.get(ws, model.name, model.version)\n",
" \n",
"print(\"Details of DataDrift Object:\\n{}\".format(datadrift))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run an Adhoc DataDriftDetector Run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"target_date = datetime.today()\n",
"run = datadrift.run(target_date, services, feature_list=feature_list, create_compute_target=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"exp = Experiment(ws, datadrift._id)\n",
"dd_run = Run(experiment=exp, run_id=run)\n",
"RunDetails(dd_run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Drift Analysis Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(dd_run.get_children())\n",
"for child in children:\n",
" child.wait_for_completion()\n",
"\n",
"drift_metrics = datadrift.get_output(start_time=start, end_time=end)\n",
"drift_metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show all drift figures, one per serivice.\n",
"# If setting with_details is False (by default), only drift will be shown; if it's True, all details will be shown.\n",
"\n",
"drift_figures = datadrift.show(with_details=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enable DataDrift Schedule"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datadrift.enable_schedule()"
]
}
],
"metadata": {
"authors": [
{
"name": "rafarmah"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: azure-ml-datadrift
dependencies:
- pip:
- azureml-sdk
- azureml-contrib-datadrift
- azureml-contrib-opendatasets
- lightgbm
- azureml-widgets

View File

@@ -0,0 +1,58 @@
import pickle
import json
import numpy
import azureml.train.automl
from sklearn.externals import joblib
from sklearn.linear_model import Ridge
from azureml.core.model import Model
from azureml.core.run import Run
from azureml.monitoring import ModelDataCollector
import time
import pandas as pd
def init():
global model, inputs_dc, prediction_dc, feature_names, categorical_features
print("Model is initialized" + time.strftime("%H:%M:%S"))
model_path = Model.get_model_path(model_name="driftmodel")
model = joblib.load(model_path)
feature_names = ["usaf", "wban", "latitude", "longitude", "station_name", "p_k",
"sine_weekofyear", "cosine_weekofyear", "sine_hourofday", "cosine_hourofday",
"temperature-7"]
categorical_features = ["usaf", "wban", "p_k", "station_name"]
inputs_dc = ModelDataCollector(model_name="driftmodel",
identifier="inputs",
feature_names=feature_names)
prediction_dc = ModelDataCollector("driftmodel",
identifier="predictions",
feature_names=["temperature"])
def run(raw_data):
global inputs_dc, prediction_dc
try:
data = json.loads(raw_data)["data"]
data = pd.DataFrame(data)
# Remove the categorical features as the model expects OHE values
input_data = data.drop(categorical_features, axis=1)
result = model.predict(input_data)
# Collect the non-OHE dataframe
collected_df = data[feature_names]
inputs_dc.collect(collected_df.values)
prediction_dc.collect(result)
return result.tolist()
except Exception as e:
error = str(e)
print(error + time.strftime("%H:%M:%S"))
return error

View File

@@ -115,16 +115,7 @@ jupyter notebook
- Simple example of using automated ML for regression - Simple example of using automated ML for regression
- Uses local compute for training - Uses local compute for training
- [auto-ml-remote-execution.ipynb](remote-execution/auto-ml-remote-execution.ipynb) - [auto-ml-remote-amlcompute.ipynb](remote-amlcompute/auto-ml-remote-amlcompute.ipynb)
- Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
- Example of using automated ML for classification using a remote linux DSVM for training
- Parallel execution of iterations
- Async tracking of progress
- Cancelling individual iterations or entire run
- Retrieving models for any iteration or logged metric
- Specify automated ML settings as kwargs
- [auto-ml-remote-amlcompute.ipynb](remote-batchai/auto-ml-remote-amlcompute.ipynb)
- Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits) - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
- Example of using automated ML for classification using remote AmlCompute for training - Example of using automated ML for classification using remote AmlCompute for training
- Parallel execution of iterations - Parallel execution of iterations
@@ -133,12 +124,6 @@ jupyter notebook
- Retrieving models for any iteration or logged metric - Retrieving models for any iteration or logged metric
- Specify automated ML settings as kwargs - Specify automated ML settings as kwargs
- [auto-ml-remote-attach.ipynb](remote-attach/auto-ml-remote-attach.ipynb)
- Dataset: Scikit learn's [20newsgroup](http://scikit-learn.org/stable/datasets/twenty_newsgroups.html)
- handling text data with preprocess flag
- Reading data from a blob store for remote executions
- using pandas dataframes for reading data
- [auto-ml-missing-data-blacklist-early-termination.ipynb](missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.ipynb) - [auto-ml-missing-data-blacklist-early-termination.ipynb](missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.ipynb)
- Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits) - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
- Blacklist certain pipelines - Blacklist certain pipelines
@@ -156,10 +141,6 @@ jupyter notebook
- Get details for a automated ML Run. (automated ML settings, run widget & all metrics) - Get details for a automated ML Run. (automated ML settings, run widget & all metrics)
- Download fitted pipeline for any iteration - Download fitted pipeline for any iteration
- [auto-ml-remote-execution-with-datastore.ipynb](remote-execution-with-datastore/auto-ml-remote-execution-with-datastore.ipynb)
- Dataset: Scikit learn's [20newsgroup](http://scikit-learn.org/stable/datasets/twenty_newsgroups.html)
- Download the data and store it in DataStore.
- [auto-ml-classification-with-deployment.ipynb](classification-with-deployment/auto-ml-classification-with-deployment.ipynb) - [auto-ml-classification-with-deployment.ipynb](classification-with-deployment/auto-ml-classification-with-deployment.ipynb)
- Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits) - Dataset: scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)
- Simple example of using automated ML for classification - Simple example of using automated ML for classification
@@ -198,6 +179,26 @@ jupyter notebook
- Simple example of using automated ML for classification with ONNX models - Simple example of using automated ML for classification with ONNX models
- Uses local compute for training - Uses local compute for training
- [auto-ml-bank-marketing-subscribers-with-deployment.ipynb](bank-marketing-subscribers-with-deployment/auto-ml-bank-marketing-with-deployment.ipynb)
- Dataset: UCI's [bank marketing dataset](https://www.kaggle.com/janiobachmann/bank-marketing-dataset)
- Simple example of using automated ML for classification to predict term deposit subscriptions for a bank
- Uses azure compute for training
- [auto-ml-creditcard-with-deployment.ipynb](credit-card-fraud-detection-with-deployment/auto-ml-creditcard-with-deployment.ipynb)
- Dataset: Kaggle's [credit card fraud detection dataset](https://www.kaggle.com/mlg-ulb/creditcardfraud)
- Simple example of using automated ML for classification to fraudulent credit card transactions
- Uses azure compute for training
- [auto-ml-hardware-performance-with-deployment.ipynb](hardware-performance-prediction-with-deployment/auto-ml-hardware-performance-with-deployment.ipynb)
- Dataset: UCI's [computer hardware dataset](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware)
- Simple example of using automated ML for regression to predict the performance of certain combinations of hardware components
- Uses azure compute for training
- [auto-ml-concrete-strength-with-deployment.ipynb](predicting-concrete-strength-with-deployment/auto-ml-concrete-strength-with-deployment.ipynb)
- Dataset: UCI's [concrete compressive strength dataset](https://www.kaggle.com/pavanraj159/concrete-compressive-strength-data-set)
- Simple example of using automated ML for regression to predict the strength predict the compressive strength of concrete based off of different ingredient combinations and quantities of those ingredients
- Uses azure compute for training
<a name="documentation"></a> <a name="documentation"></a>
See [Configure automated machine learning experiments](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train) to learn how more about the the settings and features available for automated machine learning experiments. See [Configure automated machine learning experiments](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train) to learn how more about the the settings and features available for automated machine learning experiments.

View File

@@ -9,6 +9,8 @@ IF "%automl_env_file%"=="" SET automl_env_file="automl_env.yml"
IF NOT EXIST %automl_env_file% GOTO YmlMissing IF NOT EXIST %automl_env_file% GOTO YmlMissing
IF "%CONDA_EXE%"=="" GOTO CondaMissing
call conda activate %conda_env_name% 2>nul: call conda activate %conda_env_name% 2>nul:
if not errorlevel 1 ( if not errorlevel 1 (
@@ -42,6 +44,15 @@ IF NOT "%options%"=="nolaunch" (
goto End goto End
:CondaMissing
echo Please run this script from an Anaconda Prompt window.
echo You can start an Anaconda Prompt window by
echo typing Anaconda Prompt on the Start menu.
echo If you don't see the Anaconda Prompt app, install Miniconda.
echo If you are running an older version of Miniconda or Anaconda,
echo you can upgrade using the command: conda update conda
goto End
:YmlMissing :YmlMissing
echo File %automl_env_file% not found. echo File %automl_env_file% not found.

View File

@@ -0,0 +1,729 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Classification with Deployment using a Bank Marketing Dataset**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Deploy](#Deploy)\n",
"1. [Test](#Test)\n",
"1. [Acknowledgements](#Acknowledgements)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"\n",
"In this example we use the UCI Bank Marketing dataset to showcase how you can use AutoML for a classification problem and deploy it to an Azure Container Instance (ACI). The classification goal is to predict if the client will subscribe to a term deposit with the bank.\n",
"\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
"\n",
"In this notebook you will learn how to:\n",
"1. Create an experiment using an existing workspace.\n",
"2. Configure AutoML using `AutoMLConfig`.\n",
"3. Train the model using local compute.\n",
"4. Explore the results.\n",
"5. Register the model.\n",
"6. Create a container image.\n",
"7. Create an Azure Container Instance (ACI) service.\n",
"8. Test the ACI service."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import logging\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"from sklearn import datasets\n",
"import azureml.dataprep as dprep\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig\n",
"from azureml.train.automl.run import AutoMLRun"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# choose a name for experiment\n",
"experiment_name = 'automl-classification-bmarketing'\n",
"# project folder\n",
"project_folder = './sample_projects/automl-classification-bankmarketing'\n",
"\n",
"experiment=Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
"#### Creation of AmlCompute takes approximately 5 minutes. \n",
"If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"automlcl\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data\n",
"\n",
"Here load the data in the get_data() script to be utilized in azure compute. To do this first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_Run_config."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Data\n",
"\n",
"Here we create the script to be run in azure comput for loading the data, we load the bank marketing dataset into X_train and y_train. Next X_train and y_train is returned for training the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
"dflow = dprep.auto_read_file(data)\n",
"dflow.get_profile()\n",
"X_train = dflow.drop_columns(columns=['y'])\n",
"y_train = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
"dflow.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**task**|classification or regression|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
"|**y**|(sparse) array-like, shape = [n_samples, ], Multi-class targets.|\n",
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
"\n",
"**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 5,\n",
" \"iterations\": 10,\n",
" \"n_cross_validations\": 2,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": True,\n",
" \"max_concurrent_iterations\": 5,\n",
" \"verbosity\": logging.INFO,\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
"In this example, we specify `show_output = True` to print currently running iterations to the console."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy\n",
"\n",
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last invocation. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Register the Fitted Model for Deployment\n",
"If neither `metric` nor `iteration` are specified in the `register_model` call, the iteration with the best primary metric is registered."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"description = 'AutoML Model trained on bank marketing data to predict if a client will subscribe to a term deposit'\n",
"tags = None\n",
"model = remote_run.register_model(description = description, tags = tags)\n",
"\n",
"print(remote_run.model_id) # This will be written to the script file later in the notebook."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Scoring Script\n",
"The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy\n",
"import azureml.train.automl\n",
"from sklearn.externals import joblib\n",
"from azureml.core.model import Model\n",
"\n",
"\n",
"def init():\n",
" global model\n",
" model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
"def run(rawdata):\n",
" try:\n",
" data = json.loads(rawdata)['data']\n",
" data = numpy.array(data)\n",
" result = model.predict(data)\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"error\": result})\n",
" return json.dumps({\"result\":result.tolist()})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a YAML File for the Environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for p in ['azureml-train-automl', 'azureml-sdk', 'azureml-core']:\n",
" print('{}\\t{}'.format(p, dependencies[p]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost<=0.80'],\n",
" pip_packages=['azureml-sdk[automl]'])\n",
"\n",
"conda_env_file_name = 'myenv.yml'\n",
"myenv.save_to_file('.', conda_env_file_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Substitute the actual version number in the environment file.\n",
"# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
"# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
"\n",
"with open(conda_env_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(conda_env_file_name, 'w') as cefw:\n",
" cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-sdk']))\n",
"\n",
"# Substitute the actual model id in the script file.\n",
"\n",
"script_file_name = 'score.py'\n",
"\n",
"with open(script_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(script_file_name, 'w') as cefw:\n",
" cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Container Image\n",
"\n",
"Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
"or when testing a model that is under development."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import Image, ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
" execution_script = script_file_name,\n",
" conda_file = conda_env_file_name,\n",
" tags = {'area': \"bmData\", 'type': \"automl_classification\"},\n",
" description = \"Image for automl classification sample\")\n",
"\n",
"image = Image.create(name = \"automlsampleimage\",\n",
" # this is the model object \n",
" models = [model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)\n",
"\n",
"if image.creation_state == 'Failed':\n",
" print(\"Image build log at: \" + image.image_build_log_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy the Image as a Web Service on Azure Container Instance\n",
"\n",
"Deploy an image that contains the model and other assets needed by the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
" memory_gb = 1, \n",
" tags = {'area': \"bmData\", 'type': \"automl_classification\"}, \n",
" description = 'sample service for Automl Classification')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import Webservice\n",
"\n",
"aci_service_name = 'automl-sample-bankmarketing'\n",
"print(aci_service_name)\n",
"aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
" image = image,\n",
" name = aci_service_name,\n",
" workspace = ws)\n",
"aci_service.wait_for_deployment(True)\n",
"print(aci_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete a Web Service\n",
"\n",
"Deletes the specified web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get Logs from a Deployed Web Service\n",
"\n",
"Gets logs from a deployed web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"Now that the model is trained split our data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the bank marketing datasets.\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.model_selection import train_test_split\n",
"from numpy import array"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv\"\n",
"dflow = dprep.auto_read_file(data)\n",
"dflow.get_profile()\n",
"X_test = dflow.drop_columns(columns=['y'])\n",
"y_test = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
"dflow.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_test = X_test.to_pandas_dataframe()\n",
"y_test = y_test.to_pandas_dataframe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred = fitted_model.predict(X_test)\n",
"actual = array(y_test)\n",
"actual = actual[:,0]\n",
"print(y_pred.shape, \" \", actual.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate metrics for the prediction\n",
"\n",
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
"from the trained model that was returned."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib notebook\n",
"test_pred = plt.scatter(actual, y_pred, color='b')\n",
"test_test = plt.scatter(actual, actual, color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Acknowledgements"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This Bank Marketing dataset is made available under the Creative Commons (CCO: Public Domain) License: https://creativecommons.org/publicdomain/zero/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: https://creativecommons.org/publicdomain/zero/1.0/ and is available at: https://www.kaggle.com/janiobachmann/bank-marketing-dataset .\n",
"\n",
"_**Acknowledgements**_\n",
"This data set is originally available within the UCI Machine Learning Database: https://archive.ics.uci.edu/ml/datasets/bank+marketing\n",
"\n",
"[Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014"
]
}
],
"metadata": {
"authors": [
{
"name": "v-rasav"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: auto-ml-classification-bank-marketing
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,712 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Classification with Deployment using Credit Card Dataset**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Deploy](#Deploy)\n",
"1. [Test](#Test)\n",
"1. [Acknowledgements](#Acknowledgements)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"\n",
"In this example we use the associated credit card dataset to showcase how you can use AutoML for a simple classification problem and deploy it to an Azure Container Instance (ACI). The classification goal is to predict if a creditcard transaction is or is not considered a fraudulent charge.\n",
"\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
"\n",
"In this notebook you will learn how to:\n",
"1. Create an experiment using an existing workspace.\n",
"2. Configure AutoML using `AutoMLConfig`.\n",
"3. Train the model using local compute.\n",
"4. Explore the results.\n",
"5. Register the model.\n",
"6. Create a container image.\n",
"7. Create an Azure Container Instance (ACI) service.\n",
"8. Test the ACI service."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import pandas as pd\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"import azureml.dataprep as dprep\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig\n",
"from azureml.train.automl.run import AutoMLRun"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# choose a name for experiment\n",
"experiment_name = 'automl-classification-ccard'\n",
"# project folder\n",
"project_folder = './sample_projects/automl-classification-creditcard'\n",
"\n",
"experiment=Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
"#### Creation of AmlCompute takes approximately 5 minutes. \n",
"If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"automlcl\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data\n",
"\n",
"Here load the data in the get_data script to be utilized in azure compute. To do this, first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_run_config."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Data\n",
"\n",
"Here create the script to be run in azure compute for loading the data, load the credit card dataset into cards and store the Class column (y) in the y variable and store the remaining data in the x variable. Next split the data using train_test_split and return X_train and y_train for training the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
"dflow = dprep.auto_read_file(data)\n",
"dflow.get_profile()\n",
"X = dflow.drop_columns(columns=['Class'])\n",
"y = dflow.keep_columns(columns=['Class'], validate_column_exists=True)\n",
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
"y_train, y_test = y.random_split(percentage=0.8, seed=223)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**task**|classification or regression|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
"|**y**|(sparse) array-like, shape = [n_samples, ], Multi-class targets.|\n",
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
"\n",
"**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### If you would like to see even better results increase \"iteration_time_out minutes\" to 10+ mins and increase \"iterations\" to a minimum of 30"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 5,\n",
" \"iterations\": 10,\n",
" \"n_cross_validations\": 2,\n",
" \"primary_metric\": 'average_precision_score_weighted',\n",
" \"preprocess\": True,\n",
" \"max_concurrent_iterations\": 5,\n",
" \"verbosity\": logging.INFO,\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors_20190417.log',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
"In this example, we specify `show_output = True` to print currently running iterations to the console."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy\n",
"\n",
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last invocation. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Register the Fitted Model for Deployment\n",
"If neither `metric` nor `iteration` are specified in the `register_model` call, the iteration with the best primary metric is registered."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"description = 'AutoML Model'\n",
"tags = None\n",
"model = remote_run.register_model(description = description, tags = tags)\n",
"\n",
"print(remote_run.model_id) # This will be written to the script file later in the notebook."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Scoring Script\n",
"The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy\n",
"import azureml.train.automl\n",
"from sklearn.externals import joblib\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global model\n",
" model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
"def run(rawdata):\n",
" try:\n",
" data = json.loads(rawdata)['data']\n",
" data = numpy.array(data)\n",
" result = model.predict(data)\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"error\": result})\n",
" return json.dumps({\"result\":result.tolist()})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a YAML File for the Environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for p in ['azureml-train-automl', 'azureml-sdk', 'azureml-core']:\n",
" print('{}\\t{}'.format(p, dependencies[p]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn','py-xgboost<=0.80'],\n",
" pip_packages=['azureml-sdk[automl]'])\n",
"\n",
"conda_env_file_name = 'myenv.yml'\n",
"myenv.save_to_file('.', conda_env_file_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Substitute the actual version number in the environment file.\n",
"# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
"# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
"\n",
"with open(conda_env_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(conda_env_file_name, 'w') as cefw:\n",
" cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-sdk']))\n",
"\n",
"# Substitute the actual model id in the script file.\n",
"\n",
"script_file_name = 'score.py'\n",
"\n",
"with open(script_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(script_file_name, 'w') as cefw:\n",
" cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Container Image\n",
"\n",
"Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
"or when testing a model that is under development."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import Image, ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
" execution_script = script_file_name,\n",
" conda_file = conda_env_file_name,\n",
" tags = {'area': \"cards\", 'type': \"automl_classification\"},\n",
" description = \"Image for automl classification sample\")\n",
"\n",
"image = Image.create(name = \"automlsampleimage\",\n",
" # this is the model object \n",
" models = [model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)\n",
"\n",
"if image.creation_state == 'Failed':\n",
" print(\"Image build log at: \" + image.image_build_log_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy the Image as a Web Service on Azure Container Instance\n",
"\n",
"Deploy an image that contains the model and other assets needed by the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
" memory_gb = 1, \n",
" tags = {'area': \"cards\", 'type': \"automl_classification\"}, \n",
" description = 'sample service for Automl Classification')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import Webservice\n",
"\n",
"aci_service_name = 'automl-sample-creditcard'\n",
"print(aci_service_name)\n",
"aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
" image = image,\n",
" name = aci_service_name,\n",
" workspace = ws)\n",
"aci_service.wait_for_deployment(True)\n",
"print(aci_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete a Web Service\n",
"\n",
"Deletes the specified web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get Logs from a Deployed Web Service\n",
"\n",
"Gets logs from a deployed web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Randomly select and test\n",
"X_test = X_test.to_pandas_dataframe()\n",
"y_test = y_test.to_pandas_dataframe()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred = fitted_model.predict(X_test)\n",
"y_pred"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate metrics for the prediction\n",
"\n",
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
"from the trained model that was returned."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Randomly select and test\n",
"# Plot outputs\n",
"%matplotlib notebook\n",
"test_pred = plt.scatter(y_test, y_pred, color='b')\n",
"test_test = plt.scatter(y_test, y_test, color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Acknowledgements"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
"\n",
"\n",
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
"Please cite the following works: \n",
"\u00e2\u20ac\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
"\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
"\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
"o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
"\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
"\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
]
}
],
"metadata": {
"authors": [
{
"name": "v-rasav"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: auto-ml-classification-credit-card-fraud
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,8 @@
name: auto-ml-classification-with-deployment
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -129,6 +129,22 @@
" test_size=0.2, \n", " test_size=0.2, \n",
" random_state=0)\n", " random_state=0)\n",
"\n", "\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ensure the x_train and x_test are pandas DataFrame."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert the X_train and X_test to pandas DataFrame and set column names,\n", "# Convert the X_train and X_test to pandas DataFrame and set column names,\n",
"# This is needed for initializing the input variable names of ONNX model, \n", "# This is needed for initializing the input variable names of ONNX model, \n",
"# and the prediction with the ONNX model using the inference helper.\n", "# and the prediction with the ONNX model using the inference helper.\n",
@@ -158,6 +174,13 @@
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|" "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set the preprocess=True, currently the InferenceHelper only supports this mode."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -299,7 +322,7 @@
" onnxrt_present = False\n", " onnxrt_present = False\n",
"\n", "\n",
"def get_onnx_res(run):\n", "def get_onnx_res(run):\n",
" res_path = '_debug_y_trans_converter.json'\n", " res_path = 'onnx_resource.json'\n",
" run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)\n", " run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)\n",
" with open(res_path) as f:\n", " with open(res_path) as f:\n",
" onnx_res = json.load(f)\n", " onnx_res = json.load(f)\n",
@@ -316,7 +339,7 @@
" print(pred_prob_onnx)\n", " print(pred_prob_onnx)\n",
"else:\n", "else:\n",
" if not python_version_compatible:\n", " if not python_version_compatible:\n",
" print('Please use Python version 3.6 to run the inference helper.') \n", " print('Please use Python version 3.6 or 3.7 to run the inference helper.') \n",
" if not onnxrt_present:\n", " if not onnxrt_present:\n",
" print('Please install the onnxruntime package to do the prediction with ONNX model.')" " print('Please install the onnxruntime package to do the prediction with ONNX model.')"
] ]

View File

@@ -0,0 +1,9 @@
name: auto-ml-classification-with-onnx
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml
- onnxruntime

View File

@@ -0,0 +1,8 @@
name: auto-ml-classification-with-whitelisting
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,8 @@
name: auto-ml-classification
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -21,7 +21,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Automated Machine Learning\n", "# Automated Machine Learning\n",
"_**Prepare Data using `azureml.dataprep` for Remote Execution (DSVM)**_\n", "_**Prepare Data using `azureml.dataprep` for Remote Execution (AmlCompute)**_\n",
"\n", "\n",
"## Contents\n", "## Contents\n",
"1. [Introduction](#Introduction)\n", "1. [Introduction](#Introduction)\n",
@@ -192,7 +192,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Create or Attach a Remote Linux DSVM" "### Create or Attach an AmlCompute cluster"
] ]
}, },
{ {
@@ -201,21 +201,36 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"dsvm_name = 'mydsvmb'\n", "from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n", "\n",
"try:\n", "# Choose a name for your cluster.\n",
" while ws.compute_targets[dsvm_name].provisioning_state == 'Creating':\n", "amlcompute_cluster_name = \"cpu-cluster\"\n",
" time.sleep(1)\n", "\n",
" \n", "found = False\n",
" dsvm_compute = DsvmCompute(ws, dsvm_name)\n", "\n",
" print('Found existing DVSM.')\n", "# Check if this compute target already exists in the workspace.\n",
"except:\n", "\n",
" print('Creating a new DSVM.')\n", "cts = ws.compute_targets\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n", "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)\n", " found = True\n",
" dsvm_compute.wait_for_completion(show_output = True)\n", " print('Found existing compute target.')\n",
" print(\"Waiting one minute for ssh to be accessible\")\n", " compute_target = cts[amlcompute_cluster_name]\n",
" time.sleep(90) # Wait for ssh to be accessible" "\n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\\n\",\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
"\n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
"\n",
" # For a more detailed view of current AmlCompute status, use get_status()."
] ]
}, },
{ {
@@ -227,9 +242,13 @@
"from azureml.core.runconfig import RunConfiguration\n", "from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n", "from azureml.core.conda_dependencies import CondaDependencies\n",
"\n", "\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n", "conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n", "\n",
"conda_run_config.target = dsvm_compute\n", "# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n", "\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n", "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
"conda_run_config.environment.python.conda_dependencies = cd" "conda_run_config.environment.python.conda_dependencies = cd"
@@ -294,6 +313,27 @@
"remote_run.clean_preprocessor_cache()" "remote_run.clean_preprocessor_cache()"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},

View File

@@ -0,0 +1,8 @@
name: auto-ml-dataprep-remote-execution
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,8 @@
name: auto-ml-dataprep
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,8 @@
name: auto-ml-exploring-previous-runs
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -36,19 +36,17 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Introduction\n", "## Introduction\n",
"In this example, we show how AutoML can be used for bike share forecasting.\n", "This notebook demonstrates demand forecasting for a bike-sharing service using AutoML.\n",
"\n", "\n",
"The purpose is to demonstrate how to take advantage of the built-in holiday featurization, access the feature names, and further demonstrate how to work with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n", "AutoML highlights here include built-in holiday featurization, accessing engineered feature names, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
"\n", "\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n", "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n", "\n",
"In this notebook you would see\n", "Notebook synopsis:\n",
"1. Creating an Experiment in an existing Workspace\n", "1. Creating an Experiment in an existing Workspace\n",
"2. Instantiating AutoMLConfig with new task type \"forecasting\" for timeseries data training, and other timeseries related settings: for this dataset we use the basic one: \"time_column_name\" \n", "2. Configuration and local run of AutoML for a time-series model with lag and holiday features \n",
"3. Training the Model using local compute\n", "3. Viewing the engineered names for featurized data and featurization summary for all raw features\n",
"4. Exploring the results\n", "4. Evaluating the fitted model using a rolling test "
"5. Viewing the engineered names for featurized data and featurization summary for all raw features\n",
"6. Testing the fitted model"
] ]
}, },
{ {
@@ -69,10 +67,12 @@
"import numpy as np\n", "import numpy as np\n",
"import logging\n", "import logging\n",
"import warnings\n", "import warnings\n",
"\n",
"from pandas.tseries.frequencies import to_offset\n",
"\n",
"# Squash warning messages for cleaner output in the notebook\n", "# Squash warning messages for cleaner output in the notebook\n",
"warnings.showwarning = lambda *args, **kwargs: None\n", "warnings.showwarning = lambda *args, **kwargs: None\n",
"\n", "\n",
"\n",
"from azureml.core.workspace import Workspace\n", "from azureml.core.workspace import Workspace\n",
"from azureml.core.experiment import Experiment\n", "from azureml.core.experiment import Experiment\n",
"from azureml.train.automl import AutoMLConfig\n", "from azureml.train.automl import AutoMLConfig\n",
@@ -84,7 +84,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments." "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
] ]
}, },
{ {
@@ -129,14 +129,15 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"data = pd.read_csv('bike-no.csv', parse_dates=['date'])" "data = pd.read_csv('bike-no.csv', parse_dates=['date'])\n",
"data.head()"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Let's set up what we know abou the dataset. \n", "Let's set up what we know about the dataset. \n",
"\n", "\n",
"**Target column** is what we want to forecast.\n", "**Target column** is what we want to forecast.\n",
"\n", "\n",
@@ -194,8 +195,7 @@
"source": [ "source": [
"### Setting forecaster maximum horizon \n", "### Setting forecaster maximum horizon \n",
"\n", "\n",
"Assuming your test data forms a full and regular time series(regular time intervals and no holes), \n", "The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 14 periods (i.e. 14 days). Notice that this is much shorter than the number of days in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
"the maximum horizon you will need to forecast is the length of the longest grain in your test set."
] ]
}, },
{ {
@@ -204,10 +204,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"if len(grain_column_names) == 0:\n", "max_horizon = 14"
" max_horizon = len(X_test)\n",
"else:\n",
" max_horizon = X_test.groupby(grain_column_names)[time_column_name].count().max()"
] ]
}, },
{ {
@@ -237,26 +234,25 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"time_column_name = 'date'\n",
"automl_settings = {\n", "automl_settings = {\n",
" \"time_column_name\": time_column_name,\n", " 'time_column_name': time_column_name,\n",
" # these columns are a breakdown of the total and therefore a leak\n", " 'max_horizon': max_horizon,\n",
" \"drop_column_names\": ['casual', 'registered'],\n",
" # knowing the country/region allows Automated ML to bring in holidays\n", " # knowing the country/region allows Automated ML to bring in holidays\n",
" \"country_or_region\" : 'US',\n", " 'country_or_region': 'US',\n",
" \"max_horizon\" : max_horizon,\n", " 'target_lags': 1,\n",
" \"target_lags\": 1 \n", " # these columns are a breakdown of the total and therefore a leak\n",
" 'drop_column_names': ['casual', 'registered']\n",
"}\n", "}\n",
"\n", "\n",
"automl_config = AutoMLConfig(task = 'forecasting', \n", "automl_config = AutoMLConfig(task='forecasting', \n",
" primary_metric='normalized_root_mean_squared_error',\n", " primary_metric='normalized_root_mean_squared_error',\n",
" iterations = 10,\n", " iterations=10,\n",
" iteration_timeout_minutes = 5,\n", " iteration_timeout_minutes=5,\n",
" X = X_train,\n", " X=X_train,\n",
" y = y_train,\n", " y=y_train,\n",
" n_cross_validations = 3, \n", " n_cross_validations=3, \n",
" path=project_folder,\n", " path=project_folder,\n",
" verbosity = logging.INFO,\n", " verbosity=logging.INFO,\n",
" **automl_settings)" " **automl_settings)"
] ]
}, },
@@ -264,7 +260,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We will now run the experiment, starting with 10 iterations of model search. Experiment can be continued for more iterations if the results are not yet good. You will see the currently running iterations printing to the console." "We will now run the experiment, starting with 10 iterations of model search. The experiment can be continued for more iterations if more accurate results are required. You will see the currently running iterations printing to the console."
] ]
}, },
{ {
@@ -356,11 +352,16 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Test the Best Fitted Model\n", "## Evaluate"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now use the best fitted model from the AutoML Run to make forecasts for the test set. \n",
"\n", "\n",
"Predict on training and test set, and calculate residual values.\n", "We always score on the original dataset whose schema matches the training set schema."
"\n",
"We always score on the original dataset whose schema matches the scheme of the training dataset."
] ]
}, },
{ {
@@ -372,21 +373,12 @@
"X_test.head()" "X_test.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_query = y_test.copy().astype(np.float)\n",
"y_query.fill(np.NaN)\n",
"y_fcst, X_trans = fitted_model.forecast(X_test, y_query)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We now define some functions for aligning output to input and for producing rolling forecasts over the full test set. As previously stated, the forecast horizon of 14 days is shorter than the length of the test set - which is about 120 days. To get predictions over the full test set, we iterate over the test set, making forecasts 14 days at a time and combining the results. We also make sure that each 14-day forecast uses up-to-date actuals - the current context - to construct lag features. \n",
"\n",
"It is a good practice to always align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows." "It is a good practice to always align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows."
] ]
}, },
@@ -396,7 +388,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name = 'predicted'):\n", "def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name='predicted',\n",
" horizon_colname='horizon_origin'):\n",
" \"\"\"\n", " \"\"\"\n",
" Demonstrates how to get the output aligned to the inputs\n", " Demonstrates how to get the output aligned to the inputs\n",
" using pandas indexes. Helps understand what happened if\n", " using pandas indexes. Helps understand what happened if\n",
@@ -408,7 +401,8 @@
" * model was asked to predict past max_horizon -> increase max horizon\n", " * model was asked to predict past max_horizon -> increase max horizon\n",
" * data at start of X_test was needed for lags -> provide previous periods\n", " * data at start of X_test was needed for lags -> provide previous periods\n",
" \"\"\"\n", " \"\"\"\n",
" df_fcst = pd.DataFrame({predicted_column_name : y_predicted})\n", " df_fcst = pd.DataFrame({predicted_column_name : y_predicted,\n",
" horizon_colname: X_trans[horizon_colname]})\n",
" # y and X outputs are aligned by forecast() function contract\n", " # y and X outputs are aligned by forecast() function contract\n",
" df_fcst.index = X_trans.index\n", " df_fcst.index = X_trans.index\n",
" \n", " \n",
@@ -427,7 +421,49 @@
" clean = together[together[[target_column_name, predicted_column_name]].notnull().all(axis=1)]\n", " clean = together[together[[target_column_name, predicted_column_name]].notnull().all(axis=1)]\n",
" return(clean)\n", " return(clean)\n",
"\n", "\n",
"df_all = align_outputs(y_fcst, X_trans, X_test, y_test)\n" "def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq='D'):\n",
" \"\"\"\n",
" Produce forecasts on a rolling origin over the given test set.\n",
" \n",
" Each iteration makes a forecast for the next 'max_horizon' periods \n",
" with respect to the current origin, then advances the origin by the horizon time duration. \n",
" The prediction context for each forecast is set so that the forecaster uses \n",
" the actual target values prior to the current origin time for constructing lag features.\n",
" \n",
" This function returns a concatenated DataFrame of rolling forecasts.\n",
" \"\"\"\n",
" df_list = []\n",
" origin_time = X_test[time_column_name].min()\n",
" while origin_time <= X_test[time_column_name].max():\n",
" # Set the horizon time - end date of the forecast\n",
" horizon_time = origin_time + max_horizon * to_offset(freq)\n",
" \n",
" # Extract test data from an expanding window up-to the horizon \n",
" expand_wind = (X_test[time_column_name] < horizon_time)\n",
" X_test_expand = X_test[expand_wind]\n",
" y_query_expand = np.zeros(len(X_test_expand)).astype(np.float)\n",
" y_query_expand.fill(np.NaN)\n",
" \n",
" if origin_time != X_test[time_column_name].min():\n",
" # Set the context by including actuals up-to the origin time\n",
" test_context_expand_wind = (X_test[time_column_name] < origin_time)\n",
" context_expand_wind = (X_test_expand[time_column_name] < origin_time)\n",
" y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]\n",
" \n",
" # Make a forecast out to the maximum horizon\n",
" y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)\n",
" \n",
" # Align forecast with test set for dates within the current rolling window \n",
" trans_tindex = X_trans.index.get_level_values(time_column_name)\n",
" trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)\n",
" test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)\n",
" df_list.append(align_outputs(y_fcst[trans_roll_wind], X_trans[trans_roll_wind],\n",
" X_test[test_roll_wind], y_test[test_roll_wind]))\n",
" \n",
" # Advance the origin time\n",
" origin_time = horizon_time\n",
" \n",
" return pd.concat(df_list, ignore_index=True)"
] ]
}, },
{ {
@@ -436,6 +472,30 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df_all = do_rolling_forecast(fitted_model, X_test, y_test, max_horizon)\n",
"df_all"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now calculate some error metrics for the forecasts and vizualize the predictions vs. the actuals."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def APE(actual, pred):\n",
" \"\"\"\n",
" Calculate absolute percentage error.\n",
" Returns a vector of APE values with same length as actual/pred.\n",
" \"\"\"\n",
" return 100*np.abs((actual - pred)/actual)\n",
"\n",
"def MAPE(actual, pred):\n", "def MAPE(actual, pred):\n",
" \"\"\"\n", " \"\"\"\n",
" Calculate mean absolute percentage error.\n", " Calculate mean absolute percentage error.\n",
@@ -445,8 +505,7 @@
" not_zero = ~np.isclose(actual, 0.0)\n", " not_zero = ~np.isclose(actual, 0.0)\n",
" actual_safe = actual[not_na & not_zero]\n", " actual_safe = actual[not_na & not_zero]\n",
" pred_safe = pred[not_na & not_zero]\n", " pred_safe = pred[not_na & not_zero]\n",
" APE = 100*np.abs((actual_safe - pred_safe)/actual_safe)\n", " return np.mean(APE(actual_safe, pred_safe))"
" return np.mean(APE)"
] ]
}, },
{ {
@@ -469,12 +528,57 @@
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n", "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()" "plt.show()"
] ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The MAPE seems high; it is being skewed by an actual with a small absolute value. For a more informative evaluation, we can calculate the metrics by forecast horizon:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_all.groupby('horizon_origin').apply(\n",
" lambda df: pd.Series({'MAPE': MAPE(df[target_column_name], df['predicted']),\n",
" 'RMSE': np.sqrt(mean_squared_error(df[target_column_name], df['predicted'])),\n",
" 'MAE': mean_absolute_error(df[target_column_name], df['predicted'])}))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It's also interesting to see the distributions of APE (absolute percentage error) by horizon. On a log scale, the outlying APE in the horizon-3 group is clear."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_all_APE = df_all.assign(APE=APE(df_all[target_column_name], df_all['predicted']))\n",
"APEs = [df_all_APE[df_all['horizon_origin'] == h].APE.values for h in range(1, max_horizon + 1)]\n",
"\n",
"%matplotlib notebook\n",
"plt.boxplot(APEs)\n",
"plt.yscale('log')\n",
"plt.xlabel('horizon')\n",
"plt.ylabel('APE (%)')\n",
"plt.title('Absolute Percentage Errors by Forecast Horizon')\n",
"\n",
"plt.show()"
]
} }
], ],
"metadata": { "metadata": {
"authors": [ "authors": [
{ {
"name": "xiaga@microsoft.com, tosingli@microsoft.com" "name": "xiaga@microsoft.com, tosingli@microsoft.com, erwright@microsoft.com"
} }
], ],
"kernelspec": { "kernelspec": {
@@ -492,7 +596,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.7" "version": "3.6.8"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -0,0 +1,9 @@
name: auto-ml-forecasting-bike-share
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml
- statsmodels

View File

@@ -35,17 +35,16 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Introduction\n", "## Introduction\n",
"In this example, we show how AutoML can be used for energy demand forecasting.\n", "In this example, we show how AutoML can be used to forecast a single time-series in the energy demand application area. \n",
"\n", "\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n", "Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n", "\n",
"In this notebook you would see\n", "Notebook synopsis:\n",
"1. Creating an Experiment in an existing Workspace\n", "1. Creating an Experiment in an existing Workspace\n",
"2. Instantiating AutoMLConfig with new task type \"forecasting\" for timeseries data training, and other timeseries related settings: for this dataset we use the basic one: \"time_column_name\" \n", "2. Configuration and local run of AutoML for a simple time-series model\n",
"3. Training the Model using local compute\n", "3. View engineered features and prediction results\n",
"4. Exploring the results\n", "4. Configuration and local run of AutoML for a time-series model with lag and rolling window features\n",
"5. Viewing the engineered names for featurized data and featurization summary for all raw features\n", "5. Estimate feature importance"
"6. Testing the fitted model"
] ]
}, },
{ {
@@ -66,10 +65,10 @@
"import numpy as np\n", "import numpy as np\n",
"import logging\n", "import logging\n",
"import warnings\n", "import warnings\n",
"\n",
"# Squash warning messages for cleaner output in the notebook\n", "# Squash warning messages for cleaner output in the notebook\n",
"warnings.showwarning = lambda *args, **kwargs: None\n", "warnings.showwarning = lambda *args, **kwargs: None\n",
"\n", "\n",
"\n",
"from azureml.core.workspace import Workspace\n", "from azureml.core.workspace import Workspace\n",
"from azureml.core.experiment import Experiment\n", "from azureml.core.experiment import Experiment\n",
"from azureml.train.automl import AutoMLConfig\n", "from azureml.train.automl import AutoMLConfig\n",
@@ -81,7 +80,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments." "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
] ]
}, },
{ {
@@ -117,7 +116,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Data\n", "## Data\n",
"Read energy demanding data from file, and preview data." "We will use energy consumption data from New York City for model training. The data is stored in a tabular format and includes energy demand and basic weather data at an hourly frequency. Pandas CSV reader is used to read the file into memory. Special attention is given to the \"timeStamp\" column in the data since it contains text which should be parsed as datetime-type objects. "
] ]
}, },
{ {
@@ -130,13 +129,20 @@
"data.head()" "data.head()"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We must now define the schema of this dataset. Every time-series must have a time column and a target. The target quantity is what will be eventually forecasted by a trained model. In this case, the target is the \"demand\" column. The other columns, \"temp\" and \"precip,\" are implicitly designated as features."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# let's take note of what columns means what in the data\n", "# Dataset schema\n",
"time_column_name = 'timeStamp'\n", "time_column_name = 'timeStamp'\n",
"target_column_name = 'demand'" "target_column_name = 'demand'"
] ]
@@ -145,7 +151,14 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Split the data into train and test sets\n" "### Forecast Horizon\n",
"\n",
"In addition to the data schema, we must also specify the forecast horizon. A forecast horizon is a time span into the future (or just beyond the latest date in the training data) where forecasts of the target quantity are needed. Choosing a forecast horizon is application specific, but a rule-of-thumb is that **the horizon should be the time-frame where you need actionable decisions based on the forecast.** The horizon usually has a strong relationship with the frequency of the time-series data, that is, the sampling interval of the target quantity and the features. For instance, the NYC energy demand data has an hourly frequency. A decision that requires a demand forecast to the hour is unlikely to be made weeks or months in advance, particularly if we expect weather to be a strong determinant of demand. We may have fairly accurate meteorological forecasts of the hourly temperature and precipitation on a the time-scale of a day or two, however.\n",
"\n",
"Given the above discussion, we generally recommend that users set forecast horizons to less than 100 time periods (i.e. less than 100 hours in the NYC energy example). Furthermore, **AutoML's memory use and computation time increase in proportion to the length of the horizon**, so the user should consider carefully how they set this value. If a long horizon forecast really is necessary, it may be good practice to aggregate the series to a coarser time scale. \n",
"\n",
"\n",
"Forecast horizons in AutoML are given as integer multiples of the time-series frequency. In this example, we set the horizon to 48 hours."
] ]
}, },
{ {
@@ -154,8 +167,32 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"X_train = data[data[time_column_name] < '2017-02-01']\n", "max_horizon = 48"
"X_test = data[data[time_column_name] >= '2017-02-01']\n", ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Split the data into train and test sets\n",
"We now split the data into a train and a test set so that we may evaluate model performance. We note that the tail of the dataset contains a large number of NA values in the target column, so we designate the test set as the 48 hour window ending on the latest date of known energy demand. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Find time point to split on\n",
"latest_known_time = data[~pd.isnull(data[target_column_name])][time_column_name].max()\n",
"split_time = latest_known_time - pd.Timedelta(hours=max_horizon)\n",
"\n",
"# Split into train/test sets\n",
"X_train = data[data[time_column_name] <= split_time]\n",
"X_test = data[(data[time_column_name] > split_time) & (data[time_column_name] <= latest_known_time)]\n",
"\n",
"# Move the target values into their own arrays \n",
"y_train = X_train.pop(target_column_name).values\n", "y_train = X_train.pop(target_column_name).values\n",
"y_test = X_test.pop(target_column_name).values" "y_test = X_test.pop(target_column_name).values"
] ]
@@ -166,7 +203,7 @@
"source": [ "source": [
"## Train\n", "## Train\n",
"\n", "\n",
"Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n", "We now instantiate an AutoMLConfig object. This config defines the settings and data used to run the experiment. For forecasting tasks, we must provide extra configuration related to the time-series data schema and forecasting context. Here, only the name of the time column and the maximum forecast horizon are needed. Other settings are described below:\n",
"\n", "\n",
"|Property|Description|\n", "|Property|Description|\n",
"|-|-|\n", "|-|-|\n",
@@ -176,7 +213,7 @@
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n", "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n", "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
"|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n", "|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n", "|**n_cross_validations**|Number of cross validation splits. Rolling Origin Validation is used to split time-series in a temporally consistent way.|\n",
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder. " "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder. "
] ]
}, },
@@ -186,22 +223,22 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"automl_settings = {\n", "time_series_settings = {\n",
" \"time_column_name\": time_column_name \n", " 'time_column_name': time_column_name,\n",
" 'max_horizon': max_horizon\n",
"}\n", "}\n",
"\n", "\n",
"\n", "automl_config = AutoMLConfig(task='forecasting',\n",
"automl_config = AutoMLConfig(task = 'forecasting',\n", " debug_log='automl_nyc_energy_errors.log',\n",
" debug_log = 'automl_nyc_energy_errors.log',\n",
" primary_metric='normalized_root_mean_squared_error',\n", " primary_metric='normalized_root_mean_squared_error',\n",
" iterations = 10,\n", " iterations=10,\n",
" iteration_timeout_minutes = 5,\n", " iteration_timeout_minutes=5,\n",
" X = X_train,\n", " X=X_train,\n",
" y = y_train,\n", " y=y_train,\n",
" n_cross_validations = 3,\n", " n_cross_validations=3,\n",
" path=project_folder,\n", " path=project_folder,\n",
" verbosity = logging.INFO,\n", " verbosity = logging.INFO,\n",
" **automl_settings)" " **time_series_settings)"
] ]
}, },
{ {
@@ -358,7 +395,8 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Calculate accuracy metrics\n" "### Calculate accuracy metrics\n",
"Finally, we calculate some accuracy metrics for the forecast and plot the predictions vs. the actuals over the time range in the test set."
] ]
}, },
{ {
@@ -395,9 +433,12 @@
"\n", "\n",
"# Plot outputs\n", "# Plot outputs\n",
"%matplotlib notebook\n", "%matplotlib notebook\n",
"test_pred = plt.scatter(df_all[target_column_name], df_all['predicted'], color='b')\n", "pred, = plt.plot(df_all[time_column_name], df_all['predicted'], color='b')\n",
"test_test = plt.scatter(y_test, y_test, color='g')\n", "actual, = plt.plot(df_all[time_column_name], df_all[target_column_name], color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n", "plt.xticks(fontsize=8)\n",
"plt.legend((pred, actual), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.title('Prediction vs. Actual Time-Series')\n",
"\n",
"plt.show()" "plt.show()"
] ]
}, },
@@ -412,16 +453,16 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Using lags and rolling window features to improve the forecast" "### Using lags and rolling window features"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, grain and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data.\n", "We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, grain and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data. In the previous example, the horizon was only used to split the data for cross-validation.\n",
"\n", "\n",
"Now that we configured target lags, that is the previous values of the target variables, and the prediction is no longer horizon-less. We therefore must specify the `max_horizon` that the model will learn to forecast. The `target_lags` keyword specifies how far back we will construct the lags of the target variable, and the `target_rolling_window_size` specifies the size of the rolling window over which we will generate the `max`, `min` and `sum` features." "Now that we configured target lags, that is the previous values of the target variables, and the prediction is no longer horizon-less. We therefore must still specify the `max_horizon` that the model will learn to forecast. The `target_lags` keyword specifies how far back we will construct the lags of the target variable, and the `target_rolling_window_size` specifies the size of the rolling window over which we will generate the `max`, `min` and `sum` features."
] ]
}, },
{ {
@@ -430,27 +471,32 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"automl_settings_lags = {\n", "time_series_settings_with_lags = {\n",
" 'time_column_name': time_column_name,\n", " 'time_column_name': time_column_name,\n",
" 'target_lags': 1,\n", " 'max_horizon': max_horizon,\n",
" 'target_rolling_window_size': 5,\n", " 'target_lags': 12,\n",
" # you MUST set the max_horizon when using lags and rolling windows\n", " 'target_rolling_window_size': 4\n",
" # it is optional when looking-back features are not used \n",
" 'max_horizon': len(y_test), # only one grain\n",
"}\n", "}\n",
"\n", "\n",
"\n", "automl_config_lags = AutoMLConfig(task='forecasting',\n",
"automl_config_lags = AutoMLConfig(task = 'forecasting',\n", " debug_log='automl_nyc_energy_errors.log',\n",
" debug_log = 'automl_nyc_energy_errors.log',\n", " primary_metric='normalized_root_mean_squared_error',\n",
" primary_metric='normalized_root_mean_squared_error',\n", " blacklist_models=['ElasticNet'],\n",
" iterations = 10,\n", " iterations=10,\n",
" iteration_timeout_minutes = 5,\n", " iteration_timeout_minutes=10,\n",
" X = X_train,\n", " X=X_train,\n",
" y = y_train,\n", " y=y_train,\n",
" n_cross_validations = 3,\n", " n_cross_validations=3,\n",
" path=project_folder,\n", " path=project_folder,\n",
" verbosity = logging.INFO,\n", " verbosity=logging.INFO,\n",
" **automl_settings_lags)" " **time_series_settings_with_lags)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now start a new local run, this time with lag and rolling window featurization. AutoML applies featurizations in the setup stage, prior to iterating over ML models. The full training set is featurized first, followed by featurization of each of the CV splits. Lag and rolling window features introduce additional complexity, so the run will take longer than in the previous example that lacked these featurizations."
] ]
}, },
{ {
@@ -498,9 +544,10 @@
"\n", "\n",
"# Plot outputs\n", "# Plot outputs\n",
"%matplotlib notebook\n", "%matplotlib notebook\n",
"test_pred = plt.scatter(df_lags[target_column_name], df_lags['predicted'], color='b')\n", "pred, = plt.plot(df_lags[time_column_name], df_lags['predicted'], color='b')\n",
"test_test = plt.scatter(y_test, y_test, color='g')\n", "actual, = plt.plot(df_lags[time_column_name], df_lags[target_column_name], color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n", "plt.xticks(fontsize=8)\n",
"plt.legend((pred, actual), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()" "plt.show()"
] ]
}, },
@@ -520,8 +567,8 @@
"from azureml.train.automl.automlexplainer import explain_model\n", "from azureml.train.automl.automlexplainer import explain_model\n",
"\n", "\n",
"# feature names are everything in the transformed data except the target\n", "# feature names are everything in the transformed data except the target\n",
"features = X_trans.columns[:-1]\n", "features = X_trans_lags.columns[:-1]\n",
"expl = explain_model(fitted_model, X_train, X_test, features = features, best_run=best_run_lags, y_train = y_train)\n", "expl = explain_model(fitted_model_lags, X_train.copy(), X_test.copy(), features=features, best_run=best_run_lags, y_train=y_train)\n",
"# unpack the tuple\n", "# unpack the tuple\n",
"shap_values, expected_values, feat_overall_imp, feat_names, per_class_summary, per_class_imp = expl\n", "shap_values, expected_values, feat_overall_imp, feat_names, per_class_summary, per_class_imp = expl\n",
"best_run_lags" "best_run_lags"
@@ -540,7 +587,7 @@
"metadata": { "metadata": {
"authors": [ "authors": [
{ {
"name": "xiaga, tosingli" "name": "xiaga, tosingli, erwright"
} }
], ],
"kernelspec": { "kernelspec": {
@@ -558,7 +605,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.7" "version": "3.6.8"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -0,0 +1,10 @@
name: auto-ml-forecasting-energy-demand
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml
- statsmodels
- azureml-explain-model

View File

@@ -37,16 +37,10 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Introduction\n", "## Introduction\n",
"In this example, we use AutoML to find and tune a time-series forecasting model.\n", "In this example, we use AutoML to train, select, and operationalize a time-series forecasting model for multiple time-series.\n",
"\n", "\n",
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n", "Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
"\n", "\n",
"In this notebook, you will:\n",
"1. Create an Experiment in an existing Workspace\n",
"2. Instantiate an AutoMLConfig \n",
"3. Find and train a forecasting model using local compute\n",
"4. Evaluate the performance of the model\n",
"\n",
"The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area." "The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area."
] ]
}, },
@@ -68,10 +62,10 @@
"import numpy as np\n", "import numpy as np\n",
"import logging\n", "import logging\n",
"import warnings\n", "import warnings\n",
"\n",
"# Squash warning messages for cleaner output in the notebook\n", "# Squash warning messages for cleaner output in the notebook\n",
"warnings.showwarning = lambda *args, **kwargs: None\n", "warnings.showwarning = lambda *args, **kwargs: None\n",
"\n", "\n",
"\n",
"from azureml.core.workspace import Workspace\n", "from azureml.core.workspace import Workspace\n",
"from azureml.core.experiment import Experiment\n", "from azureml.core.experiment import Experiment\n",
"from azureml.train.automl import AutoMLConfig\n", "from azureml.train.automl import AutoMLConfig\n",
@@ -82,7 +76,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment is a named object in a Workspace which represents a predictive task, the output of which is a trained model and a set of evaluation metrics for the model. " "As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem. "
] ]
}, },
{ {
@@ -236,7 +230,7 @@
"\n", "\n",
"For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time, the grain column names, and the maximum forecast horizon. A time column is required for forecasting, while the grain is optional. If a grain is not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n", "For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time, the grain column names, and the maximum forecast horizon. A time column is required for forecasting, while the grain is optional. If a grain is not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n",
"\n", "\n",
"The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up-to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly. \n", "The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up-to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly. Please see the [energy_demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) for more discussion of forecast horizon.\n",
"\n", "\n",
"Finally, a note about the cross-validation (CV) procedure for time-series data. AutoML uses out-of-sample error estimates to select a best pipeline/model, so it is important that the CV fold splitting is done correctly. Time-series can violate the basic statistical assumptions of the canonical K-Fold CV strategy, so AutoML implements a [rolling origin validation](https://robjhyndman.com/hyndsight/tscv/) procedure to create CV folds for time-series data. To use this procedure, you just need to specify the desired number of CV folds in the AutoMLConfig object. It is also possible to bypass CV and use your own validation set by setting the *X_valid* and *y_valid* parameters of AutoMLConfig.\n", "Finally, a note about the cross-validation (CV) procedure for time-series data. AutoML uses out-of-sample error estimates to select a best pipeline/model, so it is important that the CV fold splitting is done correctly. Time-series can violate the basic statistical assumptions of the canonical K-Fold CV strategy, so AutoML implements a [rolling origin validation](https://robjhyndman.com/hyndsight/tscv/) procedure to create CV folds for time-series data. To use this procedure, you just need to specify the desired number of CV folds in the AutoMLConfig object. It is also possible to bypass CV and use your own validation set by setting the *X_valid* and *y_valid* parameters of AutoMLConfig.\n",
"\n", "\n",
@@ -269,7 +263,7 @@
" 'time_column_name': time_column_name,\n", " 'time_column_name': time_column_name,\n",
" 'grain_column_names': grain_column_names,\n", " 'grain_column_names': grain_column_names,\n",
" 'drop_column_names': ['logQuantity'],\n", " 'drop_column_names': ['logQuantity'],\n",
" 'max_horizon': n_test_periods # optional\n", " 'max_horizon': n_test_periods\n",
"}\n", "}\n",
"\n", "\n",
"automl_config = AutoMLConfig(task='forecasting',\n", "automl_config = AutoMLConfig(task='forecasting',\n",
@@ -278,7 +272,7 @@
" iterations=10,\n", " iterations=10,\n",
" X=X_train,\n", " X=X_train,\n",
" y=y_train,\n", " y=y_train,\n",
" n_cross_validations=5,\n", " n_cross_validations=3,\n",
" enable_ensembling=False,\n", " enable_ensembling=False,\n",
" path=project_folder,\n", " path=project_folder,\n",
" verbosity=logging.INFO,\n", " verbosity=logging.INFO,\n",
@@ -324,7 +318,8 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Predict\n", "# Forecasting\n",
"\n",
"Now that we have retrieved the best pipeline/model, it can be used to make predictions on test data. First, we remove the target values from the test set:" "Now that we have retrieved the best pipeline/model, it can be used to make predictions on test data. First, we remove the target values from the test set:"
] ]
}, },
@@ -852,7 +847,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.7" "version": "3.6.8"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -0,0 +1,9 @@
name: auto-ml-forecasting-orange-juice-sales
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml
- statsmodels

View File

@@ -0,0 +1,8 @@
name: auto-ml-missing-data-blacklist-early-termination
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,9 @@
name: auto-ml-model-explanation
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml
- azureml-explain-model

View File

@@ -0,0 +1,800 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Regression with Deployment using Hardware Performance Dataset**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)\n",
"1. [Acknowledgements](#Acknowledgements)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the Predicting Compressive Strength of Concrete Dataset to showcase how you can use AutoML for a regression problem. The regression goal is to predict the compressive strength of concrete based off of different ingredient combinations and the quantities of those ingredients.\n",
"\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
"\n",
"In this notebook you will learn how to:\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Configure AutoML using `AutoMLConfig`.\n",
"3. Train the model using local compute.\n",
"4. Explore the results.\n",
"5. Test the best fitted model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"As part of the setup you have already created an Azure ML Workspace object. For AutoML you will need to create an Experiment object, which is a named object in a Workspace used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"import azureml.dataprep as dprep\n",
" \n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the experiment and specify the project folder.\n",
"experiment_name = 'automl-regression-concrete'\n",
"project_folder = './sample_projects/automl-regression-concrete'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
"#### Creation of AmlCompute takes approximately 5 minutes. \n",
"If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"automlcl\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data\n",
"\n",
"Here load the data in the get_data script to be utilized in azure compute. To do this, first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_run_config."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Data\n",
"\n",
"Here create the script to be run in azure compute for loading the data, load the concrete strength dataset into the X and y variables. Next, split the data using train_test_split and return X_train and y_train for training the model. Finally, return X_train and y_train for training the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\"\n",
"dflow = dprep.auto_read_file(data)\n",
"dflow.get_profile()\n",
"X = dflow.drop_columns(columns=['CONCRETE'])\n",
"y = dflow.keep_columns(columns=['CONCRETE'], validate_column_exists=True)\n",
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
"y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
"dflow.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**task**|classification or regression|\n",
"|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
"|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
"\n",
"**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### If you would like to see even better results increase \"iteration_time_out minutes\" to 10+ mins and increase \"iterations\" to a minimum of 30"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 5,\n",
" \"iterations\": 10,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'spearman_correlation',\n",
" \"preprocess\": True,\n",
" \"max_concurrent_iterations\": 5,\n",
" \"verbosity\": logging.INFO,\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'regression',\n",
" debug_log = 'automl.log',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"Widget for Monitoring Runs\n",
"The widget will first report a \u00e2\u20ac\u0153loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Retrieve the Best Model\n",
"Below we select the best pipeline from our iterations. The get_output method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on get_output allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Best Model Based on Any Other Metric\n",
"Show the run and the model that has the smallest root_mean_squared_error value (which turned out to be the same as the one with largest spearman_correlation value):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lookup_metric = \"root_mean_squared_error\"\n",
"best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 3\n",
"third_run, third_model = remote_run.get_output(iteration = iteration)\n",
"print(third_run)\n",
"print(third_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register the Fitted Model for Deployment\n",
"If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"description = 'AutoML Model'\n",
"tags = None\n",
"model = remote_run.register_model(description = description, tags = tags)\n",
"\n",
"print(remote_run.model_id) # This will be written to the script file later in the notebook."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Scoring Script\n",
"The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy\n",
"import azureml.train.automl\n",
"from sklearn.externals import joblib\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global model\n",
" model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
"def run(rawdata):\n",
" try:\n",
" data = json.loads(rawdata)['data']\n",
" data = numpy.array(data)\n",
" result = model.predict(data)\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"error\": result})\n",
" return json.dumps({\"result\":result.tolist()})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a YAML File for the Environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for p in ['azureml-train-automl', 'azureml-sdk', 'azureml-core']:\n",
" print('{}\\t{}'.format(p, dependencies[p]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-sdk[automl]'])\n",
"\n",
"conda_env_file_name = 'myenv.yml'\n",
"myenv.save_to_file('.', conda_env_file_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Substitute the actual version number in the environment file.\n",
"# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
"# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
"\n",
"with open(conda_env_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(conda_env_file_name, 'w') as cefw:\n",
" cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-sdk']))\n",
"\n",
"# Substitute the actual model id in the script file.\n",
"\n",
"script_file_name = 'score.py'\n",
"\n",
"with open(script_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(script_file_name, 'w') as cefw:\n",
" cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Container Image\n",
"\n",
"Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
"or when testing a model that is under development."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import Image, ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
" execution_script = script_file_name,\n",
" conda_file = conda_env_file_name,\n",
" tags = {'area': \"digits\", 'type': \"automl_regression\"},\n",
" description = \"Image for automl regression sample\")\n",
"\n",
"image = Image.create(name = \"automlsampleimage\",\n",
" # this is the model object \n",
" models = [model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)\n",
"\n",
"if image.creation_state == 'Failed':\n",
" print(\"Image build log at: \" + image.image_build_log_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy the Image as a Web Service on Azure Container Instance\n",
"\n",
"Deploy an image that contains the model and other assets needed by the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
" memory_gb = 1, \n",
" tags = {'area': \"digits\", 'type': \"automl_regression\"}, \n",
" description = 'sample service for Automl Regression')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import Webservice\n",
"\n",
"aci_service_name = 'automl-sample-concrete'\n",
"print(aci_service_name)\n",
"aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
" image = image,\n",
" name = aci_service_name,\n",
" workspace = ws)\n",
"aci_service.wait_for_deployment(True)\n",
"print(aci_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete a Web Service\n",
"\n",
"Deletes the specified web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get Logs from a Deployed Web Service\n",
"\n",
"Gets logs from a deployed web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test\n",
"\n",
"Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_test = X_test.to_pandas_dataframe()\n",
"y_test = y_test.to_pandas_dataframe()\n",
"y_test = np.array(y_test)\n",
"y_test = y_test[:,0]\n",
"X_train = X_train.to_pandas_dataframe()\n",
"y_train = y_train.to_pandas_dataframe()\n",
"y_train = np.array(y_train)\n",
"y_train = y_train[:,0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Predict on training and test set, and calculate residual values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred_train = fitted_model.predict(X_train)\n",
"y_residual_train = y_train - y_pred_train\n",
"\n",
"y_pred_test = fitted_model.predict(X_test)\n",
"y_residual_test = y_test - y_pred_test\n",
"\n",
"y_residual_train.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"\n",
"# Set up a multi-plot chart.\n",
"f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
"f.suptitle('Regression Residual Values', fontsize = 18)\n",
"f.set_figheight(6)\n",
"f.set_figwidth(16)\n",
"\n",
"# Plot residual values of training set.\n",
"a0.axis([0, 360, -200, 200])\n",
"a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
"a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
"a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
"a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)), fontsize = 12)\n",
"a0.set_xlabel('Training samples', fontsize = 12)\n",
"a0.set_ylabel('Residual Values', fontsize = 12)\n",
"\n",
"# Plot a histogram.\n",
"#a0.hist(y_residual_train, orientation = 'horizontal', color = ['b']*len(y_residual_train), bins = 10, histtype = 'step')\n",
"#a0.hist(y_residual_train, orientation = 'horizontal', color = ['b']*len(y_residual_train), alpha = 0.2, bins = 10)\n",
"\n",
"# Plot residual values of test set.\n",
"a1.axis([0, 90, -200, 200])\n",
"a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
"a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
"a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
"a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)), fontsize = 12)\n",
"a1.set_xlabel('Test samples', fontsize = 12)\n",
"a1.set_yticklabels([])\n",
"\n",
"# Plot a histogram.\n",
"#a1.hist(y_residual_test, orientation = 'horizontal', color = ['b']*len(y_residual_test), bins = 10, histtype = 'step')\n",
"#a1.hist(y_residual_test, orientation = 'horizontal', color = ['b']*len(y_residual_test), alpha = 0.2, bins = 10)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate metrics for the prediction\n",
"\n",
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
"from the trained model that was returned."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot outputs\n",
"%matplotlib notebook\n",
"test_pred = plt.scatter(y_test, y_pred_test, color='b')\n",
"test_test = plt.scatter(y_test, y_test, color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Acknowledgements\n",
"\n",
"This Predicting Compressive Strength of Concrete Dataset is made available under the CC0 1.0 Universal (CC0 1.0)\n",
"Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/. Any rights in individual contents of the database are licensed under the CC0 1.0 Universal (CC0 1.0)\n",
"Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/ . The dataset itself can be found here: https://www.kaggle.com/pavanraj159/concrete-compressive-strength-data-set and http://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength\n",
"\n",
"I-Cheng Yeh, \"Modeling of strength of high performance concrete using artificial neural networks,\" Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998). \n",
"\n",
"Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
]
}
],
"metadata": {
"authors": [
{
"name": "v-rasav"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: auto-ml-regression-concrete-strength
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,800 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Regression with Deployment using Hardware Performance Dataset**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)\n",
"1. [Acknowledgements](#Acknowledgements)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the Hardware Performance Dataset to showcase how you can use AutoML for a simple regression problem. The Regression goal is to predict the performance of certain combinations of hardware parts.\n",
"\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration](../../../configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
"\n",
"In this notebook you will learn how to:\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Configure AutoML using `AutoMLConfig`.\n",
"3. Train the model using local compute.\n",
"4. Explore the results.\n",
"5. Test the best fitted model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"As part of the setup you have already created an Azure ML Workspace object. For AutoML you will need to create an Experiment object, which is a named object in a Workspace used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"import azureml.dataprep as dprep\n",
" \n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the experiment and specify the project folder.\n",
"experiment_name = 'automl-regression-hardware'\n",
"project_folder = './sample_projects/automl-remote-regression'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a compute target for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
"#### Creation of AmlCompute takes approximately 5 minutes. \n",
"If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read this article on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"automlcl\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data\n",
"\n",
"Here load the data in the get_data script to be utilized in azure compute. To do this, first load all the necessary libraries and dependencies to set up paths for the data and to create the conda_run_config."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Data\n",
"\n",
"Here create the script to be run in azure compute for loading the data, load the hardware dataset into the X and y variables. Next split the data using train_test_split and return X_train and y_train for training the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
"dflow = dprep.auto_read_file(data)\n",
"dflow.get_profile()\n",
"X = dflow.drop_columns(columns=['ERP'])\n",
"y = dflow.keep_columns(columns=['ERP'], validate_column_exists=True)\n",
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
"y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
"dflow.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Train\n",
"\n",
"Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**task**|classification or regression|\n",
"|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics: <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
"|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n",
"\n",
"**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### If you would like to see even better results increase \"iteration_time_out minutes\" to 10+ mins and increase \"iterations\" to a minimum of 30"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 5,\n",
" \"iterations\": 10,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'spearman_correlation',\n",
" \"preprocess\": True,\n",
" \"max_concurrent_iterations\": 5,\n",
" \"verbosity\": logging.INFO,\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'regression',\n",
" debug_log = 'automl_errors_20190417.log',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Retrieve the Best Model\n",
"Below we select the best pipeline from our iterations. The get_output method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on get_output allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model that has the smallest `root_mean_squared_error` value (which turned out to be the same as the one with largest `spearman_correlation` value):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lookup_metric = \"root_mean_squared_error\"\n",
"best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 3\n",
"third_run, third_model = remote_run.get_output(iteration = iteration)\n",
"print(third_run)\n",
"print(third_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register the Fitted Model for Deployment\n",
"If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"description = 'AutoML Model'\n",
"tags = None\n",
"model = remote_run.register_model(description = description, tags = tags)\n",
"\n",
"print(remote_run.model_id) # This will be written to the script file later in the notebook."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Scoring Script\n",
"The scoring script is required to generate the image for deployment. It contains the code to do the predictions on input data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy\n",
"import azureml.train.automl\n",
"from sklearn.externals import joblib\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global model\n",
" model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
"def run(rawdata):\n",
" try:\n",
" data = json.loads(rawdata)['data']\n",
" data = numpy.array(data)\n",
" result = model.predict(data)\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"error\": result})\n",
" return json.dumps({\"result\":result.tolist()})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a YAML File for the Environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. Details about retrieving the versions can be found in notebook [12.auto-ml-retrieve-the-training-sdk-versions](12.auto-ml-retrieve-the-training-sdk-versions.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dependencies = remote_run.get_run_sdk_dependencies(iteration = 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for p in ['azureml-train-automl', 'azureml-sdk', 'azureml-core']:\n",
" print('{}\\t{}'.format(p, dependencies[p]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-sdk[automl]'])\n",
"\n",
"conda_env_file_name = 'myenv.yml'\n",
"myenv.save_to_file('.', conda_env_file_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Substitute the actual version number in the environment file.\n",
"# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.\n",
"# However, we include this in case this code is used on an experiment from a previous SDK version.\n",
"\n",
"with open(conda_env_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(conda_env_file_name, 'w') as cefw:\n",
" cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-sdk']))\n",
"\n",
"# Substitute the actual model id in the script file.\n",
"\n",
"script_file_name = 'score.py'\n",
"\n",
"with open(script_file_name, 'r') as cefr:\n",
" content = cefr.read()\n",
"\n",
"with open(script_file_name, 'w') as cefw:\n",
" cefw.write(content.replace('<<modelid>>', remote_run.model_id))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Container Image\n",
"\n",
"Next use Azure Container Instances for deploying models as a web service for quickly deploying and validating your model\n",
"or when testing a model that is under development."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import Image, ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
" execution_script = script_file_name,\n",
" conda_file = conda_env_file_name,\n",
" tags = {'area': \"digits\", 'type': \"automl_regression\"},\n",
" description = \"Image for automl regression sample\")\n",
"\n",
"image = Image.create(name = \"automlsampleimage\",\n",
" # this is the model object \n",
" models = [model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)\n",
"\n",
"if image.creation_state == 'Failed':\n",
" print(\"Image build log at: \" + image.image_build_log_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy the Image as a Web Service on Azure Container Instance\n",
"\n",
"Deploy an image that contains the model and other assets needed by the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
" memory_gb = 1, \n",
" tags = {'area': \"digits\", 'type': \"automl_regression\"}, \n",
" description = 'sample service for Automl Regression')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import Webservice\n",
"\n",
"aci_service_name = 'automl-sample-hardware'\n",
"print(aci_service_name)\n",
"aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
" image = image,\n",
" name = aci_service_name,\n",
" workspace = ws)\n",
"aci_service.wait_for_deployment(True)\n",
"print(aci_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete a Web Service\n",
"\n",
"Deletes the specified web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get Logs from a Deployed Web Service\n",
"\n",
"Gets logs from a deployed web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#aci_service.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_test = X_test.to_pandas_dataframe()\n",
"y_test = y_test.to_pandas_dataframe()\n",
"y_test = np.array(y_test)\n",
"y_test = y_test[:,0]\n",
"X_train = X_train.to_pandas_dataframe()\n",
"y_train = y_train.to_pandas_dataframe()\n",
"y_train = np.array(y_train)\n",
"y_train = y_train[:,0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Predict on training and test set, and calculate residual values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred_train = fitted_model.predict(X_train)\n",
"y_residual_train = y_train - y_pred_train\n",
"\n",
"y_pred_test = fitted_model.predict(X_test)\n",
"y_residual_test = y_test - y_pred_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate metrics for the prediction\n",
"\n",
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
"from the trained model that was returned."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"\n",
"# Set up a multi-plot chart.\n",
"f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
"f.suptitle('Regression Residual Values', fontsize = 18)\n",
"f.set_figheight(6)\n",
"f.set_figwidth(16)\n",
"\n",
"# Plot residual values of training set.\n",
"a0.axis([0, 360, -200, 200])\n",
"a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
"a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
"a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
"a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)),fontsize = 12)\n",
"a0.set_xlabel('Training samples', fontsize = 12)\n",
"a0.set_ylabel('Residual Values', fontsize = 12)\n",
"\n",
"# Plot residual values of test set.\n",
"a1.axis([0, 90, -200, 200])\n",
"a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
"a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
"a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
"a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)),fontsize = 12)\n",
"a1.set_xlabel('Test samples', fontsize = 12)\n",
"a1.set_yticklabels([])\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib notebook\n",
"test_pred = plt.scatter(y_test, y_pred_test, color='')\n",
"test_test = plt.scatter(y_test, y_test, color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Acknowledgements\n",
"This Predicting Hardware Performance Dataset is made available under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/. Any rights in individual contents of the database are licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication License: https://creativecommons.org/publicdomain/zero/1.0/ . The dataset itself can be found here: https://www.kaggle.com/faizunnabi/comp-hardware-performance and https://archive.ics.uci.edu/ml/datasets/Computer+Hardware\n",
"\n",
"_**Citation Found Here**_\n"
]
}
],
"metadata": {
"authors": [
{
"name": "v-rasav"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: auto-ml-regression-hardware-performance
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,9 @@
name: auto-ml-regression
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml
- paramiko<2.5.0

View File

@@ -119,7 +119,9 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Create or Attach existing AmlCompute\n", "### Create or Attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you create an AmlCompute as your training compute resource.\n", "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"\n", "\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota." "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
] ]
@@ -134,12 +136,10 @@
"from azureml.core.compute import ComputeTarget\n", "from azureml.core.compute import ComputeTarget\n",
"\n", "\n",
"# Choose a name for your cluster.\n", "# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"cpucluster\"\n", "amlcompute_cluster_name = \"cpu-cluster\"\n",
"\n", "\n",
"found = False\n", "found = False\n",
"\n",
"# Check if this compute target already exists in the workspace.\n", "# Check if this compute target already exists in the workspace.\n",
"\n",
"cts = ws.compute_targets\n", "cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n", "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n", " found = True\n",

View File

@@ -0,0 +1,8 @@
name: auto-ml-remote-amlcompute
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -1,565 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-attach/auto-ml-remote-attach.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution using attach**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the scikit-learn's [20newsgroup](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html) to showcase how you can use AutoML to handle text data with remote attach.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you will learn how to:\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Attach an existing DSVM to a workspace.\n",
"3. Configure AutoML using `AutoMLConfig`.\n",
"4. Train the model using the DSVM.\n",
"5. Explore the results.\n",
"6. Viewing the engineered names for featurized data and featurization summary for all raw features.\n",
"7. Test the best fitted model.\n",
"\n",
"In addition this notebook showcases the following features\n",
"- **Parallel** executions for iterations\n",
"- **Asynchronous** tracking of progress\n",
"- **Cancellation** of individual iterations or the entire run\n",
"- Retrieving models for any iteration or logged metric\n",
"- Specifying AutoML settings as `**kwargs`\n",
"- Handling **text** data using the `preprocess` flag"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the run history container in the workspace.\n",
"experiment_name = 'automl-remote-attach'\n",
"project_folder = './sample_projects/automl-remote-attach'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach a Remote Linux DSVM\n",
"To use a remote Docker compute target:\n",
"1. Create a Linux DSVM in Azure, following these [instructions](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro). Make sure you use the Ubuntu flavor (not CentOS). Make sure that disk space is available under `/tmp` because AutoML creates files under `/tmp/azureml_run`s. The DSVM should have more cores than the number of parallel runs that you plan to enable. It should also have at least 4GB per core.\n",
"2. Enter the IP address, user name and password below.\n",
"\n",
"**Note:** By default, SSH runs on port 22 and you don't need to change the port number below. If you've configured SSH to use a different port, change `dsvm_ssh_port` accordinglyaddress. [Read more](https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/detailed-troubleshoot-ssh-connection) on changing SSH ports for security reasons."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, RemoteCompute\n",
"import time\n",
"\n",
"# Add your VM information below\n",
"# If a compute with the specified compute_name already exists, it will be used and the dsvm_ip_addr, dsvm_ssh_port, \n",
"# dsvm_username and dsvm_password will be ignored.\n",
"compute_name = 'mydsvmb'\n",
"dsvm_ip_addr = '<<ip_addr>>'\n",
"dsvm_ssh_port = 22\n",
"dsvm_username = '<<username>>'\n",
"dsvm_password = '<<password>>'\n",
"\n",
"if compute_name in ws.compute_targets:\n",
" print('Using existing compute.')\n",
" dsvm_compute = ws.compute_targets[compute_name]\n",
"else:\n",
" attach_config = RemoteCompute.attach_configuration(address=dsvm_ip_addr, username=dsvm_username, password=dsvm_password, ssh_port=dsvm_ssh_port)\n",
" ComputeTarget.attach(workspace=ws, name=compute_name, attach_configuration=attach_config)\n",
"\n",
" while ws.compute_targets[compute_name].provisioning_state == 'Creating':\n",
" time.sleep(1)\n",
"\n",
" dsvm_compute = ws.compute_targets[compute_name]\n",
" \n",
" if dsvm_compute.provisioning_state == 'Failed':\n",
" print('Attached failed.')\n",
" print(dsvm_compute.provisioning_errors)\n",
" dsvm_compute.detach()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"import pkg_resources\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"conda_run_config.target = dsvm_compute\n",
"\n",
"pandas_dependency = 'pandas==' + pkg_resources.get_distribution(\"pandas\").version\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80',pandas_dependency])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"For remote executions you should author a `get_data.py` file containing a `get_data()` function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n",
"In this example, the `get_data()` function returns a [dictionary](README.md#getdata)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import numpy as np\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
"def get_data():\n",
" remove = ('headers', 'footers', 'quotes')\n",
" categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
" data_train = fetch_20newsgroups(subset = 'train', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
" \n",
" X_train = np.array(data_train.data).reshape((len(data_train.data),1))\n",
" y_train = np.array(data_train.target)\n",
" \n",
" return { \"X\" : X_train, \"y\" : y_train }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
"\n",
"**Note:** When using Remote DSVM, you can't pass Numpy arrays directly to the fit method.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|\n",
"|**preprocess**|Setting this to *True* enables AutoML to perform preprocessing on the input to handle *missing data*, and to perform some common *feature extraction*.|\n",
"|**enable_cache**|Setting this to *True* enables preprocess done once and reuse the same preprocessed data for all the iterations. Default value is True.\n",
"|**max_cores_per_iteration**|Indicates how many cores on the compute target would be used to train a single pipeline.<br>Default is *1*; you can set it to *-1* to use all cores.|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 60,\n",
" \"iterations\": 4,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": True,\n",
" \"max_cores_per_iteration\": 2\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pre-process cache cleanup\n",
"The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run.clean_preprocessor_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### View the engineered names for featurized data\n",
"Below we display the engineered feature names generated for the featurized data using the preprocessing featurization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fitted_model.named_steps['datatransformer'].get_engineered_feature_names()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### View the featurization summary\n",
"Below we display the featurization that was performed on different raw features in the user data. For each raw feature in the user data, the following information is displayed:-\n",
"- Raw feature name\n",
"- Number of engineered features formed out of this raw feature\n",
"- Type detected\n",
"- If feature was dropped\n",
"- List of feature transformations for the raw feature"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fitted_model.named_steps['datatransformer'].get_featurization_summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model which has the smallest `accuracy` value:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# lookup_metric = \"accuracy\"\n",
"# best_run, fitted_model = remote_run.get_output(metric = lookup_metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a Specific Iteration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 0\n",
"zero_run, zero_model = remote_run.get_output(iteration = iteration)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test data.\n",
"from pandas_ml import ConfusionMatrix\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
"remove = ('headers', 'footers', 'quotes')\n",
"categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
"\n",
"data_test = fetch_20newsgroups(subset = 'test', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
"\n",
"X_test = np.array(data_test.data).reshape((len(data_test.data),1))\n",
"y_test = data_test.target\n",
"\n",
"# Test our best pipeline.\n",
"\n",
"y_pred = fitted_model.predict(X_test)\n",
"y_pred_strings = [data_test.target_names[i] for i in y_pred]\n",
"y_test_strings = [data_test.target_names[i] for i in y_test]\n",
"\n",
"cm = ConfusionMatrix(y_test_strings, y_pred_strings)\n",
"print(cm)\n",
"cm.plot()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,555 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution using AmlCompute**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you would see\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Create or Attach existing AmlCompute to a workspace.\n",
"3. Configure AutoML using `AutoMLConfig`.\n",
"4. Train the model using AmlCompute\n",
"5. Explore the results.\n",
"6. Test the best fitted model.\n",
"\n",
"In addition this notebook showcases the following features\n",
"- **Parallel** executions for iterations\n",
"- **Asynchronous** tracking of progress\n",
"- **Cancellation** of individual iterations or the entire run\n",
"- Retrieving models for any iteration or logged metric\n",
"- Specifying AutoML settings as `**kwargs`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import csv\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the run history container in the workspace.\n",
"experiment_name = 'automl-remote-amlcompute'\n",
"project_folder = './project'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create or Attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"automlcl\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"For remote executions, you need to make the data accessible from the remote compute.\n",
"This can be done by uploading the data to DataStore.\n",
"In this example, we upload scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_train = datasets.load_digits()\n",
"\n",
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)\n",
" \n",
"pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
"pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
"\n",
"ds = ws.get_default_datastore()\n",
"ds.upload(src_dir='./data', target_path='bai_data', overwrite=True, show_progress=True)\n",
"\n",
"from azureml.core.runconfig import DataReferenceConfiguration\n",
"dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
" path_on_datastore='bai_data', \n",
" path_on_compute='/tmp/azureml_runs',\n",
" mode='download', # download files from datastore to compute target\n",
" overwrite=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"# set the data reference of the run coonfiguration\n",
"conda_run_config.data_references = {ds.name: dr}\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import pandas as pd\n",
"\n",
"def get_data():\n",
" X_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
" y_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
"\n",
" return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
"\n",
"**Note:** When using AmlCompute, you can't pass Numpy arrays directly to the fit method.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 2,\n",
" \"iterations\": 20,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": False,\n",
" \"max_concurrent_iterations\": 5,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run.\n",
"In this example, we specify `show_output = False` to suppress console output while the run is in progress."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"\n",
"#### Loading executed runs\n",
"In case you need to load a previously executed run, enable the cell below and replace the `run_id` value."
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"remote_run = AutoMLRun(experiment = experiment, run_id = 'AutoML_5db13491-c92a-4f1d-b622-8ab8d973a058')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model which has the smallest `log_loss` value:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lookup_metric = \"log_loss\"\n",
"best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a Specific Iteration\n",
"Show the run and the model from the third iteration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 3\n",
"third_run, third_model = remote_run.get_output(iteration=iteration)\n",
"print(third_run)\n",
"print(third_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"#### Load Test Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"digits = datasets.load_digits()\n",
"X_test = digits.data[:10, :]\n",
"y_test = digits.target[:10]\n",
"images = digits.images[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Testing Our Best Fitted Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Randomly select digits and test.\n",
"for index in np.random.choice(len(y_test), 2, replace = False):\n",
" print(index)\n",
" predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
" label = y_test[index]\n",
" title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n",
" fig = plt.figure(1, figsize=(3,3))\n",
" ax1 = fig.add_axes((0,0,.8,.8))\n",
" ax1.set_title(title)\n",
" plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
" plt.show()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,593 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-execution-with-datastore/auto-ml-remote-execution-with-datastore.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution with DataStore**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"This sample accesses a data file on a remote DSVM through DataStore. Advantages of using data store are:\n",
"1. DataStore secures the access details.\n",
"2. DataStore supports read, write to blob and file store\n",
"3. AutoML natively supports copying data from DataStore to DSVM\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you would see\n",
"1. Storing data in DataStore.\n",
"2. get_data returning data from DataStore."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import time\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import azureml.core\n",
"from azureml.core.compute import DsvmCompute\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# choose a name for experiment\n",
"experiment_name = 'automl-remote-datastore-file'\n",
"# project folder\n",
"project_folder = './sample_projects/automl-remote-datastore-file'\n",
"\n",
"experiment=Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Remote Linux DSVM\n",
"Note: If creation fails with a message about Marketplace purchase eligibilty, go to portal.azure.com, start creating DSVM there, and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled it, you can exit without actually creating VM.\n",
"\n",
"**Note**: By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you can switch to a different port (such as 5022), you can append the port number to the address. [Read more](https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/detailed-troubleshoot-ssh-connection) on this."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"compute_target_name = 'mydsvmc'\n",
"\n",
"try:\n",
" while ws.compute_targets[compute_target_name].provisioning_state == 'Creating':\n",
" time.sleep(1)\n",
" \n",
" dsvm_compute = DsvmCompute(workspace=ws, name=compute_target_name)\n",
" print('found existing:', dsvm_compute.name)\n",
"except:\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size=\"Standard_D2_v2\")\n",
" dsvm_compute = DsvmCompute.create(ws, name=compute_target_name, provisioning_configuration=dsvm_config)\n",
" dsvm_compute.wait_for_completion(show_output=True)\n",
" print(\"Waiting one minute for ssh to be accessible\")\n",
" time.sleep(90) # Wait for ssh to be accessible"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"\n",
"### Copy data file to local\n",
"\n",
"Download the data file.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir('data'):\n",
" os.mkdir('data') "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"import csv\n",
"\n",
"remove = ('headers', 'footers', 'quotes')\n",
"categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
"data_train = fetch_20newsgroups(subset = 'train', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
" \n",
"pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
"pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload data to the cloud"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now make the data accessible remotely by uploading that data from your local machine into Azure so it can be accessed for remote training. The datastore is a convenient construct associated with your workspace for you to upload/download data, and interact with it from your remote compute targets. It is backed by Azure blob storage account.\n",
"\n",
"The data.tsv files are uploaded into a directory named data at the root of the datastore."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#blob_datastore = Datastore(ws, blob_datastore_name)\n",
"ds = ws.get_default_datastore()\n",
"print(ds.datastore_type, ds.account_name, ds.container_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ds.upload_files(\"data.tsv\")\n",
"ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure & Run\n",
"\n",
"First let's create a DataReferenceConfigruation object to inform the system what data folder to download to the compute target.\n",
"The path_on_compute should be an absolute path to ensure that the data files are downloaded only once. The get_data method should use this same path to access the data files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import DataReferenceConfiguration\n",
"dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
" path_on_datastore='data', \n",
" path_on_compute='/tmp/azureml_runs',\n",
" mode='download', # download files from datastore to compute target\n",
" overwrite=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"import pkg_resources\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"conda_run_config.target = dsvm_compute\n",
"# set the data reference of the run coonfiguration\n",
"conda_run_config.data_references = {ds.name: dr}\n",
"\n",
"pandas_dependency = 'pandas==' + pkg_resources.get_distribution(\"pandas\").version\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80',pandas_dependency])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Get Data File\n",
"For remote executions you should author a get_data.py file containing a get_data() function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n",
"\n",
"The *get_data()* function returns a [dictionary](README.md#getdata).\n",
"\n",
"The read_csv uses the path_on_compute value specified in the DataReferenceConfiguration call plus the path_on_datastore folder and then the actual file name."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import pandas as pd\n",
"\n",
"def get_data():\n",
" X_train = pd.read_csv(\"/tmp/azureml_runs/data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
" y_train = pd.read_csv(\"/tmp/azureml_runs/data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
"\n",
" return { \"X\" : X_train.values, \"y\" : y_train[0].values }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify automl_settings as **kwargs** as well. Also note that you can use the get_data() symantic for local excutions too. \n",
"\n",
"<i>Note: For Remote DSVM and Batch AI you cannot pass Numpy arrays directly to AutoMLConfig.</i>\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration|\n",
"|**iterations**|Number of iterations. In each iteration Auto ML trains a specific pipeline with the data|\n",
"|**n_cross_validations**|Number of cross validation splits|\n",
"|**max_concurrent_iterations**|Max number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM\n",
"|**preprocess**| *True/False* <br>Setting this to *True* enables Auto ML to perform preprocessing <br>on the input to handle *missing data*, and perform some common *feature extraction*|\n",
"|**enable_cache**|Setting this to *True* enables preprocess done once and reuse the same preprocessed data for all the iterations. Default value is True.|\n",
"|**max_cores_per_iteration**| Indicates how many cores on the compute target would be used to train a single pipeline.<br> Default is *1*, you can set it to *-1* to use all cores|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 60,\n",
" \"iterations\": 4,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": True,\n",
" \"max_cores_per_iteration\": 1,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path=project_folder,\n",
" run_configuration=conda_run_config,\n",
" #compute_target = dsvm_compute,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets/models even when the experiment is running to retreive the best model up to that point. Once you are satisfied with the model you can cancel a particular iteration or the whole run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"#### Widget for monitoring runs\n",
"\n",
"The widget will sit on \"loading\" until the first iteration completed, then you will see an auto-updating graph and table show up. It refreshed once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under /tmp/azureml_run/{iterationid}/azureml-logs\n",
"\n",
"NOTE: The widget displays a link at the bottom. This links to a web-ui to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use sdk methods to fetch all the child runs and see individual metrics that we log. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Canceling Runs\n",
"You can cancel ongoing remote runs using the *cancel()* and *cancel_iteration()* functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pre-process cache cleanup\n",
"The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run.clean_preprocessor_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The *get_output* method returns the best run and the fitted model. There are overloads on *get_output* that allow you to retrieve the best run and fitted model for *any* logged metric or a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model based on any other metric"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# lookup_metric = \"accuracy\"\n",
"# best_run, fitted_model = remote_run.get_output(metric=lookup_metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a specific iteration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# iteration = 1\n",
"# best_run, fitted_model = remote_run.get_output(iteration=iteration)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test data.\n",
"from pandas_ml import ConfusionMatrix\n",
"\n",
"data_test = fetch_20newsgroups(subset = 'test', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
"\n",
"X_test = np.array(data_test.data).reshape((len(data_test.data),1))\n",
"y_test = data_test.target\n",
"\n",
"# Test our best pipeline.\n",
"\n",
"y_pred = fitted_model.predict(X_test)\n",
"y_pred_strings = [data_test.target_names[i] for i in y_pred]\n",
"y_test_strings = [data_test.target_names[i] for i in y_test]\n",
"\n",
"cm = ConfusionMatrix(y_test_strings, y_pred_strings)\n",
"print(cm)\n",
"cm.plot()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,534 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-execution/auto-ml-remote-execution.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution using DSVM (Ubuntu)**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you wiil learn how to:\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Attach an existing DSVM to a workspace.\n",
"3. Configure AutoML using `AutoMLConfig`.\n",
"4. Train the model using the DSVM.\n",
"5. Explore the results.\n",
"6. Test the best fitted model.\n",
"\n",
"In addition, this notebook showcases the following features:\n",
"- **Parallel** executions for iterations\n",
"- **Asynchronous** tracking of progress\n",
"- **Cancellation** of individual iterations or the entire run\n",
"- Retrieving models for any iteration or logged metric\n",
"- Specifying AutoML settings as `**kwargs`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import time\n",
"import csv\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the run history container in the workspace.\n",
"experiment_name = 'automl-remote-dsvm'\n",
"project_folder = './project'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Remote Linux DSVM\n",
"**Note:** If creation fails with a message about Marketplace purchase eligibilty, start creation of a DSVM through the [Azure portal](https://portal.azure.com), and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled this setting, you can exit the portal without actually creating the DSVM, and creation of the DSVM through the notebook should work.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import DsvmCompute\n",
"\n",
"dsvm_name = 'mydsvma'\n",
"try:\n",
" dsvm_compute = DsvmCompute(ws, dsvm_name)\n",
" print('Found an existing DSVM.')\n",
"except:\n",
" print('Creating a new DSVM.')\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n",
" dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)\n",
" dsvm_compute.wait_for_completion(show_output = True)\n",
" print(\"Waiting one minute for ssh to be accessible\")\n",
" time.sleep(90) # Wait for ssh to be accessible"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"For remote executions, you need to make the data accessible from the remote compute.\n",
"This can be done by uploading the data to DataStore.\n",
"In this example, we upload scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_train = datasets.load_digits()\n",
"\n",
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)\n",
" \n",
"pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
"pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
"\n",
"ds = ws.get_default_datastore()\n",
"ds.upload(src_dir='./data', target_path='re_data', overwrite=True, show_progress=True)\n",
"\n",
"from azureml.core.runconfig import DataReferenceConfiguration\n",
"dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
" path_on_datastore='re_data', \n",
" path_on_compute='/tmp/azureml_runs',\n",
" mode='download', # download files from datastore to compute target\n",
" overwrite=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"conda_run_config.target = dsvm_compute\n",
"\n",
"# set the data reference of the run coonfiguration\n",
"conda_run_config.data_references = {ds.name: dr}\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import pandas as pd\n",
"\n",
"def get_data():\n",
" X_train = pd.read_csv(\"/tmp/azureml_runs/re_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
" y_train = pd.read_csv(\"/tmp/azureml_runs/re_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
"\n",
" return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
"\n",
"**Note:** When using Remote DSVM, you can't pass Numpy arrays directly to the fit method.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be less than the number of cores on the DSVM.|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 10,\n",
" \"iterations\": 20,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": False,\n",
" \"max_concurrent_iterations\": 2,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path = project_folder, \n",
" run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note:** The first run on a new DSVM may take several minutes to prepare the environment."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run.\n",
"\n",
"In this example, we specify `show_output = False` to suppress console output while the run is in progress."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"\n",
"#### Loading Executed Runs\n",
"In case you need to load a previously executed run, enable the cell below and replace the `run_id` value."
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"remote_run = AutoMLRun(experiment=experiment, run_id = 'AutoML_480d3ed6-fc94-44aa-8f4e-0b945db9d3ef')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model which has the smallest `log_loss` value:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lookup_metric = \"log_loss\"\n",
"best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a Specific Iteration\n",
"Show the run and the model from the third iteration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 3\n",
"third_run, third_model = remote_run.get_output(iteration = iteration)\n",
"print(third_run)\n",
"print(third_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"#### Load Test Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"digits = datasets.load_digits()\n",
"X_test = digits.data[:10, :]\n",
"y_test = digits.target[:10]\n",
"images = digits.images[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Test Our Best Fitted Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Randomly select digits and test.\n",
"for index in np.random.choice(len(y_test), 2, replace = False):\n",
" print(index)\n",
" predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
" label = y_test[index]\n",
" title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n",
" fig = plt.figure(1, figsize=(3,3))\n",
" ax1 = fig.add_axes((0,0,.8,.8))\n",
" ax1.set_title(title)\n",
" plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
" plt.show()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: auto-ml-sample-weight
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,8 @@
name: auto-ml-sparse-data-train-test-split
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,8 @@
name: auto-ml-subsampling-local
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -0,0 +1,709 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Track Data Drift between Training and Inference Data in Production \n",
"\n",
"With this notebook, you will learn how to enable the DataDrift service to automatically track and determine whether your inference data is drifting from the data your model was initially trained on. The DataDrift service provides metrics and visualizations to help stakeholders identify which specific features cause the concept drift to occur.\n",
"\n",
"Please email driftfeedback@microsoft.com with any issues. A member from the DataDrift team will respond shortly. \n",
"\n",
"The DataDrift Public Preview API can be found [here](https://docs.microsoft.com/en-us/python/api/azureml-contrib-datadrift/?view=azure-ml-py). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/contrib/datadrift/azureml-datadrift.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prerequisites and Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Install the DataDrift package\n",
"\n",
"Install the azureml-contrib-datadrift, azureml-contrib-opendatasets and lightgbm packages before running this notebook.\n",
"```\n",
"pip install azureml-contrib-datadrift\n",
"pip install azureml-contrib-datasets\n",
"pip install lightgbm\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import time\n",
"from datetime import datetime, timedelta\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import requests\n",
"from azureml.contrib.datadrift import DataDriftDetector, AlertConfiguration\n",
"from azureml.contrib.opendatasets import NoaaIsdWeather\n",
"from azureml.core import Dataset, Workspace, Run\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.image import ContainerImage\n",
"from azureml.core.model import Model\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"from azureml.widgets import RunDetails\n",
"from sklearn.externals import joblib\n",
"from sklearn.model_selection import train_test_split\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up Configuraton and Create Azure ML Workspace\n",
"\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Please type in your initials/alias. The prefix is prepended to the names of resources created by this notebook. \n",
"prefix = \"dd\"\n",
"\n",
"# NOTE: Please do not change the model_name, as it's required by the score.py file\n",
"model_name = \"driftmodel\"\n",
"image_name = \"{}driftimage\".format(prefix)\n",
"service_name = \"{}driftservice\".format(prefix)\n",
"\n",
"# optionally, set email address to receive an email alert for DataDrift\n",
"email_address = \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate Train/Testing Data\n",
"\n",
"For this demo, we will use NOAA weather data from [Azure Open Datasets](https://azure.microsoft.com/services/open-datasets/). You may replace this step with your own dataset. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"usaf_list = ['725724', '722149', '723090', '722159', '723910', '720279',\n",
" '725513', '725254', '726430', '720381', '723074', '726682',\n",
" '725486', '727883', '723177', '722075', '723086', '724053',\n",
" '725070', '722073', '726060', '725224', '725260', '724520',\n",
" '720305', '724020', '726510', '725126', '722523', '703333',\n",
" '722249', '722728', '725483', '722972', '724975', '742079',\n",
" '727468', '722193', '725624', '722030', '726380', '720309',\n",
" '722071', '720326', '725415', '724504', '725665', '725424',\n",
" '725066']\n",
"\n",
"columns = ['usaf', 'wban', 'datetime', 'latitude', 'longitude', 'elevation', 'windAngle', 'windSpeed', 'temperature', 'stationName', 'p_k']\n",
"\n",
"\n",
"def enrich_weather_noaa_data(noaa_df):\n",
" hours_in_day = 23\n",
" week_in_year = 52\n",
" \n",
" noaa_df[\"hour\"] = noaa_df[\"datetime\"].dt.hour\n",
" noaa_df[\"weekofyear\"] = noaa_df[\"datetime\"].dt.week\n",
" \n",
" noaa_df[\"sine_weekofyear\"] = noaa_df['datetime'].transform(lambda x: np.sin((2*np.pi*x.dt.week-1)/week_in_year))\n",
" noaa_df[\"cosine_weekofyear\"] = noaa_df['datetime'].transform(lambda x: np.cos((2*np.pi*x.dt.week-1)/week_in_year))\n",
"\n",
" noaa_df[\"sine_hourofday\"] = noaa_df['datetime'].transform(lambda x: np.sin(2*np.pi*x.dt.hour/hours_in_day))\n",
" noaa_df[\"cosine_hourofday\"] = noaa_df['datetime'].transform(lambda x: np.cos(2*np.pi*x.dt.hour/hours_in_day))\n",
" \n",
" return noaa_df\n",
"\n",
"def add_window_col(input_df):\n",
" shift_interval = pd.Timedelta('-7 days') # your X days interval\n",
" df_shifted = input_df.copy()\n",
" df_shifted['datetime'] = df_shifted['datetime'] - shift_interval\n",
" df_shifted.drop(list(input_df.columns.difference(['datetime', 'usaf', 'wban', 'sine_hourofday', 'temperature'])), axis=1, inplace=True)\n",
"\n",
" # merge, keeping only observations where -1 lag is present\n",
" df2 = pd.merge(input_df,\n",
" df_shifted,\n",
" on=['datetime', 'usaf', 'wban', 'sine_hourofday'],\n",
" how='inner', # use 'left' to keep observations without lags\n",
" suffixes=['', '-7'])\n",
" return df2\n",
"\n",
"def get_noaa_data(start_time, end_time, cols, station_list):\n",
" isd = NoaaIsdWeather(start_time, end_time, cols=cols)\n",
" # Read into Pandas data frame.\n",
" noaa_df = isd.to_pandas_dataframe()\n",
" noaa_df = noaa_df.rename(columns={\"stationName\": \"station_name\"})\n",
" \n",
" df_filtered = noaa_df[noaa_df[\"usaf\"].isin(station_list)]\n",
" df_filtered.reset_index(drop=True)\n",
" \n",
" # Enrich with time features\n",
" df_enriched = enrich_weather_noaa_data(df_filtered)\n",
" \n",
" return df_enriched\n",
"\n",
"def get_featurized_noaa_df(start_time, end_time, cols, station_list):\n",
" df_1 = get_noaa_data(start_time - timedelta(days=7), start_time - timedelta(seconds=1), cols, station_list)\n",
" df_2 = get_noaa_data(start_time, end_time, cols, station_list)\n",
" noaa_df = pd.concat([df_1, df_2])\n",
" \n",
" print(\"Adding window feature\")\n",
" df_window = add_window_col(noaa_df)\n",
" \n",
" cat_columns = df_window.dtypes == object\n",
" cat_columns = cat_columns[cat_columns == True]\n",
" \n",
" print(\"Encoding categorical columns\")\n",
" df_encoded = pd.get_dummies(df_window, columns=cat_columns.keys().tolist())\n",
" \n",
" print(\"Dropping unnecessary columns\")\n",
" df_featurized = df_encoded.drop(['windAngle', 'windSpeed', 'datetime', 'elevation'], axis=1).dropna().drop_duplicates()\n",
" \n",
" return df_featurized"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Train model on Jan 1 - 14, 2009 data\n",
"df = get_featurized_noaa_df(datetime(2009, 1, 1), datetime(2009, 1, 14, 23, 59, 59), columns, usaf_list)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label = \"temperature\"\n",
"x_df = df.drop(label, axis=1)\n",
"y_df = df[[label]]\n",
"x_train, x_test, y_train, y_test = train_test_split(df, y_df, test_size=0.2, random_state=223)\n",
"print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)\n",
"\n",
"training_dir = 'outputs/training'\n",
"training_file = \"training.csv\"\n",
"\n",
"# Generate training dataframe to register as Training Dataset\n",
"os.makedirs(training_dir, exist_ok=True)\n",
"training_df = pd.merge(x_train.drop(label, axis=1), y_train, left_index=True, right_index=True)\n",
"training_df.to_csv(training_dir + \"/\" + training_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create/Register Training Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_name = \"dataset\"\n",
"name_suffix = datetime.utcnow().strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
"snapshot_name = \"snapshot-{}\".format(name_suffix)\n",
"\n",
"dstore = ws.get_default_datastore()\n",
"dstore.upload(training_dir, \"data/training\", show_progress=True)\n",
"dpath = dstore.path(\"data/training/training.csv\")\n",
"trainingDataset = Dataset.auto_read_files(dpath, include_path=True)\n",
"trainingDataset = trainingDataset.register(workspace=ws, name=dataset_name, description=\"dset\", exist_ok=True)\n",
"\n",
"trainingDataSnapshot = trainingDataset.create_snapshot(snapshot_name=snapshot_name, compute_target=None, create_data_snapshot=True)\n",
"datasets = [(Dataset.Scenario.TRAINING, trainingDataSnapshot)]\n",
"print(\"dataset registration done.\\n\")\n",
"datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train and Save Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import lightgbm as lgb\n",
"\n",
"train = lgb.Dataset(data=x_train, \n",
" label=y_train)\n",
"\n",
"test = lgb.Dataset(data=x_test, \n",
" label=y_test,\n",
" reference=train)\n",
"\n",
"params = {'learning_rate' : 0.1,\n",
" 'boosting' : 'gbdt',\n",
" 'metric' : 'rmse',\n",
" 'feature_fraction' : 1,\n",
" 'bagging_fraction' : 1,\n",
" 'max_depth': 6,\n",
" 'num_leaves' : 31,\n",
" 'objective' : 'regression',\n",
" 'bagging_freq' : 1,\n",
" \"verbose\": -1,\n",
" 'min_data_per_leaf': 100}\n",
"\n",
"model = lgb.train(params, \n",
" num_boost_round=500,\n",
" train_set=train,\n",
" valid_sets=[train, test],\n",
" verbose_eval=50,\n",
" early_stopping_rounds=25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_file = 'outputs/{}.pkl'.format(model_name)\n",
"\n",
"os.makedirs('outputs', exist_ok=True)\n",
"joblib.dump(model, model_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Model.register(model_path=model_file,\n",
" model_name=model_name,\n",
" workspace=ws,\n",
" datasets=datasets)\n",
"\n",
"print(model_name, image_name, service_name, model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploy Model To AKS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare Environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn', 'joblib', 'lightgbm', 'pandas'],\n",
" pip_packages=['azureml-monitoring', 'azureml-sdk[automl]'])\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Image creation may take up to 15 minutes.\n",
"\n",
"image_name = image_name + str(model.version)\n",
"\n",
"if not image_name in ws.images:\n",
" # Use the score.py defined in this directory as the execution script\n",
" # NOTE: The Model Data Collector must be enabled in the execution script for DataDrift to run correctly\n",
" image_config = ContainerImage.image_configuration(execution_script=\"score.py\",\n",
" runtime=\"python\",\n",
" conda_file=\"myenv.yml\",\n",
" description=\"Image with weather dataset model\")\n",
" image = ContainerImage.create(name=image_name,\n",
" models=[model],\n",
" image_config=image_config,\n",
" workspace=ws)\n",
"\n",
" image.wait_for_creation(show_output=True)\n",
"else:\n",
" image = ws.images[image_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Compute Target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_name = 'dd-demo-e2e'\n",
"prov_config = AksCompute.provisioning_configuration()\n",
"\n",
"if not aks_name in ws.compute_targets:\n",
" aks_target = ComputeTarget.create(workspace=ws,\n",
" name=aks_name,\n",
" provisioning_configuration=prov_config)\n",
"\n",
" aks_target.wait_for_completion(show_output=True)\n",
" print(aks_target.provisioning_state)\n",
" print(aks_target.provisioning_errors)\n",
"else:\n",
" aks_target=ws.compute_targets[aks_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy Service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = service_name\n",
"\n",
"if not aks_service_name in ws.webservices:\n",
" aks_config = AksWebservice.deploy_configuration(collect_model_data=True, enable_app_insights=True)\n",
" aks_service = Webservice.deploy_from_image(workspace=ws,\n",
" name=aks_service_name,\n",
" image=image,\n",
" deployment_config=aks_config,\n",
" deployment_target=aks_target)\n",
" aks_service.wait_for_deployment(show_output=True)\n",
" print(aks_service.state)\n",
"else:\n",
" aks_service = ws.webservices[aks_service_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Run DataDrift Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Send Scoring Data to Service"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download Scoring Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Score Model on March 15, 2016 data\n",
"scoring_df = get_noaa_data(datetime(2016, 3, 15) - timedelta(days=7), datetime(2016, 3, 16), columns, usaf_list)\n",
"# Add the window feature column\n",
"scoring_df = add_window_col(scoring_df)\n",
"\n",
"# Drop features not used by the model\n",
"print(\"Dropping unnecessary columns\")\n",
"scoring_df = scoring_df.drop(['windAngle', 'windSpeed', 'datetime', 'elevation'], axis=1).dropna()\n",
"scoring_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# One Hot Encode the scoring dataset to match the training dataset schema\n",
"columns_dict = model.datasets[\"training\"][0].get_profile().columns\n",
"extra_cols = ('Path', 'Column1')\n",
"for k in extra_cols:\n",
" columns_dict.pop(k, None)\n",
"training_columns = list(columns_dict.keys())\n",
"\n",
"categorical_columns = scoring_df.dtypes == object\n",
"categorical_columns = categorical_columns[categorical_columns == True]\n",
"\n",
"test_df = pd.get_dummies(scoring_df[categorical_columns.keys().tolist()])\n",
"encoded_df = scoring_df.join(test_df)\n",
"\n",
"# Populate missing OHE columns with 0 values to match traning dataset schema\n",
"difference = list(set(training_columns) - set(encoded_df.columns.tolist()))\n",
"for col in difference:\n",
" encoded_df[col] = 0\n",
"encoded_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Serialize dataframe to list of row dictionaries\n",
"encoded_dict = encoded_df.to_dict('records')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit Scoring Data to Service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"\n",
"# retreive the API keys. AML generates two keys.\n",
"key1, key2 = aks_service.get_keys()\n",
"\n",
"total_count = len(scoring_df)\n",
"i = 0\n",
"load = []\n",
"for row in encoded_dict:\n",
" load.append(row)\n",
" i = i + 1\n",
" if i % 100 == 0:\n",
" payload = json.dumps({\"data\": load})\n",
" \n",
" # construct raw HTTP request and send to the service\n",
" payload_binary = bytes(payload,encoding = 'utf8')\n",
" headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}\n",
" resp = requests.post(aks_service.scoring_uri, payload_binary, headers=headers)\n",
" \n",
" print(\"prediction:\", resp.content, \"Progress: {}/{}\".format(i, total_count)) \n",
"\n",
" load = []\n",
" time.sleep(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure DataDrift"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"services = [service_name]\n",
"start = datetime.now() - timedelta(days=2)\n",
"end = datetime(year=2020, month=1, day=22, hour=15, minute=16)\n",
"feature_list = ['usaf', 'wban', 'latitude', 'longitude', 'station_name', 'p_k', 'sine_hourofday', 'cosine_hourofday', 'temperature-7']\n",
"alert_config = AlertConfiguration([email_address]) if email_address else None\n",
"\n",
"# there will be an exception indicating using get() method if DataDrift object already exist\n",
"try:\n",
" datadrift = DataDriftDetector.create(ws, model.name, model.version, services, frequency=\"Day\", alert_config=alert_config)\n",
"except KeyError:\n",
" datadrift = DataDriftDetector.get(ws, model.name, model.version)\n",
" \n",
"print(\"Details of DataDrift Object:\\n{}\".format(datadrift))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run an Adhoc DataDriftDetector Run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"target_date = datetime.today()\n",
"run = datadrift.run(target_date, services, feature_list=feature_list, create_compute_target=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"exp = Experiment(ws, datadrift._id)\n",
"dd_run = Run(experiment=exp, run_id=run)\n",
"RunDetails(dd_run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Drift Analysis Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(dd_run.get_children())\n",
"for child in children:\n",
" child.wait_for_completion()\n",
"\n",
"drift_metrics = datadrift.get_output(start_time=start, end_time=end)\n",
"drift_metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show all drift figures, one per serivice.\n",
"# If setting with_details is False (by default), only drift will be shown; if it's True, all details will be shown.\n",
"\n",
"drift_figures = datadrift.show(with_details=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enable DataDrift Schedule"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datadrift.enable_schedule()"
]
}
],
"metadata": {
"authors": [
{
"name": "rafarmah"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,3 @@
## Using data drift APIs
1. [Detect data drift for a model](azure-ml-datadrift.ipynb): Detect data drift for a deployed model.

View File

@@ -0,0 +1,58 @@
import pickle
import json
import numpy
import azureml.train.automl
from sklearn.externals import joblib
from sklearn.linear_model import Ridge
from azureml.core.model import Model
from azureml.core.run import Run
from azureml.monitoring import ModelDataCollector
import time
import pandas as pd
def init():
global model, inputs_dc, prediction_dc, feature_names, categorical_features
print("Model is initialized" + time.strftime("%H:%M:%S"))
model_path = Model.get_model_path(model_name="driftmodel")
model = joblib.load(model_path)
feature_names = ["usaf", "wban", "latitude", "longitude", "station_name", "p_k",
"sine_weekofyear", "cosine_weekofyear", "sine_hourofday", "cosine_hourofday",
"temperature-7"]
categorical_features = ["usaf", "wban", "p_k", "station_name"]
inputs_dc = ModelDataCollector(model_name="driftmodel",
identifier="inputs",
feature_names=feature_names)
prediction_dc = ModelDataCollector("driftmodel",
identifier="predictions",
feature_names=["temperature"])
def run(raw_data):
global inputs_dc, prediction_dc
try:
data = json.loads(raw_data)["data"]
data = pd.DataFrame(data)
# Remove the categorical features as the model expects OHE values
input_data = data.drop(categorical_features, axis=1)
result = model.predict(input_data)
# Collect the non-OHE dataframe
collected_df = data[feature_names]
inputs_dc.collect(collected_df.values)
prediction_dc.collect(result)
return result.tolist()
except Exception as e:
error = str(e)
print(error + time.strftime("%H:%M:%S"))
return error

View File

@@ -0,0 +1,4 @@
name: model-register-and-deploy
dependencies:
- pip:
- azureml-sdk

View File

@@ -1,492 +1,494 @@
{ {
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n", "Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n", "\n",
"Licensed under the MIT License." "Licensed under the MIT License."
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Azure ML Hardware Accelerated Object Detection" "# Azure ML Hardware Accelerated Object Detection"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"This tutorial will show you how to deploy an object detection service based on the SSD-VGG model in just a few minutes using the Azure Machine Learning Accelerated AI service.\n", "This tutorial will show you how to deploy an object detection service based on the SSD-VGG model in just a few minutes using the Azure Machine Learning Accelerated AI service.\n",
"\n", "\n",
"We will use the SSD-VGG model accelerated on an FPGA. Our Accelerated Models Service handles translating deep neural networks (DNN) into an FPGA program.\n", "We will use the SSD-VGG model accelerated on an FPGA. Our Accelerated Models Service handles translating deep neural networks (DNN) into an FPGA program.\n",
"\n", "\n",
"The steps in this notebook are: \n", "The steps in this notebook are: \n",
"1. [Setup Environment](#set-up-environment)\n", "1. [Setup Environment](#set-up-environment)\n",
"* [Construct Model](#construct-model)\n", "* [Construct Model](#construct-model)\n",
" * Image Preprocessing\n", " * Image Preprocessing\n",
" * Featurizer\n", " * Featurizer\n",
" * Save Model\n", " * Save Model\n",
" * Save input and output tensor names\n", " * Save input and output tensor names\n",
"* [Create Image](#create-image)\n", "* [Create Image](#create-image)\n",
"* [Deploy Image](#deploy-image)\n", "* [Deploy Image](#deploy-image)\n",
"* [Test the Service](#test-service)\n", "* [Test the Service](#test-service)\n",
" * Create Client\n", " * Create Client\n",
" * Serve the model\n", " * Serve the model\n",
"* [Cleanup](#cleanup)" "* [Cleanup](#cleanup)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"<a id=\"set-up-environment\"></a>\n", "<a id=\"set-up-environment\"></a>\n",
"## 1. Set up Environment\n", "## 1. Set up Environment\n",
"### 1.a. Imports" "### 1.a. Imports"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n", "import os\n",
"import tensorflow as tf" "import tensorflow as tf"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### 1.b. Retrieve Workspace\n", "### 1.b. Retrieve Workspace\n",
"If you haven't created a Workspace, please follow [this notebook](\"../../../configuration.ipynb\") to do so. If you have, run the codeblock below to retrieve it. " "If you haven't created a Workspace, please follow [this notebook](\"../../../configuration.ipynb\") to do so. If you have, run the codeblock below to retrieve it. "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core import Workspace\n", "from azureml.core import Workspace\n",
"\n", "\n",
"ws = Workspace.from_config()\n", "ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')" "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"<a id=\"construct-model\"></a>\n", "<a id=\"construct-model\"></a>\n",
"## 2. Construct model\n", "## 2. Construct model\n",
"### 2.a. Image preprocessing\n", "### 2.a. Image preprocessing\n",
"We'd like our service to accept JPEG images as input. However the input to SSD-VGG is a float tensor of shape \\[1, 300, 300, 3\\]. The first dimension is batch, then height, width, and channels (i.e. NHWC). To bridge this gap, we need code that decodes JPEG images and resizes them appropriately for input to SSD-VGG. The Accelerated AI service can execute TensorFlow graphs as part of the service and we'll use that ability to do the image preprocessing. This code defines a TensorFlow graph that preprocesses an array of JPEG images (as TensorFlow strings) and produces a tensor that is ready to be featurized by SSD-VGG.\n", "We'd like our service to accept JPEG images as input. However the input to SSD-VGG is a float tensor of shape \\[1, 300, 300, 3\\]. The first dimension is batch, then height, width, and channels (i.e. NHWC). To bridge this gap, we need code that decodes JPEG images and resizes them appropriately for input to SSD-VGG. The Accelerated AI service can execute TensorFlow graphs as part of the service and we'll use that ability to do the image preprocessing. This code defines a TensorFlow graph that preprocesses an array of JPEG images (as TensorFlow strings) and produces a tensor that is ready to be featurized by SSD-VGG.\n",
"\n", "\n",
"**Note:** Expect to see TF deprecation warnings until we port our SDK over to use Tensorflow 2.0." "**Note:** Expect to see TF deprecation warnings until we port our SDK over to use Tensorflow 2.0."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Input images as a two-dimensional tensor containing an arbitrary number of images represented a strings\n", "# Input images as a two-dimensional tensor containing an arbitrary number of images represented a strings\n",
"import azureml.accel.models.utils as utils\n", "import azureml.accel.models.utils as utils\n",
"tf.reset_default_graph()\n", "tf.reset_default_graph()\n",
"\n", "\n",
"in_images = tf.placeholder(tf.string)\n", "in_images = tf.placeholder(tf.string)\n",
"image_tensors = utils.preprocess_array(in_images, output_width=300, output_height=300, preserve_aspect_ratio=False)\n", "image_tensors = utils.preprocess_array(in_images, output_width=300, output_height=300, preserve_aspect_ratio=False)\n",
"print(image_tensors.shape)" "print(image_tensors.shape)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### 2.b. Featurizer\n", "### 2.b. Featurizer\n",
"The SSD-VGG model is different from our other models in that it generates 12 tensor outputs. These corresponds to x,y displacements of the anchor boxes and the detection confidence (for 21 classes). Because these outputs are not convenient to work with, we will later use a pre-defined post-processing utility to transform the outputs into a simplified list of bounding boxes with their respective class and confidence.\n", "The SSD-VGG model is different from our other models in that it generates 12 tensor outputs. These corresponds to x,y displacements of the anchor boxes and the detection confidence (for 21 classes). Because these outputs are not convenient to work with, we will later use a pre-defined post-processing utility to transform the outputs into a simplified list of bounding boxes with their respective class and confidence.\n",
"\n", "\n",
"For more information about the output tensors, take this example: the output tensor 'ssd_300_vgg/block4_box/Reshape_1:0' has a shape of [None, 37, 37, 4, 21]. This gives the pre-softmax confidence for 4 anchor boxes situated at each site of a 37 x 37 grid imposed on the image, one confidence score for each of the 21 classes. The first dimension is the batch dimension. Likewise, 'ssd_300_vgg/block4_box/Reshape:0' has shape [None, 37, 37, 4, 4] and encodes the (cx, cy) center shift and rescaling (sw, sh) relative to each anchor box. Refer to the [SSD-VGG paper](https://arxiv.org/abs/1512.02325) to understand how these are computed. The other 10 tensors are defined similarly." "For more information about the output tensors, take this example: the output tensor 'ssd_300_vgg/block4_box/Reshape_1:0' has a shape of [None, 37, 37, 4, 21]. This gives the pre-softmax confidence for 4 anchor boxes situated at each site of a 37 x 37 grid imposed on the image, one confidence score for each of the 21 classes. The first dimension is the batch dimension. Likewise, 'ssd_300_vgg/block4_box/Reshape:0' has shape [None, 37, 37, 4, 4] and encodes the (cx, cy) center shift and rescaling (sw, sh) relative to each anchor box. Refer to the [SSD-VGG paper](https://arxiv.org/abs/1512.02325) to understand how these are computed. The other 10 tensors are defined similarly."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.accel.models import SsdVgg\n", "from azureml.accel.models import SsdVgg\n",
"\n", "\n",
"saved_model_dir = os.path.join(os.path.expanduser('~'), 'models')\n", "saved_model_dir = os.path.join(os.path.expanduser('~'), 'models')\n",
"model_graph = SsdVgg(saved_model_dir, is_frozen = True)\n", "model_graph = SsdVgg(saved_model_dir, is_frozen = True)\n",
"\n", "\n",
"print('SSD-VGG Input Tensors:')\n", "print('SSD-VGG Input Tensors:')\n",
"for idx, input_name in enumerate(model_graph.input_tensor_list):\n", "for idx, input_name in enumerate(model_graph.input_tensor_list):\n",
" print('{}, {}'.format(input_name, model_graph.get_input_dims(idx)))\n", " print('{}, {}'.format(input_name, model_graph.get_input_dims(idx)))\n",
" \n", " \n",
"print('SSD-VGG Output Tensors:')\n", "print('SSD-VGG Output Tensors:')\n",
"for idx, output_name in enumerate(model_graph.output_tensor_list):\n", "for idx, output_name in enumerate(model_graph.output_tensor_list):\n",
" print('{}, {}'.format(output_name, model_graph.get_output_dims(idx)))\n", " print('{}, {}'.format(output_name, model_graph.get_output_dims(idx)))\n",
"\n", "\n",
"ssd_outputs = model_graph.import_graph_def(image_tensors, is_training=False)" "ssd_outputs = model_graph.import_graph_def(image_tensors, is_training=False)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### 2.c. Save Model\n", "### 2.c. Save Model\n",
"Now that we loaded both parts of the tensorflow graph (preprocessor and SSD-VGG featurizer), we can save the graph and associated variables to a directory which we can register as an Azure ML Model." "Now that we loaded both parts of the tensorflow graph (preprocessor and SSD-VGG featurizer), we can save the graph and associated variables to a directory which we can register as an Azure ML Model."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"model_name = \"ssdvgg\"\n", "model_name = \"ssdvgg\"\n",
"model_save_path = os.path.join(saved_model_dir, model_name, \"saved_model\")\n", "model_save_path = os.path.join(saved_model_dir, model_name, \"saved_model\")\n",
"print(\"Saving model in {}\".format(model_save_path))\n", "print(\"Saving model in {}\".format(model_save_path))\n",
"\n", "\n",
"output_map = {}\n", "output_map = {}\n",
"for i, output in enumerate(ssd_outputs):\n", "for i, output in enumerate(ssd_outputs):\n",
" output_map['out_{}'.format(i)] = output\n", " output_map['out_{}'.format(i)] = output\n",
"\n", "\n",
"with tf.Session() as sess:\n", "with tf.Session() as sess:\n",
" model_graph.restore_weights(sess)\n", " model_graph.restore_weights(sess)\n",
" tf.saved_model.simple_save(sess, \n", " tf.saved_model.simple_save(sess, \n",
" model_save_path, \n", " model_save_path, \n",
" inputs={'images': in_images}, \n", " inputs={'images': in_images}, \n",
" outputs=output_map)" " outputs=output_map)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### 2.d. Important! Save names of input and output tensors\n", "### 2.d. Important! Save names of input and output tensors\n",
"\n", "\n",
"These input and output tensors that were created during the preprocessing and classifier steps are also going to be used when **converting the model** to an Accelerated Model that can run on FPGA's and for **making an inferencing request**. It is very important to save this information!" "These input and output tensors that were created during the preprocessing and classifier steps are also going to be used when **converting the model** to an Accelerated Model that can run on FPGA's and for **making an inferencing request**. It is very important to save this information!"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"tags": [ "tags": [
"register model from file" "register model from file"
] ]
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"input_tensors = in_images.name\n", "input_tensors = in_images.name\n",
"# We will use the list of output tensors during inferencing\n", "# We will use the list of output tensors during inferencing\n",
"output_tensors = [output.name for output in ssd_outputs]\n", "output_tensors = [output.name for output in ssd_outputs]\n",
"# However, for multiple output tensors, our AccelOnnxConverter will \n", "# However, for multiple output tensors, our AccelOnnxConverter will \n",
"# accept comma-delimited strings (lists will cause error)\n", "# accept comma-delimited strings (lists will cause error)\n",
"output_tensors_str = \",\".join(output_tensors)\n", "output_tensors_str = \",\".join(output_tensors)\n",
"\n", "\n",
"print(input_tensors)\n", "print(input_tensors)\n",
"print(output_tensors)" "print(output_tensors)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"<a id=\"create-image\"></a>\n", "<a id=\"create-image\"></a>\n",
"## 3. Create AccelContainerImage\n", "## 3. Create AccelContainerImage\n",
"Below we will execute all the same steps as in the [Quickstart](./accelerated-models-quickstart.ipynb#create-image) to package the model we have saved locally into an accelerated Docker image saved in our workspace. To complete all the steps, it may take a few minutes. For more details on each step, check out the [Quickstart section on model registration](./accelerated-models-quickstart.ipynb#register-model)." "Below we will execute all the same steps as in the [Quickstart](./accelerated-models-quickstart.ipynb#create-image) to package the model we have saved locally into an accelerated Docker image saved in our workspace. To complete all the steps, it may take a few minutes. For more details on each step, check out the [Quickstart section on model registration](./accelerated-models-quickstart.ipynb#register-model)."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core import Workspace\n", "from azureml.core import Workspace\n",
"from azureml.core.model import Model\n", "from azureml.core.model import Model\n",
"from azureml.core.image import Image\n", "from azureml.core.image import Image\n",
"from azureml.accel import AccelOnnxConverter\n", "from azureml.accel import AccelOnnxConverter\n",
"from azureml.accel import AccelContainerImage\n", "from azureml.accel import AccelContainerImage\n",
"\n", "\n",
"# Retrieve workspace\n", "# Retrieve workspace\n",
"ws = Workspace.from_config()\n", "ws = Workspace.from_config()\n",
"print(\"Successfully retrieved workspace:\", ws.name, ws.resource_group, ws.location, ws.subscription_id, '\\n')\n", "print(\"Successfully retrieved workspace:\", ws.name, ws.resource_group, ws.location, ws.subscription_id, '\\n')\n",
"\n", "\n",
"# Register model\n", "# Register model\n",
"registered_model = Model.register(workspace = ws,\n", "registered_model = Model.register(workspace = ws,\n",
" model_path = model_save_path,\n", " model_path = model_save_path,\n",
" model_name = model_name)\n", " model_name = model_name)\n",
"print(\"Successfully registered: \", registered_model.name, registered_model.description, registered_model.version, '\\n', sep = '\\t')\n", "print(\"Successfully registered: \", registered_model.name, registered_model.description, registered_model.version, '\\n', sep = '\\t')\n",
"\n", "\n",
"# Convert model\n", "# Convert model\n",
"convert_request = AccelOnnxConverter.convert_tf_model(ws, registered_model, input_tensors, output_tensors_str)\n", "convert_request = AccelOnnxConverter.convert_tf_model(ws, registered_model, input_tensors, output_tensors_str)\n",
"# If it fails, you can run wait_for_completion again with show_output=True.\n", "# If it fails, you can run wait_for_completion again with show_output=True.\n",
"convert_request.wait_for_completion(show_output=False)\n", "convert_request.wait_for_completion(show_output=False)\n",
"converted_model = convert_request.result\n", "converted_model = convert_request.result\n",
"print(\"\\nSuccessfully converted: \", converted_model.name, converted_model.url, converted_model.version, \n", "print(\"\\nSuccessfully converted: \", converted_model.name, converted_model.url, converted_model.version, \n",
" converted_model.id, converted_model.created_time, '\\n')\n", " converted_model.id, converted_model.created_time, '\\n')\n",
"\n", "\n",
"# Package into AccelContainerImage\n", "# Package into AccelContainerImage\n",
"image_config = AccelContainerImage.image_configuration()\n", "image_config = AccelContainerImage.image_configuration()\n",
"# Image name must be lowercase\n", "# Image name must be lowercase\n",
"image_name = \"{}-image\".format(model_name)\n", "image_name = \"{}-image\".format(model_name)\n",
"image = Image.create(name = image_name,\n", "image = Image.create(name = image_name,\n",
" models = [converted_model],\n", " models = [converted_model],\n",
" image_config = image_config, \n", " image_config = image_config, \n",
" workspace = ws)\n", " workspace = ws)\n",
"image.wait_for_creation()\n", "image.wait_for_creation()\n",
"print(\"Created AccelContainerImage: {} {} {}\\n\".format(image.name, image.creation_state, image.image_location))" "print(\"Created AccelContainerImage: {} {} {}\\n\".format(image.name, image.creation_state, image.image_location))"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"<a id=\"deploy-image\"></a>\n", "<a id=\"deploy-image\"></a>\n",
"## 4. Deploy image\n", "## 4. Deploy image\n",
"Once you have an Azure ML Accelerated Image in your Workspace, you can deploy it to two destinations, to a Databox Edge machine or to an AKS cluster. \n", "Once you have an Azure ML Accelerated Image in your Workspace, you can deploy it to two destinations, to a Databox Edge machine or to an AKS cluster. \n",
"\n", "\n",
"### 4.a. Deploy to Databox Edge Machine using IoT Hub\n", "### 4.a. Deploy to Databox Edge Machine using IoT Hub\n",
"See the sample [here](https://github.com/Azure-Samples/aml-real-time-ai/) for using the Azure IoT CLI extension for deploying your Docker image to your Databox Edge Machine.\n", "See the sample [here](https://github.com/Azure-Samples/aml-real-time-ai/) for using the Azure IoT CLI extension for deploying your Docker image to your Databox Edge Machine.\n",
"\n", "\n",
"### 4.b. Deploy to AKS Cluster\n", "### 4.b. Deploy to AKS Cluster\n",
"Same as in the [Quickstart section on image deployment](./accelerated-models-quickstart.ipynb#deploy-image), we are going to create an AKS cluster with FPGA-enabled machines, then deploy our service to it.\n", "Same as in the [Quickstart section on image deployment](./accelerated-models-quickstart.ipynb#deploy-image), we are going to create an AKS cluster with FPGA-enabled machines, then deploy our service to it.\n",
"#### Create AKS ComputeTarget" "#### Create AKS ComputeTarget"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core.compute import AksCompute, ComputeTarget\n", "from azureml.core.compute import AksCompute, ComputeTarget\n",
"\n", "\n",
"# Uses the specific FPGA enabled VM (sku: Standard_PB6s)\n", "# Uses the specific FPGA enabled VM (sku: Standard_PB6s)\n",
"# Standard_PB6s are available in: eastus, westus2, westeurope, southeastasia\n", "# Standard_PB6s are available in: eastus, westus2, westeurope, southeastasia\n",
"prov_config = AksCompute.provisioning_configuration(vm_size = \"Standard_PB6s\",\n", "prov_config = AksCompute.provisioning_configuration(vm_size = \"Standard_PB6s\",\n",
" agent_count = 1, \n", " agent_count = 1, \n",
" location = \"eastus\")\n", " location = \"eastus\")\n",
"\n", "\n",
"aks_name = 'aks-pb6-obj'\n", "aks_name = 'aks-pb6-obj'\n",
"# Create the cluster\n", "# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n", "aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n", " name = aks_name, \n",
" provisioning_configuration = prov_config)" " provisioning_configuration = prov_config)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Provisioning an AKS cluster might take awhile (15 or so minutes), and we want to wait until it's successfully provisioned before we can deploy a service to it. If you interrupt this cell, provisioning of the cluster will continue. You can re-run it or check the status in your Workspace under Compute." "Provisioning an AKS cluster might take awhile (15 or so minutes), and we want to wait until it's successfully provisioned before we can deploy a service to it. If you interrupt this cell, provisioning of the cluster will continue. You can re-run it or check the status in your Workspace under Compute."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"aks_target.wait_for_completion(show_output = True)\n", "aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n", "print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)" "print(aks_target.provisioning_errors)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Deploy AccelContainerImage to AKS ComputeTarget" "#### Deploy AccelContainerImage to AKS ComputeTarget"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core.webservice import Webservice, AksWebservice\n", "from azureml.core.webservice import Webservice, AksWebservice\n",
"\n", "\n",
"# Set the web service configuration (for creating a test service, we don't want autoscale enabled)\n", "# Set the web service configuration (for creating a test service, we don't want autoscale enabled)\n",
"# Authentication is enabled by default, but for testing we specify False\n", "# Authentication is enabled by default, but for testing we specify False\n",
"aks_config = AksWebservice.deploy_configuration(autoscale_enabled=False,\n", "aks_config = AksWebservice.deploy_configuration(autoscale_enabled=False,\n",
" num_replicas=1,\n", " num_replicas=1,\n",
" auth_enabled = False)\n", " auth_enabled = False)\n",
"\n", "\n",
"aks_service_name ='my-aks-service'\n", "aks_service_name ='my-aks-service'\n",
"\n", "\n",
"aks_service = Webservice.deploy_from_image(workspace = ws,\n", "aks_service = Webservice.deploy_from_image(workspace = ws,\n",
" name = aks_service_name,\n", " name = aks_service_name,\n",
" image = image,\n", " image = image,\n",
" deployment_config = aks_config,\n", " deployment_config = aks_config,\n",
" deployment_target = aks_target)\n", " deployment_target = aks_target)\n",
"aks_service.wait_for_deployment(show_output = True)" "aks_service.wait_for_deployment(show_output = True)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"<a id=\"test-service\"></a>\n", "<a id=\"test-service\"></a>\n",
"## 5. Test the service\n", "## 5. Test the service\n",
"<a id=\"create-client\"></a>\n", "<a id=\"create-client\"></a>\n",
"### 5.a. Create Client\n", "### 5.a. Create Client\n",
"The image supports gRPC and the TensorFlow Serving \"predict\" API. We have a client that can call into the docker image to get predictions. \n", "The image supports gRPC and the TensorFlow Serving \"predict\" API. We have a client that can call into the docker image to get predictions. \n",
"\n", "\n",
"**Note:** If you chose to use auth_enabled=True when creating your AksWebservice.deploy_configuration(), see documentation [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice(class)?view=azure-ml-py#get-keys--) on how to retrieve your keys and use either key as an argument to PredictionClient(...,access_token=key)." "**Note:** If you chose to use auth_enabled=True when creating your AksWebservice.deploy_configuration(), see documentation [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice(class)?view=azure-ml-py#get-keys--) on how to retrieve your keys and use either key as an argument to PredictionClient(...,access_token=key).",
] "\n",
}, "**WARNING:** If you are running on Azure Notebooks free compute, you will not be able to make outgoing calls to your service. Try locating your client on a different machine to consume it."
{ ]
"cell_type": "code", },
"execution_count": null, {
"metadata": {}, "cell_type": "code",
"outputs": [], "execution_count": null,
"source": [ "metadata": {},
"# Using the grpc client in AzureML Accelerated Models SDK\n", "outputs": [],
"from azureml.accel.client import PredictionClient\n", "source": [
"\n", "# Using the grpc client in AzureML Accelerated Models SDK\n",
"address = aks_service.scoring_uri\n", "from azureml.accel.client import PredictionClient\n",
"ssl_enabled = address.startswith(\"https\")\n", "\n",
"address = address[address.find('/')+2:].strip('/')\n", "address = aks_service.scoring_uri\n",
"port = 443 if ssl_enabled else 80\n", "ssl_enabled = address.startswith(\"https\")\n",
"\n", "address = address[address.find('/')+2:].strip('/')\n",
"# Initialize AzureML Accelerated Models client\n", "port = 443 if ssl_enabled else 80\n",
"client = PredictionClient(address=address,\n", "\n",
" port=port,\n", "# Initialize AzureML Accelerated Models client\n",
" use_ssl=ssl_enabled,\n", "client = PredictionClient(address=address,\n",
" service_name=aks_service.name)" " port=port,\n",
] " use_ssl=ssl_enabled,\n",
}, " service_name=aks_service.name)"
{ ]
"cell_type": "markdown", },
"metadata": {}, {
"source": [ "cell_type": "markdown",
"You can adapt the client [code](https://github.com/Azure/aml-real-time-ai/blob/master/pythonlib/amlrealtimeai/client.py) to meet your needs. There is also an example C# [client](https://github.com/Azure/aml-real-time-ai/blob/master/sample-clients/csharp).\n", "metadata": {},
"\n", "source": [
"The service provides an API that is compatible with TensorFlow Serving. There are instructions to download a sample client [here](https://www.tensorflow.org/serving/setup)." "You can adapt the client [code](https://github.com/Azure/aml-real-time-ai/blob/master/pythonlib/amlrealtimeai/client.py) to meet your needs. There is also an example C# [client](https://github.com/Azure/aml-real-time-ai/blob/master/sample-clients/csharp).\n",
] "\n",
}, "The service provides an API that is compatible with TensorFlow Serving. There are instructions to download a sample client [here](https://www.tensorflow.org/serving/setup)."
{ ]
"cell_type": "markdown", },
"metadata": {}, {
"source": [ "cell_type": "markdown",
"<a id=\"serve-model\"></a>\n", "metadata": {},
"### 5.b. Serve the model\n", "source": [
"The SSD-VGG model returns the confidence and bounding boxes for all possible anchor boxes. As mentioned earlier, we will use a post-processing routine to transform this into a list of bounding boxes (y1, x1, y2, x2) where x, y are fractional coordinates measured from left and top respectively. A respective list of classes and scores is also returned to tag each bounding box. Below we make use of this information to draw the bounding boxes on top the original image. Note that in the post-processing routine we select a confidence threshold of 0.5." "<a id=\"serve-model\"></a>\n",
] "### 5.b. Serve the model\n",
}, "The SSD-VGG model returns the confidence and bounding boxes for all possible anchor boxes. As mentioned earlier, we will use a post-processing routine to transform this into a list of bounding boxes (y1, x1, y2, x2) where x, y are fractional coordinates measured from left and top respectively. A respective list of classes and scores is also returned to tag each bounding box. Below we make use of this information to draw the bounding boxes on top the original image. Note that in the post-processing routine we select a confidence threshold of 0.5."
{ ]
"cell_type": "code", },
"execution_count": null, {
"metadata": {}, "cell_type": "code",
"outputs": [], "execution_count": null,
"source": [ "metadata": {},
"import cv2\n", "outputs": [],
"from matplotlib import pyplot as plt\n", "source": [
"\n", "import cv2\n",
"colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),\n", "from matplotlib import pyplot as plt\n",
" (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),\n", "\n",
" (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),\n", "colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),\n",
" (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),\n", " (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),\n",
" (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]\n", " (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),\n",
"\n", " (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),\n",
"\n", " (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]\n",
"def draw_boxes_on_img(img, classes, scores, bboxes, thickness=2):\n", "\n",
" shape = img.shape\n", "\n",
" for i in range(bboxes.shape[0]):\n", "def draw_boxes_on_img(img, classes, scores, bboxes, thickness=2):\n",
" bbox = bboxes[i]\n", " shape = img.shape\n",
" color = colors_tableau[classes[i]]\n", " for i in range(bboxes.shape[0]):\n",
" # Draw bounding box...\n", " bbox = bboxes[i]\n",
" p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))\n", " color = colors_tableau[classes[i]]\n",
" p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))\n", " # Draw bounding box...\n",
" cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)\n", " p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))\n",
" # Draw text...\n", " p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))\n",
" s = '%s/%.3f' % (classes[i], scores[i])\n", " cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)\n",
" p1 = (p1[0]-5, p1[1])\n", " # Draw text...\n",
" cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1)" " s = '%s/%.3f' % (classes[i], scores[i])\n",
] " p1 = (p1[0]-5, p1[1])\n",
}, " cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1)"
{ ]
"cell_type": "code", },
"execution_count": null, {
"metadata": {}, "cell_type": "code",
"outputs": [], "execution_count": null,
"source": [ "metadata": {},
"import azureml.accel._external.ssdvgg_utils as ssdvgg_utils\n", "outputs": [],
"\n", "source": [
"result = client.score_file(path=\"meeting.jpg\", input_name=input_tensors, outputs=output_tensors)\n", "import azureml.accel._external.ssdvgg_utils as ssdvgg_utils\n",
"classes, scores, bboxes = ssdvgg_utils.postprocess(result, select_threshold=0.5)\n", "\n",
"\n", "result = client.score_file(path=\"meeting.jpg\", input_name=input_tensors, outputs=output_tensors)\n",
"img = cv2.imread('meeting.jpg', 1)\n", "classes, scores, bboxes = ssdvgg_utils.postprocess(result, select_threshold=0.5)\n",
"img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", "\n",
"draw_boxes_on_img(img, classes, scores, bboxes)\n", "img = cv2.imread('meeting.jpg', 1)\n",
"plt.imshow(img)" "img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
] "draw_boxes_on_img(img, classes, scores, bboxes)\n",
}, "plt.imshow(img)"
{ ]
"cell_type": "markdown", },
"metadata": {}, {
"source": [ "cell_type": "markdown",
"<a id=\"cleanup\"></a>\n", "metadata": {},
"## 6. Cleanup\n", "source": [
"It's important to clean up your resources, so that you won't incur unnecessary costs. In the [next notebook](./accelerated-models-training.ipynb) you will learn how to train a classfier on a new dataset using transfer learning." "<a id=\"cleanup\"></a>\n",
] "## 6. Cleanup\n",
}, "It's important to clean up your resources, so that you won't incur unnecessary costs. In the [next notebook](./accelerated-models-training.ipynb) you will learn how to train a classfier on a new dataset using transfer learning."
{ ]
"cell_type": "code", },
"execution_count": null, {
"metadata": {}, "cell_type": "code",
"outputs": [], "execution_count": null,
"source": [ "metadata": {},
"aks_service.delete()\n", "outputs": [],
"aks_target.delete()\n", "source": [
"image.delete()\n", "aks_service.delete()\n",
"registered_model.delete()\n", "aks_target.delete()\n",
"converted_model.delete()" "image.delete()\n",
] "registered_model.delete()\n",
} "converted_model.delete()"
], ]
"metadata": { }
"authors": [
{
"name": "coverste"
},
{
"name": "paledger"
},
{
"name": "sukha"
}
], ],
"kernelspec": { "metadata": {
"display_name": "Python 3.6", "authors": [
"language": "python", {
"name": "python36" "name": "coverste"
},
{
"name": "paledger"
},
{
"name": "sukha"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
}, },
"language_info": { "nbformat": 4,
"codemirror_mode": { "nbformat_minor": 2
"name": "ipython", }
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,4 @@
name: enable-app-insights-in-production-service
dependencies:
- pip:
- azureml-sdk

View File

@@ -0,0 +1,4 @@
name: enable-data-collection-for-models-in-aks
dependencies:
- pip:
- azureml-sdk

View File

@@ -0,0 +1,6 @@
name: onnx-convert-aml-deploy-tinyyolo
dependencies:
- pip:
- azureml-sdk
- git+https://github.com/apple/coremltools
- onnxmltools==1.3.1

View File

@@ -0,0 +1,9 @@
name: onnx-inference-facial-expression-recognition-deploy
dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- matplotlib
- numpy
- onnx
- opencv-python

View File

@@ -0,0 +1,9 @@
name: onnx-inference-mnist-deploy
dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- matplotlib
- numpy
- onnx
- opencv-python

View File

@@ -0,0 +1,4 @@
name: onnx-modelzoo-aml-deploy-resnet50
dependencies:
- pip:
- azureml-sdk

View File

@@ -98,7 +98,7 @@
"from azureml.core.compute_target import ComputeTargetException\n", "from azureml.core.compute_target import ComputeTargetException\n",
"\n", "\n",
"# choose a name for your cluster\n", "# choose a name for your cluster\n",
"cluster_name = \"gpucluster\"\n", "cluster_name = \"gpu-cluster\"\n",
"\n", "\n",
"try:\n", "try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",

View File

@@ -0,0 +1,5 @@
name: onnx-train-pytorch-aml-deploy-mnist
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -1,407 +1,407 @@
{ {
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n", "Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n", "\n",
"Licensed under the MIT License." "Licensed under the MIT License."
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Deploying a web service to Azure Kubernetes Service (AKS)\n", "# Deploying a web service to Azure Kubernetes Service (AKS)\n",
"This notebook shows the steps for deploying a service: registering a model, creating an image, provisioning a cluster (one time action), and deploying a service to it. \n", "This notebook shows the steps for deploying a service: registering a model, creating an image, provisioning a cluster (one time action), and deploying a service to it. \n",
"We then test and delete the service, image and model." "We then test and delete the service, image and model."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core import Workspace\n", "from azureml.core import Workspace\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n", "from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.webservice import Webservice, AksWebservice\n", "from azureml.core.webservice import Webservice, AksWebservice\n",
"from azureml.core.image import Image\n", "from azureml.core.image import Image\n",
"from azureml.core.model import Model" "from azureml.core.model import Model"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import azureml.core\n", "import azureml.core\n",
"print(azureml.core.VERSION)" "print(azureml.core.VERSION)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Get workspace\n", "# Get workspace\n",
"Load existing workspace from the config file info." "Load existing workspace from the config file info."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core.workspace import Workspace\n", "from azureml.core.workspace import Workspace\n",
"\n", "\n",
"ws = Workspace.from_config()\n", "ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')" "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Register the model\n", "# Register the model\n",
"Register an existing trained model, add descirption and tags. Prior to registering the model, you should have a TensorFlow [Saved Model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) in the `resnet50` directory. You can download a [pretrained resnet50](https://github.com/tensorflow/models/tree/master/official/resnet#pre-trained-model) and unpack it to that directory." "Register an existing trained model, add descirption and tags. Prior to registering the model, you should have a TensorFlow [Saved Model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) in the `resnet50` directory. You can download a [pretrained resnet50](https://github.com/tensorflow/models/tree/master/official/resnet#pre-trained-model) and unpack it to that directory."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"#Register the model\n", "#Register the model\n",
"from azureml.core.model import Model\n", "from azureml.core.model import Model\n",
"model = Model.register(model_path = \"resnet50\", # this points to a local file\n", "model = Model.register(model_path = \"resnet50\", # this points to a local file\n",
" model_name = \"resnet50\", # this is the name the model is registered as\n", " model_name = \"resnet50\", # this is the name the model is registered as\n",
" tags = {'area': \"Image classification\", 'type': \"classification\"},\n", " tags = {'area': \"Image classification\", 'type': \"classification\"},\n",
" description = \"Image classification trained on Imagenet Dataset\",\n", " description = \"Image classification trained on Imagenet Dataset\",\n",
" workspace = ws)\n", " workspace = ws)\n",
"\n", "\n",
"print(model.name, model.description, model.version)" "print(model.name, model.description, model.version)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Create an image\n", "# Create an image\n",
"Create an image using the registered model the script that will load and run the model." "Create an image using the registered model the script that will load and run the model."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%writefile score.py\n", "%%writefile score.py\n",
"import tensorflow as tf\n", "import tensorflow as tf\n",
"import numpy as np\n", "import numpy as np\n",
"import ujson\n", "import ujson\n",
"from azureml.core.model import Model\n", "from azureml.core.model import Model\n",
"from azureml.contrib.services.aml_request import AMLRequest, rawhttp\n", "from azureml.contrib.services.aml_request import AMLRequest, rawhttp\n",
"from azureml.contrib.services.aml_response import AMLResponse\n", "from azureml.contrib.services.aml_response import AMLResponse\n",
"\n", "\n",
"def init():\n", "def init():\n",
" global session\n", " global session\n",
" global input_name\n", " global input_name\n",
" global output_name\n", " global output_name\n",
" \n", " \n",
" session = tf.Session()\n", " session = tf.Session()\n",
"\n", "\n",
" model_path = Model.get_model_path('resnet50')\n", " model_path = Model.get_model_path('resnet50')\n",
" model = tf.saved_model.loader.load(session, ['serve'], model_path)\n", " model = tf.saved_model.loader.load(session, ['serve'], model_path)\n",
" if len(model.signature_def['serving_default'].inputs) > 1:\n", " if len(model.signature_def['serving_default'].inputs) > 1:\n",
" raise ValueError(\"This score.py only supports one input\")\n", " raise ValueError(\"This score.py only supports one input\")\n",
" if len(model.signature_def['serving_default'].outputs) > 1:\n", " if len(model.signature_def['serving_default'].outputs) > 1:\n",
" raise ValueError(\"This score.py only supports one input\")\n", " raise ValueError(\"This score.py only supports one input\")\n",
" input_name = [tensor.name for tensor in model.signature_def['serving_default'].inputs.values()][0]\n", " input_name = [tensor.name for tensor in model.signature_def['serving_default'].inputs.values()][0]\n",
" output_name = [tensor.name for tensor in model.signature_def['serving_default'].outputs.values()][0]\n", " output_name = [tensor.name for tensor in model.signature_def['serving_default'].outputs.values()][0]\n",
" \n", " \n",
"\n", "\n",
"@rawhttp\n", "@rawhttp\n",
"def run(request):\n", "def run(request):\n",
" if request.method == 'POST':\n", " if request.method == 'POST':\n",
" reqBody = request.get_data(False)\n", " reqBody = request.get_data(False)\n",
" resp = score(reqBody)\n", " resp = score(reqBody)\n",
" return AMLResponse(resp, 200)\n", " return AMLResponse(resp, 200)\n",
" if request.method == 'GET':\n", " if request.method == 'GET':\n",
" respBody = str.encode(\"GET is not supported\")\n", " respBody = str.encode(\"GET is not supported\")\n",
" return AMLResponse(respBody, 405)\n", " return AMLResponse(respBody, 405)\n",
" return AMLResponse(\"bad request\", 500)\n", " return AMLResponse(\"bad request\", 500)\n",
"\n", "\n",
"def score(data):\n", "def score(data):\n",
" result = session.run(output_name, {input_name: [data]})\n", " result = session.run(output_name, {input_name: [data]})\n",
" return ujson.dumps(result[0])\n", " return ujson.dumps(result[0])\n",
"\n", "\n",
"if __name__ == \"__main__\":\n", "if __name__ == \"__main__\":\n",
" init()\n", " init()\n",
" with open(\"test_image.jpg\", 'rb') as f:\n", " with open(\"test_image.jpg\", 'rb') as f:\n",
" content = f.read()\n", " content = f.read()\n",
" print(score(content))" " print(score(content))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core.conda_dependencies import CondaDependencies \n", "from azureml.core.conda_dependencies import CondaDependencies \n",
"\n", "\n",
"myenv = CondaDependencies.create(conda_packages=['tensorflow-gpu==1.12.0','numpy','ujson','azureml-contrib-services'])\n", "myenv = CondaDependencies.create(conda_packages=['tensorflow-gpu==1.12.0','numpy','ujson','azureml-contrib-services'])\n",
"\n", "\n",
"with open(\"myenv.yml\",\"w\") as f:\n", "with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())" " f.write(myenv.serialize_to_string())"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core.image import ContainerImage\n", "from azureml.core.image import ContainerImage\n",
"\n", "\n",
"image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n", "image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n",
" runtime = \"python\",\n", " runtime = \"python\",\n",
" conda_file = \"myenv.yml\",\n", " conda_file = \"myenv.yml\",\n",
" gpu_enabled = True\n", " gpu_enabled = True\n",
" )\n", " )\n",
"\n", "\n",
"image = ContainerImage.create(name = \"GpuImage\",\n", "image = ContainerImage.create(name = \"GpuImage\",\n",
" # this is the model object\n", " # this is the model object\n",
" models = [model],\n", " models = [model],\n",
" image_config = image_config,\n", " image_config = image_config,\n",
" workspace = ws)\n", " workspace = ws)\n",
"\n", "\n",
"image.wait_for_creation(show_output = True)" "image.wait_for_creation(show_output = True)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Provision the AKS Cluster\n", "# Provision the AKS Cluster\n",
"This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete the cluster or the resource group that contains it, then you would have to recreate it." "This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete the cluster or the resource group that contains it, then you would have to recreate it."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Use the default configuration (can also provide parameters to customize)\n", "# Use the default configuration (can also provide parameters to customize)\n",
"prov_config = AksCompute.provisioning_configuration(vm_size=\"Standard_NC6\")\n", "prov_config = AksCompute.provisioning_configuration(vm_size=\"Standard_NC6\")\n",
"\n", "\n",
"aks_name = 'my-aks-9' \n", "aks_name = 'my-aks-9' \n",
"# Create the cluster\n", "# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n", "aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n", " name = aks_name, \n",
" provisioning_configuration = prov_config)" " provisioning_configuration = prov_config)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Create AKS Cluster in an existing virtual network (optional)\n", "# Create AKS Cluster in an existing virtual network (optional)\n",
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-enable-virtual-network#use-azure-kubernetes-service) for more details." "See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-enable-virtual-network#use-azure-kubernetes-service) for more details."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"'''\n", "'''\n",
"from azureml.core.compute import ComputeTarget, AksCompute\n", "from azureml.core.compute import ComputeTarget, AksCompute\n",
"\n", "\n",
"# Create the compute configuration and set virtual network information\n", "# Create the compute configuration and set virtual network information\n",
"config = AksCompute.provisioning_configuration(vm_size=\"Standard_NC6\", location=\"eastus2\")\n", "config = AksCompute.provisioning_configuration(vm_size=\"Standard_NC6\", location=\"eastus2\")\n",
"config.vnet_resourcegroup_name = \"mygroup\"\n", "config.vnet_resourcegroup_name = \"mygroup\"\n",
"config.vnet_name = \"mynetwork\"\n", "config.vnet_name = \"mynetwork\"\n",
"config.subnet_name = \"default\"\n", "config.subnet_name = \"default\"\n",
"config.service_cidr = \"10.0.0.0/16\"\n", "config.service_cidr = \"10.0.0.0/16\"\n",
"config.dns_service_ip = \"10.0.0.10\"\n", "config.dns_service_ip = \"10.0.0.10\"\n",
"config.docker_bridge_cidr = \"172.17.0.1/16\"\n", "config.docker_bridge_cidr = \"172.17.0.1/16\"\n",
"\n", "\n",
"# Create the compute target\n", "# Create the compute target\n",
"aks_target = ComputeTarget.create(workspace = ws,\n", "aks_target = ComputeTarget.create(workspace = ws,\n",
" name = \"myaks\",\n", " name = \"myaks\",\n",
" provisioning_configuration = config)\n", " provisioning_configuration = config)\n",
"'''" "'''"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Enable SSL on the AKS Cluster (optional)\n", "# Enable SSL on the AKS Cluster (optional)\n",
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-secure-web-service) for more details" "See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-secure-web-service) for more details"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# provisioning_config = AksCompute.provisioning_configuration(ssl_cert_pem_file=\"cert.pem\", ssl_key_pem_file=\"key.pem\", ssl_cname=\"www.contoso.com\")" "# provisioning_config = AksCompute.provisioning_configuration(ssl_cert_pem_file=\"cert.pem\", ssl_key_pem_file=\"key.pem\", ssl_cname=\"www.contoso.com\")"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%time\n", "%%time\n",
"aks_target.wait_for_completion(show_output = True)\n", "aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n", "print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)" "print(aks_target.provisioning_errors)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Optional step: Attach existing AKS cluster\n", "## Optional step: Attach existing AKS cluster\n",
"\n", "\n",
"If you have existing AKS cluster in your Azure subscription, you can attach it to the Workspace." "If you have existing AKS cluster in your Azure subscription, you can attach it to the Workspace."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"'''\n", "'''\n",
"# Use the default configuration (can also provide parameters to customize)\n", "# Use the default configuration (can also provide parameters to customize)\n",
"resource_id = '/subscriptions/92c76a2f-0e1c-4216-b65e-abf7a3f34c1e/resourcegroups/raymondsdk0604/providers/Microsoft.ContainerService/managedClusters/my-aks-0605d37425356b7d01'\n", "resource_id = '/subscriptions/92c76a2f-0e1c-4216-b65e-abf7a3f34c1e/resourcegroups/raymondsdk0604/providers/Microsoft.ContainerService/managedClusters/my-aks-0605d37425356b7d01'\n",
"\n", "\n",
"create_name='my-existing-aks' \n", "create_name='my-existing-aks' \n",
"# Create the cluster\n", "# Create the cluster\n",
"attach_config = AksCompute.attach_configuration(resource_id=resource_id)\n", "attach_config = AksCompute.attach_configuration(resource_id=resource_id)\n",
"aks_target = ComputeTarget.attach(workspace=ws, name=create_name, attach_configuration=attach_config)\n", "aks_target = ComputeTarget.attach(workspace=ws, name=create_name, attach_configuration=attach_config)\n",
"# Wait for the operation to complete\n", "# Wait for the operation to complete\n",
"aks_target.wait_for_completion(True)\n", "aks_target.wait_for_completion(True)\n",
"'''" "'''"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Deploy web service to AKS" "# Deploy web service to AKS"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"#Set the web service configuration (using default here)\n", "#Set the web service configuration (using default here)\n",
"aks_config = AksWebservice.deploy_configuration()" "aks_config = AksWebservice.deploy_configuration()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%time\n", "%%time\n",
"aks_service_name ='aks-service-1'\n", "aks_service_name ='aks-service-1'\n",
"\n", "\n",
"aks_service = Webservice.deploy_from_image(workspace = ws, \n", "aks_service = Webservice.deploy_from_image(workspace = ws, \n",
" name = aks_service_name,\n", " name = aks_service_name,\n",
" image = image,\n", " image = image,\n",
" deployment_config = aks_config,\n", " deployment_config = aks_config,\n",
" deployment_target = aks_target)\n", " deployment_target = aks_target)\n",
"aks_service.wait_for_deployment(show_output = True)\n", "aks_service.wait_for_deployment(show_output = True)\n",
"print(aks_service.state)" "print(aks_service.state)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Test the web service\n", "# Test the web service\n",
"We test the web sevice by passing the test images content." "We test the web sevice by passing the test images content."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%time\n", "%%time\n",
"import requests\n", "import requests\n",
"key1, key2 = aks_service.get_keys()\n", "key1, key2 = aks_service.get_keys()\n",
"\n", "\n",
"headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}\n", "headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}\n",
"test_sampe = open('test_image.jpg', 'rb').read()\n", "test_sampe = open('test_image.jpg', 'rb').read()\n",
"resp = requests.post(aks_service.scoring_uri, test_sample, headers=headers)" "resp = requests.post(aks_service.scoring_uri, test_sample, headers=headers)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Clean up\n", "# Clean up\n",
"Delete the service, image, model and compute target" "Delete the service, image, model and compute target"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%time\n", "%%time\n",
"aks_service.delete()\n", "aks_service.delete()\n",
"image.delete()\n", "image.delete()\n",
"model.delete()\n", "model.delete()\n",
"aks_target.delete()" "aks_target.delete()"
] ]
} }
],
"metadata": {
"authors": [
{
"name": "aashishb"
}
], ],
"kernelspec": { "metadata": {
"display_name": "Python 3", "authors": [
"language": "python", {
"name": "python3" "name": "aashishb"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
}, },
"language_info": { "nbformat": 4,
"codemirror_mode": { "nbformat_minor": 2
"name": "ipython", }
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
name: production-deploy-to-aks
dependencies:
- pip:
- azureml-sdk
- matplotlib
- tqdm
- scipy
- sklearn

View File

@@ -0,0 +1,8 @@
name: register-model-create-image-deploy-service
dependencies:
- pip:
- azureml-sdk
- matplotlib
- tqdm
- scipy
- sklearn

View File

@@ -265,7 +265,7 @@
"from azureml.core.compute_target import ComputeTargetException\n", "from azureml.core.compute_target import ComputeTargetException\n",
"\n", "\n",
"# Choose a name for your CPU cluster\n", "# Choose a name for your CPU cluster\n",
"cpu_cluster_name = \"cpucluster\"\n", "cpu_cluster_name = \"cpu-cluster\"\n",
"\n", "\n",
"# Verify that cluster does not exist already\n", "# Verify that cluster does not exist already\n",
"try:\n", "try:\n",
@@ -370,7 +370,7 @@
"from azureml.core.compute_target import ComputeTargetException\n", "from azureml.core.compute_target import ComputeTargetException\n",
"\n", "\n",
"# Choose a name for your CPU cluster\n", "# Choose a name for your CPU cluster\n",
"cpu_cluster_name = \"cpucluster\"\n", "cpu_cluster_name = \"cpu-cluster\"\n",
"\n", "\n",
"# Verify that cluster does not exist already\n", "# Verify that cluster does not exist already\n",
"try:\n", "try:\n",
@@ -506,7 +506,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Delete () is used to deprovision and delete the AmlCompute target. Useful if you want to re-use the compute name \n", "# Delete () is used to deprovision and delete the AmlCompute target. Useful if you want to re-use the compute name \n",
"# 'cpucluster' in this case but use a different VM family for instance.\n", "# 'cpu-cluster' in this case but use a different VM family for instance.\n",
"\n", "\n",
"# cpu_cluster.delete()" "# cpu_cluster.delete()"
] ]

View File

@@ -0,0 +1,6 @@
name: regression-sklearn-on-amlcompute
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model

View File

@@ -0,0 +1,6 @@
name: explain-local-sklearn-binary-classification
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model

View File

@@ -0,0 +1,6 @@
name: explain-local-sklearn-multiclass-classification
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model

View File

@@ -0,0 +1,6 @@
name: explain-local-sklearn-regression
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model

View File

@@ -36,22 +36,6 @@
"4. Visualize the global and local explanations with the visualization dashboard." "4. Visualize the global and local explanations with the visualization dashboard."
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This example needs sklearn-pandas. If it is not installed, uncomment and run the following line."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install sklearn-pandas"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -63,7 +47,6 @@
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.linear_model import LogisticRegression\n", "from sklearn.linear_model import LogisticRegression\n",
"from azureml.explain.model.tabular_explainer import TabularExplainer\n", "from azureml.explain.model.tabular_explainer import TabularExplainer\n",
"from sklearn_pandas import DataFrameMapper\n",
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np" "import numpy as np"
] ]
@@ -113,6 +96,13 @@
"x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"sklearn imports"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -121,7 +111,51 @@
"source": [ "source": [
"from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n", "from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can explain raw features by either using a `sklearn.compose.ColumnTransformer` or a list of fitted transformer tuples. The cell below uses `sklearn.compose.ColumnTransformer`. In case you want to run the example with the list of fitted transformer tuples, comment the cell below and uncomment the cell that follows after. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"\n",
"transformations = ColumnTransformer([\n",
" (\"age_fare\", Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
" ]), [\"age\", \"fare\"]),\n",
" (\"embarked\", Pipeline(steps=[\n",
" (\"imputer\", SimpleImputer(strategy='constant', fill_value='missing')), \n",
" (\"encoder\", OneHotEncoder(sparse=False))]), [\"embarked\"]),\n",
" (\"sex_pclass\", OneHotEncoder(sparse=False), [\"sex\", \"pclass\"]) \n",
"])\n",
"\n",
"\n",
"# Append classifier to preprocessing pipeline.\n",
"# Now we have a full prediction pipeline.\n",
"clf = Pipeline(steps=[('preprocessor', transformations),\n",
" ('classifier', LogisticRegression(solver='lbfgs'))])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"# Uncomment below if sklearn-pandas is not installed\n",
"#!pip install sklearn-pandas\n",
"from sklearn_pandas import DataFrameMapper\n", "from sklearn_pandas import DataFrameMapper\n",
"\n", "\n",
"# Impute, standardize the numeric features and one-hot encode the categorical features. \n", "# Impute, standardize the numeric features and one-hot encode the categorical features. \n",
@@ -141,7 +175,8 @@
"# Append classifier to preprocessing pipeline.\n", "# Append classifier to preprocessing pipeline.\n",
"# Now we have a full prediction pipeline.\n", "# Now we have a full prediction pipeline.\n",
"clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)),\n", "clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)),\n",
" ('classifier', LogisticRegression(solver='lbfgs'))])" " ('classifier', LogisticRegression(solver='lbfgs'))])\n",
"'''"
] ]
}, },
{ {

View File

@@ -0,0 +1,7 @@
name: explain-sklearn-raw-features
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model
- sklearn-pandas

View File

@@ -0,0 +1,6 @@
name: explain-run-history-sklearn-classification
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model

View File

@@ -0,0 +1,6 @@
name: explain-run-history-sklearn-regression
dependencies:
- pip:
- azureml-sdk
- azureml-explain-model
- azureml-contrib-explain-model

View File

@@ -43,5 +43,6 @@ Take a look at [intro-to-pipelines](./intro-to-pipelines/) for the list of noteb
1. [pipeline-batch-scoring.ipynb](https://aka.ms/pl-batch-score): This notebook demonstrates how to run a batch scoring job using Azure Machine Learning pipelines. 1. [pipeline-batch-scoring.ipynb](https://aka.ms/pl-batch-score): This notebook demonstrates how to run a batch scoring job using Azure Machine Learning pipelines.
2. [pipeline-style-transfer.ipynb](https://aka.ms/pl-style-trans): This notebook demonstrates a multi-step pipeline that uses GPU compute. 2. [pipeline-style-transfer.ipynb](https://aka.ms/pl-style-trans): This notebook demonstrates a multi-step pipeline that uses GPU compute.
3. [nyc-taxi-data-regression-model-building.ipynb](https://aka.ms/pl-nyctaxi-tutorial): This notebook is an AzureML Pipelines version of the previously published two part sample.
![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/machine-learning-pipelines/README.png) ![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/machine-learning-pipelines/README.png)

View File

@@ -0,0 +1,5 @@
name: aml-pipelines-data-transfer
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -65,8 +65,6 @@
"import os\n", "import os\n",
"import azureml.core\n", "import azureml.core\n",
"from azureml.core import Workspace, Experiment, Datastore\n", "from azureml.core import Workspace, Experiment, Datastore\n",
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"from azureml.widgets import RunDetails\n", "from azureml.widgets import RunDetails\n",
"\n", "\n",
"# Check core SDK version number\n", "# Check core SDK version number\n",
@@ -116,36 +114,20 @@
"ws = Workspace.from_config()\n", "ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')\n", "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')\n",
"\n", "\n",
"# Default datastore (Azure file storage)\n", "# Default datastore\n",
"def_file_store = ws.get_default_datastore() \n", "def_blob_store = ws.get_default_datastore() \n",
"# The above call is equivalent to Datastore(ws, \"workspacefilestore\") or simply Datastore(ws)\n",
"print(\"Default datastore's name: {}\".format(def_file_store.name))\n",
"\n",
"# Blob storage associated with the workspace\n",
"# The following call GETS the Azure Blob Store associated with your workspace.\n", "# The following call GETS the Azure Blob Store associated with your workspace.\n",
"# Note that workspaceblobstore is **the name of this store and CANNOT BE CHANGED and must be used as is** \n", "# Note that workspaceblobstore is **the name of this store and CANNOT BE CHANGED and must be used as is** \n",
"def_blob_store = Datastore(ws, \"workspaceblobstore\")\n", "def_blob_store = Datastore(ws, \"workspaceblobstore\")\n",
"print(\"Blobstore's name: {}\".format(def_blob_store.name))" "print(\"Blobstore's name: {}\".format(def_blob_store.name))"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# project folder\n",
"project_folder = '.'\n",
" \n",
"print('Sample projects will be created in {}.'.format(os.path.realpath(project_folder)))"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Required data and script files for the the tutorial\n", "### Required data and script files for the the tutorial\n",
"Sample files required to finish this tutorial are already copied to the project folder specified above. Even though the .py provided in the samples don't have much \"ML work,\" as a data scientist, you will work on this extensively as part of your work. To complete this tutorial, the contents of these files are not very important. The one-line files are for demostration purpose only." "Sample files required to finish this tutorial are already copied to the corresponding source_directory locations. Even though the .py provided in the samples don't have much \"ML work,\" as a data scientist, you will work on this extensively as part of your work. To complete this tutorial, the contents of these files are not very important. The one-line files are for demostration purpose only."
] ]
}, },
{ {
@@ -176,19 +158,10 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# get_default_datastore() gets the default Azure File Store associated with your workspace.\n", "# get_default_datastore() gets the default Azure Blob Store associated with your workspace.\n",
"# Here we are reusing the def_file_store object we obtained earlier\n", "# Here we are reusing the def_blob_store object we obtained earlier\n",
"\n",
"# target_path is the directory at the destination\n",
"def_file_store.upload_files(['./20news.pkl'], \n",
" target_path = '20newsgroups', \n",
" overwrite = True, \n",
" show_progress = True)\n",
"\n",
"# Here we are reusing the def_blob_store we created earlier\n",
"def_blob_store.upload_files([\"./20news.pkl\"], target_path=\"20newsgroups\", overwrite=True)\n", "def_blob_store.upload_files([\"./20news.pkl\"], target_path=\"20newsgroups\", overwrite=True)\n",
"\n", "print(\"Upload call completed\")"
"print(\"Upload calls completed\")"
] ]
}, },
{ {
@@ -233,8 +206,15 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Retrieve default Azure Machine Learning compute\n", "#### Retrieve or create a Azure Machine Learning compute\n",
"Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's get the default Azure Machine Learning Compute in the current workspace. We will then run the training script on this compute target." "Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's create a new Azure Machine Learning Compute in the current workspace, if it doesn't already exist. We will then run the training script on this compute target.\n",
"\n",
"If we could not find the compute with the given name in the previous cell, then we will create a new compute here. We will create an Azure Machine Learning Compute containing **STANDARD_D2_V2 CPU VMs**. This process is broken down into the following steps:\n",
"\n",
"1. Create the configuration\n",
"2. Create the Azure Machine Learning compute\n",
"\n",
"**This process will take about 3 minutes and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell.**"
] ]
}, },
{ {
@@ -243,7 +223,23 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"aml_compute = ws.get_default_compute_target(\"CPU\")" "from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"aml_compute_target = \"cpu-cluster\"\n",
"try:\n",
" aml_compute = AmlCompute(ws, aml_compute_target)\n",
" print(\"found existing compute target.\")\n",
"except ComputeTargetException:\n",
" print(\"creating new compute target\")\n",
" \n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\",\n",
" min_nodes = 1, \n",
" max_nodes = 4) \n",
" aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)\n",
" aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
" \n",
"print(\"Azure Machine Learning Compute attached\")\n"
] ]
}, },
{ {
@@ -290,9 +286,10 @@
"- [**MpiStep**](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.mpi_step.mpistep?view=azure-ml-py): Adds a step to run a MPI job in a Pipeline.\n", "- [**MpiStep**](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.mpi_step.mpistep?view=azure-ml-py): Adds a step to run a MPI job in a Pipeline.\n",
"- [**AutoMLStep**](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py): Creates a AutoML step in a Pipeline.\n", "- [**AutoMLStep**](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py): Creates a AutoML step in a Pipeline.\n",
"\n", "\n",
"The following code will create a PythonScriptStep to be executed in the Azure Machine Learning Compute we created above using train.py, one of the files already made available in the project folder.\n", "The following code will create a PythonScriptStep to be executed in the Azure Machine Learning Compute we created above using train.py, one of the files already made available in the `source_directory`.\n",
"\n", "\n",
"A **PythonScriptStep** is a basic, built-in step to run a Python Script on a compute target. It takes a script name and optionally other parameters like arguments for the script, compute target, inputs and outputs. If no compute target is specified, default compute target for the workspace is used. You can also use a [**RunConfiguration**](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py) to specify requirements for the PythonScriptStep, such as conda dependencies and docker image." "A **PythonScriptStep** is a basic, built-in step to run a Python Script on a compute target. It takes a script name and optionally other parameters like arguments for the script, compute target, inputs and outputs. If no compute target is specified, default compute target for the workspace is used. You can also use a [**RunConfiguration**](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py) to specify requirements for the PythonScriptStep, such as conda dependencies and docker image.\n",
"> The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step."
] ]
}, },
{ {
@@ -303,6 +300,9 @@
"source": [ "source": [
"# Uses default values for PythonScriptStep construct.\n", "# Uses default values for PythonScriptStep construct.\n",
"\n", "\n",
"source_directory = './train'\n",
"print('Source directory for the step is {}.'.format(os.path.realpath(source_directory)))\n",
"\n",
"# Syntax\n", "# Syntax\n",
"# PythonScriptStep(\n", "# PythonScriptStep(\n",
"# script_name, \n", "# script_name, \n",
@@ -321,7 +321,7 @@
"step1 = PythonScriptStep(name=\"train_step\",\n", "step1 = PythonScriptStep(name=\"train_step\",\n",
" script_name=\"train.py\", \n", " script_name=\"train.py\", \n",
" compute_target=aml_compute, \n", " compute_target=aml_compute, \n",
" source_directory=project_folder,\n", " source_directory=source_directory,\n",
" allow_reuse=True)\n", " allow_reuse=True)\n",
"print(\"Step1 created\")" "print(\"Step1 created\")"
] ]
@@ -351,12 +351,15 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# All steps use files already available in the project_folder\n", "# For this step, we use a different source_directory\n",
"source_directory = './compare'\n",
"print('Source directory for the step is {}.'.format(os.path.realpath(source_directory)))\n",
"\n",
"# All steps use the same Azure Machine Learning compute target as well\n", "# All steps use the same Azure Machine Learning compute target as well\n",
"step2 = PythonScriptStep(name=\"compare_step\",\n", "step2 = PythonScriptStep(name=\"compare_step\",\n",
" script_name=\"compare.py\", \n", " script_name=\"compare.py\", \n",
" compute_target=aml_compute, \n", " compute_target=aml_compute, \n",
" source_directory=project_folder)\n", " source_directory=source_directory)\n",
"\n", "\n",
"# Use a RunConfiguration to specify some additional requirements for this step.\n", "# Use a RunConfiguration to specify some additional requirements for this step.\n",
"from azureml.core.runconfig import RunConfiguration\n", "from azureml.core.runconfig import RunConfiguration\n",
@@ -378,10 +381,14 @@
"# specify CondaDependencies obj\n", "# specify CondaDependencies obj\n",
"run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n", "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n",
"\n", "\n",
"# For this step, we use yet another source_directory\n",
"source_directory = './extract'\n",
"print('Source directory for the step is {}.'.format(os.path.realpath(source_directory)))\n",
"\n",
"step3 = PythonScriptStep(name=\"extract_step\",\n", "step3 = PythonScriptStep(name=\"extract_step\",\n",
" script_name=\"extract.py\", \n", " script_name=\"extract.py\", \n",
" compute_target=aml_compute, \n", " compute_target=aml_compute, \n",
" source_directory=project_folder,\n", " source_directory=source_directory,\n",
" runconfig=run_config)\n", " runconfig=run_config)\n",
"\n", "\n",
"# list of steps to run\n", "# list of steps to run\n",
@@ -597,7 +604,7 @@
"metadata": { "metadata": {
"authors": [ "authors": [
{ {
"name": "diray" "name": "sanpil"
} }
], ],
"kernelspec": { "kernelspec": {

View File

@@ -0,0 +1,5 @@
name: aml-pipelines-getting-started
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -113,7 +113,25 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"batch_compute = ws.get_default_compute_target(\"CPU\")" "batch_compute_name = 'mybatchcompute' # Name to associate with new compute in workspace\n",
"\n",
"# Batch account details needed to attach as compute to workspace\n",
"batch_account_name = \"<batch_account_name>\" # Name of the Batch account\n",
"batch_resource_group = \"<batch_resource_group>\" # Name of the resource group which contains this account\n",
"\n",
"try:\n",
" # check if already attached\n",
" batch_compute = BatchCompute(ws, batch_compute_name)\n",
"except ComputeTargetException:\n",
" print('Attaching Batch compute...')\n",
" provisioning_config = BatchCompute.attach_configuration(resource_group=batch_resource_group, \n",
" account_name=batch_account_name)\n",
" batch_compute = ComputeTarget.attach(ws, batch_compute_name, provisioning_config)\n",
" batch_compute.wait_for_completion()\n",
" print(\"Provisioning state:{}\".format(batch_compute.provisioning_state))\n",
" print(\"Provisioning errors:{}\".format(batch_compute.provisioning_errors))\n",
"\n",
"print(\"Using Batch compute:{}\".format(batch_compute.cluster_resource_id))"
] ]
}, },
{ {

View File

@@ -76,8 +76,18 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Get default AmlCompute\n", "## Create or Attach existing AmlCompute\n",
"You can create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you use default `AmlCompute` as your training compute resource." "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you create `AmlCompute` as your training compute resource."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `STANDARD_NC6` GPU VMs. This process is broken down into 3 steps:\n",
"1. create the configuration (this step is local and only takes a second)\n",
"2. create the cluster (this step will take about **20 seconds**)\n",
"3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell"
] ]
}, },
{ {
@@ -86,7 +96,25 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"cpu_cluster = ws.get_default_compute_target(\"CPU\")\n", "from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# choose a name for your cluster\n",
"cluster_name = \"cpu-cluster\"\n",
"\n",
"try:\n",
" cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=4)\n",
"\n",
" # create the cluster\n",
" cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
" # can poll for a minimum number of nodes and for a specific timeout. \n",
" # if no min node count is provided it uses the scale settings for the cluster\n",
" cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n", "\n",
"# use get_status() to get a detailed status for the current cluster. \n", "# use get_status() to get a detailed status for the current cluster. \n",
"print(cpu_cluster.get_status().serialize())" "print(cpu_cluster.get_status().serialize())"
@@ -96,7 +124,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Now that you have created the compute target, let's see what the workspace's `compute_targets` property returns. You should now see one entry named 'cpucluster' of type `AmlCompute`." "Now that you have created the compute target, let's see what the workspace's `compute_targets` property returns. You should now see one entry named 'cpu-cluster' of type `AmlCompute`."
] ]
}, },
{ {

View File

@@ -0,0 +1,5 @@
name: aml-pipelines-how-to-use-estimatorstep
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -22,9 +22,17 @@
"# Azure Machine Learning Pipeline with HyperDriveStep\n", "# Azure Machine Learning Pipeline with HyperDriveStep\n",
"\n", "\n",
"\n", "\n",
"This notebook is used to demonstrate the use of HyperDriveStep in AML Pipeline.\n", "This notebook is used to demonstrate the use of HyperDriveStep in AML Pipeline."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites and Azure Machine Learning Basics\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the configuration Notebook located at https://github.com/Azure/MachineLearningNotebooks first if you haven't. This sets you up with a working config file that has information on your workspace, subscription id, etc. \n",
"\n", "\n",
"## Azure Machine Learning and Pipeline SDK-specific imports\n" "## Azure Machine Learning and Pipeline SDK-specific imports"
] ]
}, },
{ {
@@ -33,19 +41,24 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n",
"import shutil\n",
"import urllib\n",
"import azureml.core\n", "import azureml.core\n",
"from azureml.core import Workspace, Experiment\n", "from azureml.core import Workspace, Experiment\n",
"from azureml.core.datastore import Datastore\n", "from azureml.core.datastore import Datastore\n",
"from azureml.core.compute import ComputeTarget, AmlCompute\n", "from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.exceptions import ComputeTargetException\n", "from azureml.exceptions import ComputeTargetException\n",
"from azureml.data.data_reference import DataReference\n", "from azureml.data.data_reference import DataReference\n",
"from azureml.pipeline.steps import HyperDriveStep\n", "from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun\n",
"from azureml.pipeline.core import Pipeline, PipelineData\n", "from azureml.pipeline.core import Pipeline, PipelineData\n",
"from azureml.train.dnn import TensorFlow\n", "from azureml.train.dnn import TensorFlow\n",
"from azureml.train.hyperdrive import *\n", "# from azureml.train.hyperdrive import *\n",
"from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal\n",
"from azureml.train.hyperdrive import choice, loguniform\n",
"\n",
"import os\n",
"import shutil\n",
"import urllib\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n", "\n",
"# Check core SDK version number\n", "# Check core SDK version number\n",
"print(\"SDK version:\", azureml.core.VERSION)" "print(\"SDK version:\", azureml.core.VERSION)"
@@ -87,7 +100,7 @@
"script_folder = './tf-mnist'\n", "script_folder = './tf-mnist'\n",
"os.makedirs(script_folder, exist_ok=True)\n", "os.makedirs(script_folder, exist_ok=True)\n",
"\n", "\n",
"exp = Experiment(workspace=ws, name='tf-mnist')" "exp = Experiment(workspace=ws, name='Hyperdrive_sample')"
] ]
}, },
{ {
@@ -112,6 +125,42 @@
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename = './data/mnist/test-labels.gz')" "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename = './data/mnist/test-labels.gz')"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show some sample images\n",
"Let's load the downloaded compressed file into numpy arrays using some utility functions included in the `utils.py` library file from the current folder. Then we use `matplotlib` to plot 30 random images from the dataset along with their labels."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from utils import load_data\n",
"\n",
"# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster.\n",
"X_train = load_data('./data/mnist/train-images.gz', False) / 255.0\n",
"y_train = load_data('./data/mnist/train-labels.gz', True).reshape(-1)\n",
"\n",
"X_test = load_data('./data/mnist/test-images.gz', False) / 255.0\n",
"y_test = load_data('./data/mnist/test-labels.gz', True).reshape(-1)\n",
"\n",
"count = 0\n",
"sample_size = 30\n",
"plt.figure(figsize = (16, 6))\n",
"for i in np.random.permutation(X_train.shape[0])[:sample_size]:\n",
" count = count + 1\n",
" plt.subplot(1, sample_size, count)\n",
" plt.axhline('')\n",
" plt.axvline('')\n",
" plt.text(x = 10, y = -10, s = y_train[i], fontsize = 18)\n",
" plt.imshow(X_train[i].reshape(28, 28), cmap = plt.cm.Greys)\n",
"plt.show()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@@ -135,7 +184,14 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Retrieve or create a Azure Machine Learning compute\n", "## Retrieve or create a Azure Machine Learning compute\n",
"Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's get the default Azure Machine Learning Compute in the current workspace. We will then run the training script on this compute target." "Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's create a new Azure Machine Learning Compute in the current workspace, if it doesn't already exist. We will then run the training script on this compute target.\n",
"\n",
"If we could not find the compute with the given name in the previous cell, then we will create a new compute here. This process is broken down into the following steps:\n",
"\n",
"1. Create the configuration\n",
"2. Create the Azure Machine Learning compute\n",
"\n",
"**This process will take a few minutes and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell.**\n"
] ]
}, },
{ {
@@ -144,7 +200,20 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"compute_target = ws.get_default_compute_target(\"GPU\")" "cluster_name = \"gpu-cluster\"\n",
"\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target {}.'.format(cluster_name))\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\",\n",
" max_nodes=4)\n",
"\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
" compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)\n",
"\n",
"print(\"Azure Machine Learning Compute attached\")"
] ]
}, },
{ {
@@ -173,8 +242,12 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Create TensorFlow estimator\n", "## Create TensorFlow estimator\n",
"Next, we construct an `azureml.train.dnn.TensorFlow` estimator object, use the Batch AI cluster as compute target, and pass the mount-point of the datastore to the training code as a parameter.\n", "Next, we construct an [TensorFlow](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.dnn.tensorflow?view=azure-ml-py) estimator object.\n",
"The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed -- if additional pip or conda packages are required, their names can be passed in via the `pip_packages` and `conda_packages` arguments and they will be included in the resulting docker." "The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed -- if additional pip or conda packages are required, their names can be passed in via the `pip_packages` and `conda_packages` arguments and they will be included in the resulting docker.\n",
"\n",
"The TensorFlow estimator also takes a `framework_version` parameter -- if no version is provided, the estimator will default to the latest version supported by AzureML. Use `TensorFlow.get_supported_versions()` to get a list of all versions supported by your current SDK version or see the [SDK documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.dnn?view=azure-ml-py) for the versions supported in the most current release.\n",
"\n",
"The TensorFlow estimator also takes a `framework_version` parameter -- if no version is provided, the estimator will default to the latest version supported by AzureML. Use `TensorFlow.get_supported_versions()` to get a list of all versions supported by your current SDK version or see the [SDK documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.dnn?view=azure-ml-py) for the versions supported in the most current release."
] ]
}, },
{ {
@@ -186,7 +259,8 @@
"est = TensorFlow(source_directory=script_folder, \n", "est = TensorFlow(source_directory=script_folder, \n",
" compute_target=compute_target,\n", " compute_target=compute_target,\n",
" entry_script='tf_mnist.py', \n", " entry_script='tf_mnist.py', \n",
" use_gpu=True)" " use_gpu=True,\n",
" framework_version='1.13')"
] ]
}, },
{ {
@@ -194,7 +268,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Intelligent hyperparameter tuning\n", "## Intelligent hyperparameter tuning\n",
"We have trained the model with one set of hyperparameters, now let's how we can do hyperparameter tuning by launching multiple runs on the cluster. First let's define the parameter space using random sampling.\n", "Now let's try hyperparameter tuning by launching multiple runs on the cluster. First let's define the parameter space using random sampling.\n",
"\n", "\n",
"In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (`validation_acc`)." "In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (`validation_acc`)."
] ]
@@ -251,8 +325,8 @@
" policy=early_termination_policy,\n", " policy=early_termination_policy,\n",
" primary_metric_name='validation_acc', \n", " primary_metric_name='validation_acc', \n",
" primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, \n", " primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, \n",
" max_total_runs=1,\n", " max_total_runs=10,\n",
" max_concurrent_runs=1)" " max_concurrent_runs=4)"
] ]
}, },
{ {
@@ -261,6 +335,7 @@
"source": [ "source": [
"## Add HyperDrive as a step of pipeline\n", "## Add HyperDrive as a step of pipeline\n",
"\n", "\n",
"### Setup an input for the hypderdrive step\n",
"Let's setup a data reference for inputs of hyperdrive step." "Let's setup a data reference for inputs of hyperdrive step."
] ]
}, },
@@ -302,8 +377,9 @@
" datastore=ds,\n", " datastore=ds,\n",
" pipeline_output_name=metrics_output_name)\n", " pipeline_output_name=metrics_output_name)\n",
"\n", "\n",
"hd_step_name='hd_step01'\n",
"hd_step = HyperDriveStep(\n", "hd_step = HyperDriveStep(\n",
" name=\"hyperdrive_module\",\n", " name=hd_step_name,\n",
" hyperdrive_config=hd_config,\n", " hyperdrive_config=hd_config,\n",
" estimator_entry_script_arguments=['--data-folder', data_folder],\n", " estimator_entry_script_arguments=['--data-folder', data_folder],\n",
" inputs=[data_folder],\n", " inputs=[data_folder],\n",
@@ -324,7 +400,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"pipeline = Pipeline(workspace=ws, steps=[hd_step])\n", "pipeline = Pipeline(workspace=ws, steps=[hd_step])\n",
"pipeline_run = Experiment(ws, 'Hyperdrive_Test').submit(pipeline)" "pipeline_run = exp.submit(pipeline)"
] ]
}, },
{ {
@@ -357,7 +433,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"pipeline_run.wait_for_completion()" "# PUBLISHONLY\n",
"# pipeline_run.wait_for_completion()"
] ]
}, },
{ {
@@ -374,8 +451,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)\n", "# PUBLISHONLY\n",
"num_file_downloaded = metrics_output.download('.', show_progress=True)" "# metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)\n",
"# num_file_downloaded = metrics_output.download('.', show_progress=True)"
] ]
}, },
{ {
@@ -384,14 +462,374 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "# PUBLISHONLY\n",
"import json\n", "# import pandas as pd\n",
"with open(metrics_output._path_on_datastore) as f: \n", "# import json\n",
" metrics_output_result = f.read()\n", "# with open(metrics_output._path_on_datastore) as f: \n",
"# metrics_output_result = f.read()\n",
" \n", " \n",
"deserialized_metrics_output = json.loads(metrics_output_result)\n", "# deserialized_metrics_output = json.loads(metrics_output_result)\n",
"df = pd.DataFrame(deserialized_metrics_output)\n", "# df = pd.DataFrame(deserialized_metrics_output)\n",
"df" "# df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find and register best model\n",
"When all the jobs finish, we can find out the one that has the highest accuracy."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# hd_step_run = HyperDriveStepRun(step_run=pipeline_run.find_step_run(hd_step_name)[0])\n",
"# best_run = hd_step_run.get_best_run_by_primary_metric()\n",
"# best_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's list the model files uploaded during the run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# print(best_run.get_file_names())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can then register the folder (and all files in it) as a model named `tf-dnn-mnist` under the workspace for deployment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# model = best_run.register_model(model_name='tf-dnn-mnist', model_path='outputs/model')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy the model in ACI\n",
"Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n",
"### Create score.py\n",
"First, we will create a scoring script that will be invoked by the web service call. \n",
"\n",
"* Note that the scoring script must have two required functions, `init()` and `run(input_data)`. \n",
" * In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started. \n",
" * In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to `run` typically use JSON as serialization and de-serialization format but you are not limited to that."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import json\n",
"import numpy as np\n",
"import os\n",
"import tensorflow as tf\n",
"\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global X, output, sess\n",
" tf.reset_default_graph()\n",
" model_root = Model.get_model_path('tf-dnn-mnist')\n",
" saver = tf.train.import_meta_graph(os.path.join(model_root, 'mnist-tf.model.meta'))\n",
" X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n",
" output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n",
" \n",
" sess = tf.Session()\n",
" saver.restore(sess, os.path.join(model_root, 'mnist-tf.model'))\n",
"\n",
"def run(raw_data):\n",
" data = np.array(json.loads(raw_data)['data'])\n",
" # make prediction\n",
" out = output.eval(session=sess, feed_dict={X: data})\n",
" y_hat = np.argmax(out, axis=1)\n",
" return y_hat.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create myenv.yml\n",
"We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `tensorflow`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# from azureml.core.runconfig import CondaDependencies\n",
"\n",
"# cd = CondaDependencies.create()\n",
"# cd.add_conda_package('numpy')\n",
"# cd.add_tensorflow_conda_package()\n",
"# cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n",
"\n",
"# print(cd.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy to ACI\n",
"We are almost ready to deploy. Create a deployment configuration and specify the number of CPUs and gigbyte of RAM needed for your ACI container. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# from azureml.core.webservice import AciWebservice\n",
"\n",
"# aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n",
"# memory_gb=1, \n",
"# tags={'name':'mnist', 'framework': 'TensorFlow DNN'},\n",
"# description='Tensorflow DNN on MNIST')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Deployment Process\n",
"Now we can deploy. **This cell will run for about 7-8 minutes**. Behind the scene, it will do the following:\n",
"1. **Register model** \n",
"Take the local `model` folder (which contains our previously downloaded trained model files) and register it (and the files inside that folder) as a model named `model` under the workspace. Azure ML will register the model directory or model file(s) we specify to the `model_paths` parameter of the `Webservice.deploy` call.\n",
"2. **Build Docker image** \n",
"Build a Docker image using the scoring file (`score.py`), the environment file (`myenv.yml`), and the `model` folder containing the TensorFlow model files. \n",
"3. **Register image** \n",
"Register that image under the workspace. \n",
"4. **Ship to ACI** \n",
"And finally ship the image to the ACI infrastructure, start up a container in ACI using that image, and expose an HTTP endpoint to accept REST client calls."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# from azureml.core.image import ContainerImage\n",
"\n",
"# imgconfig = ContainerImage.image_configuration(execution_script=\"score.py\", \n",
"# runtime=\"python\", \n",
"# conda_file=\"myenv.yml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# %%time\n",
"# from azureml.core.webservice import Webservice\n",
"\n",
"# service = Webservice.deploy_from_model(workspace=ws,\n",
"# name='tf-mnist-svc',\n",
"# deployment_config=aciconfig,\n",
"# models=[model],\n",
"# image_config=imgconfig)\n",
"\n",
"# service.wait_for_deployment(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# print(service.get_logs())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is the scoring web service endpoint:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# print(service.scoring_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the deployed model\n",
"Let's test the deployed model. Pick 30 random samples from the test set, and send it to the web service hosted in ACI. Note here we are using the `run` API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.\n",
"\n",
"After the invocation, we print the returned predictions and plot them along with the input images. Use red font color and inversed image (white on black) to highlight the misclassified samples. Note since the model accuracy is pretty high, you might have to run the below cell a few times before you can see a misclassified sample."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# import json\n",
"\n",
"# # find 30 random samples from test set\n",
"# n = 30\n",
"# sample_indices = np.random.permutation(X_test.shape[0])[0:n]\n",
"\n",
"# test_samples = json.dumps({\"data\": X_test[sample_indices].tolist()})\n",
"# test_samples = bytes(test_samples, encoding='utf8')\n",
"\n",
"# # predict using the deployed model\n",
"# result = service.run(input_data=test_samples)\n",
"\n",
"# # compare actual value vs. the predicted values:\n",
"# i = 0\n",
"# plt.figure(figsize = (20, 1))\n",
"\n",
"# for s in sample_indices:\n",
"# plt.subplot(1, n, i + 1)\n",
"# plt.axhline('')\n",
"# plt.axvline('')\n",
" \n",
"# # use different color for misclassified sample\n",
"# font_color = 'red' if y_test[s] != result[i] else 'black'\n",
"# clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys\n",
" \n",
"# plt.text(x=10, y=-10, s=y_hat[s], fontsize=18, color=font_color)\n",
"# plt.imshow(X_test[s].reshape(28, 28), cmap=clr_map)\n",
" \n",
"# i = i + 1\n",
"# plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can also send raw HTTP request to the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# import requests\n",
"\n",
"# # send a random row from the test set to score\n",
"# random_index = np.random.randint(0, len(X_test)-1)\n",
"# input_data = \"{\\\"data\\\": [\" + str(list(X_test[random_index])) + \"]}\"\n",
"\n",
"# headers = {'Content-Type':'application/json'}\n",
"\n",
"# resp = requests.post(service.scoring_uri, input_data, headers=headers)\n",
"\n",
"# print(\"POST to url\", service.scoring_uri)\n",
"# print(\"input data:\", input_data)\n",
"# print(\"label:\", y_test[random_index])\n",
"# print(\"prediction:\", resp.text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's look at the workspace after the web service was deployed. You should see \n",
"* a registered model named 'model' and with the id 'model:1'\n",
"* an image called 'tf-mnist' and with a docker image location pointing to your workspace's Azure Container Registry (ACR) \n",
"* a webservice called 'tf-mnist' with some scoring URL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# models = ws.models\n",
"# for name, model in models.items():\n",
"# print(\"Model: {}, ID: {}\".format(name, model.id))\n",
" \n",
"# images = ws.images\n",
"# for name, image in images.items():\n",
"# print(\"Image: {}, location: {}\".format(name, image.image_location))\n",
" \n",
"# webservices = ws.webservices\n",
"# for name, webservice in webservices.items():\n",
"# print(\"Webservice: {}, scoring URI: {}\".format(name, webservice.scoring_uri))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean up\n",
"You can delete the ACI deployment with a simple delete API call."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# service.delete()"
] ]
} }
], ],

View File

@@ -79,7 +79,20 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"aml_compute = ws.get_default_compute_target(\"CPU\")" "from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"aml_compute_target = \"cpu-cluster\"\n",
"try:\n",
" aml_compute = AmlCompute(ws, aml_compute_target)\n",
" print(\"found existing compute target.\")\n",
"except ComputeTargetException:\n",
" print(\"creating new compute target\")\n",
" \n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\",\n",
" min_nodes = 1, \n",
" max_nodes = 4) \n",
" aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)\n",
" aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n"
] ]
}, },
{ {

View File

@@ -0,0 +1,6 @@
name: aml-pipelines-publish-and-run-using-rest-endpoint
dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- requests

View File

@@ -54,7 +54,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Compute Targets\n", "### Compute Targets\n",
"#### Retrieve the default Azure Machine Learning Compute" "#### Retrieve an already attached Azure Machine Learning Compute"
] ]
}, },
{ {
@@ -63,7 +63,31 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"aml_compute_target = ws.get_default_compute_target(\"CPU\")" "from azureml.core import Run, Experiment, Datastore\n",
"\n",
"from azureml.widgets import RunDetails\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
"aml_compute_target = \"cpu-cluster\"\n",
"try:\n",
" aml_compute = AmlCompute(ws, aml_compute_target)\n",
" print(\"Found existing compute target: {}\".format(aml_compute_target))\n",
"except:\n",
" print(\"Creating new compute target: {}\".format(aml_compute_target))\n",
" \n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\",\n",
" min_nodes = 1, \n",
" max_nodes = 4) \n",
" aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)\n",
" aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)"
] ]
}, },
{ {

View File

@@ -85,10 +85,24 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from azureml.core import Run, Experiment, Datastore\n",
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
"from azureml.pipeline.steps import PythonScriptStep\n", "from azureml.pipeline.steps import PythonScriptStep\n",
"from azureml.pipeline.core import Pipeline\n", "from azureml.pipeline.core import Pipeline\n",
"\n", "\n",
"aml_compute = ws.get_default_compute_target(\"CPU\")\n", "#Retrieve an already attached Azure Machine Learning Compute\n",
"aml_compute_target = \"cpu-cluster\"\n",
"try:\n",
" aml_compute = AmlCompute(ws, aml_compute_target)\n",
" print(\"Found existing compute target: {}\".format(aml_compute_target))\n",
"except:\n",
" print(\"Creating new compute target: {}\".format(aml_compute_target))\n",
" \n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\",\n",
" min_nodes = 1, \n",
" max_nodes = 4) \n",
" aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)\n",
" aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n", "\n",
"# source_directory\n", "# source_directory\n",
"source_directory = '.'\n", "source_directory = '.'\n",

View File

@@ -139,7 +139,31 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"compute_target = ws.get_default_compute_target(\"CPU\")" "# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"cpu-cluster\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 4)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = 1, timeout_in_minutes = 10)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
] ]
}, },
{ {

View File

@@ -0,0 +1,8 @@
name: aml-pipelines-with-automated-machine-learning-step
dependencies:
- pip:
- azureml-sdk
- azureml-train-automl
- azureml-widgets
- matplotlib
- pandas_ml

View File

@@ -127,7 +127,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Retrieve or create a Aml compute\n", "#### Retrieve or create an Aml compute\n",
"Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's get the default Aml Compute in the current workspace. We will then run the training script on this compute target." "Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's get the default Aml Compute in the current workspace. We will then run the training script on this compute target."
] ]
}, },
@@ -137,7 +137,22 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"aml_compute = ws.get_default_compute_target(\"CPU\")\n" "from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"aml_compute_target = \"cpu-cluster\"\n",
"try:\n",
" aml_compute = AmlCompute(ws, aml_compute_target)\n",
" print(\"found existing compute target.\")\n",
"except ComputeTargetException:\n",
" print(\"creating new compute target\")\n",
" \n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\",\n",
" min_nodes = 1, \n",
" max_nodes = 4) \n",
" aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)\n",
" aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
" \n",
"print(\"Aml Compute attached\")\n"
] ]
}, },
{ {
@@ -418,6 +433,77 @@
"RunDetails(pipeline_run1).show()" "RunDetails(pipeline_run1).show()"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Wait for pipeline run to complete"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_run1.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### See Outputs\n",
"\n",
"See where outputs of each pipeline step are located on your datastore.\n",
"\n",
"***Wait for pipeline run to complete, to make sure all the outputs are ready***"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get Steps\n",
"for step in pipeline_run1.get_steps():\n",
" print(\"Outputs of step \" + step.name)\n",
" \n",
" # Get a dictionary of StepRunOutputs with the output name as the key \n",
" output_dict = step.get_outputs()\n",
" \n",
" for name, output in output_dict.items():\n",
" \n",
" output_reference = output.get_port_data_reference() # Get output port data reference\n",
" print(\"\\tname: \" + name)\n",
" print(\"\\tdatastore: \" + output_reference.datastore_name)\n",
" print(\"\\tpath on datastore: \" + output_reference.path_on_datastore)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download Outputs\n",
"\n",
"We can download the output of any step to our local machine using the SDK."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Retrieve the step runs by name 'train.py'\n",
"train_step = pipeline_run1.find_step_run('train.py')\n",
"\n",
"if train_step:\n",
" train_step_obj = train_step[0] # since we have only one step by name 'train.py'\n",
" train_step_obj.get_output_data('processed_data1').download(\"./outputs\") # download the output to current directory"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},

View File

@@ -0,0 +1,5 @@
name: aml-pipelines-with-data-dependency-steps
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -0,0 +1,24 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license.
import argparse
import os
print("In compare.py")
print("As a data scientist, this is where I use my compare code.")
parser = argparse.ArgumentParser("compare")
parser.add_argument("--compare_data1", type=str, help="compare_data1 data")
parser.add_argument("--compare_data2", type=str, help="compare_data2 data")
parser.add_argument("--output_compare", type=str, help="output_compare directory")
parser.add_argument("--pipeline_param", type=int, help="pipeline parameter")
args = parser.parse_args()
print("Argument 1: %s" % args.compare_data1)
print("Argument 2: %s" % args.compare_data2)
print("Argument 3: %s" % args.output_compare)
print("Argument 4: %s" % args.pipeline_param)
if not (args.output_compare is None):
os.makedirs(args.output_compare, exist_ok=True)
print("%s created" % args.output_compare)

View File

@@ -0,0 +1,21 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license.
import argparse
import os
print("In extract.py")
print("As a data scientist, this is where I use my extract code.")
parser = argparse.ArgumentParser("extract")
parser.add_argument("--input_extract", type=str, help="input_extract data")
parser.add_argument("--output_extract", type=str, help="output_extract directory")
args = parser.parse_args()
print("Argument 1: %s" % args.input_extract)
print("Argument 2: %s" % args.output_extract)
if not (args.output_extract is None):
os.makedirs(args.output_extract, exist_ok=True)
print("%s created" % args.output_extract)

View File

@@ -0,0 +1,22 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license.
import argparse
import os
print("In train.py")
print("As a data scientist, this is where I use my training code.")
parser = argparse.ArgumentParser("train")
parser.add_argument("--input_data", type=str, help="input data")
parser.add_argument("--output_train", type=str, help="output_train directory")
args = parser.parse_args()
print("Argument 1: %s" % args.input_data)
print("Argument 2: %s" % args.output_train)
if not (args.output_train is None):
os.makedirs(args.output_train, exist_ok=True)
print("%s created" % args.output_train)

View File

@@ -0,0 +1,9 @@
name: nyc-taxi-data-regression-model-building
dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- azureml-contrib-opendatasets
- azureml-dataprep
- azureml-train-automl
- matplotlib

View File

@@ -0,0 +1,58 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license.
import argparse
import os
import pandas as pd
import azureml.dataprep as dprep
def get_dict(dict_str):
pairs = dict_str.strip("{}").split("\;")
new_dict = {}
for pair in pairs:
key, value = pair.strip('\\').split(":")
new_dict[key.strip().strip("'")] = value.strip().strip("'")
return new_dict
print("Cleans the input data")
parser = argparse.ArgumentParser("cleanse")
parser.add_argument("--input_cleanse", type=str, help="raw taxi data")
parser.add_argument("--output_cleanse", type=str, help="cleaned taxi data directory")
parser.add_argument("--useful_columns", type=str, help="useful columns to keep")
parser.add_argument("--columns", type=str, help="rename column pattern")
args = parser.parse_args()
print("Argument 1(input taxi data path): %s" % args.input_cleanse)
print("Argument 2(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;")))
print("Argument 3(columns renaming mapping): %s" % str(args.columns.strip("{}").split("\;")))
print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse)
raw_df = dprep.read_csv(path=args.input_cleanse, header=dprep.PromoteHeadersMode.GROUPED)
# These functions ensure that null data is removed from the data set,
# which will help increase machine learning model accuracy.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep
# for more details
useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")]
columns = get_dict(args.columns)
all_columns = dprep.ColumnSelector(term=".*", use_regex=True)
drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)]
new_df = (raw_df
.replace_na(columns=all_columns)
.drop_nulls(*drop_if_all_null)
.rename_columns(column_pairs=columns)
.keep_columns(columns=useful_columns))
if not (args.output_cleanse is None):
os.makedirs(args.output_cleanse, exist_ok=True)
print("%s created" % args.output_cleanse)
write_df = new_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_cleanse))
write_df.run_local()

View File

@@ -0,0 +1,55 @@
import argparse
import os
import azureml.dataprep as dprep
print("Filters out coordinates for locations that are outside the city border.",
"Chain the column filter commands within the filter() function",
"and define the minimum and maximum bounds for each field.")
parser = argparse.ArgumentParser("filter")
parser.add_argument("--input_filter", type=str, help="merged taxi data directory")
parser.add_argument("--output_filter", type=str, help="filter out out of city locations")
args = parser.parse_args()
print("Argument 1(input taxi data path): %s" % args.input_filter)
print("Argument 2(output filtered taxi data path): %s" % args.output_filter)
combined_df = dprep.read_csv(args.input_filter + '/part-*')
# These functions filter out coordinates for locations that are outside the city border.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details
# Create a condensed view of the dataflow to just show the lat/long fields,
# which makes it easier to evaluate missing or out-of-scope coordinates
decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)
combined_df = combined_df.set_column_types(type_conversions={
"pickup_longitude": decimal_type,
"pickup_latitude": decimal_type,
"dropoff_longitude": decimal_type,
"dropoff_latitude": decimal_type
})
# Filter out coordinates for locations that are outside the city border.
# Chain the column filter commands within the filter() function
# and define the minimum and maximum bounds for each field
latlong_filtered_df = (combined_df
.drop_nulls(columns=["pickup_longitude",
"pickup_latitude",
"dropoff_longitude",
"dropoff_latitude"],
column_relationship=dprep.ColumnRelationship(dprep.ColumnRelationship.ANY))
.filter(dprep.f_and(dprep.col("pickup_longitude") <= -73.72,
dprep.col("pickup_longitude") >= -74.09,
dprep.col("pickup_latitude") <= 40.88,
dprep.col("pickup_latitude") >= 40.53,
dprep.col("dropoff_longitude") <= -73.72,
dprep.col("dropoff_longitude") >= -74.09,
dprep.col("dropoff_latitude") <= 40.88,
dprep.col("dropoff_latitude") >= 40.53)))
if not (args.output_filter is None):
os.makedirs(args.output_filter, exist_ok=True)
print("%s created" % args.output_filter)
write_df = latlong_filtered_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_filter))
write_df.run_local()

View File

@@ -0,0 +1,29 @@
import argparse
import os
import azureml.dataprep as dprep
print("Merge Green and Yellow taxi data")
parser = argparse.ArgumentParser("merge")
parser.add_argument("--input_green_merge", type=str, help="cleaned green taxi data directory")
parser.add_argument("--input_yellow_merge", type=str, help="cleaned yellow taxi data directory")
parser.add_argument("--output_merge", type=str, help="green and yellow taxi data merged")
args = parser.parse_args()
print("Argument 1(input green taxi data path): %s" % args.input_green_merge)
print("Argument 2(input yellow taxi data path): %s" % args.input_yellow_merge)
print("Argument 3(output merge taxi data path): %s" % args.output_merge)
green_df = dprep.read_csv(args.input_green_merge + '/part-*')
yellow_df = dprep.read_csv(args.input_yellow_merge + '/part-*')
# Appending yellow data to green data
combined_df = green_df.append_rows([yellow_df])
if not (args.output_merge is None):
os.makedirs(args.output_merge, exist_ok=True)
print("%s created" % args.output_merge)
write_df = combined_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_merge))
write_df.run_local()

View File

@@ -0,0 +1,47 @@
import argparse
import os
import azureml.dataprep as dprep
print("Replace undefined values to relavant values and rename columns to meaningful names")
parser = argparse.ArgumentParser("normalize")
parser.add_argument("--input_normalize", type=str, help="combined and converted taxi data")
parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns")
args = parser.parse_args()
print("Argument 1(input taxi data path): %s" % args.input_normalize)
print("Argument 2(output normalized taxi data path): %s" % args.output_normalize)
combined_converted_df = dprep.read_csv(args.input_normalize + '/part-*')
# These functions replace undefined values and rename to use meaningful names.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details
replaced_stfor_vals_df = combined_converted_df.replace(columns="store_forward",
find="0",
replace_with="N").fill_nulls("store_forward", "N")
replaced_distance_vals_df = replaced_stfor_vals_df.replace(columns="distance",
find=".00",
replace_with=0).fill_nulls("distance", 0)
replaced_distance_vals_df = replaced_distance_vals_df.to_number(["distance"])
time_split_df = (replaced_distance_vals_df
.split_column_by_example(source_column="pickup_datetime")
.split_column_by_example(source_column="dropoff_datetime"))
# Split the pickup and dropoff datetime values into the respective date and time columns
renamed_col_df = (time_split_df
.rename_columns(column_pairs={
"pickup_datetime_1": "pickup_date",
"pickup_datetime_2": "pickup_time",
"dropoff_datetime_1": "dropoff_date",
"dropoff_datetime_2": "dropoff_time"}))
if not (args.output_normalize is None):
os.makedirs(args.output_normalize, exist_ok=True)
print("%s created" % args.output_normalize)
write_df = renamed_col_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_normalize))
write_df.run_local()

View File

@@ -0,0 +1,88 @@
import argparse
import os
import azureml.dataprep as dprep
print("Transforms the renamed taxi data to the required format")
parser = argparse.ArgumentParser("transform")
parser.add_argument("--input_transform", type=str, help="renamed taxi data")
parser.add_argument("--output_transform", type=str, help="transformed taxi data")
args = parser.parse_args()
print("Argument 1(input taxi data path): %s" % args.input_transform)
print("Argument 2(output final transformed taxi data): %s" % args.output_transform)
renamed_df = dprep.read_csv(args.input_transform + '/part-*')
# These functions transform the renamed data to be used finally for training.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details
# Split the pickup and dropoff date further into the day of the week, day of the month, and month values.
# To get the day of the week value, use the derive_column_by_example() function.
# The function takes an array parameter of example objects that define the input data,
# and the preferred output. The function automatically determines your preferred transformation.
# For the pickup and dropoff time columns, split the time into the hour, minute, and second by using
# the split_column_by_example() function with no example parameter. After you generate the new features,
# use the drop_columns() function to delete the original fields as the newly generated features are preferred.
# Rename the rest of the fields to use meaningful descriptions.
transformed_features_df = (renamed_df
.derive_column_by_example(
source_columns="pickup_date",
new_column_name="pickup_weekday",
example_data=[("2009-01-04", "Sunday"), ("2013-08-22", "Thursday")])
.derive_column_by_example(
source_columns="dropoff_date",
new_column_name="dropoff_weekday",
example_data=[("2013-08-22", "Thursday"), ("2013-11-03", "Sunday")])
.split_column_by_example(source_column="pickup_time")
.split_column_by_example(source_column="dropoff_time")
.split_column_by_example(source_column="pickup_time_1")
.split_column_by_example(source_column="dropoff_time_1")
.drop_columns(columns=[
"pickup_date", "pickup_time", "dropoff_date", "dropoff_time",
"pickup_date_1", "dropoff_date_1", "pickup_time_1", "dropoff_time_1"])
.rename_columns(column_pairs={
"pickup_date_2": "pickup_month",
"pickup_date_3": "pickup_monthday",
"pickup_time_1_1": "pickup_hour",
"pickup_time_1_2": "pickup_minute",
"pickup_time_2": "pickup_second",
"dropoff_date_2": "dropoff_month",
"dropoff_date_3": "dropoff_monthday",
"dropoff_time_1_1": "dropoff_hour",
"dropoff_time_1_2": "dropoff_minute",
"dropoff_time_2": "dropoff_second"}))
# Drop the pickup_datetime and dropoff_datetime columns because they're
# no longer needed (granular time features like hour,
# minute and second are more useful for model training).
processed_df = transformed_features_df.drop_columns(columns=["pickup_datetime", "dropoff_datetime"])
# Use the type inference functionality to automatically check the data type of each field,
# and display the inference results.
type_infer = processed_df.builders.set_column_types()
type_infer.learn()
# The inference results look correct based on the data. Now apply the type conversions to the dataflow.
type_converted_df = type_infer.to_dataflow()
# Before you package the dataflow, run two final filters on the data set.
# To eliminate incorrectly captured data points,
# filter the dataflow on records where both the cost and distance variable values are greater than zero.
# This step will significantly improve machine learning model accuracy,
# because data points with a zero cost or distance represent major outliers that throw off prediction accuracy.
final_df = type_converted_df.filter(dprep.col("distance") > 0)
final_df = final_df.filter(dprep.col("cost") > 0)
# Writing the final dataframe to use for training in the following steps
if not (args.output_transform is None):
os.makedirs(args.output_transform, exist_ok=True)
print("%s created" % args.output_transform)
write_df = final_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_transform))
write_df.run_local()

Some files were not shown because too many files have changed in this diff Show More