mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-22 18:42:41 -05:00
Added setup and configuration files
This commit is contained in:
@@ -183,9 +183,13 @@ bash automl_setup_linux.sh
|
|||||||
- Simple example of using Auto ML for classification with whitelisting tensorflow models.checkout
|
- Simple example of using Auto ML for classification with whitelisting tensorflow models.checkout
|
||||||
- Uses local compute for training
|
- Uses local compute for training
|
||||||
|
|
||||||
- [auto-ml-timeseries.ipynb](timeseries/auto-ml-timeseries.ipynb)
|
- [auto-ml-forecasting-a.ipynb](forecasting-a/auto-ml-forecasting-a.ipynb)
|
||||||
- Dataset: NYC energy demanding data
|
- Dataset: [NYC energy demand data](forecasting-a/nyc_energy.csv)
|
||||||
- Example of using AutoML for timeseries data training
|
- Example of using AutoML for training a forecasting model
|
||||||
|
|
||||||
|
- [auto-ml-forecasting-b.ipynb](forecasting-b/auto-ml-forecasting-b.ipynb)
|
||||||
|
- Dataset: [Dominick's grocery sales of orange juice](forecasting-b/dominicks_OJ.csv)
|
||||||
|
- Example of training an AutoML forecasting model on multiple time-series
|
||||||
|
|
||||||
<a name="documentation"></a>
|
<a name="documentation"></a>
|
||||||
# Documentation
|
# Documentation
|
||||||
|
|||||||
32
how-to-use-azureml/automated-machine-learning/automl_env.yml
Normal file
32
how-to-use-azureml/automated-machine-learning/automl_env.yml
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
name: azure_automl
|
||||||
|
dependencies:
|
||||||
|
# The python interpreter version.
|
||||||
|
# Currently Azure ML only supports 3.5.2 and later.
|
||||||
|
- python=3.6
|
||||||
|
- nb_conda
|
||||||
|
- matplotlib==2.1.0
|
||||||
|
- numpy>=1.11.0,<1.15.0
|
||||||
|
- cython
|
||||||
|
- urllib3<1.24
|
||||||
|
- scipy>=1.0.0,<=1.1.0
|
||||||
|
- scikit-learn>=0.18.0,<=0.19.1
|
||||||
|
- pandas>=0.22.0,<0.23.0
|
||||||
|
- tensorflow>=1.12.0
|
||||||
|
|
||||||
|
# Required for azuremlftk
|
||||||
|
- dill
|
||||||
|
- pyodbc
|
||||||
|
- statsmodels
|
||||||
|
- numexpr
|
||||||
|
- keras
|
||||||
|
- distributed>=1.21.5,<1.24
|
||||||
|
|
||||||
|
- pip:
|
||||||
|
|
||||||
|
# Required for azuremlftk
|
||||||
|
- https://azuremlpackages.blob.core.windows.net/forecasting/azuremlftk-0.1.18323.5a1-py3-none-any.whl
|
||||||
|
|
||||||
|
# Required packages for AzureML execution, history, and data preparation.
|
||||||
|
- azureml-sdk[automl,notebooks,explain]
|
||||||
|
- pandas_ml
|
||||||
|
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
name: azure_automl
|
||||||
|
dependencies:
|
||||||
|
# The python interpreter version.
|
||||||
|
# Currently Azure ML only supports 3.5.2 and later.
|
||||||
|
- python=3.6
|
||||||
|
- nb_conda
|
||||||
|
- matplotlib==2.1.0
|
||||||
|
- numpy>=1.15.3
|
||||||
|
- cython
|
||||||
|
- urllib3<1.24
|
||||||
|
- scipy>=1.0.0,<=1.1.0
|
||||||
|
- scikit-learn>=0.18.0,<=0.19.1
|
||||||
|
- pandas>=0.22.0,<0.23.0
|
||||||
|
- tensorflow>=1.12.0
|
||||||
|
|
||||||
|
# Required for azuremlftk
|
||||||
|
- dill
|
||||||
|
- pyodbc
|
||||||
|
- statsmodels
|
||||||
|
- numexpr
|
||||||
|
- keras
|
||||||
|
- distributed>=1.21.5,<1.24
|
||||||
|
|
||||||
|
- pip:
|
||||||
|
|
||||||
|
# Required for azuremlftk
|
||||||
|
- https://azuremlpackages.blob.core.windows.net/forecasting/azuremlftk-0.1.18323.5a1-py3-none-any.whl
|
||||||
|
|
||||||
|
# Required packages for AzureML execution, history, and data preparation.
|
||||||
|
- azureml-sdk[automl,notebooks,explain]
|
||||||
|
- pandas_ml
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
@echo off
|
||||||
|
set conda_env_name=%1
|
||||||
|
set automl_env_file=%2
|
||||||
|
set PIP_NO_WARN_SCRIPT_LOCATION=0
|
||||||
|
|
||||||
|
IF "%conda_env_name%"=="" SET conda_env_name="azure_automl"
|
||||||
|
IF "%automl_env_file%"=="" SET automl_env_file="automl_env.yml"
|
||||||
|
|
||||||
|
IF NOT EXIST %automl_env_file% GOTO YmlMissing
|
||||||
|
|
||||||
|
call conda activate %conda_env_name% 2>nul:
|
||||||
|
|
||||||
|
if not errorlevel 1 (
|
||||||
|
echo Upgrading azureml-sdk[automl] in existing conda environment %conda_env_name%
|
||||||
|
call pip install --upgrade azureml-sdk[automl,notebooks]
|
||||||
|
if errorlevel 1 goto ErrorExit
|
||||||
|
) else (
|
||||||
|
call conda env create -f %automl_env_file% -n %conda_env_name%
|
||||||
|
)
|
||||||
|
|
||||||
|
call conda activate %conda_env_name% 2>nul:
|
||||||
|
if errorlevel 1 goto ErrorExit
|
||||||
|
|
||||||
|
call pip install psutil
|
||||||
|
|
||||||
|
call python -m ipykernel install --user --name %conda_env_name% --display-name "Python (%conda_env_name%)"
|
||||||
|
|
||||||
|
call jupyter nbextension install --py azureml.widgets --user
|
||||||
|
if errorlevel 1 goto ErrorExit
|
||||||
|
|
||||||
|
call jupyter nbextension enable --py azureml.widgets --user
|
||||||
|
if errorlevel 1 goto ErrorExit
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo.
|
||||||
|
echo ***************************************
|
||||||
|
echo * AutoML setup completed successfully *
|
||||||
|
echo ***************************************
|
||||||
|
echo.
|
||||||
|
echo Starting jupyter notebook - please run the configuration notebook
|
||||||
|
echo.
|
||||||
|
jupyter notebook --log-level=50
|
||||||
|
|
||||||
|
goto End
|
||||||
|
|
||||||
|
:YmlMissing
|
||||||
|
echo File %automl_env_file% not found.
|
||||||
|
|
||||||
|
:ErrorExit
|
||||||
|
echo Install failed
|
||||||
|
|
||||||
|
:End
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
CONDA_ENV_NAME=$1
|
||||||
|
AUTOML_ENV_FILE=$2
|
||||||
|
PIP_NO_WARN_SCRIPT_LOCATION=0
|
||||||
|
|
||||||
|
if [ "$CONDA_ENV_NAME" == "" ]
|
||||||
|
then
|
||||||
|
CONDA_ENV_NAME="azure_automl"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$AUTOML_ENV_FILE" == "" ]
|
||||||
|
then
|
||||||
|
AUTOML_ENV_FILE="automl_env.yml"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $AUTOML_ENV_FILE ]; then
|
||||||
|
echo "File $AUTOML_ENV_FILE not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if source activate $CONDA_ENV_NAME 2> /dev/null
|
||||||
|
then
|
||||||
|
echo "Upgrading azureml-sdk[automl] in existing conda environment" $CONDA_ENV_NAME
|
||||||
|
pip install --upgrade azureml-sdk[automl,notebooks]
|
||||||
|
else
|
||||||
|
conda env create -f $AUTOML_ENV_FILE -n $CONDA_ENV_NAME &&
|
||||||
|
source activate $CONDA_ENV_NAME &&
|
||||||
|
python -m ipykernel install --user --name $CONDA_ENV_NAME --display-name "Python ($CONDA_ENV_NAME)" &&
|
||||||
|
jupyter nbextension install --py azureml.widgets --user &&
|
||||||
|
jupyter nbextension enable --py azureml.widgets --user &&
|
||||||
|
echo "" &&
|
||||||
|
echo "" &&
|
||||||
|
echo "***************************************" &&
|
||||||
|
echo "* AutoML setup completed successfully *" &&
|
||||||
|
echo "***************************************" &&
|
||||||
|
echo "" &&
|
||||||
|
echo "Starting jupyter notebook - please run the configuration notebook" &&
|
||||||
|
echo "" &&
|
||||||
|
jupyter notebook --log-level=50
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $? -gt 0 ]
|
||||||
|
then
|
||||||
|
echo "Installation failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
CONDA_ENV_NAME=$1
|
||||||
|
AUTOML_ENV_FILE=$2
|
||||||
|
PIP_NO_WARN_SCRIPT_LOCATION=0
|
||||||
|
|
||||||
|
if [ "$CONDA_ENV_NAME" == "" ]
|
||||||
|
then
|
||||||
|
CONDA_ENV_NAME="azure_automl"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$AUTOML_ENV_FILE" == "" ]
|
||||||
|
then
|
||||||
|
AUTOML_ENV_FILE="automl_env_mac.yml"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $AUTOML_ENV_FILE ]; then
|
||||||
|
echo "File $AUTOML_ENV_FILE not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if source activate $CONDA_ENV_NAME 2> /dev/null
|
||||||
|
then
|
||||||
|
echo "Upgrading azureml-sdk[automl] in existing conda environment" $CONDA_ENV_NAME
|
||||||
|
pip install --upgrade azureml-sdk[automl,notebooks]
|
||||||
|
else
|
||||||
|
conda env create -f $AUTOML_ENV_FILE -n $CONDA_ENV_NAME &&
|
||||||
|
source activate $CONDA_ENV_NAME &&
|
||||||
|
conda install lightgbm -c conda-forge -y &&
|
||||||
|
python -m ipykernel install --user --name $CONDA_ENV_NAME --display-name "Python ($CONDA_ENV_NAME)" &&
|
||||||
|
jupyter nbextension install --py azureml.widgets --user &&
|
||||||
|
jupyter nbextension enable --py azureml.widgets --user &&
|
||||||
|
pip install numpy==1.15.3
|
||||||
|
echo "" &&
|
||||||
|
echo "" &&
|
||||||
|
echo "***************************************" &&
|
||||||
|
echo "* AutoML setup completed successfully *" &&
|
||||||
|
echo "***************************************" &&
|
||||||
|
echo "" &&
|
||||||
|
echo "Starting jupyter notebook - please run the configuration notebook" &&
|
||||||
|
echo "" &&
|
||||||
|
jupyter notebook --log-level=50
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $? -gt 0 ]
|
||||||
|
then
|
||||||
|
echo "Installation failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,568 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||||
|
"\n",
|
||||||
|
"Licensed under the MIT License."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Automated Machine Learning: Classification local on Azure DataBricks\n",
|
||||||
|
"\n",
|
||||||
|
"In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
|
||||||
|
"\n",
|
||||||
|
"In this notebook you will learn how to:\n",
|
||||||
|
"1. Create Azure Machine Learning Workspace object and initialize your notebook directory to easily reload this object from a configuration file.\n",
|
||||||
|
"2. Create an `Experiment` in an existing `Workspace`.\n",
|
||||||
|
"3. Configure AutoML using `AutoMLConfig`.\n",
|
||||||
|
"4. Train the model using AzureDataBricks.\n",
|
||||||
|
"5. Explore the results.\n",
|
||||||
|
"6. Test the best fitted model.\n",
|
||||||
|
"\n",
|
||||||
|
"Prerequisites:\n",
|
||||||
|
"Before running this notebook, run the install instructions described in README.md."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Register Machine Learning Services Resource Provider\n",
|
||||||
|
"Microsoft.MachineLearningServices only needs to be registed once in the subscription. To register it:\n",
|
||||||
|
"Start the Azure portal.\n",
|
||||||
|
"Select your All services and then Subscription.\n",
|
||||||
|
"Select the subscription that you want to use.\n",
|
||||||
|
"Click on Resource providers\n",
|
||||||
|
"Click the Register link next to Microsoft.MachineLearningServices"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Check the Azure ML Core SDK Version to Validate Your Installation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import azureml.core\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"SDK Version:\", azureml.core.VERSION)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Initialize an Azure ML Workspace\n",
|
||||||
|
"### What is an Azure ML Workspace and Why Do I Need One?\n",
|
||||||
|
"\n",
|
||||||
|
"An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, operationalization, and the monitoring of operationalized models.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"### What do I Need?\n",
|
||||||
|
"\n",
|
||||||
|
"To create or access an Azure ML workspace, you will need to import the Azure ML library and specify following information:\n",
|
||||||
|
"* A name for your workspace. You can choose one.\n",
|
||||||
|
"* Your subscription id. Use the `id` value from the `az account show` command output above.\n",
|
||||||
|
"* The resource group name. The resource group organizes Azure resources and provides a default region for the resources in the group. The resource group will be created if it doesn't exist. Resource groups can be created and viewed in the [Azure portal](https://portal.azure.com)\n",
|
||||||
|
"* Supported regions include `eastus2`, `eastus`,`westcentralus`, `southeastasia`, `westeurope`, `australiaeast`, `westus2`, `southcentralus`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"subscription_id = \"<SubscriptionId>\"\n",
|
||||||
|
"resource_group = \"myrg\"\n",
|
||||||
|
"workspace_name = \"myws\"\n",
|
||||||
|
"workspace_region = \"eastus2\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating a Workspace\n",
|
||||||
|
"If you already have access to an Azure ML workspace you want to use, you can skip this cell. Otherwise, this cell will create an Azure ML workspace for you in the specified subscription, provided you have the correct permissions for the given `subscription_id`.\n",
|
||||||
|
"\n",
|
||||||
|
"This will fail when:\n",
|
||||||
|
"1. The workspace already exists.\n",
|
||||||
|
"2. You do not have permission to create a workspace in the resource group.\n",
|
||||||
|
"3. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription.\n",
|
||||||
|
"\n",
|
||||||
|
"If workspace creation fails for any reason other than already existing, please work with your IT administrator to provide you with the appropriate permissions or to provision the required resources.\n",
|
||||||
|
"\n",
|
||||||
|
"**Note:** Creation of a new workspace can take several minutes."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the Workspace class and check the Azure ML SDK version.\n",
|
||||||
|
"from azureml.core import Workspace\n",
|
||||||
|
"\n",
|
||||||
|
"ws = Workspace.create(name = workspace_name,\n",
|
||||||
|
" subscription_id = subscription_id,\n",
|
||||||
|
" resource_group = resource_group, \n",
|
||||||
|
" location = workspace_region,\n",
|
||||||
|
" exist_ok=True)\n",
|
||||||
|
"ws.get_details()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configuring Your Local Environment\n",
|
||||||
|
"You can validate that you have access to the specified workspace and write a configuration file to the default configuration location, `./aml_config/config.json`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azureml.core import Workspace\n",
|
||||||
|
"\n",
|
||||||
|
"ws = Workspace(workspace_name = workspace_name,\n",
|
||||||
|
" subscription_id = subscription_id,\n",
|
||||||
|
" resource_group = resource_group)\n",
|
||||||
|
"\n",
|
||||||
|
"# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
|
||||||
|
"ws.write_config()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Create a Folder to Host Sample Projects\n",
|
||||||
|
"Finally, create a folder where all the sample projects will be hosted."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"sample_projects_folder = './sample_projects'\n",
|
||||||
|
"\n",
|
||||||
|
"if not os.path.isdir(sample_projects_folder):\n",
|
||||||
|
" os.mkdir(sample_projects_folder)\n",
|
||||||
|
" \n",
|
||||||
|
"print('Sample projects will be created in {}.'.format(sample_projects_folder))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Create an Experiment\n",
|
||||||
|
"\n",
|
||||||
|
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import logging\n",
|
||||||
|
"import os\n",
|
||||||
|
"import random\n",
|
||||||
|
"import time\n",
|
||||||
|
"\n",
|
||||||
|
"from matplotlib import pyplot as plt\n",
|
||||||
|
"from matplotlib.pyplot import imshow\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"import azureml.core\n",
|
||||||
|
"from azureml.core.experiment import Experiment\n",
|
||||||
|
"from azureml.core.workspace import Workspace\n",
|
||||||
|
"from azureml.train.automl import AutoMLConfig\n",
|
||||||
|
"from azureml.train.automl.run import AutoMLRun"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ws = Workspace.from_config()\n",
|
||||||
|
"\n",
|
||||||
|
"# Choose a name for the experiment and specify the project folder.\n",
|
||||||
|
"experiment_name = 'automl-local-classification'\n",
|
||||||
|
"project_folder = './sample_projects/automl-local-classification'\n",
|
||||||
|
"\n",
|
||||||
|
"experiment = Experiment(ws, experiment_name)\n",
|
||||||
|
"\n",
|
||||||
|
"output = {}\n",
|
||||||
|
"output['SDK version'] = azureml.core.VERSION\n",
|
||||||
|
"output['Subscription ID'] = ws.subscription_id\n",
|
||||||
|
"output['Workspace Name'] = ws.name\n",
|
||||||
|
"output['Resource Group'] = ws.resource_group\n",
|
||||||
|
"output['Location'] = ws.location\n",
|
||||||
|
"output['Project Directory'] = project_folder\n",
|
||||||
|
"output['Experiment Name'] = experiment.name\n",
|
||||||
|
"pd.set_option('display.max_colwidth', -1)\n",
|
||||||
|
"pd.DataFrame(data = output, index = ['']).T"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Diagnostics\n",
|
||||||
|
"\n",
|
||||||
|
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||||
|
"set_diagnostics_collection(send_diagnostics = True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Load Training Data Using DataPrep\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import azureml.dataprep as dprep\n",
|
||||||
|
"# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
|
||||||
|
"# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
|
||||||
|
"simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
|
||||||
|
"X = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n",
|
||||||
|
"\n",
|
||||||
|
"# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
|
||||||
|
"# and convert column types manually.\n",
|
||||||
|
"# Here we read a comma delimited file and convert all columns to integers.\n",
|
||||||
|
"y = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"## Review the Data Preparation Result\n",
|
||||||
|
"You can peek the result of a Dataflow at any range using skip(i) and head(j). Doing so evaluates only j records for all the steps in the Dataflow, which makes it fast even against large datasets."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X.skip(1).head(5)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configure AutoML\n",
|
||||||
|
"\n",
|
||||||
|
"Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n",
|
||||||
|
"\n",
|
||||||
|
"|Property|Description|\n",
|
||||||
|
"|-|-|\n",
|
||||||
|
"|**task**|classification or regression|\n",
|
||||||
|
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
|
||||||
|
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
|
||||||
|
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
|
||||||
|
"|**n_cross_validations**|Number of cross validation splits.|\n",
|
||||||
|
"|**spark_context**|Spark Context object.|\n",
|
||||||
|
"|**max_cuncurrent_iterations**|Maximum number of iterations to execute in parallel. This should be less than the number of cores on the ADB..|\n",
|
||||||
|
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
|
||||||
|
"|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]<br>Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers.|\n",
|
||||||
|
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"automl_settings = {\n",
|
||||||
|
" \"iteration_timeout_minutes\": 10,\n",
|
||||||
|
" \"iterations\": 10,\n",
|
||||||
|
" \"n_cross_validations\": 5,\n",
|
||||||
|
" \"primary_metric\": 'AUC_weighted',\n",
|
||||||
|
" \"preprocess\": False,\n",
|
||||||
|
" \"max_concurrent_iterations\": 2,\n",
|
||||||
|
" \"verbosity\": logging.INFO,\n",
|
||||||
|
" \"spark_context\": sc\n",
|
||||||
|
"}\n",
|
||||||
|
" \n",
|
||||||
|
"automl_config = AutoMLConfig(task = 'classification',\n",
|
||||||
|
" debug_log = 'automl_errors.log',\n",
|
||||||
|
" path = project_folder, \n",
|
||||||
|
" X = X, \n",
|
||||||
|
" y = y,\n",
|
||||||
|
" **automl_settings\n",
|
||||||
|
" )\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Train the Models\n",
|
||||||
|
"\n",
|
||||||
|
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
|
||||||
|
"In this example, we specify `show_output = True` to print currently running iterations to the console."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"local_run = experiment.submit(automl_config, show_output = False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Explore the Results"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Portal URL for Monitoring Runs\n",
|
||||||
|
"\n",
|
||||||
|
"The following will provide a link to the web interface to explore individual run details and status."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(local_run.get_portal_url())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"The following will show the child runs and waits for the parent run to complete."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"local_run.wait_for_completion(show_output = True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Retrieve All Child Runs\n",
|
||||||
|
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"children = list(local_run.get_children())\n",
|
||||||
|
"metricslist = {}\n",
|
||||||
|
"for run in children:\n",
|
||||||
|
" properties = run.get_properties()\n",
|
||||||
|
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
|
||||||
|
" metricslist[int(properties['iteration'])] = metrics\n",
|
||||||
|
"\n",
|
||||||
|
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
|
||||||
|
"rundata"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Retrieve the Best Model\n",
|
||||||
|
"\n",
|
||||||
|
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"best_run, fitted_model = local_run.get_output()\n",
|
||||||
|
"print(best_run)\n",
|
||||||
|
"print(fitted_model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Best Model Based on Any Other Metric\n",
|
||||||
|
"Show the run and the model that has the smallest `log_loss` value:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"lookup_metric = \"log_loss\"\n",
|
||||||
|
"best_run, fitted_model = local_run.get_output(metric = lookup_metric)\n",
|
||||||
|
"print(best_run)\n",
|
||||||
|
"print(fitted_model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Model from a Specific Iteration\n",
|
||||||
|
"Show the run and the model from the third iteration:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"iteration = 3\n",
|
||||||
|
"third_run, third_model = local_run.get_output(iteration = iteration)\n",
|
||||||
|
"print(third_run)\n",
|
||||||
|
"print(third_model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Test the Best Fitted Model\n",
|
||||||
|
"\n",
|
||||||
|
"#### Load Test Data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"digits = datasets.load_digits()\n",
|
||||||
|
"X_test = digits.data[:10, :]\n",
|
||||||
|
"y_test = digits.target[:10]\n",
|
||||||
|
"images = digits.images[:10]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Testing Our Best Fitted Model\n",
|
||||||
|
"We will try to predict 2 digits and see how our model works."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Randomly select digits and test.\n",
|
||||||
|
"for index in np.random.choice(len(y_test), 2, replace = False):\n",
|
||||||
|
" print(index)\n",
|
||||||
|
" predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
|
||||||
|
" label = y_test[index]\n",
|
||||||
|
" title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n",
|
||||||
|
" fig = plt.figure(1, figsize = (3,3))\n",
|
||||||
|
" ax1 = fig.add_axes((0,0,.8,.8))\n",
|
||||||
|
" ax1.set_title(title)\n",
|
||||||
|
" plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
|
||||||
|
" display(fig)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "savitam"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python [conda env:AutoML_ADB]",
|
||||||
|
"language": "python",
|
||||||
|
"name": "conda-env-AutoML_ADB-py"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.6"
|
||||||
|
},
|
||||||
|
"name": "auto-ml-classification-local-adb",
|
||||||
|
"notebookId": 3742842704905931
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 1
|
||||||
|
}
|
||||||
@@ -0,0 +1,154 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||||
|
"\n",
|
||||||
|
"Licensed under the MIT License."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Automated Machine Learning Configuration\n",
|
||||||
|
"\n",
|
||||||
|
"In this example you will create an Azure Machine Learning `Workspace` object and initialize your notebook directory to easily reload this object from a configuration file. Typically you will only need to run this once per notebook directory, and all other notebooks in this directory or any sub-directories will automatically use the settings you indicate here.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Check the Azure ML Core SDK Version to Validate Your Installation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import azureml.core\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"SDK Version:\", azureml.core.VERSION)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Initialize an Azure ML Workspace\n",
|
||||||
|
"### What is an Azure ML Workspace and Why Do I Need One?\n",
|
||||||
|
"\n",
|
||||||
|
"An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, operationalization, and the monitoring of operationalized models.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"### What do I Need?\n",
|
||||||
|
"\n",
|
||||||
|
"To create or access an Azure ML workspace, you will need to import the Azure ML library and specify following information:\n",
|
||||||
|
"* A name for your workspace. You can choose one.\n",
|
||||||
|
"* Your subscription id. Use the `id` value from the `az account show` command output above.\n",
|
||||||
|
"* The resource group name. The resource group organizes Azure resources and provides a default region for the resources in the group. The resource group will be created if it doesn't exist. Resource groups can be created and viewed in the [Azure portal](https://portal.azure.com)\n",
|
||||||
|
"* Supported regions include `eastus2`, `eastus`,`westcentralus`, `southeastasia`, `westeurope`, `australiaeast`, `westus2`, `southcentralus`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"subscription_id = \"<subscription_id>\"\n",
|
||||||
|
"resource_group = \"myrg\"\n",
|
||||||
|
"workspace_name = \"myws\"\n",
|
||||||
|
"workspace_region = \"eastus2\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating a Workspace\n",
|
||||||
|
"If you already have access to an Azure ML workspace you want to use, you can skip this cell. Otherwise, this cell will create an Azure ML workspace for you in the specified subscription, provided you have the correct permissions for the given `subscription_id`.\n",
|
||||||
|
"\n",
|
||||||
|
"This will fail when:\n",
|
||||||
|
"1. The workspace already exists.\n",
|
||||||
|
"2. You do not have permission to create a workspace in the resource group.\n",
|
||||||
|
"3. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription.\n",
|
||||||
|
"\n",
|
||||||
|
"If workspace creation fails for any reason other than already existing, please work with your IT administrator to provide you with the appropriate permissions or to provision the required resources.\n",
|
||||||
|
"\n",
|
||||||
|
"**Note:** Creation of a new workspace can take several minutes."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the Workspace class and check the Azure ML SDK version.\n",
|
||||||
|
"from azureml.core import Workspace\n",
|
||||||
|
"\n",
|
||||||
|
"ws = Workspace.create(name = workspace_name,\n",
|
||||||
|
" subscription_id = subscription_id,\n",
|
||||||
|
" resource_group = resource_group, \n",
|
||||||
|
" location = workspace_region)\n",
|
||||||
|
"ws.get_details()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configuring Your Local Environment\n",
|
||||||
|
"You can validate that you have access to the specified workspace and write a configuration file to the default configuration location, `./aml_config/config.json`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azureml.core import Workspace\n",
|
||||||
|
"\n",
|
||||||
|
"ws = Workspace(workspace_name = workspace_name,\n",
|
||||||
|
" subscription_id = subscription_id,\n",
|
||||||
|
" resource_group = resource_group)\n",
|
||||||
|
"\n",
|
||||||
|
"# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
|
||||||
|
"ws.write_config()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "savitam"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.6",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python36"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@@ -210,7 +210,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"The AutoMLConfig object defines the settings and data for an AutoML training job. Here, we set necessary inputs like the task type, the number of AutoML iterations to try, and the training and validation data. \n",
|
"The AutoMLConfig object defines the settings and data for an AutoML training job. Here, we set necessary inputs like the task type, the number of AutoML iterations to try, and the training and validation data. \n",
|
||||||
"\n",
|
"\n",
|
||||||
"For forecasting tasks, there are some additional parameters that can be set: the name of the input data column, holding the date/time and the grain column names. A time column is required for forecasting, while the grain is optional. If a grain is not given, the forecaster assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak. \n",
|
"For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time and the grain column names. A time column is required for forecasting, while the grain is optional. If a grain is not given, the forecaster assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak. \n",
|
||||||
"\n",
|
"\n",
|
||||||
"|Property|Description|\n",
|
"|Property|Description|\n",
|
||||||
"|-|-|\n",
|
"|-|-|\n",
|
||||||
|
|||||||
467
tutorials/regression-part2-automated-ml.ipynb
Normal file
467
tutorials/regression-part2-automated-ml.ipynb
Normal file
@@ -0,0 +1,467 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Copyright (c) Microsoft Corporation. All rights reserved."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tutorial #2: Train a regression model with automated machine learning\n",
|
||||||
|
"\n",
|
||||||
|
"This tutorial is **part two of a two-part tutorial series**. In the previous tutorial, you [prepared the NYC taxi data for regression modeling](regression-part1-data-prep.ipynb).\n",
|
||||||
|
"\n",
|
||||||
|
"Now, you're ready to start building your model with Azure Machine Learning service. In this part of the tutorial, you will use the prepared data and automatically generate a regression model to predict taxi fare prices. Using the automated ML capabilities of the service, you define your machine learning goals and constraints, launch the automated machine learning process and then allow the algorithm selection and hyperparameter-tuning to happen for you. The automated ML technique iterates over many combinations of algorithms and hyperparameters until it finds the best model based on your criterion.\n",
|
||||||
|
"\n",
|
||||||
|
"In this tutorial, you learn how to:\n",
|
||||||
|
"\n",
|
||||||
|
"> * Setup a Python environment and import the SDK packages\n",
|
||||||
|
"> * Configure an Azure Machine Learning service workspace\n",
|
||||||
|
"> * Auto-train a regression model \n",
|
||||||
|
"> * Run the model locally with custom parameters\n",
|
||||||
|
"> * Explore the results\n",
|
||||||
|
"> * Register the best model\n",
|
||||||
|
"\n",
|
||||||
|
"If you don’t have an Azure subscription, create a [free account](https://aka.ms/AMLfree) before you begin. \n",
|
||||||
|
"\n",
|
||||||
|
"> Code in this article was tested with Azure Machine Learning SDK version 1.0.0\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Prerequisites\n",
|
||||||
|
"\n",
|
||||||
|
"> * [Run the data preparation tutorial](regression-part1-data-prep.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"> * Automated machine learning configured environment e.g. Azure notebooks, Local Python environment or Data Science Virtual Machine. [Setup](https://docs.microsoft.com/azure/machine-learning/service/samples-notebooks) automated machine learning."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Import packages\n",
|
||||||
|
"Import Python packages you need in this tutorial."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import azureml.core\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from azureml.core.workspace import Workspace\n",
|
||||||
|
"from azureml.train.automl.run import AutoMLRun\n",
|
||||||
|
"import time\n",
|
||||||
|
"import logging"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Configure workspace\n",
|
||||||
|
"\n",
|
||||||
|
"Create a workspace object from the existing workspace. A `Workspace` is a class that accepts your Azure subscription and resource information, and creates a cloud resource to monitor and track your model runs. `Workspace.from_config()` reads the file **aml_config/config.json** and loads the details into an object named `ws`. `ws` is used throughout the rest of the code in this tutorial.\n",
|
||||||
|
"\n",
|
||||||
|
"Once you have a workspace object, specify a name for the experiment and create and register a local directory with the workspace. The history of all runs is recorded under the specified experiment."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ws = Workspace.from_config()\n",
|
||||||
|
"# choose a name for the run history container in the workspace\n",
|
||||||
|
"experiment_name = 'automated-ml-regression'\n",
|
||||||
|
"# project folder\n",
|
||||||
|
"project_folder = './automated-ml-regression'\n",
|
||||||
|
"\n",
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"output = {}\n",
|
||||||
|
"output['SDK version'] = azureml.core.VERSION\n",
|
||||||
|
"output['Subscription ID'] = ws.subscription_id\n",
|
||||||
|
"output['Workspace'] = ws.name\n",
|
||||||
|
"output['Resource Group'] = ws.resource_group\n",
|
||||||
|
"output['Location'] = ws.location\n",
|
||||||
|
"output['Project Directory'] = project_folder\n",
|
||||||
|
"pd.set_option('display.max_colwidth', -1)\n",
|
||||||
|
"pd.DataFrame(data=output, index=['']).T"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Explore data\n",
|
||||||
|
"\n",
|
||||||
|
"Utilize the data flow object created in the previous tutorial. Open and execute the data flow and review the results."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import azureml.dataprep as dprep\n",
|
||||||
|
"package_saved = dprep.Package.open(\".\\dflow\")\n",
|
||||||
|
"dflow_prepared = package_saved.dataflows[0]\n",
|
||||||
|
"dflow_prepared.get_profile()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"You prepare the data for the experiment by adding columns to `dflow_X` to be features for our model creation. You define `dflow_y` to be our prediction value; cost.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dflow_X = dflow_prepared.keep_columns(['pickup_weekday', 'dropoff_latitude', 'dropoff_longitude','pickup_hour','pickup_longitude','pickup_latitude','passengers'])\n",
|
||||||
|
"dflow_y = dflow_prepared.keep_columns('cost')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Split data into train and test sets\n",
|
||||||
|
"\n",
|
||||||
|
"Now you split the data into training and test sets using the `train_test_split` function in the `sklearn` library. This function segregates the data into the x (features) data set for model training and the y (values to predict) data set for testing. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random generator, so that your train-test splits are always deterministic."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"x_df = dflow_X.to_pandas_dataframe()\n",
|
||||||
|
"y_df = dflow_y.to_pandas_dataframe()\n",
|
||||||
|
"\n",
|
||||||
|
"x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=123)\n",
|
||||||
|
"# flatten y_train to 1d array\n",
|
||||||
|
"y_train.values.flatten()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"You now have the necessary packages and data ready for auto training for your model. \n",
|
||||||
|
"\n",
|
||||||
|
"## Automatically train a model\n",
|
||||||
|
"\n",
|
||||||
|
"To automatically train a model:\n",
|
||||||
|
"1. Define settings for the experiment run\n",
|
||||||
|
"1. Submit the experiment for model tuning\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"### Define settings for autogeneration and tuning\n",
|
||||||
|
"\n",
|
||||||
|
"Define the experiment parameters and models settings for autogeneration and tuning. View the full list of [settings](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train).\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"|Property| Value in this tutorial |Description|\n",
|
||||||
|
"|----|----|---|\n",
|
||||||
|
"|**iteration_timeout_minutes**|10|Time limit in minutes for each iteration|\n",
|
||||||
|
"|**iterations**|30|Number of iterations. In each iteration, the model trains with the data with a specific pipeline|\n",
|
||||||
|
"|**primary_metric**|spearman_correlation | Metric that you want to optimize.|\n",
|
||||||
|
"|**preprocess**| True | True enables experiment to perform preprocessing on the input.|\n",
|
||||||
|
"|**verbosity**| logging.INFO | Controls the level of logging.|\n",
|
||||||
|
"|**n_cross_validationss**|5|Number of cross validation splits\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"automl_settings = {\n",
|
||||||
|
" \"iteration_timeout_minutes\" : 10,\n",
|
||||||
|
" \"iterations\" : 30,\n",
|
||||||
|
" \"primary_metric\" : 'spearman_correlation',\n",
|
||||||
|
" \"preprocess\" : True,\n",
|
||||||
|
" \"verbosity\" : logging.INFO,\n",
|
||||||
|
" \"n_cross_validations\": 5\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": [
|
||||||
|
"configure automl"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azureml.train.automl import AutoMLConfig\n",
|
||||||
|
"\n",
|
||||||
|
"# local compute \n",
|
||||||
|
"automated_ml_config = AutoMLConfig(task = 'regression',\n",
|
||||||
|
" debug_log = 'automated_ml_errors.log',\n",
|
||||||
|
" path = project_folder,\n",
|
||||||
|
" X = x_train.values,\n",
|
||||||
|
" y = y_train.values.flatten(),\n",
|
||||||
|
" **automl_settings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Train the automatic regression model\n",
|
||||||
|
"\n",
|
||||||
|
"Start the experiment to run locally. Pass the defined `automated_ml_config` object to the experiment, and set the output to `true` to view progress during the experiment."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": [
|
||||||
|
"local submitted run",
|
||||||
|
"automl"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azureml.core.experiment import Experiment\n",
|
||||||
|
"experiment=Experiment(ws, experiment_name)\n",
|
||||||
|
"local_run = experiment.submit(automated_ml_config, show_output=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Explore the results\n",
|
||||||
|
"\n",
|
||||||
|
"Explore the results of automatic training with a Jupyter widget or by examining the experiment history.\n",
|
||||||
|
"\n",
|
||||||
|
"### Option 1: Add a Jupyter widget to see results\n",
|
||||||
|
"\n",
|
||||||
|
"Use the Jupyter notebook widget to see a graph and a table of all results."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": [
|
||||||
|
"use notebook widget"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azureml.widgets import RunDetails\n",
|
||||||
|
"RunDetails(local_run).show()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Option 2: Get and examine all run iterations in Python\n",
|
||||||
|
"\n",
|
||||||
|
"Alternatively, you can retrieve the history of each experiment and explore the individual metrics for each iteration run."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": [
|
||||||
|
"get metrics",
|
||||||
|
"query history"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"children = list(local_run.get_children())\n",
|
||||||
|
"metricslist = {}\n",
|
||||||
|
"for run in children:\n",
|
||||||
|
" properties = run.get_properties()\n",
|
||||||
|
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
|
||||||
|
" metricslist[int(properties['iteration'])] = metrics\n",
|
||||||
|
"\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
|
||||||
|
"rundata"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Retrieve the best model\n",
|
||||||
|
"\n",
|
||||||
|
"Select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last fit invocation. There are overloads on `get_output` that allow you to retrieve the best run and fitted model for any logged metric or a particular iteration."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"best_run, fitted_model = local_run.get_output()\n",
|
||||||
|
"print(best_run)\n",
|
||||||
|
"print(fitted_model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Register the model\n",
|
||||||
|
"\n",
|
||||||
|
"Register the model in your Azure Machine Learning Workspace."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"description = 'Automated Machine Learning Model'\n",
|
||||||
|
"tags = None\n",
|
||||||
|
"local_run.register_model(description=description, tags=tags)\n",
|
||||||
|
"local_run.model_id # Use this id to deploy the model as a web service in Azure"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test the best model accuracy\n",
|
||||||
|
"\n",
|
||||||
|
"Use the best model to run predictions on the test data set. The function `predict` uses the best model, and predicts the values of y (trip cost) from the `x_test` data set. Print the first 10 predicted cost values from `y_predict`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y_predict = fitted_model.predict(x_test.values) \n",
|
||||||
|
"print(y_predict[:10])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Compare the predicted cost values with the actual cost values. Use the `y_test` dataframe, and convert it to a list to compare to the predicted values. The function `mean_absolute_error` takes two arrays of values, and calculates the average absolute value error between them. In this example, a mean absolute error of 3.5 would mean that on average, the model predicts the cost within plus or minus 3.5 of the actual value."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.metrics import mean_absolute_error\n",
|
||||||
|
"\n",
|
||||||
|
"y_actual = y_test.values.flatten().tolist()\n",
|
||||||
|
"mean_absolute_error(y_actual, y_predict)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Run the following code to calculate MAPE (mean absolute percent error) using the full `y_actual` and `y_predict` data sets. This metric calculates an absolute difference between each predicted and actual value, sums all the differences, and then expresses that sum as a percent of the total of the actual values."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sum_actuals = sum_errors = 0\n",
|
||||||
|
"\n",
|
||||||
|
"for actual_val, predict_val in zip(y_actual, y_predict):\n",
|
||||||
|
" abs_error = actual_val - predict_val\n",
|
||||||
|
" if abs_error < 0:\n",
|
||||||
|
" abs_error = abs_error * -1\n",
|
||||||
|
" \n",
|
||||||
|
" sum_errors = sum_errors + abs_error\n",
|
||||||
|
" sum_actuals = sum_actuals + actual_val\n",
|
||||||
|
" \n",
|
||||||
|
"mean_abs_percent_error = sum_errors / sum_actuals\n",
|
||||||
|
"print(\"Model MAPE:\")\n",
|
||||||
|
"print(mean_abs_percent_error)\n",
|
||||||
|
"print()\n",
|
||||||
|
"print(\"Model Accuracy:\")\n",
|
||||||
|
"print(1 - mean_abs_percent_error)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Next steps\n",
|
||||||
|
"\n",
|
||||||
|
"In this automated machine learning tutorial, you:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"> * Configured a workspace and prepared data for an experiment\n",
|
||||||
|
"> * Trained using an automated regression model locally with custom parameters\n",
|
||||||
|
"> * Explored and reviewed training results\n",
|
||||||
|
"> * Registered the best model\n",
|
||||||
|
"\n",
|
||||||
|
"[Deploy your model](02.deploy-models.ipynb) with Azure Machine Learning."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "jeffshep"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
},
|
||||||
|
"msauthor": "sgilley"
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user