mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 01:27:06 -05:00
Compare commits
49 Commits
azureml-sd
...
jeffshep/p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fcc882174b | ||
|
|
6872d8a3bb | ||
|
|
a2cb4c3589 | ||
|
|
15008962b2 | ||
|
|
9414b51fac | ||
|
|
80ac414582 | ||
|
|
cbc151660b | ||
|
|
0024abc6e3 | ||
|
|
fa13385860 | ||
|
|
0c5f6daf52 | ||
|
|
c11e9fc1da | ||
|
|
280150713e | ||
|
|
bb11c80b1b | ||
|
|
d0961b98bf | ||
|
|
302589b7f9 | ||
|
|
cc85949d6d | ||
|
|
3a1824e3ad | ||
|
|
579643326d | ||
|
|
14f76f227e | ||
|
|
25baf5203a | ||
|
|
1178fcb0ba | ||
|
|
e4d84c8e45 | ||
|
|
7a3ab1e44c | ||
|
|
598a293dfa | ||
|
|
40b3068462 | ||
|
|
0ecbbbce75 | ||
|
|
9b1e130d18 | ||
|
|
0e17b33d2a | ||
|
|
34d80abd26 | ||
|
|
249278ab77 | ||
|
|
25fdb17f80 | ||
|
|
3a02a27f1e | ||
|
|
4eed9d529f | ||
|
|
f344d410a2 | ||
|
|
9dc1228063 | ||
|
|
4404e62f58 | ||
|
|
38d5743bbb | ||
|
|
0814eee151 | ||
|
|
f45b815221 | ||
|
|
bd629ae454 | ||
|
|
41de75a584 | ||
|
|
96a426dc36 | ||
|
|
824dd40f7e | ||
|
|
fa2e649fe8 | ||
|
|
e25e8e3a41 | ||
|
|
aa3670a902 | ||
|
|
ef1f9205ac | ||
|
|
3228bbfc63 | ||
|
|
f18a0dfc4d |
@@ -1,6 +1,8 @@
|
||||
# Azure Machine Learning Python SDK notebooks
|
||||
|
||||
> a community-driven repository of examples using mlflow for tracking can be found at https://github.com/Azure/azureml-examples
|
||||
|
||||
** **With the introduction of AzureML SDK v2, this samples repository for the v1 SDK is now deprecated and will not be monitored or updated. Users are encouraged to visit the [v2 SDK samples repository](https://github.com/Azure/azureml-examples) instead for up-to-date and enhanced examples of how to build, train, and deploy machine learning models with AzureML's newest features.** **
|
||||
|
||||
|
||||
Welcome to the Azure Machine Learning Python SDK notebooks repository!
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.45.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -329,7 +329,7 @@
|
||||
" print(\"Creating new gpu-cluster\")\n",
|
||||
" \n",
|
||||
" # Specify the configuration for the new cluster\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\",\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size=\"Standard_NC6s_v3\",\n",
|
||||
" min_nodes=0,\n",
|
||||
" max_nodes=4)\n",
|
||||
" # Create the cluster with the specified name and configuration\n",
|
||||
@@ -367,9 +367,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -174,7 +174,7 @@
|
||||
"else:\n",
|
||||
" print(\"creating new cluster\")\n",
|
||||
" # vm_size parameter below could be modified to one of the RAPIDS-supported VM types\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"Standard_NC6s_v2\", min_nodes=1, max_nodes = 1)\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"Standard_NC6s_v3\", min_nodes=1, max_nodes = 1)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)\n",
|
||||
@@ -398,7 +398,7 @@
|
||||
"# run_config.target = gpu_cluster_name\n",
|
||||
"# run_config.environment.docker.enabled = True\n",
|
||||
"# run_config.environment.docker.gpu_support = True\n",
|
||||
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n",
|
||||
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu20.04\"\n",
|
||||
"# # run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
|
||||
"# # run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
|
||||
"# # run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",
|
||||
@@ -525,9 +525,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -599,9 +599,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -3,10 +3,11 @@ dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-contrib-fairness
|
||||
- fairlearn>=0.6.2
|
||||
- fairlearn>=0.6.2,<=0.7.0
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.21.0
|
||||
- raiwidgets~=0.28.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
- numpy<1.24.0
|
||||
|
||||
@@ -523,9 +523,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -3,10 +3,11 @@ dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-contrib-fairness
|
||||
- fairlearn>=0.6.2
|
||||
- fairlearn>=0.6.2,<=0.7.0
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.21.0
|
||||
- raiwidgets~=0.28.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
- numpy<1.24.0
|
||||
|
||||
@@ -5,32 +5,23 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip==20.2.4
|
||||
- python>=3.6,<3.9
|
||||
- matplotlib==3.2.1
|
||||
- py-xgboost==1.3.3
|
||||
- pytorch::pytorch=1.4.0
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- cudatoolkit=10.1.243
|
||||
- scipy==1.5.3
|
||||
- notebook
|
||||
- pywin32==227
|
||||
- PySocks==1.7.1
|
||||
- conda-forge::pyqt==5.12.3
|
||||
- jsonschema==4.15.0
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
- tqdm==4.64.0
|
||||
# Azure ML only supports 3.7.0 and later.
|
||||
- pip==22.3.1
|
||||
- python>=3.8,<3.9
|
||||
- holidays==0.10.3
|
||||
- pandas==1.3.5
|
||||
- scipy==1.10.1
|
||||
- Cython==0.29.14
|
||||
- tqdm==4.66.1
|
||||
- scikit-learn<1.1
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.45.0
|
||||
- azureml-defaults~=1.45.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.45.0/validated_win32_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
- wasabi==0.9.1
|
||||
- azureml-widgets~=1.53.0
|
||||
- azureml-defaults~=1.53.0
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.53.2/validated_win32_requirements.txt [--no-deps]
|
||||
- matplotlib==3.7.1
|
||||
- xgboost==1.3.3
|
||||
- prophet==1.1.4
|
||||
- cmdstanpy==1.1.0
|
||||
- setuptools-git==1.2
|
||||
|
||||
@@ -5,32 +5,29 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip==20.2.4
|
||||
- python>=3.6,<3.9
|
||||
- boto3==1.20.19
|
||||
- botocore<=1.23.19
|
||||
- matplotlib==3.2.1
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
# Azure ML only supports 3.7 and later.
|
||||
- pip==22.3.1
|
||||
- python>=3.8,<3.9
|
||||
- matplotlib==3.7.1
|
||||
- numpy==1.22.3
|
||||
- cython==0.29.14
|
||||
- urllib3==1.26.7
|
||||
- scipy>=1.4.1,<=1.5.3
|
||||
- scikit-learn==0.22.1
|
||||
- scipy==1.10.1
|
||||
- scikit-learn==1.1.3
|
||||
- py-xgboost<=1.3.3
|
||||
- holidays==0.10.3
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- pytorch::pytorch=1.4.0
|
||||
- conda-forge::prophet==1.1.4
|
||||
- pytorch::pytorch=1.11.0
|
||||
- cudatoolkit=10.1.243
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
- notebook
|
||||
- scikit-learn<1.1
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.45.0
|
||||
- azureml-defaults~=1.45.0
|
||||
- azureml-widgets~=1.53.0
|
||||
- azureml-defaults~=1.53.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.45.0/validated_linux_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.53.2/validated_linux_requirements.txt [--no-deps]
|
||||
|
||||
@@ -5,33 +5,25 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip==20.2.4
|
||||
- nomkl
|
||||
- python>=3.6,<3.9
|
||||
- boto3==1.20.19
|
||||
- botocore<=1.23.19
|
||||
- matplotlib==3.2.1
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
# Currently Azure ML only supports 3.7 and later.
|
||||
- pip==22.3.1
|
||||
- python>=3.8,<3.9
|
||||
- numpy==1.22.3
|
||||
- cython==0.29.14
|
||||
- urllib3==1.26.7
|
||||
- scipy>=1.4.1,<=1.5.3
|
||||
- scikit-learn==0.22.1
|
||||
- py-xgboost<=1.3.3
|
||||
- scipy==1.10.1
|
||||
- scikit-learn==1.1.3
|
||||
- holidays==0.10.3
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- pytorch::pytorch=1.4.0
|
||||
- cudatoolkit=9.0
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
- scikit-learn<1.1
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.45.0
|
||||
- azureml-defaults~=1.45.0
|
||||
- azureml-widgets~=1.53.0
|
||||
- azureml-defaults~=1.53.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- prophet==1.1.4
|
||||
- xgboost==1.3.3
|
||||
- matplotlib==3.7.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.45.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.53.2/validated_darwin_requirements.txt [--no-deps]
|
||||
|
||||
@@ -33,6 +33,8 @@ if not errorlevel 1 (
|
||||
call conda env create -f %automl_env_file% -n %conda_env_name%
|
||||
)
|
||||
|
||||
python "%conda_prefix%\scripts\pywin32_postinstall.py" -install
|
||||
|
||||
call conda activate %conda_env_name% 2>nul:
|
||||
if errorlevel 1 goto ErrorExit
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from distutils.version import LooseVersion
|
||||
from setuptools._vendor.packaging import version
|
||||
import platform
|
||||
|
||||
try:
|
||||
@@ -17,7 +17,7 @@ if architecture != "64bit":
|
||||
|
||||
minimumVersion = "4.7.8"
|
||||
|
||||
versionInvalid = (LooseVersion(conda.__version__) < LooseVersion(minimumVersion))
|
||||
versionInvalid = (version.parse(conda.__version__) < version.parse(minimumVersion))
|
||||
|
||||
if versionInvalid:
|
||||
print('Setup requires conda version ' + minimumVersion + ' or higher.')
|
||||
|
||||
@@ -1,5 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -712,7 +728,9 @@
|
||||
"from azureml.core.model import Model\n",
|
||||
"from azureml.core.environment import Environment\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script=script_file_name)\n",
|
||||
"inference_config = InferenceConfig(\n",
|
||||
" environment=best_run.get_environment(), entry_script=script_file_name\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(\n",
|
||||
" cpu_cores=2,\n",
|
||||
@@ -828,9 +846,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib notebook\n",
|
||||
@@ -1060,9 +1076,9 @@
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,5 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -456,9 +472,9 @@
|
||||
"friendly_name": "Classification of credit card fraudulent transactions using Automated ML",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,5 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -139,8 +155,8 @@
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_NC6\", # CPU for BiLSTM, such as \"STANDARD_D2_V2\"\n",
|
||||
" # To use BERT (this is recommended for best performance), select a GPU such as \"STANDARD_NC6\"\n",
|
||||
" vm_size=\"Standard_NC6s_v3\", # CPU for BiLSTM, such as \"STANDARD_D2_V2\"\n",
|
||||
" # To use BERT (this is recommended for best performance), select a GPU such as \"Standard_NC6s_v3\"\n",
|
||||
" # or similar GPU option\n",
|
||||
" # available in your workspace\n",
|
||||
" idle_seconds_before_scaledown=60,\n",
|
||||
@@ -336,7 +352,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For local inferencing, you can load the model locally via. the method `remote_run.get_output()`. For more information on the arguments expected by this method, you can run `remote_run.get_output??`.\n",
|
||||
"Note that when the model contains BERT, this step will require pytorch and pytorch-transformers installed in your local environment. The exact versions of these packages can be found in the **automl_env.yml** file located in the local copy of your azureml-examples folder here: \"azureml-examples/python-sdk/tutorials/automl-with-azureml\""
|
||||
"Note that when the model contains BERT, this step will require pytorch and pytorch-transformers installed in your local environment. The exact versions of these packages can be found in the **automl_env.yml** file located in the local copy of your MachineLearningNotebooks folder here: \"MachineLearningNotebooks\\how-to-use-azureml\\automated-machine-learning\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -567,9 +583,9 @@
|
||||
"friendly_name": "DNN Text Featurization",
|
||||
"index_order": 2,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import pandas as pd
|
||||
from azureml.core import Environment, ScriptRunConfig
|
||||
from azureml.core.run import Run
|
||||
@@ -13,7 +14,16 @@ def run_inference(
|
||||
model_name,
|
||||
):
|
||||
|
||||
inference_env = train_run.get_environment()
|
||||
try:
|
||||
inference_env = train_run.get_environment()
|
||||
except BaseException:
|
||||
run_details = train_run.get_details()
|
||||
run_def = run_details.get("runDefinition")
|
||||
env = run_def.get("environment")
|
||||
if env is None:
|
||||
raise
|
||||
json.dump(env, open("azureml_environment.json", "w"))
|
||||
inference_env = Environment.load_from_directory(".")
|
||||
|
||||
est = ScriptRunConfig(
|
||||
source_directory=script_folder,
|
||||
|
||||
@@ -3,7 +3,7 @@ import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
|
||||
from azureml.automl.runtime.shared.score import scoring, constants
|
||||
from azureml.core import Run, Dataset
|
||||
|
||||
@@ -1,5 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -564,9 +580,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -31,7 +31,7 @@ try:
|
||||
model = Model(ws, args.model_name)
|
||||
last_train_time = model.created_time
|
||||
print("Model was last trained on {0}.".format(last_train_time))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
print("Could not get last model train time.")
|
||||
last_train_time = datetime.min.replace(tzinfo=pytz.UTC)
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.45.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -324,9 +324,9 @@
|
||||
"hash": "adb464b67752e4577e3dc163235ced27038d19b7d88def00d75d1975bde5d9ab"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.45.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -454,10 +454,13 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Note:** Not all datasets produce a y_transformer. The dataset used in the current notebook requires a transformer as the y column data is categorical."
|
||||
"**Note:** Not all datasets produce a y_transformer. The dataset used in the current notebook requires a transformer as the y column data is categorical. \n",
|
||||
"\n",
|
||||
"We will go ahead and download the mlflow transformer model and use it to transform test data that can be used for further experimentation below. To run the commented code, make sure the environment requirement is satisfied. You can go ahead and create the environment from the `conda.yaml` file under `/outputs/featurization/pipeline/` and run the given code in it."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -466,7 +469,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import Transformers\n",
|
||||
"''' from azureml.automl.core.shared.constants import Transformers\n",
|
||||
"\n",
|
||||
"transformers = mlflow.sklearn.load_model(uri) # Using method 1\n",
|
||||
"data_transformers = transformers.get_transformers()\n",
|
||||
@@ -474,14 +477,15 @@
|
||||
"y_transformer = data_transformers[Transformers.Y_TRANSFORMER]\n",
|
||||
"\n",
|
||||
"X_test = x_transformer.transform(X_test_data)\n",
|
||||
"y_test = y_transformer.transform(y_test_data)"
|
||||
"y_test = y_transformer.transform(y_test_data) '''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run the following cell to see the featurization summary of X and y transformers. "
|
||||
"Run the following cell to see the featurization summary of X and y transformers. Uncomment to use. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -490,10 +494,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_data_summary = x_transformer.get_featurization_summary(is_user_friendly=False)\n",
|
||||
"''' X_data_summary = x_transformer.get_featurization_summary(is_user_friendly=False)\n",
|
||||
"\n",
|
||||
"summary_df = pd.DataFrame.from_records(X_data_summary)\n",
|
||||
"summary_df"
|
||||
"summary_df '''"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -544,10 +548,11 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Another way to load the data is to go to the above autofeaturization experiment and check for the featurized dataset ids under `Output datasets`. Uncomment and replace them accordingly below to use."
|
||||
"Another way to load the data is to go to the above autofeaturization experiment and check for the featurized dataset ids under `Output datasets`. Uncomment and replace them accordingly below, to use."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -597,10 +602,20 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Here we are passing our training data to the lightgbm classifier, any custom model can be used with your data."
|
||||
"Here we are passing our training data to the lightgbm classifier, any custom model can be used with your data. Let us first install lightgbm."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install lightgbm"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -612,11 +627,27 @@
|
||||
"import lightgbm as lgb\n",
|
||||
"\n",
|
||||
"model = lgb.LGBMClassifier(learning_rate=0.08,max_depth=-5,random_state=42)\n",
|
||||
"model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_test, y_test),(X_train, y_train)],\n",
|
||||
" verbose=20,eval_metric='logloss')\n",
|
||||
"\n",
|
||||
"model.fit(X_train, y_train, sample_weight=sample_weight)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Once training is done, the test data obtained after transforming from the above downloaded transformer can be used to calculate the accuracy "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('Training accuracy {:.4f}'.format(model.score(X_train, y_train)))\n",
|
||||
"print('Testing accuracy {:.4f}'.format(model.score(X_test, y_test)))"
|
||||
"\n",
|
||||
"# Uncomment below to test the model on test data \n",
|
||||
"# print('Testing accuracy {:.4f}'.format(model.score(X_test, y_test)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -654,45 +685,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Calculate metrics for the prediction\n",
|
||||
"\n",
|
||||
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
|
||||
"from the trained model that was returned."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import confusion_matrix\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"import itertools\n",
|
||||
"\n",
|
||||
"cf =confusion_matrix(y_test,y_pred)\n",
|
||||
"plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.title('Confusion Matrix')\n",
|
||||
"plt.xlabel('Predicted')\n",
|
||||
"plt.ylabel('Actual')\n",
|
||||
"class_labels = ['False','True']\n",
|
||||
"tick_marks = np.arange(len(class_labels))\n",
|
||||
"plt.xticks(tick_marks,class_labels)\n",
|
||||
"plt.yticks([-0.5,0,1,1.5],['','False','True',''])\n",
|
||||
"# plotting text value inside cells\n",
|
||||
"thresh = cf.max() / 2.\n",
|
||||
"for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):\n",
|
||||
" plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')\n",
|
||||
"plt.show()"
|
||||
"# Uncomment below to test the model on test data\n",
|
||||
"# y_pred = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -713,9 +707,9 @@
|
||||
"hash": "adb464b67752e4577e3dc163235ced27038d19b7d88def00d75d1975bde5d9ab"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,20 +1,12 @@
|
||||
name: azure_automl_experimental
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip<=20.2.4
|
||||
- python>=3.6.0,<3.10
|
||||
- cython==0.29.14
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy==1.21.6
|
||||
- pywin32==227
|
||||
- cryptography<37.0.0
|
||||
# Currently Azure ML only supports 3.7.0 and later.
|
||||
- pip<=22.3.1
|
||||
- python>=3.7.0,<3.11
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azure-core==1.24.1
|
||||
- azure-identity==1.7.0
|
||||
- azureml-defaults
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
|
||||
@@ -4,14 +4,13 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
# Currently Azure ML only supports 3.7.0 and later.
|
||||
- pip<=20.2.4
|
||||
- nomkl
|
||||
- python>=3.6.0,<3.10
|
||||
- python>=3.7.0,<3.11
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
- cryptography<37.0.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.45.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -354,7 +354,7 @@
|
||||
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u0192\u00c2\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u0192\u00c2\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net and the page of the DefeatFraud project\n",
|
||||
"Please cite the following works: \n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
|
||||
@@ -389,9 +389,9 @@
|
||||
"friendly_name": "Classification of credit card fraudulent transactions using Automated ML",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.45.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.53.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -448,9 +448,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -122,7 +122,10 @@ def calculate_scores_and_build_plots(
|
||||
input_dir: str, output_dir: str, automl_settings: Dict[str, Any]
|
||||
):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
grains = automl_settings.get(constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES)
|
||||
grains = automl_settings.get(
|
||||
constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES,
|
||||
automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES, None),
|
||||
)
|
||||
time_column_name = automl_settings.get(constants.TimeSeries.TIME_COLUMN_NAME)
|
||||
if grains is None:
|
||||
grains = []
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -33,6 +33,7 @@
|
||||
"For this notebook we are using a synthetic dataset to demonstrate the back testing in many model scenario. This allows us to check historical performance of AutoML on a historical data. To do that we step back on the backtesting period by the data set several times and split the data to train and test sets. Then these data sets are used for training and evaluation of model.<br>\n",
|
||||
"\n",
|
||||
"Thus, it is a quick way of evaluating AutoML as if it was in production. Here, we do not test historical performance of a particular model, for this see the [notebook](../forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb). Instead, the best model for every backtest iteration can be different since AutoML chooses the best model for a given training set.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**NOTE: There are limits on how many runs we can do in parallel per workspace, and we currently recommend to set the parallelism to maximum of 320 runs per experiment per workspace. If users want to have more parallelism and increase this limit they might encounter Too Many Requests errors (HTTP 429).**"
|
||||
@@ -43,7 +44,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
|
||||
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -313,22 +314,37 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition. Please note, that in this case we are setting grain_column_names to be the time series ID column plus iteration, because we want to train a separate model for each time series and iteration.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **task** | forecasting |\n",
|
||||
"| **primary_metric** | This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>normalized_root_mean_squared_error</i><br><i>normalized_mean_absolute_error</i> |\n",
|
||||
"| **primary_metric** | This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i> |\n",
|
||||
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
|
||||
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### ``ManyModelsTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
]
|
||||
},
|
||||
@@ -345,22 +361,30 @@
|
||||
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
|
||||
" ManyModelsTrainParameters,\n",
|
||||
")\n",
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
|
||||
"\n",
|
||||
"partition_column_names = [TIME_SERIES_ID_COLNAME, \"backtest_iteration\"]\n",
|
||||
"automl_settings = {\n",
|
||||
" \"task\": \"forecasting\",\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
|
||||
" \"iterations\": 15,\n",
|
||||
" \"experiment_timeout_hours\": 0.25, # This also needs to be changed based on the dataset. For larger data set this number needs to be bigger.\n",
|
||||
" \"label_column_name\": TARGET_COLNAME,\n",
|
||||
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" \"cv_step_size\": \"auto\",\n",
|
||||
" \"time_column_name\": TIME_COLNAME,\n",
|
||||
" \"forecast_horizon\": 6,\n",
|
||||
" \"time_series_id_column_names\": partition_column_names,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=TIME_COLNAME,\n",
|
||||
" forecast_horizon=6,\n",
|
||||
" time_series_id_column_names=partition_column_names,\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" iterations=15,\n",
|
||||
" experiment_timeout_hours=0.25,\n",
|
||||
" label_column_name=TARGET_COLNAME,\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"mm_paramters = ManyModelsTrainParameters(\n",
|
||||
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
|
||||
@@ -387,8 +411,16 @@
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
|
||||
"\n",
|
||||
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
|
||||
"where,\n",
|
||||
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
|
||||
"- $ p $ is ``process_count_per_node``\n",
|
||||
"- $ n $ is ``node_count``\n",
|
||||
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -406,7 +438,7 @@
|
||||
" compute_target=compute_target,\n",
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=2,\n",
|
||||
" run_invocation_timeout=920,\n",
|
||||
" run_invocation_timeout=1200,\n",
|
||||
" train_pipeline_parameters=mm_paramters,\n",
|
||||
")"
|
||||
]
|
||||
@@ -491,25 +523,31 @@
|
||||
"source": [
|
||||
"For many models we need to provide the ManyModelsInferenceParameters object.\n",
|
||||
"\n",
|
||||
"#### ManyModelsInferenceParameters arguments\n",
|
||||
"#### ``ManyModelsInferenceParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **target_column_name** | \\[Optional\\] Column name only if the inference dataset has the target. |\n",
|
||||
"| **time_column_name** | Column name only if it is timeseries. |\n",
|
||||
"| **many_models_run_id** | \\[Optional\\] Many models pipeline run id where models were trained. |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **target_column_name** | \\[Optional] Column name only if the inference dataset has the target. |\n",
|
||||
"| **time_column_name** | \\[Optional] Time column name only if it is timeseries. |\n",
|
||||
"| **inference_type** | \\[Optional] Which inference method to use on the model. Possible values are 'forecast', 'predict_proba', and 'predict'. |\n",
|
||||
"| **forecast_mode** | \\[Optional] The type of forecast to be used, either 'rolling' or 'recursive'; defaults to 'recursive'. |\n",
|
||||
"| **step** | \\[Optional] Number of periods to advance the forecasting window in each iteration **(for rolling forecast only)**; defaults to 1. |\n",
|
||||
"\n",
|
||||
"#### get_many_models_batch_inference_steps arguments\n",
|
||||
"#### ``get_many_models_batch_inference_steps`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline.|\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** | The number of processes per node.\n",
|
||||
"| **train_run_id** | \\[Optional\\] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional\\] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional\\] The number of processes per node, by default it's 4. |"
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
|
||||
"| **inference_pipeline_parameters** | \\[Optional] The ``ManyModelsInferenceParameters`` object defined above. |\n",
|
||||
"| **append_row_file_name** | \\[Optional] The name of the output file (optional, default value is 'parallel_run_step.txt'). Supports 'txt' and 'csv' file extension. A 'txt' file extension generates the output in 'txt' format with space as separator without column names. A 'csv' file extension generates the output in 'csv' format with comma as separator and with column names. |\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **output_datastore** | \\[Optional] The ``Datastore`` or ``OutputDatasetConfig`` to be used for output. If specified any pipeline output will be written to that location. If unspecified the default datastore will be used. |\n",
|
||||
"| **arguments** | \\[Optional] Arguments to be passed to inference script. Possible argument is '--forecast_quantiles' followed by quantile values. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -529,6 +567,8 @@
|
||||
" target_column_name=TARGET_COLNAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output_file_name = \"parallel_run_step.csv\"\n",
|
||||
"\n",
|
||||
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
|
||||
" experiment=experiment,\n",
|
||||
" inference_data=test_data,\n",
|
||||
@@ -540,6 +580,7 @@
|
||||
" train_run_id=training_run.id,\n",
|
||||
" train_experiment_name=training_run.experiment.name,\n",
|
||||
" inference_pipeline_parameters=mm_parameters,\n",
|
||||
" append_row_file_name=output_file_name,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -587,18 +628,21 @@
|
||||
"source": [
|
||||
"from azureml.contrib.automl.pipeline.steps.utilities import get_output_from_mm_pipeline\n",
|
||||
"\n",
|
||||
"PREDICTION_COLNAME = \"Predictions\"\n",
|
||||
"forecasting_results_name = \"forecasting_results\"\n",
|
||||
"forecasting_output_name = \"many_models_inference_output\"\n",
|
||||
"forecast_file = get_output_from_mm_pipeline(\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
|
||||
")\n",
|
||||
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None, parse_dates=[0])\n",
|
||||
"df.columns = list(X_train.columns) + [\"predicted_level\"]\n",
|
||||
"df = pd.read_csv(forecast_file, parse_dates=[0])\n",
|
||||
"print(\n",
|
||||
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
|
||||
")\n",
|
||||
"# Save the scv file with header to read it in the next step.\n",
|
||||
"df.rename(columns={TARGET_COLNAME: \"actual_level\"}, inplace=True)\n",
|
||||
"# Save the csv file to read it in the next step.\n",
|
||||
"df.rename(\n",
|
||||
" columns={TARGET_COLNAME: \"actual_level\", PREDICTION_COLNAME: \"predicted_level\"},\n",
|
||||
" inplace=True,\n",
|
||||
")\n",
|
||||
"df.to_csv(os.path.join(forecasting_results_name, \"forecast.csv\"), index=False)\n",
|
||||
"df.head(10)"
|
||||
]
|
||||
@@ -622,7 +666,9 @@
|
||||
"backtesting_results = \"backtesting_mm_results\"\n",
|
||||
"os.makedirs(backtesting_results, exist_ok=True)\n",
|
||||
"calculate_scores_and_build_plots(\n",
|
||||
" forecasting_results_name, backtesting_results, automl_settings\n",
|
||||
" forecasting_results_name,\n",
|
||||
" backtesting_results,\n",
|
||||
" automl_settings.as_serializable_dict(),\n",
|
||||
")\n",
|
||||
"pd.DataFrame({\"File\": os.listdir(backtesting_results)})"
|
||||
]
|
||||
@@ -706,9 +752,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -43,11 +43,20 @@ def init():
|
||||
global output_dir
|
||||
global automl_settings
|
||||
global model_uid
|
||||
global forecast_quantiles
|
||||
|
||||
logger.info("Initialization of the run.")
|
||||
parser = argparse.ArgumentParser("Parsing input arguments.")
|
||||
parser.add_argument("--output-dir", dest="out", required=True)
|
||||
parser.add_argument("--model-name", dest="model", default=None)
|
||||
parser.add_argument("--model-uid", dest="model_uid", default=None)
|
||||
parser.add_argument(
|
||||
"--forecast_quantiles",
|
||||
nargs="*",
|
||||
type=float,
|
||||
help="forecast quantiles list",
|
||||
default=None,
|
||||
)
|
||||
|
||||
parsed_args, _ = parser.parse_known_args()
|
||||
model_name = parsed_args.model
|
||||
@@ -55,6 +64,7 @@ def init():
|
||||
target_column_name = automl_settings.get("label_column_name")
|
||||
output_dir = parsed_args.out
|
||||
model_uid = parsed_args.model_uid
|
||||
forecast_quantiles = parsed_args.forecast_quantiles
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.environ["AUTOML_IGNORE_PACKAGE_VERSION_INCOMPATIBILITIES".lower()] = "True"
|
||||
|
||||
@@ -126,23 +136,18 @@ def run_backtest(data_input_name: str, file_name: str, experiment: Experiment):
|
||||
)
|
||||
print(f"The model {best_run.properties['model_name']} was registered.")
|
||||
|
||||
_, x_pred = fitted_model.forecast(X_test)
|
||||
x_pred.reset_index(inplace=True, drop=False)
|
||||
columns = [automl_settings[constants.TimeSeries.TIME_COLUMN_NAME]]
|
||||
if automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES):
|
||||
# We know that fitted_model.grain_column_names is a list.
|
||||
columns.extend(fitted_model.grain_column_names)
|
||||
columns.append(constants.TimeSeriesInternal.DUMMY_TARGET_COLUMN)
|
||||
# Remove featurized columns.
|
||||
x_pred = x_pred[columns]
|
||||
x_pred.rename(
|
||||
{constants.TimeSeriesInternal.DUMMY_TARGET_COLUMN: "predicted_level"},
|
||||
axis=1,
|
||||
inplace=True,
|
||||
)
|
||||
# By default we will have forecast quantiles of 0.5, which is our target
|
||||
if forecast_quantiles:
|
||||
if 0.5 not in forecast_quantiles:
|
||||
forecast_quantiles.append(0.5)
|
||||
fitted_model.quantiles = forecast_quantiles
|
||||
|
||||
x_pred = fitted_model.forecast_quantiles(X_test)
|
||||
x_pred["actual_level"] = y_test
|
||||
x_pred["backtest_iteration"] = f"iteration_{last_training_date}"
|
||||
x_pred.rename({0.5: "predicted_level"}, axis=1, inplace=True)
|
||||
date_safe = RE_INVALID_SYMBOLS.sub("_", last_training_date)
|
||||
|
||||
x_pred.to_csv(os.path.join(output_dir, f"iteration_{date_safe}.csv"), index=False)
|
||||
return x_pred
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
""
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -365,6 +365,7 @@
|
||||
" step_size=BACKTESTING_PERIOD,\n",
|
||||
" step_number=NUMBER_OF_BACKTESTS,\n",
|
||||
" model_uid=model_uid,\n",
|
||||
" forecast_quantiles=[0.025, 0.975], # Optional\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -590,6 +591,7 @@
|
||||
" step_size=BACKTESTING_PERIOD,\n",
|
||||
" step_number=NUMBER_OF_BACKTESTS,\n",
|
||||
" model_name=model_name,\n",
|
||||
" forecast_quantiles=[0.025, 0.975],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -700,9 +702,9 @@
|
||||
"Azure ML AutoML"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -31,6 +31,7 @@ def get_backtest_pipeline(
|
||||
step_number: int,
|
||||
model_name: Optional[str] = None,
|
||||
model_uid: Optional[str] = None,
|
||||
forecast_quantiles: Optional[list] = None,
|
||||
) -> Pipeline:
|
||||
"""
|
||||
:param experiment: The experiment used to run the pipeline.
|
||||
@@ -44,6 +45,7 @@ def get_backtest_pipeline(
|
||||
:param step_size: The number of periods to step back in backtesting.
|
||||
:param step_number: The number of backtesting iterations.
|
||||
:param model_uid: The uid to mark models from this run of the experiment.
|
||||
:param forecast_quantiles: The forecast quantiles that are required in the inference.
|
||||
:return: The pipeline to be used for model retraining.
|
||||
**Note:** The output will be uploaded in the pipeline output
|
||||
called 'score'.
|
||||
@@ -135,6 +137,9 @@ def get_backtest_pipeline(
|
||||
if model_uid is not None:
|
||||
prs_args.append("--model-uid")
|
||||
prs_args.append(model_uid)
|
||||
if forecast_quantiles:
|
||||
prs_args.append("--forecast_quantiles")
|
||||
prs_args.extend(forecast_quantiles)
|
||||
backtest_prs = ParallelRunStep(
|
||||
name=parallel_step_name,
|
||||
parallel_run_config=back_test_config,
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-bike-share)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -42,7 +49,7 @@
|
||||
"\n",
|
||||
"AutoML highlights here include built-in holiday featurization, accessing engineered feature names, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"Notebook synopsis:\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
@@ -61,7 +68,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1680248038565
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
@@ -170,25 +181,6 @@
|
||||
"source": [
|
||||
"## Data\n",
|
||||
"\n",
|
||||
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./bike-no.csv\"], target_path=\"dataset/\", overwrite=True, show_progress=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's set up what we know about the dataset. \n",
|
||||
"\n",
|
||||
"**Target column** is what we want to forecast.\n",
|
||||
@@ -206,25 +198,50 @@
|
||||
"time_column_name = \"date\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"You are now ready to load the historical bike share data. We will load the CSV file into a plain pandas DataFrame."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"dataset/bike-no.csv\")]\n",
|
||||
").with_timestamp_columns(fine_grain_timestamp=time_column_name)\n",
|
||||
"all_data = pd.read_csv(\"bike-no.csv\", parse_dates=[time_column_name])\n",
|
||||
"\n",
|
||||
"# Drop the columns 'casual' and 'registered' as these columns are a breakdown of the total and therefore a leak.\n",
|
||||
"dataset = dataset.drop_columns(columns=[\"casual\", \"registered\"])\n",
|
||||
"\n",
|
||||
"dataset.take(5).to_pandas_dataframe().reset_index(drop=True)"
|
||||
"all_data.drop([\"casual\", \"registered\"], axis=1, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Split the data\n",
|
||||
"\n",
|
||||
@@ -234,22 +251,63 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1680247376789
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# select data that occurs before a specified date\n",
|
||||
"train = dataset.time_before(datetime(2012, 8, 31), include_boundary=True)\n",
|
||||
"train.to_pandas_dataframe().tail(5).reset_index(drop=True)"
|
||||
"train = all_data[all_data[time_column_name] <= pd.Timestamp(\"2012-08-31\")].copy()\n",
|
||||
"test = all_data[all_data[time_column_name] >= pd.Timestamp(\"2012-09-01\")].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload data to datastore\n",
|
||||
"\n",
|
||||
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test = dataset.time_after(datetime(2012, 9, 1), include_boundary=True)\n",
|
||||
"test.to_pandas_dataframe().head(5).reset_index(drop=True)"
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"\n",
|
||||
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" train, target=(datastore, \"dataset/\"), name=\"bike_no_train\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" test, target=(datastore, \"dataset/\"), name=\"bike_no_test\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -360,7 +418,7 @@
|
||||
" featurization=featurization_config,\n",
|
||||
" blocked_models=[\"ExtremeRandomTrees\"],\n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" enable_early_stopping=True,\n",
|
||||
@@ -546,7 +604,7 @@
|
||||
"from run_forecast import run_rolling_forecast\n",
|
||||
"\n",
|
||||
"remote_run = run_rolling_forecast(\n",
|
||||
" test_experiment, compute_target, best_run, test, target_column_name\n",
|
||||
" test_experiment, compute_target, best_run, test_dataset, target_column_name\n",
|
||||
")\n",
|
||||
"remote_run"
|
||||
]
|
||||
@@ -575,7 +633,32 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run.download_file(\"outputs/predictions.csv\", \"predictions.csv\")\n",
|
||||
"df_all = pd.read_csv(\"predictions.csv\")"
|
||||
"fcst_df = pd.read_csv(\"predictions.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that the rolling forecast can contain multiple predictions for each date, each from a different forecast origin. For example, consider 2012-09-05:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fcst_df[fcst_df.date == \"2012-09-05\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Here, the forecast origin refers to the latest date of actuals available for a given forecast. The earliest origin in the rolling forecast, 2012-08-31, is the last day in the training data. For origin date 2012-09-01, the forecasts use actual recorded counts from the training data *and* the actual count recorded on 2012-09-01. Note that the model is not retrained for origin dates later than 2012-08-31, but the values for model features, such as lagged values of daily count, are updated.\n",
|
||||
"\n",
|
||||
"Let's calculate the metrics over all rolling forecasts:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -587,29 +670,17 @@
|
||||
"from azureml.automl.core.shared import constants\n",
|
||||
"from azureml.automl.runtime.shared.score import scoring\n",
|
||||
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"\n",
|
||||
"# use automl metrics module\n",
|
||||
"scores = scoring.score_regression(\n",
|
||||
" y_test=df_all[target_column_name],\n",
|
||||
" y_pred=df_all[\"predicted\"],\n",
|
||||
" y_test=fcst_df[target_column_name],\n",
|
||||
" y_pred=fcst_df[\"predicted\"],\n",
|
||||
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"[Test data scores]\\n\")\n",
|
||||
"for key, value in scores.items():\n",
|
||||
" print(\"{}: {:.3f}\".format(key, value))\n",
|
||||
"\n",
|
||||
"# Plot outputs\n",
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(df_all[target_column_name], df_all[\"predicted\"], color=\"b\")\n",
|
||||
"test_test = plt.scatter(\n",
|
||||
" df_all[target_column_name], df_all[target_column_name], color=\"g\"\n",
|
||||
")\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
"plt.show()"
|
||||
" print(\"{}: {:.3f}\".format(key, value))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -618,36 +689,15 @@
|
||||
"source": [
|
||||
"For more details on what metrics are included and how they are calculated, please refer to [supported metrics](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#regressionforecasting-metrics). You could also calculate residuals, like described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#residuals).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Since we did a rolling evaluation on the test set, we can analyze the predictions by their forecast horizon relative to the rolling origin. The model was initially trained at a forecast horizon of 14, so each prediction from the model is associated with a horizon value from 1 to 14. The horizon values are in a column named, \"horizon_origin,\" in the prediction set. For example, we can calculate some of the error metrics grouped by the horizon:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from metrics_helper import MAPE, APE\n",
|
||||
"\n",
|
||||
"df_all.groupby(\"horizon_origin\").apply(\n",
|
||||
" lambda df: pd.Series(\n",
|
||||
" {\n",
|
||||
" \"MAPE\": MAPE(df[target_column_name], df[\"predicted\"]),\n",
|
||||
" \"RMSE\": np.sqrt(\n",
|
||||
" mean_squared_error(df[target_column_name], df[\"predicted\"])\n",
|
||||
" ),\n",
|
||||
" \"MAE\": mean_absolute_error(df[target_column_name], df[\"predicted\"]),\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
")"
|
||||
"The rolling forecast metric values are very high in comparison to the validation metrics reported by the AutoML job. What's going on here? We will investigate in the following cells!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To drill down more, we can look at the distributions of APE (absolute percentage error) by horizon. From the chart, it is clear that the overall MAPE is being skewed by one particular point where the actual value is of small absolute value."
|
||||
"### Forecast versus actuals plot\n",
|
||||
"We will plot predictions and actuals on a time series plot. Since there are many forecasts for each date, we select the 14-day-ahead forecast from each forecast origin for our comparison."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -656,21 +706,55 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_all_APE = df_all.assign(APE=APE(df_all[target_column_name], df_all[\"predicted\"]))\n",
|
||||
"APEs = [\n",
|
||||
" df_all_APE[df_all[\"horizon_origin\"] == h].APE.values\n",
|
||||
" for h in range(1, forecast_horizon + 1)\n",
|
||||
"]\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"plt.boxplot(APEs)\n",
|
||||
"plt.yscale(\"log\")\n",
|
||||
"plt.xlabel(\"horizon\")\n",
|
||||
"plt.ylabel(\"APE (%)\")\n",
|
||||
"plt.title(\"Absolute Percentage Errors by Forecast Horizon\")\n",
|
||||
"\n",
|
||||
"fcst_df_h14 = (\n",
|
||||
" fcst_df.groupby(\"forecast_origin\", as_index=False)\n",
|
||||
" .last()\n",
|
||||
" .drop(columns=[\"forecast_origin\"])\n",
|
||||
")\n",
|
||||
"fcst_df_h14.set_index(time_column_name, inplace=True)\n",
|
||||
"plt.plot(fcst_df_h14[[target_column_name, \"predicted\"]])\n",
|
||||
"plt.xticks(rotation=45)\n",
|
||||
"plt.title(f\"Predicted vs. Actuals\")\n",
|
||||
"plt.legend([\"actual\", \"14-day-ahead forecast\"])\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Looking at the plot, there are two clear issues:\n",
|
||||
"1. An anomalously low count value on October 29th, 2012.\n",
|
||||
"2. End-of-year holidays (Thanksgiving and Christmas) in late November and late December.\n",
|
||||
"\n",
|
||||
"What happened on Oct. 29th, 2012? That day, Hurricane Sandy brought severe storm surge flooding to the east coast of the United States, particularly around New York City. This is certainly an anomalous event that the model did not account for!\n",
|
||||
"\n",
|
||||
"As for the late year holidays, the model apparently did not learn to account for the full reduction of bike share rentals on these major holidays. The training data covers 2011 and early 2012, so the model fit only had access to a single occurrence of these holidays. This makes it challenging to resolve holiday effects; however, a larger AutoML model search may result in a better model that is more holiday-aware.\n",
|
||||
"\n",
|
||||
"If we filter the predictions prior to the Thanksgiving holiday and remove the anomalous day of 2012-10-29, the metrics are closer to validation levels:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"date_filter = (fcst_df.date != \"2012-10-29\") & (fcst_df.date < \"2012-11-22\")\n",
|
||||
"scores = scoring.score_regression(\n",
|
||||
" y_test=fcst_df[date_filter][target_column_name],\n",
|
||||
" y_pred=fcst_df[date_filter][\"predicted\"],\n",
|
||||
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"[Test data scores (filtered)]\\n\")\n",
|
||||
"for key, value in scores.items():\n",
|
||||
" print(\"{}: {:.3f}\".format(key, value))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -696,10 +780,13 @@
|
||||
],
|
||||
"friendly_name": "Forecasting BikeShare Demand",
|
||||
"index_order": 1,
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -711,11 +798,19 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"npconvert_exporter": "python",
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"pygments_lexer": "ipython3",
|
||||
"tags": [
|
||||
"Forecasting"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import argparse
|
||||
from azureml.core import Dataset, Run
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@@ -36,18 +36,18 @@ y_test_df = (
|
||||
|
||||
fitted_model = joblib.load("model.pkl")
|
||||
|
||||
y_pred, X_trans = fitted_model.rolling_evaluation(X_test_df, y_test_df.values)
|
||||
X_rf = fitted_model.rolling_forecast(X_test_df, y_test_df.values, step=1)
|
||||
|
||||
# Add predictions, actuals, and horizon relative to rolling origin to the test feature data
|
||||
assign_dict = {
|
||||
"horizon_origin": X_trans["horizon_origin"].values,
|
||||
"predicted": y_pred,
|
||||
target_column_name: y_test_df[target_column_name].values,
|
||||
fitted_model.forecast_origin_column_name: "forecast_origin",
|
||||
fitted_model.forecast_column_name: "predicted",
|
||||
fitted_model.actual_column_name: target_column_name,
|
||||
}
|
||||
df_all = X_test_df.assign(**assign_dict)
|
||||
X_rf.rename(columns=assign_dict, inplace=True)
|
||||
|
||||
file_name = "outputs/predictions.csv"
|
||||
export_csv = df_all.to_csv(file_name, header=True)
|
||||
export_csv = X_rf.to_csv(file_name, header=True)
|
||||
|
||||
# Upload the predictions into artifacts
|
||||
run.upload_file(name=file_name, path_or_stream=file_name)
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-energy-demand/automl-forecasting-task-energy-demand-advanced-mlflow.ipynb)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -43,7 +50,7 @@
|
||||
"\n",
|
||||
"In this example we use the associated New York City energy demand dataset to showcase how you can use AutoML for a simple forecasting problem and explore the results. The goal is predict the energy demand for the next 48 hours based on historic time-series data.\n",
|
||||
"\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n",
|
||||
"\n",
|
||||
"In this notebook you will learn how to:\n",
|
||||
"1. Creating an Experiment using an existing Workspace\n",
|
||||
@@ -260,8 +267,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split into train based on time\n",
|
||||
"train = dataset.time_before(datetime(2017, 8, 8, 5), include_boundary=True)\n",
|
||||
"train.to_pandas_dataframe().reset_index(drop=True).sort_values(time_column_name).tail(5)"
|
||||
"train = (\n",
|
||||
" dataset.time_before(datetime(2017, 8, 8, 5), include_boundary=True)\n",
|
||||
" .to_pandas_dataframe()\n",
|
||||
" .reset_index(drop=True)\n",
|
||||
")\n",
|
||||
"train.sort_values(time_column_name).tail(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -271,8 +282,39 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split into test based on time\n",
|
||||
"test = dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))\n",
|
||||
"test.to_pandas_dataframe().reset_index(drop=True).head(5)"
|
||||
"test = (\n",
|
||||
" dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))\n",
|
||||
" .to_pandas_dataframe()\n",
|
||||
" .reset_index(drop=True)\n",
|
||||
")\n",
|
||||
"test.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# register the splitted train and test data in workspace storage\n",
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" train, target=(datastore, \"dataset/\"), name=\"nyc_energy_train\"\n",
|
||||
")\n",
|
||||
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" test, target=(datastore, \"dataset/\"), name=\"nyc_energy_test\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -361,7 +403,7 @@
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" blocked_models=[\"ExtremeRandomTrees\", \"AutoArima\", \"Prophet\"],\n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" enable_early_stopping=True,\n",
|
||||
@@ -521,7 +563,7 @@
|
||||
" test_experiment=test_experiment,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" train_run=best_run,\n",
|
||||
" test_dataset=test,\n",
|
||||
" test_dataset=test_dataset,\n",
|
||||
" target_column_name=target_column_name,\n",
|
||||
")\n",
|
||||
"remote_run_infer.wait_for_completion(show_output=False)\n",
|
||||
@@ -627,7 +669,7 @@
|
||||
" \"Prophet\",\n",
|
||||
" ], # These models are blocked for tutorial purposes, remove this for real use cases.\n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" enable_early_stopping=True,\n",
|
||||
@@ -698,7 +740,7 @@
|
||||
" test_experiment=test_experiment_advanced,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" train_run=best_run_lags,\n",
|
||||
" test_dataset=test,\n",
|
||||
" test_dataset=test_dataset,\n",
|
||||
" target_column_name=target_column_name,\n",
|
||||
" inference_folder=\"./forecast_advanced\",\n",
|
||||
")\n",
|
||||
@@ -766,10 +808,13 @@
|
||||
"how-to-use-azureml",
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernel_info": {
|
||||
"name": "python3"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -781,7 +826,15 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
@@ -790,5 +843,5 @@
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -6,7 +6,7 @@ compute instance.
|
||||
|
||||
import argparse
|
||||
from azureml.core import Dataset, Run
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Please make sure you have followed the `configuration.ipynb` notebook so that your ML workspace information is saved in the config file."
|
||||
"Please make sure you have followed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) so that your ML workspace information is saved in the config file."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -758,7 +758,15 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Forecasting farther than the forecast horizon <a id=\"recursive forecasting\"></a>\n",
|
||||
"When the forecast destination, or the latest date in the prediction data frame, is farther into the future than the specified forecast horizon, the `forecast()` function will still make point predictions out to the later date using a recursive operation mode. Internally, the method recursively applies the regular forecaster to generate context so that we can forecast further into the future. \n",
|
||||
"When the forecast destination, or the latest date in the prediction data frame, is farther into the future than the specified forecast horizon, the forecaster must be iteratively applied. Here, we advance the forecast origin on each iteration over the prediction window, predicting `max_horizon` periods ahead on each iteration. There are two choices for the context data to use as the forecaster advances into the prediction window:\n",
|
||||
"\n",
|
||||
"1. We can use forecasted values from previous iterations (recursive forecast),\n",
|
||||
"2. We can use known, actual values of the target if they are available (rolling forecast).\n",
|
||||
"\n",
|
||||
"The first method is useful in a true forecasting scenario when we do not yet know the actual target values while the second is useful in an evaluation scenario where we want to compute accuracy metrics for the `max_horizon`-period-ahead forecaster over a long test set. We refer to the first as a **recursive forecast** since we apply the forecaster recursively over the prediction window and the second as a **rolling forecast** since we roll forward over known actuals.\n",
|
||||
"\n",
|
||||
"### Recursive forecasting\n",
|
||||
"By default, the `forecast()` function will make point predictions out to the later date using a recursive operation mode. Internally, the method recursively applies the regular forecaster to generate context so that we can forecast further into the future. \n",
|
||||
"\n",
|
||||
"To illustrate the use-case and operation of recursive forecasting, we'll consider an example with a single time-series where the forecasting period directly follows the training period and is twice as long as the forecasting horizon given at training time.\n",
|
||||
"\n",
|
||||
@@ -818,6 +826,35 @@
|
||||
"np.array_equal(y_pred_all, y_pred_long)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Rolling forecasts\n",
|
||||
"A rolling forecast is a similar concept to the recursive forecasts described above except that we use known actual values of the target for our context data. We have provided a different, public method for this called `rolling_forecast`. In addition to test data and actuals (`X_test` and `y_test`), `rolling_forecast` also accepts an optional `step` parameter that controls how far the origin advances on each iteration. The recursive forecast mode uses a fixed step of `max_horizon` while `rolling_forecast` defaults to a step size of 1, but can be set to any integer from 1 to `max_horizon`, inclusive.\n",
|
||||
"\n",
|
||||
"Let's see what the rolling forecast looks like on the long test set with the step set to 1:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_rf = fitted_model.rolling_forecast(X_test_long, y_test_long, step=1)\n",
|
||||
"X_rf.head(n=12)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Notice that `rolling_forecast` has returned a single DataFrame containing all results and has generated some new columns: `_automl_forecast_origin`, `_automl_forecast_y`, and `_automl_actual_y`. These are the origin date for each forecast, the forecasted value and the actual value, respectively. Note that \"y\" in the forecast and actual column names will generally be replaced by the target column name supplied to AutoML.\n",
|
||||
"\n",
|
||||
"The output above shows forecasts for two prediction windows, the first with origin at the end of the training set and the second including the first observation in the test set (2000-01-01 06:00:00). Since the forecast windows overlap, there are multiple forecasts for most dates which are associated with different origin dates."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -866,9 +903,9 @@
|
||||
"friendly_name": "Forecasting away from training data",
|
||||
"index_order": 3,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -880,7 +917,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.7.13"
|
||||
},
|
||||
"tags": [
|
||||
"Forecasting",
|
||||
@@ -894,5 +931,5 @@
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -19,7 +19,14 @@
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
""
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-github-dau)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -52,7 +59,7 @@
|
||||
"\n",
|
||||
"AutoML highlights here include using Deep Learning forecasts, Arima, Prophet, Remote Execution and Remote Inferencing, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"Make sure you have executed the [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"Notebook synopsis:\n",
|
||||
"\n",
|
||||
@@ -325,7 +332,7 @@
|
||||
"source": [
|
||||
"### Setting forecaster maximum horizon \n",
|
||||
"\n",
|
||||
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 12 periods (i.e. 12 months). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
|
||||
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 14 periods (i.e. 14 days). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -337,7 +344,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"forecast_horizon = 12"
|
||||
"forecast_horizon = 14"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -382,7 +389,7 @@
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" experiment_timeout_hours=1,\n",
|
||||
" experiment_timeout_hours=1.5,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" validation_data=valid_dataset,\n",
|
||||
@@ -681,9 +688,9 @@
|
||||
],
|
||||
"hide_code_all_hidden": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -695,9 +702,9 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
"version": "3.8.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -4,8 +4,7 @@ import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
||||
|
||||
from azureml.automl.runtime.shared.score import scoring, constants
|
||||
@@ -19,219 +18,8 @@ except ImportError:
|
||||
_torch_present = False
|
||||
|
||||
|
||||
def align_outputs(
|
||||
y_predicted,
|
||||
X_trans,
|
||||
X_test,
|
||||
y_test,
|
||||
predicted_column_name="predicted",
|
||||
horizon_colname="horizon_origin",
|
||||
):
|
||||
"""
|
||||
Demonstrates how to get the output aligned to the inputs
|
||||
using pandas indexes. Helps understand what happened if
|
||||
the output's shape differs from the input shape, or if
|
||||
the data got re-sorted by time and grain during forecasting.
|
||||
|
||||
Typical causes of misalignment are:
|
||||
* we predicted some periods that were missing in actuals -> drop from eval
|
||||
* model was asked to predict past max_horizon -> increase max horizon
|
||||
* data at start of X_test was needed for lags -> provide previous periods
|
||||
"""
|
||||
if horizon_colname in X_trans:
|
||||
df_fcst = pd.DataFrame(
|
||||
{
|
||||
predicted_column_name: y_predicted,
|
||||
horizon_colname: X_trans[horizon_colname],
|
||||
}
|
||||
)
|
||||
else:
|
||||
df_fcst = pd.DataFrame({predicted_column_name: y_predicted})
|
||||
|
||||
# y and X outputs are aligned by forecast() function contract
|
||||
df_fcst.index = X_trans.index
|
||||
|
||||
# align original X_test to y_test
|
||||
X_test_full = X_test.copy()
|
||||
X_test_full[target_column_name] = y_test
|
||||
|
||||
# X_test_full's index does not include origin, so reset for merge
|
||||
df_fcst.reset_index(inplace=True)
|
||||
X_test_full = X_test_full.reset_index().drop(columns="index")
|
||||
together = df_fcst.merge(X_test_full, how="right")
|
||||
|
||||
# drop rows where prediction or actuals are nan
|
||||
# happens because of missing actuals
|
||||
# or at edges of time due to lags/rolling windows
|
||||
clean = together[
|
||||
together[[target_column_name, predicted_column_name]].notnull().all(axis=1)
|
||||
]
|
||||
return clean
|
||||
|
||||
|
||||
def do_rolling_forecast_with_lookback(
|
||||
fitted_model, X_test, y_test, max_horizon, X_lookback, y_lookback, freq="D"
|
||||
):
|
||||
"""
|
||||
Produce forecasts on a rolling origin over the given test set.
|
||||
|
||||
Each iteration makes a forecast for the next 'max_horizon' periods
|
||||
with respect to the current origin, then advances the origin by the
|
||||
horizon time duration. The prediction context for each forecast is set so
|
||||
that the forecaster uses the actual target values prior to the current
|
||||
origin time for constructing lag features.
|
||||
|
||||
This function returns a concatenated DataFrame of rolling forecasts.
|
||||
"""
|
||||
print("Using lookback of size: ", y_lookback.size)
|
||||
df_list = []
|
||||
origin_time = X_test[time_column_name].min()
|
||||
X = X_lookback.append(X_test)
|
||||
y = np.concatenate((y_lookback, y_test), axis=0)
|
||||
while origin_time <= X_test[time_column_name].max():
|
||||
# Set the horizon time - end date of the forecast
|
||||
horizon_time = origin_time + max_horizon * to_offset(freq)
|
||||
|
||||
# Extract test data from an expanding window up-to the horizon
|
||||
expand_wind = X[time_column_name] < horizon_time
|
||||
X_test_expand = X[expand_wind]
|
||||
y_query_expand = np.zeros(len(X_test_expand)).astype(float)
|
||||
y_query_expand.fill(np.NaN)
|
||||
|
||||
if origin_time != X[time_column_name].min():
|
||||
# Set the context by including actuals up-to the origin time
|
||||
test_context_expand_wind = X[time_column_name] < origin_time
|
||||
context_expand_wind = X_test_expand[time_column_name] < origin_time
|
||||
y_query_expand[context_expand_wind] = y[test_context_expand_wind]
|
||||
|
||||
# Print some debug info
|
||||
print(
|
||||
"Horizon_time:",
|
||||
horizon_time,
|
||||
" origin_time: ",
|
||||
origin_time,
|
||||
" max_horizon: ",
|
||||
max_horizon,
|
||||
" freq: ",
|
||||
freq,
|
||||
)
|
||||
print("expand_wind: ", expand_wind)
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
print("X_test")
|
||||
print(X)
|
||||
print("X_test_expand")
|
||||
print(X_test_expand)
|
||||
print("Type of X_test_expand: ", type(X_test_expand))
|
||||
print("Type of y_query_expand: ", type(y_query_expand))
|
||||
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
|
||||
# Make a forecast out to the maximum horizon
|
||||
# y_fcst, X_trans = y_query_expand, X_test_expand
|
||||
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
|
||||
|
||||
print("y_fcst")
|
||||
print(y_fcst)
|
||||
|
||||
# Align forecast with test set for dates within
|
||||
# the current rolling window
|
||||
trans_tindex = X_trans.index.get_level_values(time_column_name)
|
||||
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
|
||||
test_roll_wind = expand_wind & (X[time_column_name] >= origin_time)
|
||||
df_list.append(
|
||||
align_outputs(
|
||||
y_fcst[trans_roll_wind],
|
||||
X_trans[trans_roll_wind],
|
||||
X[test_roll_wind],
|
||||
y[test_roll_wind],
|
||||
)
|
||||
)
|
||||
|
||||
# Advance the origin time
|
||||
origin_time = horizon_time
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
|
||||
|
||||
def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq="D"):
|
||||
"""
|
||||
Produce forecasts on a rolling origin over the given test set.
|
||||
|
||||
Each iteration makes a forecast for the next 'max_horizon' periods
|
||||
with respect to the current origin, then advances the origin by the
|
||||
horizon time duration. The prediction context for each forecast is set so
|
||||
that the forecaster uses the actual target values prior to the current
|
||||
origin time for constructing lag features.
|
||||
|
||||
This function returns a concatenated DataFrame of rolling forecasts.
|
||||
"""
|
||||
df_list = []
|
||||
origin_time = X_test[time_column_name].min()
|
||||
while origin_time <= X_test[time_column_name].max():
|
||||
# Set the horizon time - end date of the forecast
|
||||
horizon_time = origin_time + max_horizon * to_offset(freq)
|
||||
|
||||
# Extract test data from an expanding window up-to the horizon
|
||||
expand_wind = X_test[time_column_name] < horizon_time
|
||||
X_test_expand = X_test[expand_wind]
|
||||
y_query_expand = np.zeros(len(X_test_expand)).astype(float)
|
||||
y_query_expand.fill(np.NaN)
|
||||
|
||||
if origin_time != X_test[time_column_name].min():
|
||||
# Set the context by including actuals up-to the origin time
|
||||
test_context_expand_wind = X_test[time_column_name] < origin_time
|
||||
context_expand_wind = X_test_expand[time_column_name] < origin_time
|
||||
y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]
|
||||
|
||||
# Print some debug info
|
||||
print(
|
||||
"Horizon_time:",
|
||||
horizon_time,
|
||||
" origin_time: ",
|
||||
origin_time,
|
||||
" max_horizon: ",
|
||||
max_horizon,
|
||||
" freq: ",
|
||||
freq,
|
||||
)
|
||||
print("expand_wind: ", expand_wind)
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
print("X_test")
|
||||
print(X_test)
|
||||
print("X_test_expand")
|
||||
print(X_test_expand)
|
||||
print("Type of X_test_expand: ", type(X_test_expand))
|
||||
print("Type of y_query_expand: ", type(y_query_expand))
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
|
||||
# Make a forecast out to the maximum horizon
|
||||
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
|
||||
|
||||
print("y_fcst")
|
||||
print(y_fcst)
|
||||
|
||||
# Align forecast with test set for dates within the
|
||||
# current rolling window
|
||||
trans_tindex = X_trans.index.get_level_values(time_column_name)
|
||||
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
|
||||
test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)
|
||||
df_list.append(
|
||||
align_outputs(
|
||||
y_fcst[trans_roll_wind],
|
||||
X_trans[trans_roll_wind],
|
||||
X_test[test_roll_wind],
|
||||
y_test[test_roll_wind],
|
||||
)
|
||||
)
|
||||
|
||||
# Advance the origin time
|
||||
origin_time = horizon_time
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
def map_location_cuda(storage, loc):
|
||||
return storage.cuda()
|
||||
|
||||
|
||||
def APE(actual, pred):
|
||||
@@ -254,10 +42,6 @@ def MAPE(actual, pred):
|
||||
return np.mean(APE(actual_safe, pred_safe))
|
||||
|
||||
|
||||
def map_location_cuda(storage, loc):
|
||||
return storage.cuda()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--max_horizon",
|
||||
@@ -303,7 +87,6 @@ print(model_path)
|
||||
run = Run.get_context()
|
||||
# get input dataset by name
|
||||
test_dataset = run.input_datasets["test_data"]
|
||||
lookback_dataset = run.input_datasets["lookback_data"]
|
||||
|
||||
grain_column_names = []
|
||||
|
||||
@@ -312,15 +95,8 @@ df = test_dataset.to_pandas_dataframe()
|
||||
print("Read df")
|
||||
print(df)
|
||||
|
||||
X_test_df = test_dataset.drop_columns(columns=[target_column_name])
|
||||
y_test_df = test_dataset.with_timestamp_columns(None).keep_columns(
|
||||
columns=[target_column_name]
|
||||
)
|
||||
|
||||
X_lookback_df = lookback_dataset.drop_columns(columns=[target_column_name])
|
||||
y_lookback_df = lookback_dataset.with_timestamp_columns(None).keep_columns(
|
||||
columns=[target_column_name]
|
||||
)
|
||||
X_test_df = df
|
||||
y_test = df.pop(target_column_name).to_numpy()
|
||||
|
||||
_, ext = os.path.splitext(model_path)
|
||||
if ext == ".pt":
|
||||
@@ -336,37 +112,20 @@ else:
|
||||
# Load the sklearn pipeline.
|
||||
fitted_model = joblib.load(model_path)
|
||||
|
||||
if hasattr(fitted_model, "get_lookback"):
|
||||
lookback = fitted_model.get_lookback()
|
||||
df_all = do_rolling_forecast_with_lookback(
|
||||
fitted_model,
|
||||
X_test_df.to_pandas_dataframe(),
|
||||
y_test_df.to_pandas_dataframe().values.T[0],
|
||||
max_horizon,
|
||||
X_lookback_df.to_pandas_dataframe()[-lookback:],
|
||||
y_lookback_df.to_pandas_dataframe().values.T[0][-lookback:],
|
||||
freq,
|
||||
)
|
||||
else:
|
||||
df_all = do_rolling_forecast(
|
||||
fitted_model,
|
||||
X_test_df.to_pandas_dataframe(),
|
||||
y_test_df.to_pandas_dataframe().values.T[0],
|
||||
max_horizon,
|
||||
freq,
|
||||
)
|
||||
X_rf = fitted_model.rolling_forecast(X_test_df, y_test, step=1)
|
||||
assign_dict = {
|
||||
fitted_model.forecast_origin_column_name: "forecast_origin",
|
||||
fitted_model.forecast_column_name: "predicted",
|
||||
fitted_model.actual_column_name: target_column_name,
|
||||
}
|
||||
X_rf.rename(columns=assign_dict, inplace=True)
|
||||
|
||||
print(df_all)
|
||||
|
||||
print("target values:::")
|
||||
print(df_all[target_column_name])
|
||||
print("predicted values:::")
|
||||
print(df_all["predicted"])
|
||||
print(X_rf.head())
|
||||
|
||||
# Use the AutoML scoring module
|
||||
regression_metrics = list(constants.REGRESSION_SCALAR_SET)
|
||||
y_test = np.array(df_all[target_column_name])
|
||||
y_pred = np.array(df_all["predicted"])
|
||||
y_test = np.array(X_rf[target_column_name])
|
||||
y_pred = np.array(X_rf["predicted"])
|
||||
scores = scoring.score_regression(y_test, y_pred, regression_metrics)
|
||||
|
||||
print("scores:")
|
||||
@@ -376,11 +135,11 @@ for key, value in scores.items():
|
||||
run.log(key, value)
|
||||
|
||||
print("Simple forecasting model")
|
||||
rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all["predicted"]))
|
||||
rmse = np.sqrt(mean_squared_error(X_rf[target_column_name], X_rf["predicted"]))
|
||||
print("[Test Data] \nRoot Mean squared error: %.2f" % rmse)
|
||||
mae = mean_absolute_error(df_all[target_column_name], df_all["predicted"])
|
||||
mae = mean_absolute_error(X_rf[target_column_name], X_rf["predicted"])
|
||||
print("mean_absolute_error score: %.2f" % mae)
|
||||
print("MAPE: %.2f" % MAPE(df_all[target_column_name], df_all["predicted"]))
|
||||
print("MAPE: %.2f" % MAPE(X_rf[target_column_name], X_rf["predicted"]))
|
||||
|
||||
run.log("rmse", rmse)
|
||||
run.log("mae", mae)
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-hierarchical-timeseries-in-pipeline)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -40,7 +47,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
|
||||
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -251,8 +258,17 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and hierarchy settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``HTSTrainParameters`` objects. For the forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **task** | forecasting |\n",
|
||||
@@ -260,20 +276,22 @@
|
||||
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
|
||||
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
|
||||
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **time_series_id_column_name** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |"
|
||||
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |\n",
|
||||
"\n",
|
||||
"#### ``HTSTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
|
||||
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
|
||||
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
|
||||
"| **enable_engineered_explanations** | The switch controls engineered explanations. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -287,6 +305,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.automl.runtime._hts.hts_parameters import HTSTrainParameters\n",
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model_explainability = True\n",
|
||||
"\n",
|
||||
@@ -300,24 +321,26 @@
|
||||
"label_column_name = \"quantity\"\n",
|
||||
"forecast_horizon = 7\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=time_column_name,\n",
|
||||
" forecast_horizon=forecast_horizon,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = {\n",
|
||||
" \"task\": \"forecasting\",\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"label_column_name\": label_column_name,\n",
|
||||
" \"time_column_name\": time_column_name,\n",
|
||||
" \"forecast_horizon\": forecast_horizon,\n",
|
||||
" \"hierarchy_column_names\": hierarchy,\n",
|
||||
" \"hierarchy_training_level\": training_level,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
" \"pipeline_fetch_max_batch_size\": 15,\n",
|
||||
" \"model_explainability\": model_explainability,\n",
|
||||
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" \"cv_step_size\": \"auto\",\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" experiment_timeout_hours=1,\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
" pipeline_fetch_max_batch_size=15,\n",
|
||||
" model_explainability=model_explainability,\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
" # The following settings are specific to this sample and should be adjusted according to your own needs.\n",
|
||||
" \"iteration_timeout_minutes\": 10,\n",
|
||||
" \"iterations\": 10,\n",
|
||||
"}\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" iterations=15,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"hts_parameters = HTSTrainParameters(\n",
|
||||
" automl_settings=automl_settings,\n",
|
||||
@@ -338,15 +361,25 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Parallel run step is leveraged to train the hierarchy. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The `process_count_per_node` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The ``process_count_per_node`` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"\n",
|
||||
"* **experiment:** The experiment used for training.\n",
|
||||
"* **train_data:** The tabular dataset to be used as input to the training run.\n",
|
||||
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long.\n",
|
||||
"* **process_count_per_node:** Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance.\n",
|
||||
"* **train_pipeline_parameters:** The set of configuration parameters defined in the previous section. \n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for training. |\n",
|
||||
"| **train_data** | The file dataset to be used as input to the training run. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node for optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
|
||||
"\n",
|
||||
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
|
||||
"where,\n",
|
||||
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
|
||||
"- $ p $ is ``process_count_per_node``\n",
|
||||
"- $ n $ is ``node_count``\n",
|
||||
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -365,6 +398,7 @@
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=8,\n",
|
||||
" train_pipeline_parameters=hts_parameters,\n",
|
||||
" run_invocation_timeout=3900,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -509,19 +543,24 @@
|
||||
"source": [
|
||||
"## 5.0 Forecasting\n",
|
||||
"For hierarchical forecasting we need to provide the HTSInferenceParameters object.\n",
|
||||
"#### HTSInferenceParameters arguments\n",
|
||||
"* **hierarchy_forecast_level:** The default level of the hierarchy to produce prediction/forecast on.\n",
|
||||
"* **allocation_method:** \\[Optional] The disaggregation method to use if the hierarchy forecast level specified is below the define hierarchy training level. <br><i>(average historical proportions) 'average_historical_proportions'</i><br><i>(proportions of the historical averages) 'proportions_of_historical_average'</i>\n",
|
||||
"#### ``HTSInferenceParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **hierarchy_forecast_level:** | The default level of the hierarchy to produce prediction/forecast on. |\n",
|
||||
"| **allocation_method:** | \\[Optional] The disaggregation method to use if the hierarchy forecast level specified is below the define hierarchy training level. <br><i>(average historical proportions) 'average_historical_proportions'</i><br><i>(proportions of the historical averages) 'proportions_of_historical_average'</i> |\n",
|
||||
"\n",
|
||||
"#### get_many_models_batch_inference_steps arguments\n",
|
||||
"* **experiment:** The experiment used for inference run.\n",
|
||||
"* **inference_data:** The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"* **compute_target:** The compute target that runs the inference pipeline.\n",
|
||||
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku).\n",
|
||||
"* **process_count_per_node:** The number of processes per node.\n",
|
||||
"* **train_run_id:** \\[Optional] The run id of the hierarchy training, by default it is the latest successful training hts run in the experiment.\n",
|
||||
"* **train_experiment_name:** \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline.\n",
|
||||
"* **process_count_per_node:** \\[Optional] The number of processes per node, by default it's 4."
|
||||
"#### ``get_many_models_batch_inference_steps`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
|
||||
"| **inference_pipeline_parameters** | \\[Optional] The ``HTSInferenceParameters`` object defined above. |\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -620,9 +659,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -634,7 +673,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.8"
|
||||
"version": "3.8.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-many-models-in-pipeline)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -40,7 +47,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
|
||||
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -306,7 +313,7 @@
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"\n",
|
||||
"# Name your cluster\n",
|
||||
"compute_name = \"mm-compute\"\n",
|
||||
"compute_name = \"mm-compute-v1\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if compute_name in ws.compute_targets:\n",
|
||||
@@ -316,7 +323,7 @@
|
||||
"else:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_D16S_V3\", max_nodes=20\n",
|
||||
" vm_size=\"STANDARD_D14_V2\", max_nodes=20\n",
|
||||
" )\n",
|
||||
" # Create the compute target\n",
|
||||
" compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n",
|
||||
@@ -379,8 +386,17 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **task** | forecasting |\n",
|
||||
@@ -388,17 +404,19 @@
|
||||
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
|
||||
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### ``ManyModelsTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
]
|
||||
},
|
||||
@@ -415,23 +433,29 @@
|
||||
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
|
||||
" ManyModelsTrainParameters,\n",
|
||||
")\n",
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
|
||||
"\n",
|
||||
"partition_column_names = [\"Store\", \"Brand\"]\n",
|
||||
"automl_settings = {\n",
|
||||
" \"task\": \"forecasting\",\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
|
||||
" \"iterations\": 15,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"label_column_name\": \"Quantity\",\n",
|
||||
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" \"cv_step_size\": \"auto\",\n",
|
||||
" \"time_column_name\": \"WeekStarting\",\n",
|
||||
" \"drop_column_names\": \"Revenue\",\n",
|
||||
" \"forecast_horizon\": 6,\n",
|
||||
" \"time_series_id_column_names\": partition_column_names,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=\"WeekStarting\",\n",
|
||||
" forecast_horizon=6,\n",
|
||||
" time_series_id_column_names=partition_column_names,\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" iterations=15,\n",
|
||||
" experiment_timeout_hours=0.25,\n",
|
||||
" label_column_name=\"Quantity\",\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"mm_paramters = ManyModelsTrainParameters(\n",
|
||||
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
|
||||
@@ -451,7 +475,9 @@
|
||||
"\n",
|
||||
"Reuse of previous results (``allow_reuse``) is key when using pipelines in a collaborative environment since eliminating unnecessary reruns offers agility. Reuse is the default behavior when the ``script_name``, ``inputs``, and the parameters of a step remain the same. When reuse is allowed, results from the previous run are immediately sent to the next step. If ``allow_reuse`` is set to False, a new run will always be generated for this step during pipeline execution.\n",
|
||||
"\n",
|
||||
"> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input."
|
||||
"> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input.\n",
|
||||
"\n",
|
||||
"> Note that we **drop column** \"Revenue\" from the dataset in this step to avoid information leak as \"Quantity\" = \"Revenue\" / \"Price\". **Please modify the logic based on your data**."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -489,17 +515,25 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The process_count_per_node is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The ``process_count_per_node`` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for training. |\n",
|
||||
"| **train_data** | The file dataset to be used as input to the training run. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node for optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
|
||||
"\n",
|
||||
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
|
||||
"where,\n",
|
||||
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
|
||||
"- $ p $ is ``process_count_per_node``\n",
|
||||
"- $ n $ is ``node_count``\n",
|
||||
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -517,7 +551,7 @@
|
||||
" compute_target=compute_target,\n",
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=8,\n",
|
||||
" run_invocation_timeout=920,\n",
|
||||
" run_invocation_timeout=1200,\n",
|
||||
" train_pipeline_parameters=mm_paramters,\n",
|
||||
")"
|
||||
]
|
||||
@@ -598,7 +632,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 7.2 Schedule the pipeline\n",
|
||||
"### 5.2 Schedule the pipeline\n",
|
||||
"You can also [schedule the pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-schedule-pipelines) to run on a time-based or change-based schedule. This could be used to automatically retrain models every month or based on another trigger such as data drift."
|
||||
]
|
||||
},
|
||||
@@ -654,25 +688,31 @@
|
||||
"source": [
|
||||
"For many models we need to provide the ManyModelsInferenceParameters object.\n",
|
||||
"\n",
|
||||
"#### ManyModelsInferenceParameters arguments\n",
|
||||
"#### ``ManyModelsInferenceParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **target_column_name** | \\[Optional] Column name only if the inference dataset has the target. |\n",
|
||||
"| **time_column_name** | \\[Optional] Column name only if it is timeseries. |\n",
|
||||
"| **many_models_run_id** | \\[Optional] Many models run id where models were trained. |\n",
|
||||
"| **time_column_name** | \\[Optional] Time column name only if it is timeseries. |\n",
|
||||
"| **inference_type** | \\[Optional] Which inference method to use on the model. Possible values are 'forecast', 'predict_proba', and 'predict'. |\n",
|
||||
"| **forecast_mode** | \\[Optional] The type of forecast to be used, either 'rolling' or 'recursive'; defaults to 'recursive'. |\n",
|
||||
"| **step** | \\[Optional] Number of periods to advance the forecasting window in each iteration **(for rolling forecast only)**; defaults to 1. |\n",
|
||||
"\n",
|
||||
"#### get_many_models_batch_inference_steps arguments\n",
|
||||
"#### ``get_many_models_batch_inference_steps`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** The compute target that runs the inference pipeline.|\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** The number of processes per node.\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
|
||||
"| **inference_pipeline_parameters** | \\[Optional] The ``ManyModelsInferenceParameters`` object defined above. |\n",
|
||||
"| **append_row_file_name** | \\[Optional] The name of the output file (optional, default value is 'parallel_run_step.txt'). Supports 'txt' and 'csv' file extension. A 'txt' file extension generates the output in 'txt' format with space as separator without column names. A 'csv' file extension generates the output in 'csv' format with comma as separator and with column names. |\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node, by default it's 4. |"
|
||||
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **output_datastore** | \\[Optional] The ``Datastore`` or ``OutputDatasetConfig`` to be used for output. If specified any pipeline output will be written to that location. If unspecified the default datastore will be used. |\n",
|
||||
"| **arguments** | \\[Optional] Arguments to be passed to inference script. Possible argument is '--forecast_quantiles' followed by quantile values. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -692,6 +732,8 @@
|
||||
" target_column_name=\"Quantity\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output_file_name = \"parallel_run_step.csv\"\n",
|
||||
"\n",
|
||||
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
|
||||
" experiment=experiment,\n",
|
||||
" inference_data=inference_ds_small,\n",
|
||||
@@ -703,6 +745,8 @@
|
||||
" train_run_id=training_run.id,\n",
|
||||
" train_experiment_name=training_run.experiment.name,\n",
|
||||
" inference_pipeline_parameters=mm_parameters,\n",
|
||||
" append_row_file_name=output_file_name,\n",
|
||||
" arguments=[\"--forecast_quantiles\", 0.1, 0.9],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -737,7 +781,7 @@
|
||||
"\n",
|
||||
"The following code snippet:\n",
|
||||
"1. Downloads the contents of the output folder that is passed in the parallel run step \n",
|
||||
"2. Reads the parallel_run_step.txt file that has the predictions as pandas dataframe and \n",
|
||||
"2. Reads the output file that has the predictions as pandas dataframe and \n",
|
||||
"3. Displays the top 10 rows of the predictions"
|
||||
]
|
||||
},
|
||||
@@ -752,19 +796,9 @@
|
||||
"forecasting_results_name = \"forecasting_results\"\n",
|
||||
"forecasting_output_name = \"many_models_inference_output\"\n",
|
||||
"forecast_file = get_output_from_mm_pipeline(\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
|
||||
")\n",
|
||||
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None)\n",
|
||||
"df.columns = [\n",
|
||||
" \"Week Starting\",\n",
|
||||
" \"Store\",\n",
|
||||
" \"Brand\",\n",
|
||||
" \"Quantity\",\n",
|
||||
" \"Advert\",\n",
|
||||
" \"Price\",\n",
|
||||
" \"Revenue\",\n",
|
||||
" \"Predicted\",\n",
|
||||
"]\n",
|
||||
"df = pd.read_csv(forecast_file)\n",
|
||||
"print(\n",
|
||||
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
|
||||
")\n",
|
||||
@@ -837,9 +871,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -851,7 +885,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
|
Before Width: | Height: | Size: 2.6 MiB After Width: | Height: | Size: 2.6 MiB |
@@ -11,6 +11,12 @@ def main(args):
|
||||
dataset = run_context.input_datasets["train_10_models"]
|
||||
df = dataset.to_pandas_dataframe()
|
||||
|
||||
# Drop the column "Revenue" from the dataset to avoid information leak as
|
||||
# "Quantity" = "Revenue" / "Price". Please modify the logic based on your data.
|
||||
drop_column_name = "Revenue"
|
||||
if drop_column_name in df.columns:
|
||||
df.drop(drop_column_name, axis=1, inplace=True)
|
||||
|
||||
# Apply any data pre-processing techniques here
|
||||
|
||||
df.to_parquet(output / "data_prepared_result.parquet", compression=None)
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -40,7 +47,7 @@
|
||||
"## Introduction<a id=\"introduction\"></a>\n",
|
||||
"In this example, we use AutoML to train, select, and operationalize a time-series forecasting model for multiple time-series.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area."
|
||||
]
|
||||
@@ -242,7 +249,9 @@
|
||||
" time_series_id_column_names, group_keys=False\n",
|
||||
" )\n",
|
||||
" df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])\n",
|
||||
" df_head.reset_index(inplace=True, drop=True)\n",
|
||||
" df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])\n",
|
||||
" df_tail.reset_index(inplace=True, drop=True)\n",
|
||||
" return df_head, df_tail\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -715,7 +724,7 @@
|
||||
" description=\"Automl forecasting sample service\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aci_service_name = \"automl-oj-forecast-01\"\n",
|
||||
"aci_service_name = \"automl-oj-forecast-03\"\n",
|
||||
"print(aci_service_name)\n",
|
||||
"aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)\n",
|
||||
"aci_service.wait_for_deployment(True)\n",
|
||||
@@ -792,7 +801,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"serv = Webservice(ws, \"automl-oj-forecast-01\")\n",
|
||||
"serv = Webservice(ws, \"automl-oj-forecast-03\")\n",
|
||||
"serv.delete() # don't do it accidentally"
|
||||
]
|
||||
}
|
||||
@@ -821,9 +830,9 @@
|
||||
"friendly_name": "Forecasting orange juice sales with deployment",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -835,7 +844,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"tags": [
|
||||
"None"
|
||||
|
||||
@@ -6,7 +6,7 @@ compute instance.
|
||||
|
||||
import argparse
|
||||
from azureml.core import Dataset, Run
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -1,5 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1h_automl_in_pipeline/automl-forecasting-in-pipeline)).</font>\n",
|
||||
"</br>\n",
|
||||
"</br>\n",
|
||||
"<font color=\"red\" size=\"5\">\n",
|
||||
"For examples illustrating how to build pipelines with components, please use the following links:</font>\n",
|
||||
"<ul>\n",
|
||||
" <li><a href=\"https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-many-models-in-pipeline\">Many Models</a></li>\n",
|
||||
" <li><a href=\"https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-hierarchical-timeseries-in-pipeline\">Hierarchical Time Series</a></li>\n",
|
||||
" <li><a href=\"https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-distributed-tcn\">Distributed TCN</a></li>\n",
|
||||
"</ul>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -13,7 +29,7 @@
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"\n",
|
||||
"In this notebook, we demonstrate how to use piplines to train and inference on AutoML Forecasting model. Two pipelines will be created: one for training AutoML model, and the other is for inference on AutoML model. We'll also demonstrate how to schedule the inference pipeline so you can get inference results periodically (with refreshed test dataset). Make sure you have executed the configuration notebook before running this notebook. In this notebook you will learn how to:\n",
|
||||
"In this notebook, we demonstrate how to use piplines to train and inference on AutoML Forecasting model. Two pipelines will be created: one for training AutoML model, and the other is for inference on AutoML model. We'll also demonstrate how to schedule the inference pipeline so you can get inference results periodically (with refreshed test dataset). Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook. In this notebook you will learn how to:\n",
|
||||
"\n",
|
||||
"- Configure AutoML using AutoMLConfig for forecasting tasks using pipeline AutoMLSteps.\n",
|
||||
"- Create and register an AutoML model using AzureML pipeline.\n",
|
||||
@@ -555,40 +571,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Model\n",
|
||||
"from azureml.train.automl.run import AutoMLRun\n",
|
||||
"\n",
|
||||
"model = Model(ws, model_name_str)\n",
|
||||
"download_path = model.download(model_name_str, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"After all the files are downloaded, we can generate the run config for inference runs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment, RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"for step in training_pipeline_run.get_steps():\n",
|
||||
" if step.properties.get(\"StepType\") == \"AutoMLStep\":\n",
|
||||
" automl_run = AutoMLRun(experiment, step.id)\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"env_file = os.path.join(download_path, \"conda_env_v_1_0_0.yml\")\n",
|
||||
"inference_env = Environment(\"oj-inference-env\")\n",
|
||||
"inference_env.python.conda_dependencies = CondaDependencies(\n",
|
||||
" conda_dependencies_file_path=env_file\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[Optional] The enviroment can also be assessed from the training run using `get_environment()` API."
|
||||
"best_run = automl_run.get_best_child()\n",
|
||||
"inference_env = best_run.get_environment()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -799,9 +790,9 @@
|
||||
"friendly_name": "Forecasting orange juice sales with deployment",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -6,7 +6,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
||||
|
||||
from azureml.data.dataset_factory import TabularDatasetFactory
|
||||
@@ -30,7 +30,7 @@ def infer_forecasting_dataset_tcn(
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
registered_train = TabularDatasetFactory.register_pandas_dataframe(
|
||||
TabularDatasetFactory.register_pandas_dataframe(
|
||||
df_all,
|
||||
target=(
|
||||
run.experiment.workspace.get_default_datastore(),
|
||||
|
||||
@@ -20,7 +20,14 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook we will explore the univaraite time-series data to determine the settings for an automated ML experiment. We will follow the thought process depicted in the following diagram:<br/>\n",
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook we will explore the univariate time-series data to determine the settings for an automated ML experiment. We will follow the thought process depicted in the following diagram:<br/>\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The objective is to answer the following questions:\n",
|
||||
@@ -32,11 +39,11 @@
|
||||
" </ul>\n",
|
||||
" <li>Is the data stationary? </li>\n",
|
||||
" <ul style=\"margin-top:-1px; list-style-type:none\"> \n",
|
||||
" <li> Importance: In the absense of features that capture trend behavior, ML models (regression and tree based) are not well equiped to predict stochastic trends. Working with stationary data solves this problem. </li>\n",
|
||||
" <li> Importance: In the absence of features that capture trend behavior, ML models (regression and tree based) are not well equipped to predict stochastic trends. Working with stationary data solves this problem. </li>\n",
|
||||
" </ul>\n",
|
||||
" <li>Is there a detectable auto-regressive pattern in the stationary data? </li>\n",
|
||||
" <ul style=\"margin-top:-1px; list-style-type:none\"> \n",
|
||||
" <li> Importance: The accuracy of ML models can be improved if serial correlation is modeled by including lags of the dependent/target varaible as features. Including target lags in every experiment by default will result in a regression in accuracy scores if such setting is not warranted. </li>\n",
|
||||
" <li> Importance: The accuracy of ML models can be improved if serial correlation is modeled by including lags of the dependent/target variable as features. Including target lags in every experiment by default will result in a regression in accuracy scores if such setting is not warranted. </li>\n",
|
||||
" </ul>\n",
|
||||
"</ol>\n",
|
||||
"\n",
|
||||
@@ -109,7 +116,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The graph plots the alcohol sales in the United States. Because the data is trending, it can be difficult to see cycles, seasonality or other interestng behaviors due to the scaling issues. For example, if there is a seasonal pattern, which we will discuss later, we cannot see them on the trending data. In such case, it is worth plotting the same data in first differences."
|
||||
"The graph plots the alcohol sales in the United States. Because the data is trending, it can be difficult to see cycles, seasonality or other interesting behaviors due to the scaling issues. For example, if there is a seasonal pattern, which we will discuss later, we cannot see them on the trending data. In such case, it is worth plotting the same data in first differences."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -335,8 +342,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 3 Check if there is a clear autoregressive pattern\n",
|
||||
"We need to determine if we should include lags of the target variable as features in order to improve forecast accuracy. To do this, we will examine the ACF and partial ACF (PACF) plots of the stationary series. In our case, it is a series in first diffrences.\n",
|
||||
"# 3 Check if there is a clear auto-regressive pattern\n",
|
||||
"We need to determine if we should include lags of the target variable as features in order to improve forecast accuracy. To do this, we will examine the ACF and partial ACF (PACF) plots of the stationary series. In our case, it is a series in first differences.\n",
|
||||
"\n",
|
||||
"<ul>\n",
|
||||
" <li> Question: What is an Auto-regressive pattern? What are we looking for? </li>\n",
|
||||
@@ -418,11 +425,11 @@
|
||||
" </li>\n",
|
||||
" where $\\sigma_{xzy}$ is the covariance between two random variables $X$ and $Z$; $\\sigma_x$ and $\\sigma_z$ is the variance for $X$ and $Z$, respectively. The correlation coefficient measures the strength of linear relationship between two random variables. This metric can take any value from -1 to 1. <li/>\n",
|
||||
" <br/>\n",
|
||||
" <li> The auto-correlation coefficient $\\rho_{Y_{t} Y_{t-k}}$ is the time series equivalent of the correlation coefficient, except instead of measuring linear association between two random variables $X$ and $Z$, it measures the strength of a linear relationship between a random variable $Y_t$ and its lag $Y_{t-k}$ for any positive interger value of $k$. </li> \n",
|
||||
" <li> The auto-correlation coefficient $\\rho_{Y_{t} Y_{t-k}}$ is the time series equivalent of the correlation coefficient, except instead of measuring linear association between two random variables $X$ and $Z$, it measures the strength of a linear relationship between a random variable $Y_t$ and its lag $Y_{t-k}$ for any positive integer value of $k$. </li> \n",
|
||||
" <br />\n",
|
||||
" <li> To visualize the ACF for a particular lag, say lag 2, plot the second lag of a series $y_{t-2}$ on the x-axis, and plot the series itself $y_t$ on the y-axis. The autocorrelation coefficient is the slope of the best fitted regression line and can be interpreted as follows. A one unit increase in the lag of a variable one period ago leads to a $\\rho_{Y_{t} Y_{t-2}}$ units change in the variable in the current period. This interpreation can be applied to any lag. </li> \n",
|
||||
" <li> To visualize the ACF for a particular lag, say lag 2, plot the second lag of a series $y_{t-2}$ on the x-axis, and plot the series itself $y_t$ on the y-axis. The autocorrelation coefficient is the slope of the best fitted regression line and can be interpreted as follows. A one unit increase in the lag of a variable one period ago leads to a $\\rho_{Y_{t} Y_{t-2}}$ units change in the variable in the current period. This interpretation can be applied to any lag. </li> \n",
|
||||
" <br />\n",
|
||||
" <li> In the interpretation posted above we need to be careful not to confuse the word \"leads\" with \"causes\" since these are not the same thing. We do not know the lagged value of the varaible causes it to change. Afterall, there are probably many other features that may explain the movement in $Y_t$. All we are trying to do in this section is to identify situations when the variable contains the strong auto-regressive components that needs to be included in the model to improve forecast accuracy. </li>\n",
|
||||
" <li> In the interpretation posted above we need to be careful not to confuse the word \"leads\" with \"causes\" since these are not the same thing. We do not know the lagged value of the variable causes it to change. After all, there are probably many other features that may explain the movement in $Y_t$. All we are trying to do in this section is to identify situations when the variable contains the strong auto-regressive components that needs to be included in the model to improve forecast accuracy. </li>\n",
|
||||
" </ul>\n",
|
||||
"</ul>"
|
||||
]
|
||||
@@ -434,7 +441,7 @@
|
||||
"<ul>\n",
|
||||
" <li> Question: What is the PACF? </li>\n",
|
||||
" <ul style=\"list-style-type:none;\">\n",
|
||||
" <li> When describing the ACF we essentially running a regression between a partigular lag of a series, say, lag 4, and the series itself. What this implies is the regression coefficient for lag 4 captures the impact of everything that happens in lags 1, 2 and 3. In other words, if lag 1 is the most important lag and we exclude it from the regression, naturally, the regression model will assign the importance of the 1st lag to the 4th one. Partial auto-correlation function fixes this problem since it measures the contribution of each lag accounting for the information added by the intermediary lags. If we were to illustrate ACF and PACF for the fourth lag using the regression analogy, the difference is a follows: \n",
|
||||
" <li> When describing the ACF we essentially running a regression between a particular lag of a series, say, lag 4, and the series itself. What this implies is the regression coefficient for lag 4 captures the impact of everything that happens in lags 1, 2 and 3. In other words, if lag 1 is the most important lag and we exclude it from the regression, naturally, the regression model will assign the importance of the 1st lag to the 4th one. Partial auto-correlation function fixes this problem since it measures the contribution of each lag accounting for the information added by the intermediary lags. If we were to illustrate ACF and PACF for the fourth lag using the regression analogy, the difference is a follows: \n",
|
||||
" \\begin{align}\n",
|
||||
" Y_{t} &= a_{0} + a_{4} Y_{t-4} + e_{t} \\\\\n",
|
||||
" Y_{t} &= b_{0} + b_{1} Y_{t-1} + b_{2} Y_{t-2} + b_{3} Y_{t-3} + b_{4} Y_{t-4} + \\varepsilon_{t} \\\\\n",
|
||||
@@ -442,7 +449,7 @@
|
||||
" </li>\n",
|
||||
" <br/>\n",
|
||||
" <li>\n",
|
||||
" Here, you can think of $a_4$ and $b_{4}$ as the auto- and partial auto-correlation coefficients for lag 4. Notice, in the second equation we explicitely accounting for the intermediate lags by adding them as regrerssors.\n",
|
||||
" Here, you can think of $a_4$ and $b_{4}$ as the auto- and partial auto-correlation coefficients for lag 4. Notice, in the second equation we explicitly accounting for the intermediate lags by adding them as regressors.\n",
|
||||
" </li>\n",
|
||||
" </ul>\n",
|
||||
"</ul>"
|
||||
@@ -455,11 +462,11 @@
|
||||
"<ul>\n",
|
||||
" <li> Question: Auto-regressive pattern? What are we looking for? </li>\n",
|
||||
" <ul style=\"list-style-type:none;\">\n",
|
||||
" <li> We are looking for a classical profiles for an AR(p) process such as an exponential decay of an ACF and a the first $p$ significant lags of the PACF. Let's examine the ACF/PACF profiles of the same simulated AR(2) shown in Section 3, and check if the ACF/PACF explanation are refelcted in these plots. <li/>\n",
|
||||
" <li> We are looking for a classical profiles for an AR(p) process such as an exponential decay of an ACF and a the first $p$ significant lags of the PACF. Let's examine the ACF/PACF profiles of the same simulated AR(2) shown in Section 3, and check if the ACF/PACF explanation are reflected in these plots. <li/>\n",
|
||||
" <li><img src=\"figures/ACF_PACF_for_AR2.png\" class=\"img_class\">\n",
|
||||
" <li> The autocorrelation coefficient for the 3rd lag is 0.6, which can be interpreted that a one unit increase in the value of the target varaible three periods ago leads to 0.6 units increase in the current period. However, the PACF plot shows that the partial autocorrealtion coefficient is zero (from a statistical point of view since it lies within the shaded region). This is happening because the 1st and 2nd lags are good predictors of the target variable. Ommiting these two lags from the regression results in the misleading conclusion that the third lag is a good prediciton. <li/>\n",
|
||||
" <li> The autocorrelation coefficient for the 3rd lag is 0.6, which can be interpreted that a one unit increase in the value of the target variable three periods ago leads to 0.6 units increase in the current period. However, the PACF plot shows that the partial autocorrelation coefficient is zero (from a statistical point of view since it lies within the shaded region). This is happening because the 1st and 2nd lags are good predictors of the target variable. Omitting these two lags from the regression results in the misleading conclusion that the third lag is a good prediction. <li/>\n",
|
||||
" <br/>\n",
|
||||
" <li> This is why it is important to examine both the ACF and the PACF plots when tring to determine the auto regressive order for the variable in question. <li/>\n",
|
||||
" <li> This is why it is important to examine both the ACF and the PACF plots when trying to determine the auto regressive order for the variable in question. <li/>\n",
|
||||
" </ul>\n",
|
||||
"</ul> "
|
||||
]
|
||||
@@ -471,10 +478,13 @@
|
||||
"name": "vlbejan"
|
||||
}
|
||||
],
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -486,7 +496,15 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color=\"red\" size=\"5\"><strong>!Important!</strong> </br>This notebook is outdated and is not supported by the AutoML Team. Please use the supported version ([link](https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate)).</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -300,27 +307,14 @@
|
||||
"df_train.to_csv(\"train.csv\", index=False)\n",
|
||||
"df_test.to_csv(\"test.csv\", index=False)\n",
|
||||
"\n",
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./train.csv\"],\n",
|
||||
" target_path=\"uni-recipe-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" df_train, target=(datastore, \"dataset/\"), name=\"train\"\n",
|
||||
")\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./test.csv\"],\n",
|
||||
" target_path=\"uni-recipe-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"train_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"uni-recipe-dataset/tabular/train.csv\")]\n",
|
||||
")\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"uni-recipe-dataset/tabular/test.csv\")]\n",
|
||||
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" df_test, target=(datastore, \"dataset/\"), name=\"test\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# print the first 5 rows of the Dataset\n",
|
||||
@@ -571,10 +565,13 @@
|
||||
"name": "vlbejan"
|
||||
}
|
||||
],
|
||||
"kernel_info": {
|
||||
"name": "python3"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -586,7 +583,15 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
@@ -7,7 +7,7 @@ compute instance.
|
||||
import argparse
|
||||
from azureml.core import Dataset, Run
|
||||
from azureml.automl.core.shared.constants import TimeSeriesInternal
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
|
||||
@@ -870,9 +870,9 @@
|
||||
"friendly_name": "Classification of credit card fraudulent transactions using Automated ML",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,5 +1,27 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -495,6 +517,30 @@
|
||||
"#### Create conda configuration for model explanations experiment from automl_run object"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_environment_safe(parent_run):\n",
|
||||
" \"\"\"Get the environment from parent run\"\"\"\n",
|
||||
" try:\n",
|
||||
" return parent_run.get_environment()\n",
|
||||
" except BaseException:\n",
|
||||
" run_details = parent_run.get_details()\n",
|
||||
" run_def = run_details.get(\"runDefinition\")\n",
|
||||
" env = run_def.get(\"environment\")\n",
|
||||
" if env is None:\n",
|
||||
" raise\n",
|
||||
" json.dump(env, open(\"azureml_environment.json\", \"w\"))\n",
|
||||
" return Environment.load_from_directory(\".\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -502,8 +548,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"import pkg_resources\n",
|
||||
"\n",
|
||||
"# create a new RunConfig object\n",
|
||||
"conda_run_config = RunConfiguration(framework=\"python\")\n",
|
||||
@@ -513,7 +557,7 @@
|
||||
"conda_run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"# specify CondaDependencies obj\n",
|
||||
"conda_run_config.environment = automl_run.get_environment()"
|
||||
"conda_run_config.environment = get_environment_safe(automl_run)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -686,7 +730,7 @@
|
||||
" description=\"Get local explanations for Machine test data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"myenv = automl_run.get_environment()\n",
|
||||
"myenv = get_environment_safe(automl_run)\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score_explain.py\", environment=myenv)\n",
|
||||
"\n",
|
||||
"# Use configs and models generated above\n",
|
||||
@@ -859,8 +903,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
|
||||
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
@@ -895,9 +939,9 @@
|
||||
"friendly_name": "Automated ML run with featurization and model explainability.",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -909,7 +953,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.7"
|
||||
"version": "3.8.7"
|
||||
},
|
||||
"tags": [
|
||||
"featurization",
|
||||
|
||||
@@ -422,8 +422,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
|
||||
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
@@ -449,9 +449,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -429,9 +429,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -557,9 +557,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -161,9 +161,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -215,9 +215,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -482,9 +482,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -302,9 +302,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
"source": [
|
||||
"In this example, we will be using and registering two models. \n",
|
||||
"\n",
|
||||
"First we will train two simple models on the [diabetes dataset](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset) included with scikit-learn, serializing them to files in the current directory."
|
||||
"First we will train two simple models on the [diabetes dataset](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) included with scikit-learn, serializing them to files in the current directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -239,7 +239,7 @@
|
||||
"\n",
|
||||
"env = Environment(\"deploytocloudenv\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"joblib\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"numpy\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"numpy==1.23\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"scikit-learn=={}\".format(sklearn.__version__))"
|
||||
]
|
||||
},
|
||||
@@ -373,9 +373,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
# Model Deployment with Azure ML service
|
||||
You can use Azure Machine Learning to package, debug, validate and deploy inference containers to a variety of compute targets. This process is known as "MLOps" (ML operationalization).
|
||||
For more information please check out this article: https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where
|
||||
|
||||
## Get Started
|
||||
To begin, you will need an ML workspace.
|
||||
For more information please check out this article: https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace
|
||||
|
||||
## Deploy to the cloud
|
||||
You can deploy to the cloud using the Azure ML CLI or the Azure ML SDK.
|
||||
- CLI example: https://aka.ms/azmlcli
|
||||
- Notebook example: [model-register-and-deploy](./model-register-and-deploy.ipynb).
|
||||
@@ -1,597 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Register model and deploy as webservice in ACI\n",
|
||||
"\n",
|
||||
"Following this notebook, you will:\n",
|
||||
"\n",
|
||||
" - Learn how to register a model in your Azure Machine Learning Workspace.\n",
|
||||
" - Deploy your model as a web service in an Azure Container Instance."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"\n",
|
||||
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the [configuration notebook](../../../configuration.ipynb) to install the Azure Machine Learning Python SDK and create a workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Check core SDK version number.\n",
|
||||
"print('SDK version:', azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"\n",
|
||||
"Create a [Workspace](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace%28class%29?view=azure-ml-py) object from your persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"create workspace"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create trained model\n",
|
||||
"\n",
|
||||
"For this example, we will train a small model on scikit-learn's [diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import joblib\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import load_diabetes\n",
|
||||
"from sklearn.linear_model import Ridge\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"dataset_x, dataset_y = load_diabetes(return_X_y=True)\n",
|
||||
"\n",
|
||||
"model = Ridge().fit(dataset_x, dataset_y)\n",
|
||||
"\n",
|
||||
"joblib.dump(model, 'sklearn_regression_model.pkl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register input and output datasets\n",
|
||||
"\n",
|
||||
"Here, you will register the data used to create the model in your workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"np.savetxt('features.csv', dataset_x, delimiter=',')\n",
|
||||
"np.savetxt('labels.csv', dataset_y, delimiter=',')\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(files=['./features.csv', './labels.csv'],\n",
|
||||
" target_path='sklearn_regression/',\n",
|
||||
" overwrite=True)\n",
|
||||
"\n",
|
||||
"input_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'sklearn_regression/features.csv')])\n",
|
||||
"output_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'sklearn_regression/labels.csv')])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register model\n",
|
||||
"\n",
|
||||
"Register a file or folder as a model by calling [Model.register()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#register-workspace--model-path--model-name--tags-none--properties-none--description-none--datasets-none--model-framework-none--model-framework-version-none--child-paths-none-).\n",
|
||||
"\n",
|
||||
"In addition to the content of the model file itself, your registered model will also store model metadata -- model description, tags, and framework information -- that will be useful when managing and deploying models in your workspace. Using tags, for instance, you can categorize your models and apply filters when listing models in your workspace. Also, marking this model with the scikit-learn framework will simplify deploying it as a web service, as we'll see later."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"register model from file",
|
||||
"sample-model-register"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sklearn\n",
|
||||
"\n",
|
||||
"from azureml.core import Model\n",
|
||||
"from azureml.core.resource_configuration import ResourceConfiguration\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model = Model.register(workspace=ws,\n",
|
||||
" model_name='my-sklearn-model', # Name of the registered model in your workspace.\n",
|
||||
" model_path='./sklearn_regression_model.pkl', # Local file to upload and register as a model.\n",
|
||||
" model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.\n",
|
||||
" model_framework_version=sklearn.__version__, # Version of scikit-learn used to create the model.\n",
|
||||
" sample_input_dataset=input_dataset,\n",
|
||||
" sample_output_dataset=output_dataset,\n",
|
||||
" resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),\n",
|
||||
" description='Ridge regression model to predict diabetes progression.',\n",
|
||||
" tags={'area': 'diabetes', 'type': 'regression'})\n",
|
||||
"\n",
|
||||
"print('Name:', model.name)\n",
|
||||
"print('Version:', model.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy model\n",
|
||||
"\n",
|
||||
"Deploy your model as a web service using [Model.deploy()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#deploy-workspace--name--models--inference-config--deployment-config-none--deployment-target-none-). Web services take one or more models, load them in an environment, and run them on one of several supported deployment targets. For more information on all your options when deploying models, see the [next steps](#Next-steps) section at the end of this notebook.\n",
|
||||
"\n",
|
||||
"For this example, we will deploy your scikit-learn model to an Azure Container Instance (ACI)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Use a default environment (for supported models)\n",
|
||||
"\n",
|
||||
"The Azure Machine Learning service provides a default environment for supported model frameworks, including scikit-learn, based on the metadata you provided when registering your model. This is the easiest way to deploy your model.\n",
|
||||
"\n",
|
||||
"Even when you deploy your model to ACI with a default environment you can still customize the deploy configuration (i.e. the number of cores and amount of memory made available for the deployment) using the [AciWebservice.deploy_configuration()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.webservice.aci.aciwebservice#deploy-configuration-cpu-cores-none--memory-gb-none--tags-none--properties-none--description-none--location-none--auth-enabled-none--ssl-enabled-none--enable-app-insights-none--ssl-cert-pem-file-none--ssl-key-pem-file-none--ssl-cname-none--dns-name-label-none--). Look at the \"Use a custom environment\" section of this notebook for more information on deploy configuration.\n",
|
||||
"\n",
|
||||
"**Note**: This step can take several minutes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"service_name = 'my-sklearn-service'\n",
|
||||
"\n",
|
||||
"service = Model.deploy(ws, service_name, [model], overwrite=True)\n",
|
||||
"service.wait_for_deployment(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"After your model is deployed, perform a call to the web service using [service.run()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice%28class%29?view=azure-ml-py#run-input-)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"input_payload = json.dumps({\n",
|
||||
" 'data': dataset_x[0:2].tolist(),\n",
|
||||
" 'method': 'predict' # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"output = service.run(input_payload)\n",
|
||||
"\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When you are finished testing your service, clean up the deployment with [service.delete()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice%28class%29?view=azure-ml-py#delete--)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"service.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Use a custom environment\n",
|
||||
"\n",
|
||||
"If you want more control over how your model is run, if it uses another framework, or if it has special runtime requirements, you can instead specify your own environment and scoring method. Custom environments can be used for any model you want to deploy.\n",
|
||||
"\n",
|
||||
"Specify the model's runtime environment by creating an [Environment](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.environment%28class%29?view=azure-ml-py) object and providing the [CondaDependencies](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.conda_dependencies.condadependencies?view=azure-ml-py) needed by your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"environment = Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'joblib',\n",
|
||||
" 'numpy',\n",
|
||||
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using a custom environment, you must also provide Python code for initializing and running your model. An example script is included with this notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('score.py') as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Deploy your model in the custom environment by providing an [InferenceConfig](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.inferenceconfig?view=azure-ml-py) object to [Model.deploy()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#deploy-workspace--name--models--inference-config--deployment-config-none--deployment-target-none-). In this case we are also using the [AciWebservice.deploy_configuration()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.webservice.aci.aciwebservice#deploy-configuration-cpu-cores-none--memory-gb-none--tags-none--properties-none--description-none--location-none--auth-enabled-none--ssl-enabled-none--enable-app-insights-none--ssl-cert-pem-file-none--ssl-key-pem-file-none--ssl-cname-none--dns-name-label-none--) method to generate a custom deploy configuration.\n",
|
||||
"\n",
|
||||
"**Note**: This step can take several minutes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"azuremlexception-remarks-sample",
|
||||
"sample-aciwebservice-deploy-config"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"from azureml.core.webservice import AciWebservice\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"service_name = 'my-custom-env-service'\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script='score.py', environment=environment)\n",
|
||||
"aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)\n",
|
||||
"\n",
|
||||
"service = Model.deploy(workspace=ws,\n",
|
||||
" name=service_name,\n",
|
||||
" models=[model],\n",
|
||||
" inference_config=inference_config,\n",
|
||||
" deployment_config=aci_config,\n",
|
||||
" overwrite=True)\n",
|
||||
"service.wait_for_deployment(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"After your model is deployed, make a call to the web service using [service.run()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice%28class%29?view=azure-ml-py#run-input-)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_payload = json.dumps({\n",
|
||||
" 'data': dataset_x[0:2].tolist()\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"output = service.run(input_payload)\n",
|
||||
"\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When you are finished testing your service, clean up the deployment with [service.delete()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice%28class%29?view=azure-ml-py#delete--)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"service.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Model Profiling\n",
|
||||
"\n",
|
||||
"Profile your model to understand how much CPU and memory the service, created as a result of its deployment, will need. Profiling returns information such as CPU usage, memory usage, and response latency. It also provides a CPU and memory recommendation based on the resource usage. You can profile your model (or more precisely the service built based on your model) on any CPU and/or memory combination where 0.1 <= CPU <= 3.5 and 0.1GB <= memory <= 15GB. If you do not provide a CPU and/or memory requirement, we will test it on the default configuration of 3.5 CPU and 15GB memory.\n",
|
||||
"\n",
|
||||
"In order to profile your model you will need:\n",
|
||||
"- a registered model\n",
|
||||
"- an entry script\n",
|
||||
"- an inference configuration\n",
|
||||
"- a single column tabular dataset, where each row contains a string representing sample request data sent to the service.\n",
|
||||
"\n",
|
||||
"Please, note that profiling is a long running operation and can take up to 25 minutes depending on the size of the dataset.\n",
|
||||
"\n",
|
||||
"At this point we only support profiling of services that expect their request data to be a string, for example: string serialized json, text, string serialized image, etc. The content of each row of the dataset (string) will be put into the body of the HTTP request and sent to the service encapsulating the model for scoring.\n",
|
||||
"\n",
|
||||
"Below is an example of how you can construct an input dataset to profile a service which expects its incoming requests to contain serialized json. In this case we created a dataset based one hundred instances of the same request data. In real world scenarios however, we suggest that you use larger datasets with various inputs, especially if your model resource usage/behavior is input dependent."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You may want to register datasets using the register() method to your workspace so they can be shared with others, reused and referred to by name in your script.\n",
|
||||
"You can try get the dataset first to see if it's already registered."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Datastore\n",
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.data import dataset_type_definitions\n",
|
||||
"\n",
|
||||
"dataset_name='diabetes_sample_request_data'\n",
|
||||
"\n",
|
||||
"dataset_registered = False\n",
|
||||
"try:\n",
|
||||
" sample_request_data = Dataset.get_by_name(workspace = ws, name = dataset_name)\n",
|
||||
" dataset_registered = True\n",
|
||||
"except:\n",
|
||||
" print(\"The dataset {} is not registered in workspace yet.\".format(dataset_name))\n",
|
||||
"\n",
|
||||
"if not dataset_registered:\n",
|
||||
" # create a string that can be utf-8 encoded and\n",
|
||||
" # put in the body of the request\n",
|
||||
" serialized_input_json = json.dumps({\n",
|
||||
" 'data': [\n",
|
||||
" [ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235,\n",
|
||||
" -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613]\n",
|
||||
" ]\n",
|
||||
" })\n",
|
||||
" dataset_content = []\n",
|
||||
" for i in range(100):\n",
|
||||
" dataset_content.append(serialized_input_json)\n",
|
||||
" dataset_content = '\\n'.join(dataset_content)\n",
|
||||
" file_name = \"{}.txt\".format(dataset_name)\n",
|
||||
" f = open(file_name, 'w')\n",
|
||||
" f.write(dataset_content)\n",
|
||||
" f.close()\n",
|
||||
"\n",
|
||||
" # upload the txt file created above to the Datastore and create a dataset from it\n",
|
||||
" data_store = Datastore.get_default(ws)\n",
|
||||
" data_store.upload_files(['./' + file_name], target_path='sample_request_data')\n",
|
||||
" datastore_path = [(data_store, 'sample_request_data' +'/' + file_name)]\n",
|
||||
" sample_request_data = Dataset.Tabular.from_delimited_files(\n",
|
||||
" datastore_path,\n",
|
||||
" separator='\\n',\n",
|
||||
" infer_column_types=True,\n",
|
||||
" header=dataset_type_definitions.PromoteHeadersBehavior.NO_HEADERS)\n",
|
||||
" sample_request_data = sample_request_data.register(workspace=ws,\n",
|
||||
" name=dataset_name,\n",
|
||||
" create_new_version=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now that we have an input dataset we are ready to go ahead with profiling. In this case we are testing the previously introduced sklearn regression model on 1 CPU and 0.5 GB memory. The memory usage and recommendation presented in the result is measured in Gigabytes. The CPU usage and recommendation is measured in CPU cores."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"environment = Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'joblib',\n",
|
||||
" 'numpy',\n",
|
||||
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
|
||||
"])\n",
|
||||
"inference_config = InferenceConfig(entry_script='score.py', environment=environment)\n",
|
||||
"# if cpu and memory_in_gb parameters are not provided\n",
|
||||
"# the model will be profiled on default configuration of\n",
|
||||
"# 3.5CPU and 15GB memory\n",
|
||||
"profile = Model.profile(ws,\n",
|
||||
" 'rgrsn-%s' % datetime.now().strftime('%m%d%Y-%H%M%S'),\n",
|
||||
" [model],\n",
|
||||
" inference_config,\n",
|
||||
" input_dataset=sample_request_data,\n",
|
||||
" cpu=1.0,\n",
|
||||
" memory_in_gb=0.5)\n",
|
||||
"\n",
|
||||
"# profiling is a long running operation and may take up to 25 min\n",
|
||||
"profile.wait_for_completion(True)\n",
|
||||
"details = profile.get_details()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Model packaging\n",
|
||||
"\n",
|
||||
"If you want to build a Docker image that encapsulates your model and its dependencies, you can use the model packaging option. The output image will be pushed to your workspace's ACR.\n",
|
||||
"\n",
|
||||
"You must include an Environment object in your inference configuration to use `Model.package()`.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"package = Model.package(ws, [model], inference_config)\n",
|
||||
"package.wait_for_creation(show_output=True) # Or show_output=False to hide the Docker build logs.\n",
|
||||
"package.pull()\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Instead of a fully-built image, you can also generate a Dockerfile and download all the assets needed to build an image on top of your Environment.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"package = Model.package(ws, [model], inference_config, generate_dockerfile=True)\n",
|
||||
"package.wait_for_creation(show_output=True)\n",
|
||||
"package.save(\"./local_context_dir\")\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next steps\n",
|
||||
"\n",
|
||||
" - To run a production-ready web service, see the [notebook on deployment to Azure Kubernetes Service](../production-deploy-to-aks/production-deploy-to-aks.ipynb).\n",
|
||||
" - To run a local web service, see the [notebook on deployment to a local Docker container](../deploy-to-local/register-model-deploy-local.ipynb).\n",
|
||||
" - For more information on datasets, see the [notebook on training with datasets](../../work-with-data/datasets-tutorial/train-with-datasets/train-with-datasets.ipynb).\n",
|
||||
" - For more information on environments, see the [notebook on using environments](../../training/using-environments/using-environments.ipynb).\n",
|
||||
" - For information on all the available deployment targets, see [“How and where to deploy models”](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#choose-a-compute-target)."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "vaidyas"
|
||||
}
|
||||
],
|
||||
"category": "deployment",
|
||||
"compute": [
|
||||
"None"
|
||||
],
|
||||
"datasets": [
|
||||
"Diabetes"
|
||||
],
|
||||
"deployment": [
|
||||
"Azure Container Instance"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"Scikit-learn"
|
||||
],
|
||||
"friendly_name": "Register model and deploy as webservice",
|
||||
"index_order": 3,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
},
|
||||
"star_tag": [
|
||||
"featured"
|
||||
],
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Deploy a model with Azure Machine Learning"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
name: model-register-and-deploy
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- numpy
|
||||
- scikit-learn
|
||||
@@ -1,38 +0,0 @@
|
||||
import joblib
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
from inference_schema.schema_decorators import input_schema, output_schema
|
||||
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
|
||||
|
||||
|
||||
# The init() method is called once, when the web service starts up.
|
||||
#
|
||||
# Typically you would deserialize the model file, as shown here using joblib,
|
||||
# and store it in a global variable so your run() method can access it later.
|
||||
def init():
|
||||
global model
|
||||
|
||||
# The AZUREML_MODEL_DIR environment variable indicates
|
||||
# a directory containing the model file you registered.
|
||||
model_filename = 'sklearn_regression_model.pkl'
|
||||
model_path = os.path.join(os.environ['AZUREML_MODEL_DIR'], model_filename)
|
||||
|
||||
model = joblib.load(model_path)
|
||||
|
||||
|
||||
# The run() method is called each time a request is made to the scoring API.
|
||||
#
|
||||
# Shown here are the optional input_schema and output_schema decorators
|
||||
# from the inference-schema pip package. Using these decorators on your
|
||||
# run() method parses and validates the incoming payload against
|
||||
# the example input you provide here. This will also generate a Swagger
|
||||
# API document for your web service.
|
||||
@input_schema('data', NumpyParameterType(np.array([[0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9, 9.0]])))
|
||||
@output_schema(NumpyParameterType(np.array([4429.929236457418])))
|
||||
def run(data):
|
||||
# Use the model object loaded by init().
|
||||
result = model.predict(data)
|
||||
|
||||
# You can return any JSON-serializable object.
|
||||
return result.tolist()
|
||||
@@ -473,9 +473,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -529,9 +529,9 @@
|
||||
"friendly_name": "Register a model and deploy locally",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,373 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Deploy models to Azure Kubernetes Service (AKS) using controlled roll out\n",
|
||||
"This notebook will show you how to deploy mulitple AKS webservices with the same scoring endpoint and how to roll out your models in a controlled manner by configuring % of scoring traffic going to each webservice. If you are using a Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) to install the Azure Machine Learning Python SDK and create an Azure ML Workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check for latest version\n",
|
||||
"import azureml.core\n",
|
||||
"print(azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"Create a [Workspace](https://docs.microsoft.com/python/api/azureml-core/azureml.core.workspace%28class%29?view=azure-ml-py) object from your persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the model\n",
|
||||
"Register a file or folder as a model by calling [Model.register()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#register-workspace--model-path--model-name--tags-none--properties-none--description-none--datasets-none--model-framework-none--model-framework-version-none--child-paths-none-).\n",
|
||||
"In addition to the content of the model file itself, your registered model will also store model metadata -- model description, tags, and framework information -- that will be useful when managing and deploying models in your workspace. Using tags, for instance, you can categorize your models and apply filters when listing models in your workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Model\n",
|
||||
"\n",
|
||||
"model = Model.register(workspace=ws,\n",
|
||||
" model_name='sklearn_regression_model.pkl', # Name of the registered model in your workspace.\n",
|
||||
" model_path='./sklearn_regression_model.pkl', # Local file to upload and register as a model.\n",
|
||||
" model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.\n",
|
||||
" model_framework_version='0.19.1', # Version of scikit-learn used to create the model.\n",
|
||||
" description='Ridge regression model to predict diabetes progression.',\n",
|
||||
" tags={'area': 'diabetes', 'type': 'regression'})\n",
|
||||
"\n",
|
||||
"print('Name:', model.name)\n",
|
||||
"print('Version:', model.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register an environment (for all models)\n",
|
||||
"\n",
|
||||
"If you control over how your model is run, or if it has special runtime requirements, you can specify your own environment and scoring method.\n",
|
||||
"\n",
|
||||
"Specify the model's runtime environment by creating an [Environment](https://docs.microsoft.com/python/api/azureml-core/azureml.core.environment%28class%29?view=azure-ml-py) object and providing the [CondaDependencies](https://docs.microsoft.com/python/api/azureml-core/azureml.core.conda_dependencies.condadependencies?view=azure-ml-py) needed by your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"environment=Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'numpy',\n",
|
||||
" 'scikit-learn==0.22.1',\n",
|
||||
" 'scipy'\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using a custom environment, you must also provide Python code for initializing and running your model. An example script is included with this notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('score.py') as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create the InferenceConfig\n",
|
||||
"Create the inference configuration to reference your environment and entry script during deployment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script='score.py', \n",
|
||||
" source_directory='.',\n",
|
||||
" environment=environment)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Provision the AKS Cluster\n",
|
||||
"If you already have an AKS cluster attached to this workspace, skip the step below and provide the name of the cluster.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AksCompute\n",
|
||||
"from azureml.core.compute import ComputeTarget\n",
|
||||
"# Use the default configuration (can also provide parameters to customize)\n",
|
||||
"prov_config = AksCompute.provisioning_configuration()\n",
|
||||
"\n",
|
||||
"aks_name = 'my-aks' \n",
|
||||
"# Create the cluster\n",
|
||||
"aks_target = ComputeTarget.create(workspace = ws, \n",
|
||||
" name = aks_name, \n",
|
||||
" provisioning_configuration = prov_config) \n",
|
||||
"aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create an Endpoint and add a version (AKS service)\n",
|
||||
"This creates a new endpoint and adds a version behind it. By default the first version added is the default version. You can specify the traffic percentile a version takes behind an endpoint. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# deploying the model and create a new endpoint\n",
|
||||
"from azureml.core.webservice import AksEndpoint\n",
|
||||
"# from azureml.core.compute import ComputeTarget\n",
|
||||
"\n",
|
||||
"#select a created compute\n",
|
||||
"compute = ComputeTarget(ws, 'my-aks')\n",
|
||||
"namespace_name=\"endpointnamespace\"\n",
|
||||
"# define the endpoint name\n",
|
||||
"endpoint_name = \"myendpoint1\"\n",
|
||||
"# define the service name\n",
|
||||
"version_name= \"versiona\"\n",
|
||||
"\n",
|
||||
"endpoint_deployment_config = AksEndpoint.deploy_configuration(tags = {'modelVersion':'firstversion', 'department':'finance'}, \n",
|
||||
" description = \"my first version\", namespace = namespace_name, \n",
|
||||
" version_name = version_name, traffic_percentile = 40)\n",
|
||||
"\n",
|
||||
"endpoint = Model.deploy(ws, endpoint_name, [model], inference_config, endpoint_deployment_config, compute)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"endpoint.get_logs()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add another version of the service to an existing endpoint\n",
|
||||
"This adds another version behind an existing endpoint. You can specify the traffic percentile the new version takes. If no traffic_percentile is specified then it defaults to 0. All the unspecified traffic percentile (in this example 50) across all versions goes to default version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Adding a new version to an existing Endpoint.\n",
|
||||
"version_name_add=\"versionb\" \n",
|
||||
"\n",
|
||||
"endpoint.create_version(version_name = version_name_add, inference_config=inference_config, models=[model], tags = {'modelVersion':'secondversion', 'department':'finance'}, \n",
|
||||
" description = \"my second version\", traffic_percentile = 10)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Update an existing version in an endpoint\n",
|
||||
"There are two types of versions: control and treatment. An endpoint contains one or more treatment versions but only one control version. This categorization helps compare the different versions against the defined control version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"endpoint.update_version(version_name=endpoint.versions[version_name_add].name, description=\"my second version update\", traffic_percentile=40, is_default=True, is_control_version_type=True)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the web service using run method\n",
|
||||
"Test the web sevice by passing in data. Run() method retrieves API keys behind the scenes to make sure that call is authenticated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Scoring on endpoint\n",
|
||||
"import json\n",
|
||||
"test_sample = json.dumps({'data': [\n",
|
||||
" [1,2,3,4,5,6,7,8,9,10], \n",
|
||||
" [10,9,8,7,6,5,4,3,2,1]\n",
|
||||
"]})\n",
|
||||
"\n",
|
||||
"test_sample_encoded = bytes(test_sample, encoding='utf8')\n",
|
||||
"prediction = endpoint.run(input_data=test_sample_encoded)\n",
|
||||
"print(prediction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Delete Resources"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# deleting a version in an endpoint\n",
|
||||
"endpoint.delete_version(version_name=version_name)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# deleting an endpoint, this will delete all versions in the endpoint and the endpoint itself\n",
|
||||
"endpoint.delete()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "shipatel"
|
||||
}
|
||||
],
|
||||
"category": "deployment",
|
||||
"compute": [
|
||||
"None"
|
||||
],
|
||||
"datasets": [
|
||||
"Diabetes"
|
||||
],
|
||||
"deployment": [
|
||||
"Azure Kubernetes Service"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"Scikit-learn"
|
||||
],
|
||||
"friendly_name": "Deploy models to AKS using controlled roll out",
|
||||
"index_order": 3,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.0"
|
||||
},
|
||||
"star_tag": [
|
||||
"featured"
|
||||
],
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Deploy a model with Azure Machine Learning"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
name: deploy-aks-with-controlled-rollout
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -1,28 +0,0 @@
|
||||
import pickle
|
||||
import json
|
||||
import numpy
|
||||
from sklearn.externals import joblib
|
||||
from sklearn.linear_model import Ridge
|
||||
from azureml.core.model import Model
|
||||
|
||||
|
||||
def init():
|
||||
global model
|
||||
# note here "sklearn_regression_model.pkl" is the name of the model registered under
|
||||
# this is a different behavior than before when the code is run locally, even though the code is the same.
|
||||
model_path = Model.get_model_path('sklearn_regression_model.pkl')
|
||||
# deserialize the model file back into a sklearn model
|
||||
model = joblib.load(model_path)
|
||||
|
||||
|
||||
# note you can pass in multiple rows for scoring
|
||||
def run(raw_data):
|
||||
try:
|
||||
data = json.loads(raw_data)['data']
|
||||
data = numpy.array(data)
|
||||
result = model.predict(data)
|
||||
# you can return any data type as long as it is JSON-serializable
|
||||
return result.tolist()
|
||||
except Exception as e:
|
||||
error = str(e)
|
||||
return error
|
||||
Binary file not shown.
@@ -123,7 +123,7 @@
|
||||
"import pickle\n",
|
||||
"import json\n",
|
||||
"import numpy\n",
|
||||
"from sklearn.externals import joblib\n",
|
||||
"import joblib\n",
|
||||
"from sklearn.linear_model import Ridge\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
@@ -476,9 +476,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -405,9 +405,9 @@
|
||||
"friendly_name": "Convert and deploy TinyYolo with ONNX Runtime",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -773,9 +773,9 @@
|
||||
"friendly_name": "Deploy Facial Expression Recognition (FER+) with ONNX Runtime",
|
||||
"index_order": 2,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -750,9 +750,9 @@
|
||||
"friendly_name": "Deploy MNIST digit recognition with ONNX Runtime",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -206,9 +206,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -389,9 +389,9 @@
|
||||
"friendly_name": "Deploy ResNet50 with ONNX Runtime",
|
||||
"index_order": 4,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -105,7 +105,7 @@
|
||||
" print('Found existing compute target.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
|
||||
" max_nodes=6)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
@@ -564,9 +564,9 @@
|
||||
"friendly_name": "Train MNIST in PyTorch, convert, and deploy with ONNX Runtime",
|
||||
"index_order": 3,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -620,7 +620,7 @@
|
||||
},
|
||||
"manual": null
|
||||
},
|
||||
"vm_size": "STANDARD_NC6"
|
||||
"vm_size": "Standard_NC6s_v3"
|
||||
},
|
||||
"error": "",
|
||||
"layout": "IPY_MODEL_c899ddfc2b134ca9b89a4f278ac7c997",
|
||||
|
||||
@@ -136,6 +136,9 @@
|
||||
"# Choose a name for your GPU cluster\n",
|
||||
"gpu_cluster_name = \"aks-gpu-cluster\"\n",
|
||||
"\n",
|
||||
"# Choose a location for your GPU cluster\n",
|
||||
"gpu_cluster_location = \"eastus\"\n",
|
||||
"\n",
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" gpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
|
||||
@@ -146,7 +149,8 @@
|
||||
" # Specify the configuration for the new cluster\n",
|
||||
" compute_config = AksCompute.provisioning_configuration(cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,\n",
|
||||
" agent_count=1,\n",
|
||||
" vm_size=\"Standard_NV6\")\n",
|
||||
" vm_size=\"Standard_NC6s_v3\",\n",
|
||||
" location=gpu_cluster_location)\n",
|
||||
" # Create the cluster with the specified name and configuration\n",
|
||||
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
@@ -170,7 +174,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile score.py\n",
|
||||
"import tensorflow as tf\n",
|
||||
"import tensorflow.compat.v1 as tf\n",
|
||||
"import numpy as np\n",
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
@@ -240,8 +244,8 @@
|
||||
"# Please see [Azure ML Containers repository](https://github.com/Azure/AzureML-Containers#featured-tags)\n",
|
||||
"# for open-sourced GPU base images.\n",
|
||||
"env.docker.base_image = DEFAULT_GPU_IMAGE\n",
|
||||
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.6.2\", \n",
|
||||
" conda_packages=['tensorflow-gpu==1.12.0','numpy'],\n",
|
||||
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.8\", pin_sdk_version=False,\n",
|
||||
" conda_packages=['tensorflow-gpu','numpy'],\n",
|
||||
" pip_packages=['azureml-contrib-services', 'azureml-defaults'])\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)\n",
|
||||
@@ -329,9 +333,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -343,7 +347,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.6"
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -2,4 +2,3 @@ name: production-deploy-to-aks-gpu
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- tensorflow
|
||||
|
||||
@@ -154,7 +154,7 @@
|
||||
"import pickle\n",
|
||||
"import json\n",
|
||||
"import numpy\n",
|
||||
"from sklearn.externals import joblib\n",
|
||||
"import joblib\n",
|
||||
"from sklearn.linear_model import Ridge\n",
|
||||
"from inference_schema.schema_decorators import input_schema, output_schema\n",
|
||||
"from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType\n",
|
||||
@@ -213,7 +213,7 @@
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-secure-web-service) for more details"
|
||||
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/azure/machine-learning/v1/how-to-secure-web-service) for more details"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -334,9 +334,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -5,4 +5,4 @@ dependencies:
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
|
||||
@@ -154,7 +154,7 @@
|
||||
"import pickle\n",
|
||||
"import json\n",
|
||||
"import numpy\n",
|
||||
"from sklearn.externals import joblib\n",
|
||||
"import joblib\n",
|
||||
"from sklearn.linear_model import Ridge\n",
|
||||
"\n",
|
||||
"def init():\n",
|
||||
@@ -366,7 +366,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Create AKS Cluster in an existing virtual network (optional)\n",
|
||||
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-enable-virtual-network#use-azure-kubernetes-service) for more details."
|
||||
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-network-security-overview) for more details."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -397,7 +397,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Enable SSL on the AKS Cluster (optional)\n",
|
||||
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-secure-web-service) for more details"
|
||||
"See code snippet below. Check the documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-network-security-overview#secure-the-inferencing-environment-v1) for more details"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -603,9 +603,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -5,4 +5,4 @@ dependencies:
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
|
||||
Binary file not shown.
@@ -1 +0,0 @@
|
||||
{"class":"org.apache.spark.ml.classification.LogisticRegressionModel","timestamp":1570147252329,"sparkVersion":"2.4.0","uid":"LogisticRegression_5df3978caaf3","paramMap":{"regParam":0.01},"defaultParamMap":{"aggregationDepth":2,"threshold":0.5,"rawPredictionCol":"rawPrediction","featuresCol":"features","labelCol":"label","predictionCol":"prediction","family":"auto","regParam":0.0,"tol":1.0E-6,"probabilityCol":"probability","standardization":true,"elasticNetParam":0.0,"maxIter":100,"fitIntercept":true}}
|
||||
@@ -1,349 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Register Spark Model and deploy as Webservice\n",
|
||||
"\n",
|
||||
"This example shows how to deploy a Webservice in step-by-step fashion:\n",
|
||||
"\n",
|
||||
" 1. Register Spark Model\n",
|
||||
" 2. Deploy Spark Model as Webservice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the [configuration](../../../configuration.ipynb) Notebook first if you haven't."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize Workspace\n",
|
||||
"\n",
|
||||
"Initialize a workspace object from persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"create workspace"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Register Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can add tags and descriptions to your Models. Note you need to have a `iris.model` file in the current directory. This model file is generated using [train in spark](../training/train-in-spark/train-in-spark.ipynb) notebook. The below call registers that file as a Model with the same name `iris.model` in the workspace.\n",
|
||||
"\n",
|
||||
"Using tags, you can track useful information such as the name and version of the machine learning library used to train the model. Note that tags must be alphanumeric."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"register model from file"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import Model\n",
|
||||
"\n",
|
||||
"model = Model.register(model_path=\"iris.model\",\n",
|
||||
" model_name=\"iris.model\",\n",
|
||||
" tags={'type': \"regression\"},\n",
|
||||
" description=\"Logistic regression model to predict iris species\",\n",
|
||||
" workspace=ws)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Fetch Environment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can now create and/or use an Environment object when deploying a Webservice. The Environment can have been previously registered with your Workspace, or it will be registered with it as a part of the Webservice deployment.\n",
|
||||
"\n",
|
||||
"More information can be found in our [using environments notebook](../training/using-environments/using-environments.ipynb)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\r\n",
|
||||
"from azureml.core.environment import SparkPackage\r\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\r\n",
|
||||
"\r\n",
|
||||
"myenv = Environment('my-pyspark-environment')\r\n",
|
||||
"myenv.docker.base_image = \"mcr.microsoft.com/mmlspark/release:0.15\"\r\n",
|
||||
"myenv.inferencing_stack_version = \"latest\"\r\n",
|
||||
"myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[\"azureml-core\",\"azureml-defaults\",\"azureml-telemetry\",\"azureml-train-restclients-hyperdrive\",\"azureml-train-core\"], python_version=\"3.6.2\")\r\n",
|
||||
"myenv.python.conda_dependencies.add_channel(\"conda-forge\")\r\n",
|
||||
"myenv.spark.packages = [SparkPackage(\"com.microsoft.ml.spark\", \"mmlspark_2.11\", \"0.15\"), SparkPackage(\"com.microsoft.azure\", \"azure-storage\", \"2.0.0\"), SparkPackage(\"org.apache.hadoop\", \"hadoop-azure\", \"2.7.0\")]\r\n",
|
||||
"myenv.spark.repositories = [\"https://mmlspark.azureedge.net/maven\"]\r\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create Inference Configuration\n",
|
||||
"\n",
|
||||
"There is now support for a source directory, you can upload an entire folder from your local machine as dependencies for the Webservice.\n",
|
||||
"Note: in that case, your entry_script is relative path to the source_directory path.\n",
|
||||
"\n",
|
||||
"Sample code for using a source directory:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"inference_config = InferenceConfig(source_directory=\"C:/abc\",\n",
|
||||
" entry_script=\"x/y/score.py\",\n",
|
||||
" environment=environment)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
" - source_directory = holds source path as string, this entire folder gets added in image so its really easy to access any files within this folder or subfolder\n",
|
||||
" - entry_script = contains logic specific to initializing your model and running predictions\n",
|
||||
" - environment = An environment object to use for the deployment. Doesn't have to be registered"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"create image"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=myenv)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Deploy Model as Webservice on Azure Container Instance\n",
|
||||
"\n",
|
||||
"Note that the service creation can take few minutes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"azuremlexception-remarks-sample"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.webservice import AciWebservice, Webservice\n",
|
||||
"from azureml.exceptions import WebserviceException\n",
|
||||
"\n",
|
||||
"deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)\n",
|
||||
"aci_service_name = 'aciservice1'\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" # if you want to get existing service below is the command\n",
|
||||
" # since aci name needs to be unique in subscription deleting existing aci if any\n",
|
||||
" # we use aci_service_name to create azure aci\n",
|
||||
" service = Webservice(ws, name=aci_service_name)\n",
|
||||
" if service:\n",
|
||||
" service.delete()\n",
|
||||
"except WebserviceException as e:\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"service = Model.deploy(ws, aci_service_name, [model], inference_config, deployment_config)\n",
|
||||
"\n",
|
||||
"service.wait_for_deployment(True)\n",
|
||||
"print(service.state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Test web service"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"test_sample = json.dumps({'features':{'type':1,'values':[4.3,3.0,1.1,0.1]},'label':2.0})\n",
|
||||
"\n",
|
||||
"test_sample_encoded = bytes(test_sample, encoding='utf8')\n",
|
||||
"prediction = service.run(input_data=test_sample_encoded)\n",
|
||||
"print(prediction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Delete ACI to clean up"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"deploy service",
|
||||
"aci"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"service.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Model Profiling\n",
|
||||
"\n",
|
||||
"You can also take advantage of the profiling feature to estimate CPU and memory requirements for models.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"profile = Model.profile(ws, \"profilename\", [model], inference_config, test_sample)\n",
|
||||
"profile.wait_for_profiling(True)\n",
|
||||
"profiling_results = profile.get_results()\n",
|
||||
"print(profiling_results)\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Model Packaging\n",
|
||||
"\n",
|
||||
"If you want to build a Docker image that encapsulates your model and its dependencies, you can use the model packaging option. The output image will be pushed to your workspace's ACR.\n",
|
||||
"\n",
|
||||
"You must include an Environment object in your inference configuration to use `Model.package()`.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"package = Model.package(ws, [model], inference_config)\n",
|
||||
"package.wait_for_creation(show_output=True) # Or show_output=False to hide the Docker build logs.\n",
|
||||
"package.pull()\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Instead of a fully-built image, you can also generate a Dockerfile and download all the assets needed to build an image on top of your Environment.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"package = Model.package(ws, [model], inference_config, generate_dockerfile=True)\n",
|
||||
"package.wait_for_creation(show_output=True)\n",
|
||||
"package.save(\"./local_context_dir\")\n",
|
||||
"```"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "vaidyas"
|
||||
}
|
||||
],
|
||||
"category": "deployment",
|
||||
"compute": [
|
||||
"None"
|
||||
],
|
||||
"datasets": [
|
||||
"Iris"
|
||||
],
|
||||
"deployment": [
|
||||
"Azure Container Instance"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"PySpark"
|
||||
],
|
||||
"friendly_name": "Register Spark model and deploy as webservice",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
name: model-register-and-deploy-spark
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -1,37 +0,0 @@
|
||||
import traceback
|
||||
from pyspark.ml.linalg import VectorUDT
|
||||
from azureml.core.model import Model
|
||||
from pyspark.ml.classification import LogisticRegressionModel
|
||||
from pyspark.sql.types import StructType, StructField
|
||||
from pyspark.sql.types import DoubleType
|
||||
from pyspark.sql import SQLContext
|
||||
from pyspark import SparkContext
|
||||
|
||||
sc = SparkContext.getOrCreate()
|
||||
sqlContext = SQLContext(sc)
|
||||
spark = sqlContext.sparkSession
|
||||
|
||||
input_schema = StructType([StructField("features", VectorUDT()), StructField("label", DoubleType())])
|
||||
reader = spark.read
|
||||
reader.schema(input_schema)
|
||||
|
||||
|
||||
def init():
|
||||
global model
|
||||
# note here "iris.model" is the name of the model registered under the workspace
|
||||
# this call should return the path to the model.pkl file on the local disk.
|
||||
model_path = Model.get_model_path('iris.model')
|
||||
# Load the model file back into a LogisticRegression model
|
||||
model = LogisticRegressionModel.load(model_path)
|
||||
|
||||
|
||||
def run(data):
|
||||
try:
|
||||
input_df = reader.json(sc.parallelize([data]))
|
||||
result = model.transform(input_df)
|
||||
# you can return any datatype as long as it is JSON-serializable
|
||||
return result.collect()[0]['prediction']
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
error = str(e)
|
||||
return error
|
||||
@@ -1,59 +0,0 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from azureml.core.run import Run
|
||||
from azureml.interpret import ExplanationClient
|
||||
from interpret_community.adapter import ExplanationAdapter
|
||||
import joblib
|
||||
import os
|
||||
import shap
|
||||
import xgboost
|
||||
|
||||
OUTPUT_DIR = './outputs/'
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
run = Run.get_context()
|
||||
client = ExplanationClient.from_run(run)
|
||||
|
||||
# get a dataset on income prediction
|
||||
X, y = shap.datasets.adult()
|
||||
features = X.columns.values
|
||||
|
||||
# train an XGBoost model (but any other tree model type should work)
|
||||
model = xgboost.XGBClassifier()
|
||||
model.fit(X, y)
|
||||
|
||||
explainer = shap.explainers.GPUTree(model, X)
|
||||
X_shap = X[:100]
|
||||
shap_values = explainer(X_shap)
|
||||
|
||||
print("computed shap values:")
|
||||
print(shap_values)
|
||||
|
||||
# Use the explanation adapter to convert the importances into an interpret-community
|
||||
# style explanation which can be uploaded to AzureML or visualized in the
|
||||
# ExplanationDashboard widget
|
||||
adapter = ExplanationAdapter(features, classification=True)
|
||||
global_explanation = adapter.create_global(shap_values.values, X_shap, expected_values=shap_values.base_values)
|
||||
|
||||
# write X_shap out as a pickle file for later visualization
|
||||
x_shap_pkl = 'x_shap.pkl'
|
||||
with open(x_shap_pkl, 'wb') as file:
|
||||
joblib.dump(value=X_shap, filename=os.path.join(OUTPUT_DIR, x_shap_pkl))
|
||||
run.upload_file('x_shap_adult_census.pkl', os.path.join(OUTPUT_DIR, x_shap_pkl))
|
||||
|
||||
model_file_name = 'xgboost_.pkl'
|
||||
# save model in the outputs folder so it automatically gets uploaded
|
||||
with open(model_file_name, 'wb') as file:
|
||||
joblib.dump(value=model, filename=os.path.join(OUTPUT_DIR,
|
||||
model_file_name))
|
||||
|
||||
# register the model
|
||||
run.upload_file('xgboost_model.pkl', os.path.join('./outputs/', model_file_name))
|
||||
original_model = run.register_model(model_name='xgboost_with_gpu_tree_explainer',
|
||||
model_path='xgboost_model.pkl')
|
||||
|
||||
# Uploading model explanation data for storage or visualization in webUX
|
||||
# The explanation can then be downloaded on any compute
|
||||
comment = 'Global explanation on classification model trained on adult census income dataset'
|
||||
client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)
|
||||
@@ -1,503 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Explain tree-based models on GPU using GPUTreeExplainer\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"_**This notebook illustrates how to use shap's GPUTreeExplainer on an Azure GPU machine.**_\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Problem: Train a tree-based model and explain the model on an Azure GPU machine using the GPUTreeExplainer.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Table of Contents\n",
|
||||
"\n",
|
||||
"1. [Introduction](#Introduction)\n",
|
||||
"1. [Setup](#Setup)\n",
|
||||
"1. [Run model explainer locally at training time](#Explain)\n",
|
||||
" 1. Apply feature transformations\n",
|
||||
" 1. Train a binary classification model\n",
|
||||
" 1. Explain the model on raw features\n",
|
||||
" 1. Generate global explanations\n",
|
||||
" 1. Generate local explanations\n",
|
||||
"1. [Visualize explanations](#Visualize)\n",
|
||||
"1. [Deploy model and scoring explainer](#Deploy)\n",
|
||||
"1. [Next steps](#Next)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"This notebook demonstrates how to use the GPUTreeExplainer on some simple datasets. Like the TreeExplainer, the GPUTreeExplainer is specifically designed for tree-based machine learning models, but it is designed to accelerate the computations using NVIDIA GPUs.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"Notebook synopsis:\n",
|
||||
"\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
"2. Configuration and remote run with a GPU machine"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"import os\n",
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.core.compute import AmlCompute\n",
|
||||
"from azureml.core.compute import ComputeTarget\n",
|
||||
"from azureml.core.run import Run\n",
|
||||
"from azureml.core.model import Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.45.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As part of the setup you have already created a <b>Workspace</b>. To run the script, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# Choose an experiment name.\n",
|
||||
"experiment_name = 'gpu-tree-explainer'\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace Name'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create project directory\n",
|
||||
"\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"project_folder = './azureml-shap-gpu-tree-explainer'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)\n",
|
||||
"shutil.copy('gpu_tree_explainer.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Set up a compute cluster\n",
|
||||
"This section uses a user-provided compute cluster (named \"gpu-shap-cluster\" in this example). If a cluster with this name does not exist in the user's workspace, the below code will create a new cluster. You can choose the parameters of the cluster as mentioned in the comments."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"num_nodes = 1\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster.\n",
|
||||
"amlcompute_cluster_name = \"gpu-shap-cluster\"\n",
|
||||
"\n",
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\",\n",
|
||||
" # To use GPUTreeExplainer, select a GPU such as \"STANDARD_NC6\" \n",
|
||||
" # or similar GPU option\n",
|
||||
" # available in your workspace\n",
|
||||
" max_nodes = num_nodes)\n",
|
||||
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure & Run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"# Create a new RunConfig object\n",
|
||||
"run_config = RunConfiguration(framework=\"python\")\n",
|
||||
"\n",
|
||||
"# Set compute target to AmlCompute target created in previous step\n",
|
||||
"run_config.target = amlcompute_cluster_name\n",
|
||||
"\n",
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"environment_name = \"shapgpu\"\n",
|
||||
"env = Environment(environment_name)\n",
|
||||
"\n",
|
||||
"env.docker.enabled = True\n",
|
||||
"env.docker.base_image = None\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Note: this is to pin the pandas and xgboost versions to be same as notebook.\n",
|
||||
"# In production scenario user would choose their dependencies\n",
|
||||
"import pkg_resources\n",
|
||||
"available_packages = pkg_resources.working_set\n",
|
||||
"pandas_ver = None\n",
|
||||
"numpy_ver = None\n",
|
||||
"for dist in list(available_packages):\n",
|
||||
" if dist.key == 'pandas':\n",
|
||||
" pandas_ver = dist.version\n",
|
||||
"pandas_dep = 'pandas'\n",
|
||||
"numpy_dep = 'numpy'\n",
|
||||
"if pandas_ver:\n",
|
||||
" pandas_dep = 'pandas=={}'.format(pandas_ver)\n",
|
||||
"if numpy_ver:\n",
|
||||
" numpy_dep = 'numpy=={}'.format(numpy_ver)\n",
|
||||
"\n",
|
||||
"# Note: we build shap at commit 690245 for Tesla K80 GPUs\n",
|
||||
"env.docker.base_dockerfile = f\"\"\"\n",
|
||||
"FROM nvidia/cuda:10.2-devel-ubuntu18.04\n",
|
||||
"ENV PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
|
||||
"ARG PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
|
||||
"RUN apt-get update && \\\n",
|
||||
"apt-get install -y fuse && \\\n",
|
||||
"apt-get install -y build-essential && \\\n",
|
||||
"apt-get install -y python3-dev && \\\n",
|
||||
"apt-get install -y wget && \\\n",
|
||||
"apt-get install -y git && \\\n",
|
||||
"rm -rf /var/lib/apt/lists/* && \\\n",
|
||||
"wget \\\n",
|
||||
"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \\\n",
|
||||
"mkdir /root/.conda && \\\n",
|
||||
"bash Miniconda3-latest-Linux-x86_64.sh -b && \\\n",
|
||||
"rm -f Miniconda3-latest-Linux-x86_64.sh && \\\n",
|
||||
"conda init bash && \\\n",
|
||||
". ~/.bashrc && \\\n",
|
||||
"conda create -n shapgpu python=3.8 && \\\n",
|
||||
"conda activate shapgpu && \\\n",
|
||||
"apt-get install -y g++ && \\\n",
|
||||
"printenv && \\\n",
|
||||
"echo \"which nvcc: \" && \\\n",
|
||||
"which nvcc && \\\n",
|
||||
"pip install azureml-defaults && \\\n",
|
||||
"pip install azureml-telemetry && \\\n",
|
||||
"pip install azureml-interpret && \\\n",
|
||||
"pip install {pandas_dep} && \\\n",
|
||||
"cd /usr/local/src && \\\n",
|
||||
"git clone https://github.com/slundberg/shap.git --single-branch && \\\n",
|
||||
"cd shap && \\\n",
|
||||
"git reset --hard 690245c6ab043edf40cfce3d8438a62e29ab599f && \\\n",
|
||||
"mkdir build && \\\n",
|
||||
"python setup.py install --user && \\\n",
|
||||
"pip uninstall -y xgboost && \\\n",
|
||||
"conda install py-xgboost==1.3.3 && \\\n",
|
||||
"pip uninstall -y numpy && \\\n",
|
||||
"conda install {numpy_dep} \\\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"env.python.user_managed_dependencies = True\n",
|
||||
"env.python.interpreter_path = '/root/miniconda3/envs/shapgpu/bin/python'\n",
|
||||
"\n",
|
||||
"from azureml.core import Run\n",
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=project_folder, \n",
|
||||
" script='gpu_tree_explainer.py', \n",
|
||||
" compute_target=amlcompute_cluster_name,\n",
|
||||
" environment=env) \n",
|
||||
"run = experiment.submit(config=src)\n",
|
||||
"run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note: if you need to cancel a run, you can follow [these instructions](https://aka.ms/aml-docs-cancel-run)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"# Shows output of the run on stdout.\n",
|
||||
"run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.get_metrics()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download \n",
|
||||
"1. Download model explanation data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.interpret import ExplanationClient\n",
|
||||
"\n",
|
||||
"# Get model explanation data\n",
|
||||
"client = ExplanationClient.from_run(run)\n",
|
||||
"global_explanation = client.download_model_explanation()\n",
|
||||
"local_importance_values = global_explanation.local_importance_values\n",
|
||||
"expected_values = global_explanation.expected_values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get the top k (e.g., 4) most important features with their importance values\n",
|
||||
"global_explanation_topk = client.download_model_explanation(top_k=4)\n",
|
||||
"global_importance_values = global_explanation_topk.get_ranked_global_values()\n",
|
||||
"global_importance_names = global_explanation_topk.get_ranked_global_names()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('global importance values: {}'.format(global_importance_values))\n",
|
||||
"print('global importance names: {}'.format(global_importance_names))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Download model file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieve model for visualization and deployment\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"import joblib\n",
|
||||
"original_model = Model(ws, 'xgboost_with_gpu_tree_explainer')\n",
|
||||
"model_path = original_model.download(exist_ok=True)\n",
|
||||
"original_model = joblib.load(model_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"3. Download test dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieve x_test for visualization\n",
|
||||
"x_test_path = './x_shap_adult_census.pkl'\n",
|
||||
"run.download_file('x_shap_adult_census.pkl', output_file_path=x_test_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_test = joblib.load('x_shap_adult_census.pkl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize\n",
|
||||
"Load the visualization dashboard"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from raiwidgets import ExplanationDashboard"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from interpret_community.common.model_wrapper import wrap_model\n",
|
||||
"from interpret_community.dataset.dataset_wrapper import DatasetWrapper\n",
|
||||
"# note we need to wrap the XGBoost model to output predictions and probabilities in the scikit-learn format\n",
|
||||
"class WrappedXGBoostModel(object):\n",
|
||||
" \"\"\"A class for wrapping an XGBoost model to output integer predicted classes.\"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, model):\n",
|
||||
" self.model = model\n",
|
||||
"\n",
|
||||
" def predict(self, dataset):\n",
|
||||
" return self.model.predict(dataset).astype(int)\n",
|
||||
"\n",
|
||||
" def predict_proba(self, dataset):\n",
|
||||
" return self.model.predict_proba(dataset)\n",
|
||||
"\n",
|
||||
"wrapped_model = WrappedXGBoostModel(wrap_model(original_model, DatasetWrapper(x_test), model_task='classification'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ExplanationDashboard(global_explanation, wrapped_model, dataset=x_test)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "ilmat"
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
name: train-explain-model-gpu-tree-explainer
|
||||
dependencies:
|
||||
- py-xgboost==1.3.3
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-interpret
|
||||
- flask
|
||||
- flask-cors
|
||||
- gevent>=1.3.6
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.21.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
- protobuf==3.20.0
|
||||
- jinja2==3.0.3
|
||||
@@ -496,9 +496,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -8,9 +8,8 @@ dependencies:
|
||||
- gevent>=1.3.6
|
||||
- ipython
|
||||
- matplotlib
|
||||
- azureml-dataset-runtime
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.21.0
|
||||
- raiwidgets~=0.28.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -595,9 +595,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -9,7 +9,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.21.0
|
||||
- raiwidgets~=0.28.0
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
|
||||
@@ -516,9 +516,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -9,7 +9,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.21.0
|
||||
- raiwidgets~=0.28.0
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
|
||||
@@ -576,9 +576,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -8,10 +8,9 @@ dependencies:
|
||||
- gevent>=1.3.6
|
||||
- ipython
|
||||
- matplotlib
|
||||
- azureml-dataset-runtime
|
||||
- azureml-core
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.21.0
|
||||
- raiwidgets~=0.28.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -175,7 +175,7 @@
|
||||
"store_name=os.getenv(\"ADL_STORENAME_62\", \"<my-datastore-name>\") # ADLS account name\n",
|
||||
"tenant_id=os.getenv(\"ADL_TENANT_62\", \"<my-tenant-id>\") # tenant id of service principal\n",
|
||||
"client_id=os.getenv(\"ADL_CLIENTID_62\", \"<my-client-id>\") # client id of service principal\n",
|
||||
"client_secret=os.getenv(\"ADL_CLIENT_SECRET_62\", \"<my-client-secret>\") # the secret of service principal\n",
|
||||
"client_st=os.getenv(\"ADL_CLIENT_SECRET_62\", \"<my-client-secret>\") # the secret of service principal\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" adls_datastore = Datastore.get(ws, datastore_name)\n",
|
||||
@@ -189,7 +189,7 @@
|
||||
" store_name=store_name, # ADLS account name\n",
|
||||
" tenant_id=tenant_id, # tenant id of service principal\n",
|
||||
" client_id=client_id, # client id of service principal\n",
|
||||
" client_secret=client_secret) # the secret of service principal\n",
|
||||
" client_secret=client_st) # the secret of service principal\n",
|
||||
" print(\"Registered datastore with name: %s\" % datastore_name)\n",
|
||||
"\n",
|
||||
"adls_data_ref = DataReference(\n",
|
||||
@@ -579,9 +579,9 @@
|
||||
],
|
||||
"friendly_name": "Azure Machine Learning Pipeline with DataTranferStep",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user