notebook patches
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
# Adapted from:
|
||||
# https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
|
||||
# ====================================================================
|
||||
"""Train a CNN model on the MNIST dataset via distributed training."""
|
||||
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
import os
|
||||
import cntk as C
|
||||
import argparse
|
||||
from cntk.train.training_session import CheckpointConfig, TestConfig
|
||||
|
||||
|
||||
def create_reader(path, is_training, input_dim, label_dim, total_number_of_samples):
|
||||
"""Define the reader for both training and evaluation action."""
|
||||
return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
|
||||
features=C.io.StreamDef(field='features', shape=input_dim),
|
||||
labels=C.io.StreamDef(field='labels', shape=label_dim)
|
||||
)), randomize=is_training, max_samples=total_number_of_samples)
|
||||
|
||||
|
||||
def convnet_mnist(max_epochs, output_dir, data_dir, debug_output=False, epoch_size=60000, minibatch_size=64):
|
||||
"""Creates and trains a feedforward classification model for MNIST images."""
|
||||
image_height = 28
|
||||
image_width = 28
|
||||
num_channels = 1
|
||||
input_dim = image_height * image_width * num_channels
|
||||
num_output_classes = 10
|
||||
|
||||
# Input variables denoting the features and label data
|
||||
input_var = C.ops.input_variable((num_channels, image_height, image_width), np.float32)
|
||||
label_var = C.ops.input_variable(num_output_classes, np.float32)
|
||||
|
||||
# Instantiate the feedforward classification model
|
||||
scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var)
|
||||
|
||||
with C.layers.default_options(activation=C.ops.relu, pad=False):
|
||||
conv1 = C.layers.Convolution2D((5, 5), 32, pad=True)(scaled_input)
|
||||
pool1 = C.layers.MaxPooling((3, 3), (2, 2))(conv1)
|
||||
conv2 = C.layers.Convolution2D((3, 3), 48)(pool1)
|
||||
pool2 = C.layers.MaxPooling((3, 3), (2, 2))(conv2)
|
||||
conv3 = C.layers.Convolution2D((3, 3), 64)(pool2)
|
||||
f4 = C.layers.Dense(96)(conv3)
|
||||
drop4 = C.layers.Dropout(0.5)(f4)
|
||||
z = C.layers.Dense(num_output_classes, activation=None)(drop4)
|
||||
|
||||
ce = C.losses.cross_entropy_with_softmax(z, label_var)
|
||||
pe = C.metrics.classification_error(z, label_var)
|
||||
|
||||
# Load train data
|
||||
reader_train = create_reader(os.path.join(data_dir, 'Train-28x28_cntk_text.txt'), True,
|
||||
input_dim, num_output_classes, max_epochs * epoch_size)
|
||||
# Load test data
|
||||
reader_test = create_reader(os.path.join(data_dir, 'Test-28x28_cntk_text.txt'), False,
|
||||
input_dim, num_output_classes, C.io.FULL_DATA_SWEEP)
|
||||
|
||||
# Set learning parameters
|
||||
lr_per_sample = [0.001] * 10 + [0.0005] * 10 + [0.0001]
|
||||
lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size)
|
||||
mms = [0] * 5 + [0.9990239141819757]
|
||||
mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size)
|
||||
|
||||
# Instantiate the trainer object to drive the model training
|
||||
local_learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule)
|
||||
progress_printer = C.logging.ProgressPrinter(
|
||||
tag='Training',
|
||||
rank=C.train.distributed.Communicator.rank(),
|
||||
num_epochs=max_epochs,
|
||||
)
|
||||
|
||||
learner = C.train.distributed.data_parallel_distributed_learner(local_learner)
|
||||
trainer = C.Trainer(z, (ce, pe), learner, progress_printer)
|
||||
|
||||
# define mapping from reader streams to network inputs
|
||||
input_map_train = {
|
||||
input_var: reader_train.streams.features,
|
||||
label_var: reader_train.streams.labels
|
||||
}
|
||||
|
||||
input_map_test = {
|
||||
input_var: reader_test.streams.features,
|
||||
label_var: reader_test.streams.labels
|
||||
}
|
||||
|
||||
C.logging.log_number_of_parameters(z)
|
||||
print()
|
||||
|
||||
C.train.training_session(
|
||||
trainer=trainer,
|
||||
mb_source=reader_train,
|
||||
model_inputs_to_streams=input_map_train,
|
||||
mb_size=minibatch_size,
|
||||
progress_frequency=epoch_size,
|
||||
checkpoint_config=CheckpointConfig(frequency=epoch_size,
|
||||
filename=os.path.join(output_dir, "ConvNet_MNIST")),
|
||||
test_config=TestConfig(reader_test, minibatch_size=minibatch_size,
|
||||
model_inputs_to_streams=input_map_test)
|
||||
).train()
|
||||
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--num_epochs', help='Total number of epochs to train', type=int, default='40')
|
||||
parser.add_argument('--output_dir', help='Output directory', required=False, default='outputs')
|
||||
parser.add_argument('--data_dir', help='Directory with training data')
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
convnet_mnist(args.num_epochs, args.output_dir, args.data_dir)
|
||||
|
||||
# Must call MPI finalize when process exit without exceptions
|
||||
C.train.distributed.Communicator.finalize()
|
||||
@@ -0,0 +1,394 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed CNTK using custom docker images\n",
|
||||
"In this tutorial, you will train a CNTK model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using a custom docker image and distributed training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning\n",
|
||||
"* Go through the [00.configuration.ipynb]() notebook to:\n",
|
||||
" * install the AML SDK\n",
|
||||
" * create a workspace and its configuration file (`config.json`)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diagnostics\n",
|
||||
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"Diagnostics"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||
"\n",
|
||||
"set_diagnostics_collection(send_diagnostics=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"\n",
|
||||
"Initialize a [Workspace](https://review.docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture?branch=release-ignite-aml#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or Attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpucluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
" print('Found existing compute target.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
|
||||
" max_nodes=4)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
" compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# Use the 'status' property to get a detailed status for the current AmlCompute. \n",
|
||||
"print(compute_target.status.serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Upload training data\n",
|
||||
"For this tutorial, we will be using the MNIST dataset.\n",
|
||||
"\n",
|
||||
"First, let's download the dataset. We've included the `install_mnist.py` script to download the data and convert it to a CNTK-supported format. Our data files will get written to a directory named `'mnist'`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import install_mnist\n",
|
||||
"\n",
|
||||
"install_mnist.main('mnist')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To make the data accessible for remote training, you will need to upload the data from your local machine to the cloud. AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data, and interact with it from your remote compute targets. \n",
|
||||
"\n",
|
||||
"Each workspace is associated with a default datastore. In this tutorial, we will upload the training data to this default datastore, which we will then mount on the remote compute for training in the next section."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"print(ds.datastore_type, ds.account_name, ds.container_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following code will upload the training data to the path `./mnist` on the default datastore."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ds.upload(src_dir='./mnist', target_path='./mnist')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's get a reference to the path on the datastore with the training data. We can do so using the `path` method. In the next section, we can then pass this reference to our training script's `--data_dir` argument. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path_on_datastore = 'mnist'\n",
|
||||
"ds_data = ds.path(path_on_datastore)\n",
|
||||
"print(ds_data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train model on the remote compute\n",
|
||||
"Now that we have the cluster ready to go, let's run our distributed training job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a project directory\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"project_folder = './cntk-distr'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copy the training script `cntk_distr_mnist.py` into this project directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"shutil.copy('cntk_distr_mnist.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"Create an [experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed CNTK tutorial. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'cntk-distr'\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an Estimator\n",
|
||||
"The AML SDK's base Estimator enables you to easily submit custom scripts for both single-node and distributed runs. You should this generic estimator for training code using frameworks such as sklearn or CNTK that don't have corresponding custom estimators. For more information on using the generic estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-ml-models)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.estimator import *\n",
|
||||
"\n",
|
||||
"script_params = {\n",
|
||||
" '--num_epochs': 20,\n",
|
||||
" '--data_dir': ds_data.as_mount(),\n",
|
||||
" '--output_dir': './outputs'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"estimator = Estimator(source_directory=project_folder,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" entry_script='cntk_distr_mnist.py',\n",
|
||||
" script_params=script_params,\n",
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=1,\n",
|
||||
" distributed_backend='mpi', \n",
|
||||
" pip_packages=['cntk-gpu==2.6'],\n",
|
||||
" custom_docker_base_image='microsoft/mmlspark:gpu-0.12',\n",
|
||||
" use_gpu=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We would like to train our model using a [pre-built Docker container](https://hub.docker.com/r/microsoft/mmlspark/). To do so, specify the name of the docker image to the argument `custom_docker_base_image`. You can only provide images available in public docker repositories such as Docker Hub using this argument. To use an image from a private docker repository, use the constructor's `environment_definition` parameter instead. Finally, we provide the `cntk` package to `pip_packages` to install CNTK 2.6 on our custom image.\n",
|
||||
"\n",
|
||||
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to run distributed CNTK, which uses MPI, you must provide the argument `distributed_backend='mpi'`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit job\n",
|
||||
"Run your experiment by submitting your estimator object. Note that this call is asynchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(estimator)\n",
|
||||
"print(run)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor your run\n",
|
||||
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Alternatively, you can block until the script has completed training before running more code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "minxia"
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
# Script from:
|
||||
# https://github.com/Microsoft/CNTK/blob/master/Examples/Image/DataSets/MNIST/install_mnist.py
|
||||
|
||||
from __future__ import print_function
|
||||
try:
|
||||
from urllib.request import urlretrieve
|
||||
except ImportError:
|
||||
from urllib import urlretrieve
|
||||
import gzip
|
||||
import os
|
||||
import struct
|
||||
import numpy as np
|
||||
|
||||
|
||||
def loadData(src, cimg):
|
||||
print('Downloading ' + src)
|
||||
gzfname, h = urlretrieve(src, './delete.me')
|
||||
print('Done.')
|
||||
try:
|
||||
with gzip.open(gzfname) as gz:
|
||||
n = struct.unpack('I', gz.read(4))
|
||||
# Read magic number.
|
||||
if n[0] != 0x3080000:
|
||||
raise Exception('Invalid file: unexpected magic number.')
|
||||
# Read number of entries.
|
||||
n = struct.unpack('>I', gz.read(4))[0]
|
||||
if n != cimg:
|
||||
raise Exception('Invalid file: expected {0} entries.'.format(cimg))
|
||||
crow = struct.unpack('>I', gz.read(4))[0]
|
||||
ccol = struct.unpack('>I', gz.read(4))[0]
|
||||
if crow != 28 or ccol != 28:
|
||||
raise Exception('Invalid file: expected 28 rows/cols per image.')
|
||||
# Read data.
|
||||
res = np.fromstring(gz.read(cimg * crow * ccol), dtype=np.uint8)
|
||||
finally:
|
||||
os.remove(gzfname)
|
||||
return res.reshape((cimg, crow * ccol))
|
||||
|
||||
|
||||
def loadLabels(src, cimg):
|
||||
print('Downloading ' + src)
|
||||
gzfname, h = urlretrieve(src, './delete.me')
|
||||
print('Done.')
|
||||
try:
|
||||
with gzip.open(gzfname) as gz:
|
||||
n = struct.unpack('I', gz.read(4))
|
||||
# Read magic number.
|
||||
if n[0] != 0x1080000:
|
||||
raise Exception('Invalid file: unexpected magic number.')
|
||||
# Read number of entries.
|
||||
n = struct.unpack('>I', gz.read(4))
|
||||
if n[0] != cimg:
|
||||
raise Exception('Invalid file: expected {0} rows.'.format(cimg))
|
||||
# Read labels.
|
||||
res = np.fromstring(gz.read(cimg), dtype=np.uint8)
|
||||
finally:
|
||||
os.remove(gzfname)
|
||||
return res.reshape((cimg, 1))
|
||||
|
||||
|
||||
def load(dataSrc, labelsSrc, cimg):
|
||||
data = loadData(dataSrc, cimg)
|
||||
labels = loadLabels(labelsSrc, cimg)
|
||||
return np.hstack((data, labels))
|
||||
|
||||
|
||||
def savetxt(filename, ndarray):
|
||||
with open(filename, 'w') as f:
|
||||
labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))
|
||||
for row in ndarray:
|
||||
row_str = row.astype(str)
|
||||
label_str = labels[row[-1]]
|
||||
feature_str = ' '.join(row_str[:-1])
|
||||
f.write('|labels {} |features {}\n'.format(label_str, feature_str))
|
||||
|
||||
|
||||
def main(data_dir):
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
train = load('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
|
||||
'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 60000)
|
||||
print('Writing train text file...')
|
||||
train_txt = os.path.join(data_dir, 'Train-28x28_cntk_text.txt')
|
||||
savetxt(train_txt, train)
|
||||
print('Done.')
|
||||
test = load('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
|
||||
'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 10000)
|
||||
print('Writing test text file...')
|
||||
test_txt = os.path.join(data_dir, 'Test-28x28_cntk_text.txt')
|
||||
savetxt(test_txt, test)
|
||||
print('Done.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main('mnist')
|
||||
@@ -13,60 +13,58 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 01. Train in the Notebook & Deploy Model to ACI\n",
|
||||
"# Train and deploy a model\n",
|
||||
"_**Create and deploy a model directly from a notebook**_\n",
|
||||
"\n",
|
||||
"* Load workspace\n",
|
||||
"* Train a simple regression model directly in the Notebook python kernel\n",
|
||||
"* Record run history\n",
|
||||
"* Find the best model in run history and download it.\n",
|
||||
"* Deploy the model as an Azure Container Instance (ACI)"
|
||||
"---\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Contents\n",
|
||||
"1. [Introduction](#Introduction)\n",
|
||||
"1. [Setup](#Setup)\n",
|
||||
"1. [Data](#Data)\n",
|
||||
"1. [Train](#Train)\n",
|
||||
" 1. Viewing run results\n",
|
||||
" 1. Simple parameter sweep\n",
|
||||
" 1. Viewing experiment results\n",
|
||||
" 1. Select the best model\n",
|
||||
"1. [Deploy](#Deploy)\n",
|
||||
" 1. Register the model\n",
|
||||
" 1. Create a scoring file\n",
|
||||
" 1. Describe your environment\n",
|
||||
" 1. Descrice your target compute\n",
|
||||
" 1. Deploy your webservice\n",
|
||||
" 1. Test your webservice\n",
|
||||
" 1. Clean up\n",
|
||||
"1. [Next Steps](#Next%20Steps)\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Introduction\n",
|
||||
"Azure Machine Learning provides capabilities to control all aspects of model training and deployment directly from a notebook using the AML Python SDK. In this notebook we will\n",
|
||||
"* connect to our AML Workspace\n",
|
||||
"* create an experiment that contains multiple runs with tracked metrics\n",
|
||||
"* choose the best model created across all runs\n",
|
||||
"* deploy that model as a service\n",
|
||||
"\n",
|
||||
"In the end we will have a model deployed as a web service which we can call from an HTTP endpoint"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"1. Make sure you go through the [Configuration](../../../configuration.ipynb) Notebook first if you haven't. \n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"2. Install following pre-requisite libraries to your conda environment and restart notebook.\n",
|
||||
"## Setup\n",
|
||||
"Make sure you have completed the [Configuration](..\\..\\configuration.ipnyb) notebook to set up your Azure Machine Learning workspace and ensure other common prerequisites are met. From the configuration, the important sections are the workspace configuration and ACI regristration.\n",
|
||||
"\n",
|
||||
"We will also need the following libraries install to our conda environment. If these are not installed, use the following command to do so and restart the notebook.\n",
|
||||
"```shell\n",
|
||||
"(myenv) $ conda install -y matplotlib tqdm scikit-learn\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"3. Check that ACI is registered for your Azure Subscription. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az provider show -n Microsoft.ContainerInstance -o table"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If ACI is not registered, run following command to register it. Note that you have to be a subscription owner, or this command will fail."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az provider register -n Microsoft.ContainerInstance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Validate Azure ML SDK installation and get version number for debugging purposes"
|
||||
"For this notebook we need the Azure ML SDK and access to our workspace. The following cell imports the SDK, checks the version, and accesses our already configured AzureML workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -83,28 +81,11 @@
|
||||
"from azureml.core import Experiment, Run, Workspace\n",
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize Workspace\n",
|
||||
"print(\"This notebook was created using version 1.0.2 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")\n",
|
||||
"print(\"\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Initialize a workspace object from persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"create workspace"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
@@ -116,8 +97,10 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Set experiment name\n",
|
||||
"Choose a name for experiment."
|
||||
"---\n",
|
||||
"\n",
|
||||
"## Data\n",
|
||||
"We will use the diabetes dataset for this experiement, a well-known small dataset that comes with scikit-learn. This cell loads the dataset and splits it into random training and testing sets.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -126,23 +109,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment_name = 'train-in-notebook'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Start a training run in local Notebook"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load diabetes dataset, a well-known small dataset that comes with scikit-learn\n",
|
||||
"from sklearn.datasets import load_diabetes\n",
|
||||
"from sklearn.linear_model import Ridge\n",
|
||||
"from sklearn.metrics import mean_squared_error\n",
|
||||
@@ -155,36 +121,25 @@
|
||||
"data = {\n",
|
||||
" \"train\":{\"X\": X_train, \"y\": y_train}, \n",
|
||||
" \"test\":{\"X\": X_test, \"y\": y_test}\n",
|
||||
"}"
|
||||
"}\n",
|
||||
"\n",
|
||||
"print (\"Data contains\", len(data['train']['X']), \"training samples and\",len(data['test']['X']), \"test samples\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Train a simple Ridge model\n",
|
||||
"Train a very simple Ridge regression model in scikit-learn, and save it as a pickle file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reg = Ridge(alpha = 0.03)\n",
|
||||
"reg.fit(X=data['train']['X'], y=data['train']['y'])\n",
|
||||
"preds = reg.predict(data['test']['X'])\n",
|
||||
"print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))\n",
|
||||
"joblib.dump(value=reg, filename='model.pkl');"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Add experiment tracking\n",
|
||||
"Now, let's add Azure ML experiment logging, and upload persisted model into run record as well."
|
||||
"---\n",
|
||||
"## Train\n",
|
||||
"\n",
|
||||
"Let's use scikit-learn to train a simple Ridge regression model. We use AML to record interesting information about the model in an Experiment. An Experiment contains a series of trials called Runs. During this trial we use AML in the following way:\n",
|
||||
"* We access an experiment from our AML workspace by name, which will be created if it doesn't exist\n",
|
||||
"* We use `start_logging` to create a new run in this experiment\n",
|
||||
"* We use `run.log()` to record a parameter, alpha, and an accuracy measure - the Mean Squared Error (MSE) to the run. We will be able to review and compare these measures in the Azure Portal at a later time.\n",
|
||||
"* We store the resulting model in the **outputs** directory, which is automatically captured by AML when the run is complete.\n",
|
||||
"* We use `run.take_snapshot()` to capture *this* notebook so we can reproduce this experiment at a later time.\n",
|
||||
"* We use `run.complete()` to indicate that the run is over and results can be captured and finalized"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -198,18 +153,29 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
|
||||
"run = experiment.start_logging()\n",
|
||||
"# Get an experiment object from Azure Machine Learning\n",
|
||||
"experiment = Experiment(workspace=ws, name=\"train-within-notebook\")\n",
|
||||
"\n",
|
||||
"run.tag(\"Description\",\"My first run!\")\n",
|
||||
"# Create a run object in the experiment\n",
|
||||
"run = experiment.start_logging()# Log the algorithm parameter alpha to the run\n",
|
||||
"run.log('alpha', 0.03)\n",
|
||||
"reg = Ridge(alpha=0.03)\n",
|
||||
"reg.fit(data['train']['X'], data['train']['y'])\n",
|
||||
"preds = reg.predict(data['test']['X'])\n",
|
||||
"run.log('mse', mean_squared_error(data['test']['y'], preds))\n",
|
||||
"joblib.dump(value=reg, filename='model.pkl')\n",
|
||||
"run.upload_file(name='outputs/model.pkl', path_or_stream='./model.pkl')\n",
|
||||
"\n",
|
||||
"# Create, fit, and test the scikit-learn Ridge regression model\n",
|
||||
"regression_model = Ridge(alpha=0.03)\n",
|
||||
"regression_model.fit(data['train']['X'], data['train']['y'])\n",
|
||||
"preds = regression_model.predict(data['test']['X'])\n",
|
||||
"\n",
|
||||
"# Output the Mean Squared Error to the notebook and to the run\n",
|
||||
"print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))\n",
|
||||
"run.log('mse', mean_squared_error(data['test']['y'], preds))\n",
|
||||
"\n",
|
||||
"# Save the model to the outputs directory for capture\n",
|
||||
"joblib.dump(value=regression_model, filename='outputs/model.pkl')\n",
|
||||
"\n",
|
||||
"# Take a snapshot of the directory containing this notebook\n",
|
||||
"run.take_snapshot('./')\n",
|
||||
"\n",
|
||||
"# Complete the run\n",
|
||||
"run.complete()"
|
||||
]
|
||||
},
|
||||
@@ -217,7 +183,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can browse to the recorded run. Please make sure you use Chrome to navigate the run history page."
|
||||
"### Viewing run results\n",
|
||||
"Azure Machine Learning stores all the details about the run in the Azure cloud. Let's access those details by retrieving a link to the run using the default run output. Clicking on the resulting link will take you to an interactive page presenting all run information."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -234,7 +201,11 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Simple parameter sweep\n",
|
||||
"Sweep over alpha values of a sklearn ridge model, and capture metrics and trained model in the Azure ML experiment."
|
||||
"Now let's take the same concept from above and modify the **alpha** parameter. For each value of alpha we will create a run that will store metrics and the resulting model. In the end we can use the captured run history to determine which model was the best for us to deploy. \n",
|
||||
"\n",
|
||||
"Note that by using `with experiment.start_logging() as run` AML will automatically call `run.complete()` at the end of each loop.\n",
|
||||
"\n",
|
||||
"This example also uses the **tqdm** library to provide a thermometer feedback"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -257,24 +228,28 @@
|
||||
" # create a bunch of runs, each train a model with a different alpha value\n",
|
||||
" with experiment.start_logging() as run:\n",
|
||||
" # Use Ridge algorithm to build a regression model\n",
|
||||
" reg = Ridge(alpha=alpha)\n",
|
||||
" reg.fit(X=data[\"train\"][\"X\"], y=data[\"train\"][\"y\"])\n",
|
||||
" preds = reg.predict(X=data[\"test\"][\"X\"])\n",
|
||||
" regression_model = Ridge(alpha=alpha)\n",
|
||||
" regression_model.fit(X=data[\"train\"][\"X\"], y=data[\"train\"][\"y\"])\n",
|
||||
" preds = regression_model.predict(X=data[\"test\"][\"X\"])\n",
|
||||
" mse = mean_squared_error(y_true=data[\"test\"][\"y\"], y_pred=preds)\n",
|
||||
"\n",
|
||||
" # log alpha, mean_squared_error and feature names in run history\n",
|
||||
" run.log(name=\"alpha\", value=alpha)\n",
|
||||
" run.log(name=\"mse\", value=mse)\n",
|
||||
" run.log_list(name=\"columns\", value=columns)\n",
|
||||
"\n",
|
||||
" with open(model_name, \"wb\") as file:\n",
|
||||
" joblib.dump(value=reg, filename=file)\n",
|
||||
" # Save the model to the outputs directory for capture\n",
|
||||
" joblib.dump(value=regression_model, filename='outputs/model.pkl')\n",
|
||||
" \n",
|
||||
" # upload the serialized model into run history record\n",
|
||||
" run.upload_file(name=\"outputs/\" + model_name, path_or_stream=model_name)\n",
|
||||
"\n",
|
||||
" # now delete the serialized model from local folder since it is already uploaded to run history \n",
|
||||
" os.remove(path=model_name)"
|
||||
" # Capture this notebook with the run\n",
|
||||
" run.take_snapshot('./')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Viewing experiment results\n",
|
||||
"Similar to viewing the run, we can also view the entire experiment. The experiment report view in the Azure portal lets us view all the runs in a table, and also allows us to customize charts. This way, we can see how the alpha parameter impacts the quality of the model"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -291,8 +266,12 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Select best model from the experiment\n",
|
||||
"Load all experiment run metrics recursively from the experiment into a dictionary object."
|
||||
"### Select the best model \n",
|
||||
"Now that we've created many runs with different parameters, we need to determine which model is the best for deployment. For this, we will iterate over the set of runs. From each run we will take the *run id* using the `id` property, and examine the metrics by calling `run.get_metrics()`. \n",
|
||||
"\n",
|
||||
"Since each run may be different, we do need to check if the run has the metric that we are looking for, in this case, **mse**. To find the best run, we create a dictionary mapping the run id's to the metrics.\n",
|
||||
"\n",
|
||||
"Finally, we use the `tag` method to mark the best run to make it easier to find later. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -304,106 +283,45 @@
|
||||
"runs = {}\n",
|
||||
"run_metrics = {}\n",
|
||||
"\n",
|
||||
"# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric\n",
|
||||
"for r in tqdm(experiment.get_runs()):\n",
|
||||
" metrics = r.get_metrics()\n",
|
||||
" if 'mse' in metrics.keys():\n",
|
||||
" runs[r.id] = r\n",
|
||||
" run_metrics[r.id] = metrics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now find the run with the lowest Mean Squared Error value"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" run_metrics[r.id] = metrics\n",
|
||||
"\n",
|
||||
"# Find the run with the best (lowest) mean squared error and display the id and metrics\n",
|
||||
"best_run_id = min(run_metrics, key = lambda k: run_metrics[k]['mse'])\n",
|
||||
"best_run = runs[best_run_id]\n",
|
||||
"print('Best run is:', best_run_id)\n",
|
||||
"print('Metrics:', run_metrics[best_run_id])"
|
||||
"print('Metrics:', run_metrics[best_run_id])\n",
|
||||
"\n",
|
||||
"# Tag the best run for identification later\n",
|
||||
"best_run.tag(\"Best Run\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can add tags to your runs to make them easier to catalog"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"query history"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run.tag(key=\"Description\", value=\"The best one\")\n",
|
||||
"best_run.get_tags()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Plot MSE over alpha\n",
|
||||
"\n",
|
||||
"Let's observe the best model visually by plotting the MSE values over alpha values:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"best_alpha = run_metrics[best_run_id]['alpha']\n",
|
||||
"min_mse = run_metrics[best_run_id]['mse']\n",
|
||||
"\n",
|
||||
"alpha_mse = np.array([(run_metrics[k]['alpha'], run_metrics[k]['mse']) for k in run_metrics.keys()])\n",
|
||||
"sorted_alpha_mse = alpha_mse[alpha_mse[:,0].argsort()]\n",
|
||||
"\n",
|
||||
"plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'r--')\n",
|
||||
"plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'bo')\n",
|
||||
"\n",
|
||||
"plt.xlabel('alpha', fontsize = 14)\n",
|
||||
"plt.ylabel('mean squared error', fontsize = 14)\n",
|
||||
"plt.title('MSE over alpha', fontsize = 16)\n",
|
||||
"\n",
|
||||
"# plot arrow\n",
|
||||
"plt.arrow(x = best_alpha, y = min_mse + 39, dx = 0, dy = -26, ls = '-', lw = 0.4,\n",
|
||||
" width = 0, head_width = .03, head_length = 8)\n",
|
||||
"\n",
|
||||
"# plot \"best run\" text\n",
|
||||
"plt.text(x = best_alpha - 0.08, y = min_mse + 50, s = 'Best Run', fontsize = 14)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the best model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Find the model file saved in the run record of best run."
|
||||
"---\n",
|
||||
"## Deploy\n",
|
||||
"Now that we have trained a set of models and identified the run containing the best model, we want to deploy the model for real time inferencing. The process of deploying a model involves\n",
|
||||
"* registering a model in your workspace\n",
|
||||
"* creating a scoring file containing init and run methods\n",
|
||||
"* creating an environment dependency file describing packages necessary for your scoring file\n",
|
||||
"* creating a docker image containing a properly described environment, your model, and your scoring file\n",
|
||||
"* deploying that docker image as a web service"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Register a model\n",
|
||||
"We have already identified which run contains the \"best model\" by our evaluation criteria. Each run has a file structure associated with it that contains various files collected during the run. Since a run can have many outputs we need to tell AML which file from those outputs represents the model that we want to use for our deployment. We can use the `run.get_file_names()` method to list the files associated with the run, and then use the `run.register_model()` method to place the model in the workspace's model registry.\n",
|
||||
"\n",
|
||||
"When using `run.register_model()` we supply a `model_name` that is meaningful for our scenario and the `model_path` of the model relative to the run. In this case, the model path is what is returned from `run.get_file_names()`"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -416,27 +334,11 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# View the files in the run\n",
|
||||
"for f in best_run.get_file_names():\n",
|
||||
" print(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can register this model in the model registry of the workspace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"register model from history"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" print(f)\n",
|
||||
" \n",
|
||||
"# Register the model with the workspace\n",
|
||||
"model = best_run.register_model(model_name='best_model', model_path='outputs/model.pkl')"
|
||||
]
|
||||
},
|
||||
@@ -444,7 +346,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Verify that the model has been registered properly. If you have done this several times you'd see the version number auto-increases each time."
|
||||
"Once a model is registered, it is accessible from the list of models on the AML workspace. If you register models with the same name multiple times, AML keeps a version history of those models for you. The `Model.list()` lists all models in a workspace, and can be filtered by name, tags, or model properties. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -457,8 +359,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Find all models called \"best_model\" and display their version numbers\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"models = Model.list(workspace=ws, name='best_model')\n",
|
||||
"models = Model.list(ws, name='best_model')\n",
|
||||
"for m in models:\n",
|
||||
" print(m.name, m.version)"
|
||||
]
|
||||
@@ -467,54 +370,22 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also download the registered model. Afterwards, you should see a `model.pkl` file in the current directory. You can then use it for local testing if you'd like."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"download file"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# remove the model file if it is already on disk\n",
|
||||
"if os.path.isfile('model.pkl'): \n",
|
||||
" os.remove('model.pkl')\n",
|
||||
"# download the model\n",
|
||||
"model.download(target_dir=\"./\")"
|
||||
"### Create a scoring file\n",
|
||||
"\n",
|
||||
"Since your model file can essentially be anything you want it to be, you need to supply a scoring script that can load your model and then apply the model to new data. This script is your 'scoring file'. This scoring file is a python program containing, at a minimum, two methods `init()` and `run()`. The `init()` method is called once when your deployment is started so you can load your model and any other required objects. This method uses the `get_model_path` function to locate the registered model inside the docker container. The `run()` method is called interactively when the web service is called with one or more data samples to predict.\n",
|
||||
"\n",
|
||||
"The scoring file used for this exercise is [here](score.py). \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Scoring script\n",
|
||||
"### Describe your environment\n",
|
||||
"\n",
|
||||
"Now we are ready to build a Docker image and deploy the model in it as a web service. The first step is creating the scoring script. For convenience, we have created the scoring script for you. It is printed below as text, but you can also run `%pfile ./score.py` in a cell to show the file.\n",
|
||||
"Each modelling process may require a unique set of packages. Therefore we need to create a dependency file providing instructions to AML on how to contstruct a docker image that can support the models and any other objects required for inferencing. In the following cell, we create a environment dependency file, *myenv.yml* that specifies which libraries are needed by the scoring script. You can create this file manually, or use the `CondaDependencies` class to create it for you.\n",
|
||||
"\n",
|
||||
"Tbe scoring script consists of two functions: `init` that is used to load the model to memory when starting the container, and `run` that makes the prediction when web service is called. Please pay special attention to how the model is loaded in the `init()` function. When Docker image is built for this model, the actual model file is downloaded and placed on disk, and `get_model_path` function returns the local path where the model is placed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('./score.py', 'r') as scoring_script:\n",
|
||||
" print(scoring_script.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create environment dependency file\n",
|
||||
"\n",
|
||||
"We need a environment dependency file `myenv.yml` to specify which libraries are needed by the scoring script when building the Docker image for web service deployment. We can manually create this file, or we can use the `CondaDependencies` API to automatically create this file."
|
||||
"Next we use this environment file to describe the docker container that we need to create in order to deploy our model. This container is created using our environment description and includes our scoring script."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -524,24 +395,33 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.conda_dependencies import CondaDependencies \n",
|
||||
"from azureml.core.image import ContainerImage\n",
|
||||
"\n",
|
||||
"myenv = CondaDependencies.create(conda_packages=[\"scikit-learn\"])\n",
|
||||
"print(myenv.serialize_to_string())\n",
|
||||
"# Create an empty conda environment and add the scikit-learn package\n",
|
||||
"env = CondaDependencies()\n",
|
||||
"env.add_conda_package(\"scikit-learn\")\n",
|
||||
"\n",
|
||||
"# Display the environment\n",
|
||||
"print(env.serialize_to_string())\n",
|
||||
"\n",
|
||||
"# Write the environment to disk\n",
|
||||
"with open(\"myenv.yml\",\"w\") as f:\n",
|
||||
" f.write(myenv.serialize_to_string())"
|
||||
" f.write(env.serialize_to_string())\n",
|
||||
"\n",
|
||||
"# Create a configuration object indicating how our deployment container needs to be created\n",
|
||||
"image_config = ContainerImage.image_configuration(execution_script=\"score.py\", \n",
|
||||
" runtime=\"python\", \n",
|
||||
" conda_file=\"myenv.yml\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy web service into an Azure Container Instance\n",
|
||||
"The deployment process takes the registered model and your scoring scrip, and builds a Docker image. It then deploys the Docker image into Azure Container Instance as a running container with an HTTP endpoint readying for scoring calls. Read more about [Azure Container Instance](https://azure.microsoft.com/en-us/services/container-instances/).\n",
|
||||
"### Describe your target compute\n",
|
||||
"In addition to the container, we also need to describe the type of compute we want to allocate for our webservice. In in this example we are using an [Azure Container Instance](https://azure.microsoft.com/en-us/services/container-instances/) which is a good choice for quick and cost-effective dev/test deployment scenarios. ACI instances require the number of cores you want to run and memory you need. Tags and descriptions are available for you to identify the instances in AML when viewing the Compute tab in the AML Portal.\n",
|
||||
"\n",
|
||||
"Note ACI is great for quick and cost-effective dev/test deployment scenarios. For production workloads, please use [Azure Kubernentes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/) instead. Please follow in struction in [this notebook](11.production-deploy-to-aks.ipynb) to see how that can be done from Azure ML.\n",
|
||||
" \n",
|
||||
"** Note: ** The web service creation can take 6-7 minutes."
|
||||
"For production workloads, it is better to use [Azure Kubernentes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/) instead. Try [this notebook](11.production-deploy-to-aks.ipynb) to see how that can be done from Azure ML.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -555,7 +435,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.webservice import AciWebservice, Webservice\n",
|
||||
"from azureml.core.webservice import AciWebservice\n",
|
||||
"\n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n",
|
||||
" memory_gb=1, \n",
|
||||
@@ -567,26 +447,22 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note the below `WebService.deploy_from_model()` function takes a model object registered under the workspace. It then bakes the model file in the Docker image so it can be looked-up using the `Model.get_model_path()` function in `score.py`. \n",
|
||||
"### Deploy your webservice\n",
|
||||
"The final step to deploying your webservice is to call `WebService.deploy_from_model()`. This function uses the deployment and image configurations created above to perform the following:\n",
|
||||
"* Build a docker image\n",
|
||||
"* Deploy to the docker image to an Azure Container Instance\n",
|
||||
"* Copy your model files to the Azure Container Instance\n",
|
||||
"* Call the `init()` function in your scoring file\n",
|
||||
"* Provide an HTTP endpoint for scoring calls\n",
|
||||
"\n",
|
||||
"If you have a local model file instead of a registered model object, you can also use the `WebService.deploy()` function which would register the model and then deploy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"deploy service",
|
||||
"aci"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.image import ContainerImage\n",
|
||||
"image_config = ContainerImage.image_configuration(execution_script=\"score.py\", \n",
|
||||
" runtime=\"python\", \n",
|
||||
" conda_file=\"myenv.yml\")"
|
||||
"The `deploy_from_model` method requires the following parameters\n",
|
||||
"* `workspace` - the workspace containing the service\n",
|
||||
"* `name` - a unique named used to identify the service in the workspace\n",
|
||||
"* `models` - an array of models to be deployed into the container\n",
|
||||
"* `image_config` - a configuration object describing the image environment\n",
|
||||
"* `deployment_config` - a configuration object describing the compute type\n",
|
||||
" \n",
|
||||
"**Note:** The web service creation can take several minutes. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -601,14 +477,16 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"# this will take 5-10 minutes to finish\n",
|
||||
"# you can also use \"az container list\" command to find the ACI being deployed\n",
|
||||
"from azureml.core.webservice import Webservice\n",
|
||||
"\n",
|
||||
"# Create the webservice using all of the precreated configurations and our best model\n",
|
||||
"service = Webservice.deploy_from_model(name='my-aci-svc',\n",
|
||||
" deployment_config=aciconfig,\n",
|
||||
" models=[model],\n",
|
||||
" image_config=image_config,\n",
|
||||
" workspace=ws)\n",
|
||||
"\n",
|
||||
"# Wait for the service deployment to complete while displaying log output\n",
|
||||
"service.wait_for_deployment(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -617,28 +495,14 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## Test web service"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"deploy service",
|
||||
"aci"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('web service is hosted in ACI:', service.scoring_uri)"
|
||||
"### Test your webservice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use the `run` API to call the web service with one row of data to get a prediction."
|
||||
"Now that your web service is runing you can send JSON data directly to the service using the `run` method. This cell pulls the first test sample from the original dataset into JSON and then sends it to the service."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -653,8 +517,10 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"# score the first row from the test set.\n",
|
||||
"# scrape the first row from the test set.\n",
|
||||
"test_samples = json.dumps({\"data\": X_test[0:1, :].tolist()})\n",
|
||||
"\n",
|
||||
"#score on our service\n",
|
||||
"service.run(input_data = test_samples)"
|
||||
]
|
||||
},
|
||||
@@ -662,7 +528,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Feed the entire test set and calculate the errors (residual values)."
|
||||
"This cell shows how you can send multiple rows to the webservice at once. It then calculates the residuals - that is, the errors - by subtracting out the actual values from the results. These residuals are used later to show a plotted result."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -687,7 +553,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also send raw HTTP request to test the web service."
|
||||
"This cell shows how you can use the `service.scoring_uri` property to access the HTTP endpoint of the service and call it using standard POST operations."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -704,16 +570,14 @@
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# 2 rows of input data, each with 10 made-up numerical features\n",
|
||||
"input_data = \"{\\\"data\\\": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]}\"\n",
|
||||
"# use the first row from the test set again\n",
|
||||
"test_samples = json.dumps({\"data\": X_test[0:1, :].tolist()})\n",
|
||||
"\n",
|
||||
"# create the required header\n",
|
||||
"headers = {'Content-Type':'application/json'}\n",
|
||||
"\n",
|
||||
"# for AKS deployment you'd need to the service key in the header as well\n",
|
||||
"# api_key = service.get_key()\n",
|
||||
"# headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)} \n",
|
||||
"\n",
|
||||
"resp = requests.post(service.scoring_uri, input_data, headers = headers)\n",
|
||||
"# post the request to the service and display the result\n",
|
||||
"resp = requests.post(service.scoring_uri, test_samples, headers = headers)\n",
|
||||
"print(resp.text)"
|
||||
]
|
||||
},
|
||||
@@ -721,8 +585,10 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Residual graph\n",
|
||||
"Plot a residual value graph to chart the errors on the entire test set. Observe the nice bell curve."
|
||||
"### Residual graph\n",
|
||||
"One way to understand the behavior of your model is to see how the data performs against data with known results. This cell uses matplotlib to create a histogram of the residual values, or errors, created from scoring the test samples.\n",
|
||||
"\n",
|
||||
"A good model should have residual values that cluster around 0 - that is, no error. Observing the resulting histogram can also show you if the model is skewed in any particular direction."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -731,6 +597,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"f, (a0, a1) = plt.subplots(1, 2, gridspec_kw={'width_ratios':[3, 1], 'wspace':0, 'hspace': 0})\n",
|
||||
"f.suptitle('Residual Values', fontsize = 18)\n",
|
||||
"\n",
|
||||
@@ -753,14 +623,14 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Delete ACI to clean up"
|
||||
"### Clean up"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Deleting ACI is super fast!"
|
||||
"Delete the ACI instance to stop the compute and any associated billing."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -777,6 +647,36 @@
|
||||
"%%time\n",
|
||||
"service.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Next Steps"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this example, you created a series of models inside the notebook using local data, stored them inside an AML experiment, found the best one and deployed it as a live service! From here you can continue to use Azure Machine Learning in this regard to run your own experiments and deploy your own models, or you can expand into further capabilities of AML!\n",
|
||||
"\n",
|
||||
"If you have a model that is difficult to process locally, either because the data is remote or the model is large, try the [train-on-remote-vm](../train-on-remote-vm) notebook to learn about submitting remote jobs.\n",
|
||||
"\n",
|
||||
"If you want to take advantage of multiple cloud machines to perform large parameter sweeps try the [train-hyperparameter-tune-deploy-with-pytorch](../../training-with-deep-learning/train-hyperparameter-tune-deploy-with-pytorch\n",
|
||||
") sample.\n",
|
||||
"\n",
|
||||
"If you want to deploy models to a production cluster try the [production-deploy-to-aks](../../deployment/production-deploy-to-aks\n",
|
||||
") notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -786,7 +686,7 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python [Python 3.6]",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user