diff --git a/01.getting-started/01.train-within-notebook/01.train-within-notebook.ipynb b/01.getting-started/01.train-within-notebook/01.train-within-notebook.ipynb index 0acf1392..159d16c8 100644 --- a/01.getting-started/01.train-within-notebook/01.train-within-notebook.ipynb +++ b/01.getting-started/01.train-within-notebook/01.train-within-notebook.ipynb @@ -1,816 +1,809 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 01. Train in the Notebook & Deploy Model to ACI\n", - "\n", - "* Load workspace\n", - "* Train a simple regression model directly in the Notebook python kernel\n", - "* Record run history\n", - "* Find the best model in run history and download it.\n", - "* Deploy the model as an Azure Container Instance (ACI)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "1. Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't. \n", - "\n", - "2. Install following pre-requisite libraries to your conda environment and restart notebook.\n", - "```shell\n", - "(myenv) $ conda install -y matplotlib tqdm scikit-learn\n", - "```\n", - "\n", - "3. Check that ACI is registered for your Azure Subscription. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!az provider show -n Microsoft.ContainerInstance -o table" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If ACI is not registered, run following command to register it. Note that you have to be a subscription owner, or this command will fail." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!az provider register -n Microsoft.ContainerInstance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Validate Azure ML SDK installation and get version number for debugging purposes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "install" - ] - }, - "outputs": [], - "source": [ - "from azureml.core import Experiment, Run, Workspace\n", - "import azureml.core\n", - "\n", - "# Check core SDK version number\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Workspace\n", - "\n", - "Initialize a workspace object from persisted configuration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "create workspace" - ] - }, - "outputs": [], - "source": [ - "ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set experiment name\n", - "Choose a name for experiment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "experiment_name = 'train-in-notebook'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Start a training run in local Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load diabetes dataset, a well-known small dataset that comes with scikit-learn\n", - "from sklearn.datasets import load_diabetes\n", - "from sklearn.linear_model import Ridge\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.externals import joblib\n", - "\n", - "X, y = load_diabetes(return_X_y = True)\n", - "columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", - "data = {\n", - " \"train\":{\"X\": X_train, \"y\": y_train}, \n", - " \"test\":{\"X\": X_test, \"y\": y_test}\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train a simple Ridge model\n", - "Train a very simple Ridge regression model in scikit-learn, and save it as a pickle file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "reg = Ridge(alpha = 0.03)\n", - "reg.fit(X=data['train']['X'], y=data['train']['y'])\n", - "preds = reg.predict(data['test']['X'])\n", - "print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))\n", - "joblib.dump(value=reg, filename='model.pkl');" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Add experiment tracking\n", - "Now, let's add Azure ML experiment logging, and upload persisted model into run record as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "local run", - "outputs upload" - ] - }, - "outputs": [], - "source": [ - "experiment = Experiment(workspace=ws, name=experiment_name)\n", - "run = experiment.start_logging()\n", - "\n", - "run.tag(\"Description\",\"My first run!\")\n", - "run.log('alpha', 0.03)\n", - "reg = Ridge(alpha=0.03)\n", - "reg.fit(data['train']['X'], data['train']['y'])\n", - "preds = reg.predict(data['test']['X'])\n", - "run.log('mse', mean_squared_error(data['test']['y'], preds))\n", - "joblib.dump(value=reg, filename='model.pkl')\n", - "run.upload_file(name='outputs/model.pkl', path_or_stream='./model.pkl')\n", - "\n", - "run.complete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can browse to the recorded run. Please make sure you use Chrome to navigate the run history page." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Simple parameter sweep\n", - "Sweep over alpha values of a sklearn ridge model, and capture metrics and trained model in the Azure ML experiment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import os\n", - "from tqdm import tqdm\n", - "\n", - "model_name = \"model.pkl\"\n", - "\n", - "# list of numbers from 0 to 1.0 with a 0.05 interval\n", - "alphas = np.arange(0.0, 1.0, 0.05)\n", - "\n", - "# try a bunch of alpha values in a Linear Regression (Ridge) model\n", - "for alpha in tqdm(alphas):\n", - " # create a bunch of runs, each train a model with a different alpha value\n", - " with experiment.start_logging() as run:\n", - " # Use Ridge algorithm to build a regression model\n", - " reg = Ridge(alpha=alpha)\n", - " reg.fit(X=data[\"train\"][\"X\"], y=data[\"train\"][\"y\"])\n", - " preds = reg.predict(X=data[\"test\"][\"X\"])\n", - " mse = mean_squared_error(y_true=data[\"test\"][\"y\"], y_pred=preds)\n", - "\n", - " # log alpha, mean_squared_error and feature names in run history\n", - " run.log(name=\"alpha\", value=alpha)\n", - " run.log(name=\"mse\", value=mse)\n", - " run.log_list(name=\"columns\", value=columns)\n", - "\n", - " with open(model_name, \"wb\") as file:\n", - " joblib.dump(value=reg, filename=file)\n", - " \n", - " # upload the serialized model into run history record\n", - " run.upload_file(name=\"outputs/\" + model_name, path_or_stream=model_name)\n", - "\n", - " # now delete the serialized model from local folder since it is already uploaded to run history \n", - " os.remove(path=model_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# now let's take a look at the experiment in Azure portal.\n", - "experiment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Select best model from the experiment\n", - "Load all experiment run metrics recursively from the experiment into a dictionary object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "runs = {}\n", - "run_metrics = {}\n", - "\n", - "for r in tqdm(experiment.get_runs()):\n", - " metrics = r.get_metrics()\n", - " if 'mse' in metrics.keys():\n", - " runs[r.id] = r\n", - " run_metrics[r.id] = metrics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now find the run with the lowest Mean Squared Error value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_run_id = min(run_metrics, key = lambda k: run_metrics[k]['mse'])\n", - "best_run = runs[best_run_id]\n", - "print('Best run is:', best_run_id)\n", - "print('Metrics:', run_metrics[best_run_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can add tags to your runs to make them easier to catalog" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "query history" - ] - }, - "outputs": [], - "source": [ - "best_run.tag(key=\"Description\", value=\"The best one\")\n", - "best_run.get_tags()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot MSE over alpha\n", - "\n", - "Let's observe the best model visually by plotting the MSE values over alpha values:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "\n", - "best_alpha = run_metrics[best_run_id]['alpha']\n", - "min_mse = run_metrics[best_run_id]['mse']\n", - "\n", - "alpha_mse = np.array([(run_metrics[k]['alpha'], run_metrics[k]['mse']) for k in run_metrics.keys()])\n", - "sorted_alpha_mse = alpha_mse[alpha_mse[:,0].argsort()]\n", - "\n", - "plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'r--')\n", - "plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'bo')\n", - "\n", - "plt.xlabel('alpha', fontsize = 14)\n", - "plt.ylabel('mean squared error', fontsize = 14)\n", - "plt.title('MSE over alpha', fontsize = 16)\n", - "\n", - "# plot arrow\n", - "plt.arrow(x = best_alpha, y = min_mse + 39, dx = 0, dy = -26, ls = '-', lw = 0.4,\n", - " width = 0, head_width = .03, head_length = 8)\n", - "\n", - "# plot \"best run\" text\n", - "plt.text(x = best_alpha - 0.08, y = min_mse + 50, s = 'Best Run', fontsize = 14)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Register the best model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Find the model file saved in the run record of best run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "query history" - ] - }, - "outputs": [], - "source": [ - "for f in best_run.get_file_names():\n", - " print(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can register this model in the model registry of the workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "register model from history" - ] - }, - "outputs": [], - "source": [ - "model = best_run.register_model(model_name='best_model', model_path='outputs/model.pkl')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Verify that the model has been registered properly. If you have done this several times you'd see the version number auto-increases each time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "register model from history" - ] - }, - "outputs": [], - "source": [ - "from azureml.core.model import Model\n", - "models = Model.list(workspace=ws, name='best_model')\n", - "for m in models:\n", - " print(m.name, m.version)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also download the registered model. Afterwards, you should see a `model.pkl` file in the current directory. You can then use it for local testing if you'd like." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "download file" - ] - }, - "outputs": [], - "source": [ - "# remove the model file if it is already on disk\n", - "if os.path.isfile('model.pkl'): \n", - " os.remove('model.pkl')\n", - "# download the model\n", - "model.download(target_dir=\"./\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Scoring script\n", - "\n", - "Now we are ready to build a Docker image and deploy the model in it as a web service. The first step is creating the scoring script. For convenience, we have created the scoring script for you. It is printed below as text, but you can also run `%pfile ./score.py` in a cell to show the file.\n", - "\n", - "Tbe scoring script consists of two functions: `init` that is used to load the model to memory when starting the container, and `run` that makes the prediction when web service is called. Please pay special attention to how the model is loaded in the `init()` function. When Docker image is built for this model, the actual model file is downloaded and placed on disk, and `get_model_path` function returns the local path where the model is placed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('./score.py', 'r') as scoring_script:\n", - " print(scoring_script.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create environment dependency file\n", - "\n", - "We need a environment dependency file `myenv.yml` to specify which libraries are needed by the scoring script when building the Docker image for web service deployment. We can manually create this file, or we can use the `CondaDependencies` API to automatically create this file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.conda_dependencies import CondaDependencies \n", - "\n", - "myenv = CondaDependencies()\n", - "myenv.add_conda_package(\"scikit-learn\")\n", - "print(myenv.serialize_to_string())\n", - "\n", - "with open(\"myenv.yml\",\"w\") as f:\n", - " f.write(myenv.serialize_to_string())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy web service into an Azure Container Instance\n", - "The deployment process takes the registered model and your scoring scrip, and builds a Docker image. It then deploys the Docker image into Azure Container Instance as a running container with an HTTP endpoint readying for scoring calls. Read more about [Azure Container Instance](https://azure.microsoft.com/en-us/services/container-instances/).\n", - "\n", - "Note ACI is great for quick and cost-effective dev/test deployment scenarios. For production workloads, please use [Azure Kubernentes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/) instead. Please follow in struction in [this notebook](11.production-deploy-to-aks.ipynb) to see how that can be done from Azure ML.\n", - " \n", - "** Note: ** The web service creation can take 6-7 minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "from azureml.core.webservice import AciWebservice, Webservice\n", - "\n", - "aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n", - " memory_gb=1, \n", - " tags={'sample name': 'AML 101'}, \n", - " description='This is a great example.')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note the below `WebService.deploy_from_model()` function takes a model object registered under the workspace. It then bakes the model file in the Docker image so it can be looked-up using the `Model.get_model_path()` function in `score.py`. \n", - "\n", - "If you have a local model file instead of a registered model object, you can also use the `WebService.deploy()` function which would register the model and then deploy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "from azureml.core.image import ContainerImage\n", - "image_config = ContainerImage.image_configuration(execution_script=\"score.py\", \n", - " runtime=\"python\", \n", - " conda_file=\"myenv.yml\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "%%time\n", - "# this will take 5-10 minutes to finish\n", - "# you can also use \"az container list\" command to find the ACI being deployed\n", - "service = Webservice.deploy_from_model(name='my-aci-svc',\n", - " deployment_config=aciconfig,\n", - " models=[model],\n", - " image_config=image_config,\n", - " workspace=ws)\n", - "\n", - "service.wait_for_deployment(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test web service" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "print('web service is hosted in ACI:', service.scoring_uri)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the `run` API to call the web service with one row of data to get a prediction." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "import json\n", - "# score the first row from the test set.\n", - "test_samples = json.dumps({\"data\": X_test[0:1, :].tolist()})\n", - "service.run(input_data = test_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feed the entire test set and calculate the errors (residual values)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "# score the entire test set.\n", - "test_samples = json.dumps({'data': X_test.tolist()})\n", - "\n", - "result = json.loads(service.run(input_data = test_samples))['result']\n", - "residual = result - y_test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also send raw HTTP request to test the web service." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "\n", - "# 2 rows of input data, each with 10 made-up numerical features\n", - "input_data = \"{\\\"data\\\": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]}\"\n", - "\n", - "headers = {'Content-Type':'application/json'}\n", - "\n", - "# for AKS deployment you'd need to the service key in the header as well\n", - "# api_key = service.get_key()\n", - "# headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)} \n", - "\n", - "resp = requests.post(service.scoring_uri, input_data, headers = headers)\n", - "print(resp.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Residual graph\n", - "Plot a residual value graph to chart the errors on the entire test set. Observe the nice bell curve." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f, (a0, a1) = plt.subplots(1, 2, gridspec_kw={'width_ratios':[3, 1], 'wspace':0, 'hspace': 0})\n", - "f.suptitle('Residual Values', fontsize = 18)\n", - "\n", - "f.set_figheight(6)\n", - "f.set_figwidth(14)\n", - "\n", - "a0.plot(residual, 'bo', alpha=0.4);\n", - "a0.plot([0,90], [0,0], 'r', lw=2)\n", - "a0.set_ylabel('residue values', fontsize=14)\n", - "a0.set_xlabel('test data set', fontsize=14)\n", - "\n", - "a1.hist(residual, orientation='horizontal', color='blue', bins=10, histtype='step');\n", - "a1.hist(residual, orientation='horizontal', color='blue', alpha=0.2, bins=10);\n", - "a1.set_yticklabels([])\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Delete ACI to clean up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Deleting ACI is super fast!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "deploy service", - "aci" - ] - }, - "outputs": [], - "source": [ - "%%time\n", - "service.delete()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "authors": [ - { - "name": "roastala" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 01. Train in the Notebook & Deploy Model to ACI\n", + "\n", + "* Load workspace\n", + "* Train a simple regression model directly in the Notebook python kernel\n", + "* Record run history\n", + "* Find the best model in run history and download it.\n", + "* Deploy the model as an Azure Container Instance (ACI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "1. Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't. \n", + "\n", + "2. Install following pre-requisite libraries to your conda environment and restart notebook.\n", + "```shell\n", + "(myenv) $ conda install -y matplotlib tqdm scikit-learn\n", + "```\n", + "\n", + "3. Check that ACI is registered for your Azure Subscription. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!az provider show -n Microsoft.ContainerInstance -o table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If ACI is not registered, run following command to register it. Note that you have to be a subscription owner, or this command will fail." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!az provider register -n Microsoft.ContainerInstance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate Azure ML SDK installation and get version number for debugging purposes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "install" + ] + }, + "outputs": [], + "source": [ + "from azureml.core import Experiment, Run, Workspace\n", + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Workspace\n", + "\n", + "Initialize a workspace object from persisted configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "create workspace" + ] + }, + "outputs": [], + "source": [ + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set experiment name\n", + "Choose a name for experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "experiment_name = 'train-in-notebook'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start a training run in local Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load diabetes dataset, a well-known small dataset that comes with scikit-learn\n", + "from sklearn.datasets import load_diabetes\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.externals import joblib\n", + "\n", + "X, y = load_diabetes(return_X_y = True)\n", + "columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", + "data = {\n", + " \"train\":{\"X\": X_train, \"y\": y_train}, \n", + " \"test\":{\"X\": X_test, \"y\": y_test}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train a simple Ridge model\n", + "Train a very simple Ridge regression model in scikit-learn, and save it as a pickle file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg = Ridge(alpha = 0.03)\n", + "reg.fit(X=data['train']['X'], y=data['train']['y'])\n", + "preds = reg.predict(data['test']['X'])\n", + "print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))\n", + "joblib.dump(value=reg, filename='model.pkl');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add experiment tracking\n", + "Now, let's add Azure ML experiment logging, and upload persisted model into run record as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "local run", + "outputs upload" + ] + }, + "outputs": [], + "source": [ + "experiment = Experiment(workspace=ws, name=experiment_name)\n", + "run = experiment.start_logging()\n", + "\n", + "run.tag(\"Description\",\"My first run!\")\n", + "run.log('alpha', 0.03)\n", + "reg = Ridge(alpha=0.03)\n", + "reg.fit(data['train']['X'], data['train']['y'])\n", + "preds = reg.predict(data['test']['X'])\n", + "run.log('mse', mean_squared_error(data['test']['y'], preds))\n", + "joblib.dump(value=reg, filename='model.pkl')\n", + "run.upload_file(name='outputs/model.pkl', path_or_stream='./model.pkl')\n", + "\n", + "run.complete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can browse to the recorded run. Please make sure you use Chrome to navigate the run history page." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple parameter sweep\n", + "Sweep over alpha values of a sklearn ridge model, and capture metrics and trained model in the Azure ML experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "from tqdm import tqdm\n", + "\n", + "model_name = \"model.pkl\"\n", + "\n", + "# list of numbers from 0 to 1.0 with a 0.05 interval\n", + "alphas = np.arange(0.0, 1.0, 0.05)\n", + "\n", + "# try a bunch of alpha values in a Linear Regression (Ridge) model\n", + "for alpha in tqdm(alphas):\n", + " # create a bunch of runs, each train a model with a different alpha value\n", + " with experiment.start_logging() as run:\n", + " # Use Ridge algorithm to build a regression model\n", + " reg = Ridge(alpha=alpha)\n", + " reg.fit(X=data[\"train\"][\"X\"], y=data[\"train\"][\"y\"])\n", + " preds = reg.predict(X=data[\"test\"][\"X\"])\n", + " mse = mean_squared_error(y_true=data[\"test\"][\"y\"], y_pred=preds)\n", + "\n", + " # log alpha, mean_squared_error and feature names in run history\n", + " run.log(name=\"alpha\", value=alpha)\n", + " run.log(name=\"mse\", value=mse)\n", + " run.log_list(name=\"columns\", value=columns)\n", + "\n", + " with open(model_name, \"wb\") as file:\n", + " joblib.dump(value=reg, filename=file)\n", + " \n", + " # upload the serialized model into run history record\n", + " run.upload_file(name=\"outputs/\" + model_name, path_or_stream=model_name)\n", + "\n", + " # now delete the serialized model from local folder since it is already uploaded to run history \n", + " os.remove(path=model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# now let's take a look at the experiment in Azure portal.\n", + "experiment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select best model from the experiment\n", + "Load all experiment run metrics recursively from the experiment into a dictionary object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "runs = {}\n", + "run_metrics = {}\n", + "\n", + "for r in tqdm(experiment.get_runs()):\n", + " metrics = r.get_metrics()\n", + " if 'mse' in metrics.keys():\n", + " runs[r.id] = r\n", + " run_metrics[r.id] = metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now find the run with the lowest Mean Squared Error value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_run_id = min(run_metrics, key = lambda k: run_metrics[k]['mse'])\n", + "best_run = runs[best_run_id]\n", + "print('Best run is:', best_run_id)\n", + "print('Metrics:', run_metrics[best_run_id])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can add tags to your runs to make them easier to catalog" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "query history" + ] + }, + "outputs": [], + "source": [ + "best_run.tag(key=\"Description\", value=\"The best one\")\n", + "best_run.get_tags()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot MSE over alpha\n", + "\n", + "Let's observe the best model visually by plotting the MSE values over alpha values:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "best_alpha = run_metrics[best_run_id]['alpha']\n", + "min_mse = run_metrics[best_run_id]['mse']\n", + "\n", + "alpha_mse = np.array([(run_metrics[k]['alpha'], run_metrics[k]['mse']) for k in run_metrics.keys()])\n", + "sorted_alpha_mse = alpha_mse[alpha_mse[:,0].argsort()]\n", + "\n", + "plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'r--')\n", + "plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'bo')\n", + "\n", + "plt.xlabel('alpha', fontsize = 14)\n", + "plt.ylabel('mean squared error', fontsize = 14)\n", + "plt.title('MSE over alpha', fontsize = 16)\n", + "\n", + "# plot arrow\n", + "plt.arrow(x = best_alpha, y = min_mse + 39, dx = 0, dy = -26, ls = '-', lw = 0.4,\n", + " width = 0, head_width = .03, head_length = 8)\n", + "\n", + "# plot \"best run\" text\n", + "plt.text(x = best_alpha - 0.08, y = min_mse + 50, s = 'Best Run', fontsize = 14)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Register the best model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find the model file saved in the run record of best run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "query history" + ] + }, + "outputs": [], + "source": [ + "for f in best_run.get_file_names():\n", + " print(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can register this model in the model registry of the workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "register model from history" + ] + }, + "outputs": [], + "source": [ + "model = best_run.register_model(model_name='best_model', model_path='outputs/model.pkl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Verify that the model has been registered properly. If you have done this several times you'd see the version number auto-increases each time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "register model from history" + ] + }, + "outputs": [], + "source": [ + "from azureml.core.model import Model\n", + "models = Model.list(workspace=ws, name='best_model')\n", + "for m in models:\n", + " print(m.name, m.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also download the registered model. Afterwards, you should see a `model.pkl` file in the current directory. You can then use it for local testing if you'd like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "download file" + ] + }, + "outputs": [], + "source": [ + "# remove the model file if it is already on disk\n", + "if os.path.isfile('model.pkl'): \n", + " os.remove('model.pkl')\n", + "# download the model\n", + "model.download(target_dir=\"./\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scoring script\n", + "\n", + "Now we are ready to build a Docker image and deploy the model in it as a web service. The first step is creating the scoring script. For convenience, we have created the scoring script for you. It is printed below as text, but you can also run `%pfile ./score.py` in a cell to show the file.\n", + "\n", + "Tbe scoring script consists of two functions: `init` that is used to load the model to memory when starting the container, and `run` that makes the prediction when web service is called. Please pay special attention to how the model is loaded in the `init()` function. When Docker image is built for this model, the actual model file is downloaded and placed on disk, and `get_model_path` function returns the local path where the model is placed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./score.py', 'r') as scoring_script:\n", + " print(scoring_script.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create environment dependency file\n", + "\n", + "We need a environment dependency file `myenv.yml` to specify which libraries are needed by the scoring script when building the Docker image for web service deployment. We can manually create this file, or we can use the `CondaDependencies` API to automatically create this file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.conda_dependencies import CondaDependencies \n", + "\n", + "myenv = CondaDependencies()\n", + "myenv.add_conda_package(\"scikit-learn\")\n", + "print(myenv.serialize_to_string())\n", + "\n", + "with open(\"myenv.yml\",\"w\") as f:\n", + " f.write(myenv.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy web service into an Azure Container Instance\n", + "The deployment process takes the registered model and your scoring scrip, and builds a Docker image. It then deploys the Docker image into Azure Container Instance as a running container with an HTTP endpoint readying for scoring calls. Read more about [Azure Container Instance](https://azure.microsoft.com/en-us/services/container-instances/).\n", + "\n", + "Note ACI is great for quick and cost-effective dev/test deployment scenarios. For production workloads, please use [Azure Kubernentes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/) instead. Please follow in struction in [this notebook](11.production-deploy-to-aks.ipynb) to see how that can be done from Azure ML.\n", + " \n", + "** Note: ** The web service creation can take 6-7 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "from azureml.core.webservice import AciWebservice, Webservice\n", + "\n", + "aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n", + " memory_gb=1, \n", + " tags={'sample name': 'AML 101'}, \n", + " description='This is a great example.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note the below `WebService.deploy_from_model()` function takes a model object registered under the workspace. It then bakes the model file in the Docker image so it can be looked-up using the `Model.get_model_path()` function in `score.py`. \n", + "\n", + "If you have a local model file instead of a registered model object, you can also use the `WebService.deploy()` function which would register the model and then deploy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "from azureml.core.image import ContainerImage\n", + "image_config = ContainerImage.image_configuration(execution_script=\"score.py\", \n", + " runtime=\"python\", \n", + " conda_file=\"myenv.yml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "%%time\n", + "# this will take 5-10 minutes to finish\n", + "# you can also use \"az container list\" command to find the ACI being deployed\n", + "service = Webservice.deploy_from_model(name='my-aci-svc',\n", + " deployment_config=aciconfig,\n", + " models=[model],\n", + " image_config=image_config,\n", + " workspace=ws)\n", + "\n", + "service.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Test web service" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "print('web service is hosted in ACI:', service.scoring_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the `run` API to call the web service with one row of data to get a prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "import json\n", + "# score the first row from the test set.\n", + "test_samples = json.dumps({\"data\": X_test[0:1, :].tolist()})\n", + "service.run(input_data = test_samples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Feed the entire test set and calculate the errors (residual values)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "# score the entire test set.\n", + "test_samples = json.dumps({'data': X_test.tolist()})\n", + "\n", + "result = json.loads(service.run(input_data = test_samples))\n", + "residual = result - y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also send raw HTTP request to test the web service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "\n", + "# 2 rows of input data, each with 10 made-up numerical features\n", + "input_data = \"{\\\"data\\\": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]}\"\n", + "\n", + "headers = {'Content-Type':'application/json'}\n", + "\n", + "# for AKS deployment you'd need to the service key in the header as well\n", + "# api_key = service.get_key()\n", + "# headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)} \n", + "\n", + "resp = requests.post(service.scoring_uri, input_data, headers = headers)\n", + "print(resp.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Residual graph\n", + "Plot a residual value graph to chart the errors on the entire test set. Observe the nice bell curve." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f, (a0, a1) = plt.subplots(1, 2, gridspec_kw={'width_ratios':[3, 1], 'wspace':0, 'hspace': 0})\n", + "f.suptitle('Residual Values', fontsize = 18)\n", + "\n", + "f.set_figheight(6)\n", + "f.set_figwidth(14)\n", + "\n", + "a0.plot(residual, 'bo', alpha=0.4);\n", + "a0.plot([0,90], [0,0], 'r', lw=2)\n", + "a0.set_ylabel('residue values', fontsize=14)\n", + "a0.set_xlabel('test data set', fontsize=14)\n", + "\n", + "a1.hist(residual, orientation='horizontal', color='blue', bins=10, histtype='step');\n", + "a1.hist(residual, orientation='horizontal', color='blue', alpha=0.2, bins=10);\n", + "a1.set_yticklabels([])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delete ACI to clean up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Deleting ACI is super fast!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "deploy service", + "aci" + ] + }, + "outputs": [], + "source": [ + "%%time\n", + "service.delete()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "roastala" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/01.getting-started/01.train-within-notebook/score.py b/01.getting-started/01.train-within-notebook/score.py index 16661594..b0b95ac4 100644 --- a/01.getting-started/01.train-within-notebook/score.py +++ b/01.getting-started/01.train-within-notebook/score.py @@ -21,7 +21,9 @@ def run(raw_data): data = json.loads(raw_data)['data'] data = np.array(data) result = model.predict(data) - return json.dumps({"result": result.tolist()}) + + # you can return any data type as long as it is JSON-serializable + return result.tolist() except Exception as e: result = str(e) - return json.dumps({"error": result}) + return result diff --git a/01.getting-started/02.train-on-local/.gitignore b/01.getting-started/02.train-on-local/.gitignore new file mode 100644 index 00000000..b1c8e444 --- /dev/null +++ b/01.getting-started/02.train-on-local/.gitignore @@ -0,0 +1 @@ +/samples/ diff --git a/01.getting-started/02.train-on-local/02.train-on-local.ipynb b/01.getting-started/02.train-on-local/02.train-on-local.ipynb index 97f076c6..0bacd35b 100644 --- a/01.getting-started/02.train-on-local/02.train-on-local.ipynb +++ b/01.getting-started/02.train-on-local/02.train-on-local.ipynb @@ -1,470 +1,477 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 02. Train locally\n", - "* Create or load workspace.\n", - "* Create scripts locally.\n", - "* Create `train.py` in a folder, along with a `my.lib` file.\n", - "* Configure & execute a local run in a user-managed Python environment.\n", - "* Configure & execute a local run in a system-managed Python environment.\n", - "* Configure & execute a local run in a Docker environment.\n", - "* Query run metrics to find the best model\n", - "* Register model for operationalization." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Workspace\n", - "\n", - "Initialize a workspace object from persisted configuration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.workspace import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create An Experiment\n", - "**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import Experiment\n", - "experiment_name = 'train-on-local'\n", - "exp = Experiment(workspace=ws, name=experiment_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View `train.py`\n", - "\n", - "`train.py` is already created for you." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('./train.py', 'r') as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note `train.py` also references a `mylib.py` file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('./mylib.py', 'r') as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure & Run\n", - "### User-managed environment\n", - "Below, we use a user-managed run, which means you are responsible to ensure all the necessary packages are available in the Python environment you choose to run the script." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "\n", - "# Editing a run configuration property on-fly.\n", - "run_config_user_managed = RunConfiguration()\n", - "\n", - "run_config_user_managed.environment.python.user_managed_dependencies = True\n", - "\n", - "# You can choose a specific Python environment by pointing to a Python path \n", - "#run_config.environment.python.interpreter_path = '/home/johndoe/miniconda3/envs/sdk2/bin/python'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Submit script to run in the user-managed environment\n", - "Note whole script folder is submitted for execution, including the `mylib.py` file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import ScriptRunConfig\n", - "\n", - "src = ScriptRunConfig(source_directory='./', script='train.py', run_config=run_config_user_managed)\n", - "run = exp.submit(src)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Get run history details" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Block to wait till run finishes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### System-managed environment\n", - "You can also ask the system to build a new conda environment and execute your scripts in it. The environment is built once and will be reused in subsequent executions as long as the conda dependencies remain unchanged. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "\n", - "run_config_system_managed = RunConfiguration()\n", - "\n", - "run_config_system_managed.environment.python.user_managed_dependencies = False\n", - "run_config_system_managed.auto_prepare_environment = True\n", - "\n", - "# Specify conda dependencies with scikit-learn\n", - "cd = CondaDependencies.create(conda_packages=['scikit-learn'])\n", - "run_config_system_managed.environment.python.conda_dependencies = cd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Submit script to run in the system-managed environment\n", - "A new conda environment is built based on the conda dependencies object. If you are running this for the first time, this might take up to 5 mninutes. But this conda environment is reused so long as you don't change the conda dependencies." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "src = ScriptRunConfig(source_directory=\"./\", script='train.py', run_config=run_config_system_managed)\n", - "run = exp.submit(src)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Get run history details" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Block and wait till run finishes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Docker-based execution\n", - "**IMPORTANT**: You must have Docker engine installed locally in order to use this execution mode. If your kernel is already running in a Docker container, such as **Azure Notebooks**, this mode will **NOT** work.\n", - "\n", - "You can also ask the system to pull down a Docker image and execute your scripts in it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "\n", - "run_config_docker = RunConfiguration()\n", - "\n", - "run_config_docker.environment.python.user_managed_dependencies = False\n", - "run_config_docker.auto_prepare_environment = True\n", - "run_config_docker.environment.docker.enabled = True\n", - "run_config_docker.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n", - "\n", - "# Specify conda dependencies with scikit-learn\n", - "cd = CondaDependencies.create(conda_packages=['scikit-learn'])\n", - "run_config_docker.environment.python.conda_dependencies = cd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Submit script to run in the system-managed environment\n", - "A new conda environment is built based on the conda dependencies object. If you are running this for the first time, this might take up to 5 mninutes. But this conda environment is reused so long as you don't change the conda dependencies.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "src = ScriptRunConfig(source_directory=\"./\", script='train.py', run_config=run_config_docker)\n", - "run = exp.submit(src)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Get run history details\n", - "run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Query run metrics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "query history", - "get metrics" - ] - }, - "outputs": [], - "source": [ - "# get all metris logged in the run\n", - "run.get_metrics()\n", - "metrics = run.get_metrics()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's find the model that has the lowest MSE value logged." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "best_alpha = metrics['alpha'][np.argmin(metrics['mse'])]\n", - "\n", - "print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n", - " min(metrics['mse']), \n", - " best_alpha\n", - "))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also list all the files that are associated with this run record" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.get_file_names()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We know the model `ridge_0.40.pkl` is the best performing model from the eariler queries. So let's register it with the workspace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# supply a model name, and the full path to the serialized model file.\n", - "model = run.register_model(model_name='best_ridge_model', model_path='./outputs/ridge_0.40.pkl')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(model.name, model.version, model.url)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can deploy this model following the example in the 01 notebook." - ] - } - ], - "metadata": { - "authors": [ - { - "name": "roastala" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 02. Train locally\n", + "* Create or load workspace.\n", + "* Create scripts locally.\n", + "* Create `train.py` in a folder, along with a `my.lib` file.\n", + "* Configure & execute a local run in a user-managed Python environment.\n", + "* Configure & execute a local run in a system-managed Python environment.\n", + "* Configure & execute a local run in a Docker environment.\n", + "* Query run metrics to find the best model\n", + "* Register model for operationalization." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Workspace\n", + "\n", + "Initialize a workspace object from persisted configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create An Experiment\n", + "**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "experiment_name = 'train-on-local'\n", + "exp = Experiment(workspace=ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View `train.py`\n", + "\n", + "`train.py` is already created for you." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./train.py', 'r') as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note `train.py` also references a `mylib.py` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./mylib.py', 'r') as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure & Run\n", + "### User-managed environment\n", + "Below, we use a user-managed run, which means you are responsible to ensure all the necessary packages are available in the Python environment you choose to run the script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "\n", + "# Editing a run configuration property on-fly.\n", + "run_config_user_managed = RunConfiguration()\n", + "\n", + "run_config_user_managed.environment.python.user_managed_dependencies = True\n", + "\n", + "# You can choose a specific Python environment by pointing to a Python path \n", + "#run_config.environment.python.interpreter_path = '/home/johndoe/miniconda3/envs/sdk2/bin/python'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Submit script to run in the user-managed environment\n", + "Note whole script folder is submitted for execution, including the `mylib.py` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import ScriptRunConfig\n", + "\n", + "src = ScriptRunConfig(source_directory='./', script='train.py', run_config=run_config_user_managed)\n", + "run = exp.submit(src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get run history details" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Block to wait till run finishes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### System-managed environment\n", + "You can also ask the system to build a new conda environment and execute your scripts in it. The environment is built once and will be reused in subsequent executions as long as the conda dependencies remain unchanged. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "run_config_system_managed = RunConfiguration()\n", + "\n", + "run_config_system_managed.environment.python.user_managed_dependencies = False\n", + "run_config_system_managed.auto_prepare_environment = True\n", + "\n", + "# Specify conda dependencies with scikit-learn\n", + "cd = CondaDependencies.create(conda_packages=['scikit-learn'])\n", + "run_config_system_managed.environment.python.conda_dependencies = cd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Submit script to run in the system-managed environment\n", + "A new conda environment is built based on the conda dependencies object. If you are running this for the first time, this might take up to 5 mninutes. But this conda environment is reused so long as you don't change the conda dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "src = ScriptRunConfig(source_directory=\"./\", script='train.py', run_config=run_config_system_managed)\n", + "run = exp.submit(src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get run history details" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Block and wait till run finishes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docker-based execution\n", + "**IMPORTANT**: You must have Docker engine installed locally in order to use this execution mode. If your kernel is already running in a Docker container, such as **Azure Notebooks**, this mode will **NOT** work.\n", + "\n", + "You can also ask the system to pull down a Docker image and execute your scripts in it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_config_docker = RunConfiguration()\n", + "run_config_docker.environment.python.user_managed_dependencies = False\n", + "run_config_docker.auto_prepare_environment = True\n", + "run_config_docker.environment.docker.enabled = True\n", + "run_config_docker.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n", + "\n", + "# Specify conda dependencies with scikit-learn\n", + "cd = CondaDependencies.create(conda_packages=['scikit-learn'])\n", + "run_config_docker.environment.python.conda_dependencies = cd\n", + "\n", + "src = ScriptRunConfig(source_directory=\"./\", script='train.py', run_config=run_config_docker)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit script to run in the system-managed environment\n", + "A new conda environment is built based on the conda dependencies object. If you are running this for the first time, this might take up to 5 mninutes. But this conda environment is reused so long as you don't change the conda dependencies.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "# Check if Docker is installed and Linux containers are enables\n", + "if subprocess.run(\"docker -v\", shell=True) == 0:\n", + " out = subprocess.check_output(\"docker system info\", shell=True, encoding=\"ascii\").split(\"\\n\")\n", + " if not \"OSType: linux\" in out:\n", + " print(\"Switch Docker engine to use Linux containers.\")\n", + " else:\n", + " run = exp.submit(src)\n", + "else:\n", + " print(\"Docker engine not installed.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Get run history details\n", + "run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query run metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "query history", + "get metrics" + ] + }, + "outputs": [], + "source": [ + "# get all metris logged in the run\n", + "run.get_metrics()\n", + "metrics = run.get_metrics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's find the model that has the lowest MSE value logged." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "best_alpha = metrics['alpha'][np.argmin(metrics['mse'])]\n", + "\n", + "print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n", + " min(metrics['mse']), \n", + " best_alpha\n", + "))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also list all the files that are associated with this run record" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.get_file_names()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We know the model `ridge_0.40.pkl` is the best performing model from the eariler queries. So let's register it with the workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# supply a model name, and the full path to the serialized model file.\n", + "model = run.register_model(model_name='best_ridge_model', model_path='./outputs/ridge_0.40.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(model.name, model.version, model.url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can deploy this model following the example in the 01 notebook." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "roastala" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/01.getting-started/03.train-on-aci/03.train-on-aci.ipynb b/01.getting-started/03.train-on-aci/03.train-on-aci.ipynb index 25fb573f..8218f62d 100644 --- a/01.getting-started/03.train-on-aci/03.train-on-aci.ipynb +++ b/01.getting-started/03.train-on-aci/03.train-on-aci.ipynb @@ -1,289 +1,289 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 03. Train on Azure Container Instance\n", - "\n", - "* Create Workspace\n", - "* Create `train.py` in the project folder.\n", - "* Configure an ACI (Azure Container Instance) run\n", - "* Execute in ACI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Workspace\n", - "\n", - "Initialize a workspace object from persisted configuration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "create workspace" - ] - }, - "outputs": [], - "source": [ - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create An Experiment\n", - "\n", - "**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import Experiment\n", - "experiment_name = 'train-on-aci'\n", - "experiment = Experiment(workspace = ws, name = experiment_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Remote execution on ACI\n", - "\n", - "The training script `train.py` is already created for you. Let's have a look." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('./train.py', 'r') as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure for using ACI\n", - "Linux-based ACI is available in `West US`, `East US`, `West Europe`, `North Europe`, `West US 2`, `Southeast Asia`, `Australia East`, `East US 2`, and `Central US` regions. See details [here](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-quotas#region-availability)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "configure run" - ] - }, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "\n", - "# create a new runconfig object\n", - "run_config = RunConfiguration()\n", - "\n", - "# signal that you want to use ACI to execute script.\n", - "run_config.target = \"containerinstance\"\n", - "\n", - "# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n", - "run_config.container_instance.region = 'eastus2'\n", - "\n", - "# set the ACI CPU and Memory \n", - "run_config.container_instance.cpu_cores = 1\n", - "run_config.container_instance.memory_gb = 2\n", - "\n", - "# enable Docker \n", - "run_config.environment.docker.enabled = True\n", - "\n", - "# set Docker base image to the default CPU-based image\n", - "run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n", - "\n", - "# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n", - "run_config.environment.python.user_managed_dependencies = False\n", - "\n", - "# auto-prepare the Docker image when used for execution (if it is not already prepared)\n", - "run_config.auto_prepare_environment = True\n", - "\n", - "# specify CondaDependencies obj\n", - "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submit the Experiment\n", - "Finally, run the training job on the ACI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "remote run", - "aci" - ] - }, - "outputs": [], - "source": [ - "%%time \n", - "from azureml.core.script_run_config import ScriptRunConfig\n", - "\n", - "script_run_config = ScriptRunConfig(source_directory='./',\n", - " script='train.py',\n", - " run_config=run_config)\n", - "\n", - "run = experiment.submit(script_run_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "query history" - ] - }, - "outputs": [], - "source": [ - "# Show run details\n", - "run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "remote run", - "aci" - ] - }, - "outputs": [], - "source": [ - "%%time\n", - "# Shows output of the run on stdout.\n", - "run.wait_for_completion(show_output=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "get metrics" - ] - }, - "outputs": [], - "source": [ - "# get all metris logged in the run\n", - "run.get_metrics()\n", - "metrics = run.get_metrics()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n", - " min(metrics['mse']), \n", - " metrics['alpha'][np.argmin(metrics['mse'])]\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# show all the files stored within the run record\n", - "run.get_file_names()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can take a model produced here, register it and then deploy as a web service." - ] - } - ], - "metadata": { - "authors": [ - { - "name": "roastala" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 03. Train on Azure Container Instance\n", + "\n", + "* Create Workspace\n", + "* Create `train.py` in the project folder.\n", + "* Configure an ACI (Azure Container Instance) run\n", + "* Execute in ACI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Workspace\n", + "\n", + "Initialize a workspace object from persisted configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "create workspace" + ] + }, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create An Experiment\n", + "\n", + "**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "experiment_name = 'train-on-aci'\n", + "experiment = Experiment(workspace = ws, name = experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Remote execution on ACI\n", + "\n", + "The training script `train.py` is already created for you. Let's have a look." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./train.py', 'r') as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure for using ACI\n", + "Linux-based ACI is available in `West US`, `East US`, `West Europe`, `North Europe`, `West US 2`, `Southeast Asia`, `Australia East`, `East US 2`, and `Central US` regions. See details [here](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-quotas#region-availability)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "configure run" + ] + }, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "# create a new runconfig object\n", + "run_config = RunConfiguration()\n", + "\n", + "# signal that you want to use ACI to execute script.\n", + "run_config.target = \"containerinstance\"\n", + "\n", + "# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n", + "run_config.container_instance.region = 'eastus2'\n", + "\n", + "# set the ACI CPU and Memory \n", + "run_config.container_instance.cpu_cores = 1\n", + "run_config.container_instance.memory_gb = 2\n", + "\n", + "# enable Docker \n", + "run_config.environment.docker.enabled = True\n", + "\n", + "# set Docker base image to the default CPU-based image\n", + "run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n", + "\n", + "# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n", + "run_config.environment.python.user_managed_dependencies = False\n", + "\n", + "# auto-prepare the Docker image when used for execution (if it is not already prepared)\n", + "run_config.auto_prepare_environment = True\n", + "\n", + "# specify CondaDependencies obj\n", + "run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit the Experiment\n", + "Finally, run the training job on the ACI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remote run", + "aci" + ] + }, + "outputs": [], + "source": [ + "%%time \n", + "from azureml.core.script_run_config import ScriptRunConfig\n", + "\n", + "script_run_config = ScriptRunConfig(source_directory='./',\n", + " script='train.py',\n", + " run_config=run_config)\n", + "\n", + "run = experiment.submit(script_run_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "query history" + ] + }, + "outputs": [], + "source": [ + "# Show run details\n", + "run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remote run", + "aci" + ] + }, + "outputs": [], + "source": [ + "%%time\n", + "# Shows output of the run on stdout.\n", + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "get metrics" + ] + }, + "outputs": [], + "source": [ + "# get all metris logged in the run\n", + "run.get_metrics()\n", + "metrics = run.get_metrics()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n", + " min(metrics['mse']), \n", + " metrics['alpha'][np.argmin(metrics['mse'])]\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show all the files stored within the run record\n", + "run.get_file_names()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can take a model produced here, register it and then deploy as a web service." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "roastala" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/AzureMlCli b/AzureMlCli new file mode 160000 index 00000000..09e29b09 --- /dev/null +++ b/AzureMlCli @@ -0,0 +1 @@ +Subproject commit 09e29b09694e27aeffa1daad51ee9850d5941c3c diff --git a/training/03.train-hyperparameter-tune-deploy-with-tensorflow/.gitignore b/training/03.train-hyperparameter-tune-deploy-with-tensorflow/.gitignore new file mode 100644 index 00000000..82f0c3ac --- /dev/null +++ b/training/03.train-hyperparameter-tune-deploy-with-tensorflow/.gitignore @@ -0,0 +1 @@ +/data/ diff --git a/training/03.train-hyperparameter-tune-deploy-with-tensorflow/03.train-hyperparameter-tune-deploy-with-tensorflow.ipynb b/training/03.train-hyperparameter-tune-deploy-with-tensorflow/03.train-hyperparameter-tune-deploy-with-tensorflow.ipynb index 3d754f61..24453e60 100644 --- a/training/03.train-hyperparameter-tune-deploy-with-tensorflow/03.train-hyperparameter-tune-deploy-with-tensorflow.ipynb +++ b/training/03.train-hyperparameter-tune-deploy-with-tensorflow/03.train-hyperparameter-tune-deploy-with-tensorflow.ipynb @@ -1,1655 +1,1673 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "bf74d2e9-2708-49b1-934b-e0ede342f475" - } - }, - "source": [ - "# 03. Training, hyperparameter tune, and deploy with TensorFlow\n", - "\n", - "## Introduction\n", - "This tutorial shows how to train a simple deep neural network using the MNIST dataset and TensorFlow on Azure Machine Learning. MNIST is a popular dataset consisting of 70,000 grayscale images. Each image is a handwritten digit of `28x28` pixels, representing number from 0 to 9. The goal is to create a multi-class classifier to identify the digit each image represents, and deploy it as a web service in Azure.\n", - "\n", - "For more information about the MNIST dataset, please visit [Yan LeCun's website](http://yann.lecun.com/exdb/mnist/).\n", - "\n", - "## Prerequisite:\n", - "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning\n", - "* Go through the [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook to:\n", - " * install the AML SDK\n", - " * create a workspace and its configuration file (`config.json`)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's get started. First let's import some Python libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbpresent": { - "id": "c377ea0c-0cd9-4345-9be2-e20fb29c94c3" - } - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import numpy as np\n", - "import os\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbpresent": { - "id": "edaa7f2f-2439-4148-b57a-8c794c0945ec" - } - }, - "outputs": [], - "source": [ - "import azureml\n", - "from azureml.core import Workspace, Run\n", - "\n", - "# check core SDK version number\n", - "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Diagnostics\n", - "Opt-in diagnostics for better experience, quality, and security of future releases." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "Diagnostics" - ] - }, - "outputs": [], - "source": [ - "from azureml.telemetry import set_diagnostics_collection\n", - "set_diagnostics_collection(send_diagnostics=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize workspace\n", - "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.workspace import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "59f52294-4a25-4c92-bab8-3b07f0f44d15" - } - }, - "source": [ - "## Create an Azure ML experiment\n", - "Let's create an experiment named \"tf-mnist\" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbpresent": { - "id": "bc70f780-c240-4779-96f3-bc5ef9a37d59" - } - }, - "outputs": [], - "source": [ - "from azureml.core import Experiment\n", - "\n", - "script_folder = './tf-mnist'\n", - "os.makedirs(script_folder, exist_ok=True)\n", - "\n", - "exp = Experiment(workspace=ws, name='tf-mnist')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "defe921f-8097-44c3-8336-8af6700804a7" - } - }, - "source": [ - "## Download MNIST dataset\n", - "In order to train on the MNIST dataset we will first need to download it from Yan LeCun's web site directly and save them in a `data` folder locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import urllib\n", - "\n", - "os.makedirs('./data/mnist', exist_ok=True)\n", - "\n", - "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', filename = './data/mnist/train-images.gz')\n", - "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', filename = './data/mnist/train-labels.gz')\n", - "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename = './data/mnist/test-images.gz')\n", - "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename = './data/mnist/test-labels.gz')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea" - } - }, - "source": [ - "## Show some sample images\n", - "Let's load the downloaded compressed file into numpy arrays using some utility functions included in the `utils.py` library file from the current folder. Then we use `matplotlib` to plot 30 random images from the dataset along with their labels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbpresent": { - "id": "396d478b-34aa-4afa-9898-cdce8222a516" - } - }, - "outputs": [], - "source": [ - "from utils import load_data\n", - "\n", - "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster.\n", - "X_train = load_data('./data/mnist/train-images.gz', False) / 255.0\n", - "y_train = load_data('./data/mnist/train-labels.gz', True).reshape(-1)\n", - "\n", - "X_test = load_data('./data/mnist/test-images.gz', False) / 255.0\n", - "y_test = load_data('./data/mnist/test-labels.gz', True).reshape(-1)\n", - "\n", - "count = 0\n", - "sample_size = 30\n", - "plt.figure(figsize = (16, 6))\n", - "for i in np.random.permutation(X_train.shape[0])[:sample_size]:\n", - " count = count + 1\n", - " plt.subplot(1, sample_size, count)\n", - " plt.axhline('')\n", - " plt.axvline('')\n", - " plt.text(x = 10, y = -10, s = y_train[i], fontsize = 18)\n", - " plt.imshow(X_train[i].reshape(28, 28), cmap = plt.cm.Greys)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Upload MNIST dataset to default datastore \n", - "A [datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data) is a place where data can be stored that is then made accessible to a Run either by means of mounting or copying the data to the compute target. A datastore can either be backed by an Azure Blob Storage or and Azure File Share (ADLS will be supported in the future). For simple data handling, each workspace provides a default datastore that can be used, in case the data is not already in Blob Storage or File Share.\n", - "\n", - "In this next step, we will upload the training and test set into the workspace's default datastore, which we will then later be mount on a Batch AI cluster for training.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ds = ws.get_default_datastore()\n", - "ds.upload(src_dir='./data/mnist', target_path='mnist', overwrite=True, show_progress=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Batch AI cluster as compute target\n", - "[Batch AI](https://docs.microsoft.com/en-us/azure/batch-ai/overview) is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's create a new Batch AI cluster in the current workspace, if it doesn't already exist. We will then run the training script on this compute target." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we could not find the cluster with the given name in the previous cell, then we will create a new cluster here. We will create a Batch AI Cluster of `STANDARD_D2_V2` CPU VMs. This process is broken down into 3 steps:\n", - "1. create the configuration (this step is local and only takes a second)\n", - "2. create the Batch AI cluster (this step will take about **20 seconds**)\n", - "3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", - "from azureml.core.compute_target import ComputeTargetException\n", - "\n", - "# choose a name for your cluster\n", - "cluster_name = \"gpucluster\"\n", - "\n", - "try:\n", - " # look for the existing cluster by name\n", - " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", - " if type(compute_target) is BatchAiCompute:\n", - " print('Found existing compute target {}.'.format(cluster_name))\n", - " else:\n", - " print('{} exists but it is not a Batch AI cluster. Please choose a different name.'.format(cluster_name))\n", - "except ComputeTargetException:\n", - " print('Creating a new compute target...')\n", - " compute_config = BatchAiCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\", # GPU-based VM\n", - " #vm_priority='lowpriority', # optional\n", - " autoscale_enabled=True,\n", - " cluster_min_nodes=0, \n", - " cluster_max_nodes=4)\n", - "\n", - " # create the cluster\n", - " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", - " \n", - " # can poll for a minimum number of nodes and for a specific timeout. \n", - " # if no min node count is provided it uses the scale settings for the cluster\n", - " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", - " \n", - " # Use the 'status' property to get a detailed status for the current cluster. \n", - " print(compute_target.status.serialize())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you have created the compute target, let's see what the workspace's `compute_targets()` function returns. You should now see one entry named 'gpucluster' of type BatchAI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "compute_targets = ws.compute_targets()\n", - "for name, ct in compute_targets.items():\n", - " print(name, ct.type, ct.provisioning_state)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Copy the training files into the script folder\n", - "The TensorFlow training script is already created for you. You can simply copy it into the script folder, together with the utility library used to load compressed data file into numpy array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import shutil\n", - "# the training logic is in the tf_mnist.py file.\n", - "shutil.copy('./tf_mnist.py', script_folder)\n", - "\n", - "# the utils.py just helps loading data from the downloaded MNIST dataset into numpy arrays.\n", - "shutil.copy('./utils.py', script_folder)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "2039d2d5-aca6-4f25-a12f-df9ae6529cae" - } - }, - "source": [ - "## Construct neural network in TensorFlow\n", - "In the training script `tf_mnist.py`, it creates a very simple DNN (deep neural network), with just 2 hidden layers. The input layer has 28 * 28 = 784 neurons, each representing a pixel in an image. The first hidden layer has 300 neurons, and the second hidden layer has 100 neurons. The output layer has 10 neurons, each representing a targeted label from 0 to 9.\n", - "\n", - "![DNN](nn.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Azure ML concepts \n", - "Please note the following three things in the code below:\n", - "1. The script accepts arguments using the argparse package. In this case there is one argument `--data_folder` which specifies the file system folder in which the script can find the MNIST data\n", - "```\n", - " parser = argparse.ArgumentParser()\n", - " parser.add_argument('--data_folder')\n", - "```\n", - "2. The script is accessing the Azure ML `Run` object by executing `run = Run.get_context()`. Further down the script is using the `run` to report the training accuracy and the validation accuracy as training progresses.\n", - "```\n", - " run.log('training_acc', np.float(acc_train))\n", - " run.log('validation_acc', np.float(acc_val))\n", - "```\n", - "3. When running the script on Azure ML, you can write files out to a folder `./outputs` that is relative to the root directory. This folder is specially tracked by Azure ML in the sense that any files written to that folder during script execution on the remote target will be picked up by Run History; these files (known as artifacts) will be available as part of the run history record." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next cell will print out the training code for you to inspect it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(os.path.join(script_folder, './tf_mnist.py'), 'r') as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create TensorFlow estimator\n", - "Next, we construct an `azureml.train.dnn.TensorFlow` estimator object, use the Batch AI cluster as compute target, and pass the mount-point of the datastore to the training code as a parameter.\n", - "The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed -- if additional pip or conda packages are required, their names can be passed in via the `pip_packages` and `conda_packages` arguments and they will be included in the resulting docker." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.dnn import TensorFlow\n", - "\n", - "script_params = {\n", - " '--data-folder': ws.get_default_datastore().as_mount(),\n", - " '--batch-size': 50,\n", - " '--first-layer-neurons': 300,\n", - " '--second-layer-neurons': 100,\n", - " '--learning-rate': 0.01\n", - "}\n", - "\n", - "est = TensorFlow(source_directory=script_folder,\n", - " script_params=script_params,\n", - " compute_target=compute_target,\n", - " entry_script='tf_mnist.py', \n", - " use_gpu=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submit job to run\n", - "Calling the `fit` function on the estimator submits the job to Azure ML for execution. Submitting the job should only take a few seconds." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = exp.submit(config=est)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Monitor the Run\n", - "As the Run is executed, it will go through the following stages:\n", - "1. Preparing: A docker image is created matching the Python environment specified by the TensorFlow estimator and it will be uploaded to the workspace's Azure Container Registry. This step will only happen once for each Python environment -- the container will then be cached for subsequent runs. Creating and uploading the image takes about **5 minutes**. While the job is preparing, logs are streamed to the run history and can be viewed to monitor the progress of the image creation.\n", - "\n", - "2. Scaling: If the compute needs to be scaled up (i.e. the Batch AI cluster requires more nodes to execute the run than currently available), the Batch AI cluster will attempt to scale up in order to make the required amount of nodes available. Scaling typically takes about **5 minutes**.\n", - "\n", - "3. Running: All scripts in the script folder are uploaded to the compute target, data stores are mounted/copied and the `entry_script` is executed. While the job is running, stdout and the `./logs` folder are streamed to the run history and can be viewed to monitor the progress of the run.\n", - "\n", - "4. Post-Processing: The `./outputs` folder of the run is copied over to the run history\n", - "\n", - "There are multiple ways to check the progress of a running job. We can use a Jupyter notebook widget. \n", - "\n", - "**Note: The widget will automatically update ever 10-15 seconds, always showing you the most up-to-date information about the run**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(run).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also periodically check the status of the run object, and navigate to Azure portal to monitor the run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The Run object\n", - "The Run object provides the interface to the run history -- both to the job and to the control plane (this notebook), and both while the job is running and after it has completed. It provides a number of interesting features for instance:\n", - "* `run.get_details()`: Provides a rich set of properties of the run\n", - "* `run.get_metrics()`: Provides a dictionary with all the metrics that were reported for the Run\n", - "* `run.get_file_names()`: List all the files that were uploaded to the run history for this Run. This will include the `outputs` and `logs` folder, azureml-logs and other logs, as well as files that were explicitly uploaded to the run using `run.upload_file()`\n", - "\n", - "Below are some examples -- please run through them and inspect their output. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.get_details()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.get_metrics()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.get_file_names()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Plot accuracy over epochs\n", - "Since we can retrieve the metrics from the run, we can easily make plots using `matplotlib` in the notebook. Then we can add the plotted image to the run using `run.log_image()`, so all information about the run is kept together." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.makedirs('./imgs', exist_ok = True)\n", - "metrics = run.get_metrics()\n", - "\n", - "plt.figure(figsize = (13,5))\n", - "plt.plot(metrics['validation_acc'], 'r-', lw = 4, alpha = .6)\n", - "plt.plot(metrics['training_acc'], 'b--', alpha = 0.5)\n", - "plt.legend(['Full evaluation set', 'Training set mini-batch'])\n", - "plt.xlabel('epochs', fontsize = 14)\n", - "plt.ylabel('accuracy', fontsize = 14)\n", - "plt.title('Accuracy over Epochs', fontsize = 16)\n", - "run.log_image(name = 'acc_over_epochs.png', plot = plt)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the saved model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the training script, a TensorFlow `saver` object is used to persist the model in a local folder (local to the compute target). The model was saved to the `./outputs` folder on the disk of the Batch AI cluster node where the job is run. Azure ML automatically uploaded anything written in the `./outputs` folder into run history file store. Subsequently, we can use the `Run` object to download the model files the `saver` object saved. They are under the the `outputs/model` folder in the run history file store, and are downloaded into a local folder named `model`. Note the TensorFlow model consists of four files in binary format and they are not human-readable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a model folder in the current directory\n", - "os.makedirs('./model', exist_ok = True)\n", - "\n", - "for f in run.get_file_names():\n", - " if f.startswith('outputs/model'):\n", - " output_file_path = os.path.join('./model', f.split('/')[-1])\n", - " print('Downloading from {} to {} ...'.format(f, output_file_path))\n", - " run.download_file(name = f, output_file_path = output_file_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predict on the test set\n", - "Now load the saved TensorFlow graph, and list all operations under the `network` scope. This way we can discover the input tensor `network/X:0` and the output tensor `network/output/MatMul:0`, and use them in the scoring script in the next step.\n", - "\n", - "Note: if your local TensorFlow version is different than the version running in the cluster where the model is trained, you might see a \"compiletime version mismatch\" warning. You can ignore it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow as tf\n", - "tf.reset_default_graph()\n", - "\n", - "saver = tf.train.import_meta_graph(\"./model/mnist-tf.model.meta\")\n", - "graph = tf.get_default_graph()\n", - "\n", - "for op in graph.get_operations():\n", - " if op.name.startswith('network'):\n", - " print(op.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feed test dataset to the persisted model to get predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# input tensor. this is an array of 784 elements, each representing the intensity of a pixel in the digit image.\n", - "X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", - "# output tensor. this is an array of 10 elements, each representing the probability of predicted value of the digit.\n", - "output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", - "\n", - "with tf.Session() as sess:\n", - " saver.restore(sess, './model/mnist-tf.model')\n", - " k = output.eval(feed_dict = {X : X_test})\n", - "# get the prediction, which is the index of the element that has the largest probability value.\n", - "y_hat = np.argmax(k, axis = 1)\n", - "\n", - "# print the first 30 labels and predictions\n", - "print('labels: \\t', y_test[:30])\n", - "print('predictions:\\t', y_hat[:30])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the overall accuracy by comparing the predicted value against the test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Accuracy on the test set:\", np.average(y_hat == y_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Intelligent hyperparameter tuning\n", - "We have trained the model with one set of hyperparameters, now let's how we can do hyperparameter tuning by launching multiple runs on the cluster. First let's define the parameter space using random sampling." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.hyperdrive import *\n", - "\n", - "ps = RandomParameterSampling(\n", - " {\n", - " '--batch-size': choice(25, 50, 100),\n", - " '--first-layer-neurons': choice(10, 50, 200, 300, 500),\n", - " '--second-layer-neurons': choice(10, 50, 200, 500),\n", - " '--learning-rate': loguniform(-6, -1)\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we will create a new estimator without the above parameters since they will be passed in later. Note we still need to keep the `data-folder` parameter since that's not a hyperparamter we will sweep." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "est = TensorFlow(source_directory=script_folder,\n", - " script_params={'--data-folder': ws.get_default_datastore().as_mount()},\n", - " compute_target=compute_target,\n", - " entry_script='tf_mnist.py', \n", - " use_gpu=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we will define an early termnination policy. The `BanditPolicy` basically states to check the job every 2 iterations. If the primary metric (defined later) falls outside of the top 10% range, Azure ML terminate the job. This saves us from continuing to explore hyperparameters that don't show promise of helping reach our target metric." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to configure a run configuration object, and specify the primary metric `validation_acc` that's recorded in your training runs. If you go back to visit the training script, you will notice that this value is being logged after every epoch (a full batch set). We also want to tell the service that we are looking to maximizing this value. We also set the number of samples to 20, and maximal concurrent job to 4, which is the same as the number of nodes in our computer cluster." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "htc = HyperDriveRunConfig(estimator=est, \n", - " hyperparameter_sampling=ps, \n", - " primary_metric_name='validation_acc', \n", - " primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, \n", - " max_total_runs=20,\n", - " max_concurrent_runs=4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, let's launch the hyperparameter tuning job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "htr = exp.submit(config=htc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use a run history widget to show the progress. Be patient as this might take a while to complete." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "RunDetails(htr).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Find and register best model\n", - "When all the jobs finish, we can find out the one that has the highest accuracy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_run = htr.get_best_run_by_primary_metric()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's list the model files uploaded during the run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(best_run.get_file_names()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then register the folder (and all files in it) as a model named `tf-dnn-mnist` under the workspace for deployment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = best_run.register_model(model_name='tf-dnn-mnist', model_path='outputs/model')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the model in ACI\n", - "Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n", - "### Create score.py\n", - "First, we will create a scoring script that will be invoked by the web service call. \n", - "\n", - "* Note that the scoring script must have two required functions, `init()` and `run(input_data)`. \n", - " * In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started. \n", - " * In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to `run` typically use JSON as serialization and de-serialization format but you are not limited to that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile score.py\n", - "import json\n", - "import numpy as np\n", - "import os\n", - "import tensorflow as tf\n", - "\n", - "from azureml.core.model import Model\n", - "\n", - "def init():\n", - " global X, output, sess\n", - " tf.reset_default_graph()\n", - " model_root = Model.get_model_path('tf-dnn-mnist')\n", - " saver = tf.train.import_meta_graph(os.path.join(model_root, 'mnist-tf.model.meta'))\n", - " X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", - " output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", - " \n", - " sess = tf.Session()\n", - " saver.restore(sess, os.path.join(model_root, 'mnist-tf.model'))\n", - "\n", - "def run(raw_data):\n", - " data = np.array(json.loads(raw_data)['data'])\n", - " # make prediction\n", - " out = output.eval(session = sess, feed_dict = {X: data})\n", - " y_hat = np.argmax(out, axis = 1)\n", - " return json.dumps(y_hat.tolist())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create myenv.yml\n", - "We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `tensorflow`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import CondaDependencies\n", - "cd = CondaDependencies.create()\n", - "cd.add_conda_package('numpy')\n", - "cd.add_tensorflow_conda_package()\n", - "cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n", - "\n", - "print(cd.serialize_to_string())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Deploy to ACI\n", - "We are almost ready to deploy. Create a deployment configuration and specify the number of CPUs and gigbyte of RAM needed for your ACI container. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.webservice import AciWebservice\n", - "\n", - "aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n", - " memory_gb=1, \n", - " tags={'name':'mnist', 'framework': 'TensorFlow DNN'},\n", - " description='Tensorflow DNN on MNIST')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Deployment Process\n", - "Now we can deploy. **This cell will run for about 7-8 minutes**. Behind the scene, it will do the following:\n", - "1. **Register model** \n", - "Take the local `model` folder (which contains our previously downloaded trained model files) and register it (and the files inside that folder) as a model named `model` under the workspace. Azure ML will register the model directory or model file(s) we specify to the `model_paths` parameter of the `Webservice.deploy` call.\n", - "2. **Build Docker image** \n", - "Build a Docker image using the scoring file (`score.py`), the environment file (`myenv.yml`), and the `model` folder containing the TensorFlow model files. \n", - "3. **Register image** \n", - "Register that image under the workspace. \n", - "4. **Ship to ACI** \n", - "And finally ship the image to the ACI infrastructure, start up a container in ACI using that image, and expose an HTTP endpoint to accept REST client calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.image import ContainerImage\n", - "imgconfig = ContainerImage.image_configuration(execution_script=\"score.py\", \n", - " runtime=\"python\", \n", - " conda_file=\"myenv.yml\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "from azureml.core.webservice import Webservice\n", - "\n", - "service = Webservice.deploy_from_model(workspace=ws,\n", - " name='tf-mnist-svc',\n", - " deployment_config=aciconfig,\n", - " models=[model],\n", - " image_config=imgconfig)\n", - "\n", - "service.wait_for_deployment(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(service.get_logs())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is the scoring web service endpoint:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(service.scoring_uri)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed model\n", - "Let's test the deployed model. Pick 30 random samples from the test set, and send it to the web service hosted in ACI. Note here we are using the `run` API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.\n", - "\n", - "After the invocation, we print the returned predictions and plot them along with the input images. Use red font color and inversed image (white on black) to highlight the misclassified samples. Note since the model accuracy is pretty high, you might have to run the below cell a few times before you can see a misclassified sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "# find 30 random samples from test set\n", - "n = 30\n", - "sample_indices = np.random.permutation(X_test.shape[0])[0:n]\n", - "\n", - "test_samples = json.dumps({\"data\": X_test[sample_indices].tolist()})\n", - "test_samples = bytes(test_samples, encoding = 'utf8')\n", - "\n", - "# predict using the deployed model\n", - "result = json.loads(service.run(input_data = test_samples))\n", - "\n", - "# compare actual value vs. the predicted values:\n", - "i = 0\n", - "plt.figure(figsize = (20, 1))\n", - "\n", - "for s in sample_indices:\n", - " plt.subplot(1, n, i + 1)\n", - " plt.axhline('')\n", - " plt.axvline('')\n", - " \n", - " # use different color for misclassified sample\n", - " font_color = 'red' if y_test[s] != result[i] else 'black'\n", - " clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys\n", - " \n", - " plt.text(x = 10, y = -10, s = y_hat[s], fontsize = 18, color = font_color)\n", - " plt.imshow(X_test[s].reshape(28, 28), cmap = clr_map)\n", - " \n", - " i = i + 1\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also send raw HTTP request to the service." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "\n", - "# send a random row from the test set to score\n", - "random_index = np.random.randint(0, len(X_test)-1)\n", - "input_data = \"{\\\"data\\\": [\" + str(list(X_test[random_index])) + \"]}\"\n", - "\n", - "headers = {'Content-Type':'application/json'}\n", - "\n", - "resp = requests.post(service.scoring_uri, input_data, headers=headers)\n", - "\n", - "print(\"POST to url\", service.scoring_uri)\n", - "#print(\"input data:\", input_data)\n", - "print(\"label:\", y_test[random_index])\n", - "print(\"prediction:\", resp.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's look at the workspace after the web service was deployed. You should see \n", - "* a registered model named 'model' and with the id 'model:1'\n", - "* an image called 'tf-mnist' and with a docker image location pointing to your workspace's Azure Container Registry (ACR) \n", - "* a webservice called 'tf-mnist' with some scoring URL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "models = ws.models()\n", - "for name, model in models.items():\n", - " print(\"Model: {}, ID: {}\".format(name, model.id))\n", - " \n", - "images = ws.images()\n", - "for name, image in images.items():\n", - " print(\"Image: {}, location: {}\".format(name, image.image_location))\n", - " \n", - "webservices = ws.webservices()\n", - "for name, webservice in webservices.items():\n", - " print(\"Webservice: {}, scoring URI: {}\".format(name, webservice.scoring_uri))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean up\n", - "You can delete the ACI deployment with a simple delete API call." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "service.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also delete the computer cluster. But remember if you set the `cluster_min_nodes` value to 0 when you created the cluster, once the jobs are finished, all nodes are deleted automatically. So you don't have to delete the cluster itself since it won't incur any cost. Next time you submit jobs to it, the cluster will then automatically \"grow\" up to the `cluster_min_nodes` which is set to 4." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# delete the cluster if you need to.\n", - "compute_target.delete()" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "minxia" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - }, - "nbpresent": { - "slides": { - "05bb34ad-74b0-42b3-9654-8357d1ba9c99": { - "id": "05bb34ad-74b0-42b3-9654-8357d1ba9c99", - "prev": "851089af-9725-40c9-8f0b-9bf892b2b1fe", - "regions": { - "23fb396d-50f9-4770-adb3-0d6abcb40767": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "2039d2d5-aca6-4f25-a12f-df9ae6529cae", - "part": "whole" - }, - "id": "23fb396d-50f9-4770-adb3-0d6abcb40767" - } - } - }, - "11bebe14-d1dc-476d-a31a-5828b9c3adf0": { - "id": "11bebe14-d1dc-476d-a31a-5828b9c3adf0", - "prev": "502648cb-26fe-496b-899f-84c8fe1dcbc0", - "regions": { - "a42499db-623e-4414-bea2-ff3617fd8fc5": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "4788c040-27a2-4dc1-8ed0-378a99b3a255", - "part": "whole" - }, - "id": "a42499db-623e-4414-bea2-ff3617fd8fc5" - } - } - }, - "134f92d0-6389-4226-af51-1134ae8e8278": { - "id": "134f92d0-6389-4226-af51-1134ae8e8278", - "prev": "36b8728c-32ad-4941-be03-5cef51cdc430", - "regions": { - "b6d82a77-2d58-4b9e-a375-3103214b826c": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "7ab0e6d0-1f1c-451b-8ac5-687da44a8287", - "part": "whole" - }, - "id": "b6d82a77-2d58-4b9e-a375-3103214b826c" - } - } - }, - "282a2421-697b-4fd0-9485-755abf5a0c18": { - "id": "282a2421-697b-4fd0-9485-755abf5a0c18", - "prev": "a8b9ceb9-b38f-4489-84df-b644c6fe28f2", - "regions": { - "522fec96-abe7-4a34-bd34-633733afecc8": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "d58e7785-c2ee-4a45-8e3d-4c538bf8075a", - "part": "whole" - }, - "id": "522fec96-abe7-4a34-bd34-633733afecc8" - } - } - }, - "2dfec088-8a70-411a-9199-904ef3fa2383": { - "id": "2dfec088-8a70-411a-9199-904ef3fa2383", - "prev": "282a2421-697b-4fd0-9485-755abf5a0c18", - "regions": { - "0535fcb6-3a2b-4b46-98a7-3ebb1a38c47e": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "c377ea0c-0cd9-4345-9be2-e20fb29c94c3", - "part": "whole" - }, - "id": "0535fcb6-3a2b-4b46-98a7-3ebb1a38c47e" - } - } - }, - "36a814c9-c540-4a6d-92d9-c03553d3d2c2": { - "id": "36a814c9-c540-4a6d-92d9-c03553d3d2c2", - "prev": "b52e4d09-5186-44e5-84db-3371c087acde", - "regions": { - "8bfba503-9907-43f0-b1a6-46a0b4311793": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "d5e4a56c-dfac-4346-be83-1c15b503deac", - "part": "whole" - }, - "id": "8bfba503-9907-43f0-b1a6-46a0b4311793" - } - } - }, - "36b8728c-32ad-4941-be03-5cef51cdc430": { - "id": "36b8728c-32ad-4941-be03-5cef51cdc430", - "prev": "05bb34ad-74b0-42b3-9654-8357d1ba9c99", - "regions": { - "a36a5bdf-7f62-49b0-8634-e155a98851dc": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "e33dfc47-e7df-4623-a7a6-ab6bcf944629", - "part": "whole" - }, - "id": "a36a5bdf-7f62-49b0-8634-e155a98851dc" - } - } - }, - "3f136f2a-f14c-4a4b-afea-13380556a79c": { - "id": "3f136f2a-f14c-4a4b-afea-13380556a79c", - "prev": "54cb8dfd-a89c-4922-867b-3c87d8b67cd3", - "regions": { - "80ecf237-d1b0-401e-83d2-6d04b7fcebd3": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "7debeb2b-ecea-414f-9b50-49657abb3e6a", - "part": "whole" - }, - "id": "80ecf237-d1b0-401e-83d2-6d04b7fcebd3" - } - } - }, - "502648cb-26fe-496b-899f-84c8fe1dcbc0": { - "id": "502648cb-26fe-496b-899f-84c8fe1dcbc0", - "prev": "3f136f2a-f14c-4a4b-afea-13380556a79c", - "regions": { - "4c83bb4d-2a52-41ba-a77f-0c6efebd83a6": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "dbd22f6b-6d49-4005-b8fe-422ef8ef1d42", - "part": "whole" - }, - "id": "4c83bb4d-2a52-41ba-a77f-0c6efebd83a6" - } - } - }, - "54cb8dfd-a89c-4922-867b-3c87d8b67cd3": { - "id": "54cb8dfd-a89c-4922-867b-3c87d8b67cd3", - "prev": "aa224267-f885-4c0c-95af-7bacfcc186d9", - "regions": { - "0848f0a7-032d-46c7-b35c-bfb69c83f961": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "3c32c557-d0e8-4bb3-a61a-aa51a767cd4e", - "part": "whole" - }, - "id": "0848f0a7-032d-46c7-b35c-bfb69c83f961" - } - } - }, - "636b563c-faee-4c9e-a6a3-f46a905bfa82": { - "id": "636b563c-faee-4c9e-a6a3-f46a905bfa82", - "prev": "c5f59b98-a227-4344-9d6d-03abdd01c6aa", - "regions": { - "9c64f662-05dc-4b14-9cdc-d450b96f4368": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "70640ac0-7041-47a8-9a7f-e871defd74b2", - "part": "whole" - }, - "id": "9c64f662-05dc-4b14-9cdc-d450b96f4368" - } - } - }, - "793cec2f-8413-484d-aa1e-388fd2b53a45": { - "id": "793cec2f-8413-484d-aa1e-388fd2b53a45", - "prev": "c66f3dfd-2d27-482b-be78-10ba733e826b", - "regions": { - "d08f9cfa-3b8d-4fb4-91ba-82d9858ea93e": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "dd56113e-e3db-41ae-91b7-2472ed194308", - "part": "whole" - }, - "id": "d08f9cfa-3b8d-4fb4-91ba-82d9858ea93e" - } - } - }, - "83e912ff-260a-4391-8a12-331aba098506": { - "id": "83e912ff-260a-4391-8a12-331aba098506", - "prev": "fe5a0732-69f5-462a-8af6-851f84a9fdec", - "regions": { - "2fefcf5f-ea20-4604-a528-5e6c91bcb100": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea", - "part": "whole" - }, - "id": "2fefcf5f-ea20-4604-a528-5e6c91bcb100" - } - } - }, - "851089af-9725-40c9-8f0b-9bf892b2b1fe": { - "id": "851089af-9725-40c9-8f0b-9bf892b2b1fe", - "prev": "636b563c-faee-4c9e-a6a3-f46a905bfa82", - "regions": { - "31c9dda5-fdf4-45e2-bcb7-12aa0f30e1d8": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "8408b90e-6cdd-44d1-86d3-648c23f877ac", - "part": "whole" - }, - "id": "31c9dda5-fdf4-45e2-bcb7-12aa0f30e1d8" - } - } - }, - "87ab653d-e804-470f-bde9-c67caaa0f354": { - "id": "87ab653d-e804-470f-bde9-c67caaa0f354", - "prev": "a8c2d446-caee-42c8-886a-ed98f4935d78", - "regions": { - "bc3aeb56-c465-4868-a1ea-2de82584de98": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "59f52294-4a25-4c92-bab8-3b07f0f44d15", - "part": "whole" - }, - "id": "bc3aeb56-c465-4868-a1ea-2de82584de98" - } - } - }, - "8b887c97-83bc-4395-83ac-f6703cbe243d": { - "id": "8b887c97-83bc-4395-83ac-f6703cbe243d", - "prev": "36a814c9-c540-4a6d-92d9-c03553d3d2c2", - "regions": { - "9d0bc72a-cb13-483f-a572-2bf60d0d145f": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "75499c85-d0a1-43db-8244-25778b9b2736", - "part": "whole" - }, - "id": "9d0bc72a-cb13-483f-a572-2bf60d0d145f" - } - } - }, - "a8b9ceb9-b38f-4489-84df-b644c6fe28f2": { - "id": "a8b9ceb9-b38f-4489-84df-b644c6fe28f2", - "prev": null, - "regions": { - "f741ed94-3f24-4427-b615-3ab8753e5814": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "bf74d2e9-2708-49b1-934b-e0ede342f475", - "part": "whole" - }, - "id": "f741ed94-3f24-4427-b615-3ab8753e5814" - } - } - }, - "a8c2d446-caee-42c8-886a-ed98f4935d78": { - "id": "a8c2d446-caee-42c8-886a-ed98f4935d78", - "prev": "2dfec088-8a70-411a-9199-904ef3fa2383", - "regions": { - "f03457d8-b2a7-4e14-9a73-cab80c5b815d": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "edaa7f2f-2439-4148-b57a-8c794c0945ec", - "part": "whole" - }, - "id": "f03457d8-b2a7-4e14-9a73-cab80c5b815d" - } - } - }, - "aa224267-f885-4c0c-95af-7bacfcc186d9": { - "id": "aa224267-f885-4c0c-95af-7bacfcc186d9", - "prev": "793cec2f-8413-484d-aa1e-388fd2b53a45", - "regions": { - "0d7ac442-5e1d-49a5-91b3-1432d72449d8": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "4d6826fe-2cb8-4468-85ed-a242a1ce7155", - "part": "whole" - }, - "id": "0d7ac442-5e1d-49a5-91b3-1432d72449d8" - } - } - }, - "b52e4d09-5186-44e5-84db-3371c087acde": { - "id": "b52e4d09-5186-44e5-84db-3371c087acde", - "prev": "134f92d0-6389-4226-af51-1134ae8e8278", - "regions": { - "7af7d997-80b2-497d-bced-ef8341763439": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "376882ec-d469-4fad-9462-18e4bbea64ca", - "part": "whole" - }, - "id": "7af7d997-80b2-497d-bced-ef8341763439" - } - } - }, - "c5f59b98-a227-4344-9d6d-03abdd01c6aa": { - "id": "c5f59b98-a227-4344-9d6d-03abdd01c6aa", - "prev": "83e912ff-260a-4391-8a12-331aba098506", - "regions": { - "7268abff-0540-4c06-aefc-c386410c0953": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "396d478b-34aa-4afa-9898-cdce8222a516", - "part": "whole" - }, - "id": "7268abff-0540-4c06-aefc-c386410c0953" - } - } - }, - "c66f3dfd-2d27-482b-be78-10ba733e826b": { - "id": "c66f3dfd-2d27-482b-be78-10ba733e826b", - "prev": "8b887c97-83bc-4395-83ac-f6703cbe243d", - "regions": { - "6cbe8e0e-8645-41a1-8a38-e44acb81be4b": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "7594c7c7-b808-48f7-9500-d7830a07968a", - "part": "whole" - }, - "id": "6cbe8e0e-8645-41a1-8a38-e44acb81be4b" - } - } - }, - "d22045e5-7e3e-452e-bc7b-c6c4a893da8e": { - "id": "d22045e5-7e3e-452e-bc7b-c6c4a893da8e", - "prev": "ec41f96a-63a3-4825-9295-f4657a440ddb", - "regions": { - "24e2a3a9-bf65-4dab-927f-0bf6ffbe581d": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "defe921f-8097-44c3-8336-8af6700804a7", - "part": "whole" - }, - "id": "24e2a3a9-bf65-4dab-927f-0bf6ffbe581d" - } - } - }, - "d24c958c-e419-4e4d-aa9c-d228a8ca55e4": { - "id": "d24c958c-e419-4e4d-aa9c-d228a8ca55e4", - "prev": "11bebe14-d1dc-476d-a31a-5828b9c3adf0", - "regions": { - "25312144-9faa-4680-bb8e-6307ea71370f": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "bed09a92-9a7a-473b-9464-90e479883a3e", - "part": "whole" - }, - "id": "25312144-9faa-4680-bb8e-6307ea71370f" - } - } - }, - "ec41f96a-63a3-4825-9295-f4657a440ddb": { - "id": "ec41f96a-63a3-4825-9295-f4657a440ddb", - "prev": "87ab653d-e804-470f-bde9-c67caaa0f354", - "regions": { - "22e8be98-c254-4d04-b0e4-b9b5ae46eefe": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "bc70f780-c240-4779-96f3-bc5ef9a37d59", - "part": "whole" - }, - "id": "22e8be98-c254-4d04-b0e4-b9b5ae46eefe" - } - } - }, - "fe5a0732-69f5-462a-8af6-851f84a9fdec": { - "id": "fe5a0732-69f5-462a-8af6-851f84a9fdec", - "prev": "d22045e5-7e3e-452e-bc7b-c6c4a893da8e", - "regions": { - "671b89f5-fa9c-4bc1-bdeb-6e0a4ce8939b": { - "attrs": { - "height": 0.8, - "width": 0.8, - "x": 0.1, - "y": 0.1 - }, - "content": { - "cell": "fd46e2ab-4ab6-4001-b536-1f323525d7d3", - "part": "whole" - }, - "id": "671b89f5-fa9c-4bc1-bdeb-6e0a4ce8939b" - } - } - } - }, - "themes": {} - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "bf74d2e9-2708-49b1-934b-e0ede342f475" + } + }, + "source": [ + "# 03. Training, hyperparameter tune, and deploy with TensorFlow\n", + "\n", + "## Introduction\n", + "This tutorial shows how to train a simple deep neural network using the MNIST dataset and TensorFlow on Azure Machine Learning. MNIST is a popular dataset consisting of 70,000 grayscale images. Each image is a handwritten digit of `28x28` pixels, representing number from 0 to 9. The goal is to create a multi-class classifier to identify the digit each image represents, and deploy it as a web service in Azure.\n", + "\n", + "For more information about the MNIST dataset, please visit [Yan LeCun's website](http://yann.lecun.com/exdb/mnist/).\n", + "\n", + "## Prerequisite:\n", + "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning\n", + "* Go through the [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook to:\n", + " * install the AML SDK\n", + " * create a workspace and its configuration file (`config.json`)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get started. First let's import some Python libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbpresent": { + "id": "c377ea0c-0cd9-4345-9be2-e20fb29c94c3" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "import os\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbpresent": { + "id": "edaa7f2f-2439-4148-b57a-8c794c0945ec" + } + }, + "outputs": [], + "source": [ + "import azureml\n", + "from azureml.core import Workspace, Run\n", + "\n", + "# check core SDK version number\n", + "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Diagnostics\n", + "Opt-in diagnostics for better experience, quality, and security of future releases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "Diagnostics" + ] + }, + "outputs": [], + "source": [ + "from azureml.telemetry import set_diagnostics_collection\n", + "set_diagnostics_collection(send_diagnostics=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize workspace\n", + "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "59f52294-4a25-4c92-bab8-3b07f0f44d15" + } + }, + "source": [ + "## Create an Azure ML experiment\n", + "Let's create an experiment named \"tf-mnist\" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbpresent": { + "id": "bc70f780-c240-4779-96f3-bc5ef9a37d59" + } + }, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "script_folder = './tf-mnist'\n", + "os.makedirs(script_folder, exist_ok=True)\n", + "\n", + "exp = Experiment(workspace=ws, name='tf-mnist')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "defe921f-8097-44c3-8336-8af6700804a7" + } + }, + "source": [ + "## Download MNIST dataset\n", + "In order to train on the MNIST dataset we will first need to download it from Yan LeCun's web site directly and save them in a `data` folder locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "\n", + "os.makedirs('./data/mnist', exist_ok=True)\n", + "\n", + "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', filename = './data/mnist/train-images.gz')\n", + "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', filename = './data/mnist/train-labels.gz')\n", + "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename = './data/mnist/test-images.gz')\n", + "urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename = './data/mnist/test-labels.gz')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea" + } + }, + "source": [ + "## Show some sample images\n", + "Let's load the downloaded compressed file into numpy arrays using some utility functions included in the `utils.py` library file from the current folder. Then we use `matplotlib` to plot 30 random images from the dataset along with their labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbpresent": { + "id": "396d478b-34aa-4afa-9898-cdce8222a516" + } + }, + "outputs": [], + "source": [ + "from utils import load_data\n", + "\n", + "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster.\n", + "X_train = load_data('./data/mnist/train-images.gz', False) / 255.0\n", + "y_train = load_data('./data/mnist/train-labels.gz', True).reshape(-1)\n", + "\n", + "X_test = load_data('./data/mnist/test-images.gz', False) / 255.0\n", + "y_test = load_data('./data/mnist/test-labels.gz', True).reshape(-1)\n", + "\n", + "count = 0\n", + "sample_size = 30\n", + "plt.figure(figsize = (16, 6))\n", + "for i in np.random.permutation(X_train.shape[0])[:sample_size]:\n", + " count = count + 1\n", + " plt.subplot(1, sample_size, count)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " plt.text(x = 10, y = -10, s = y_train[i], fontsize = 18)\n", + " plt.imshow(X_train[i].reshape(28, 28), cmap = plt.cm.Greys)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload MNIST dataset to default datastore \n", + "A [datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data) is a place where data can be stored that is then made accessible to a Run either by means of mounting or copying the data to the compute target. A datastore can either be backed by an Azure Blob Storage or and Azure File Share (ADLS will be supported in the future). For simple data handling, each workspace provides a default datastore that can be used, in case the data is not already in Blob Storage or File Share.\n", + "\n", + "In this next step, we will upload the training and test set into the workspace's default datastore, which we will then later be mount on a Batch AI cluster for training.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = ws.get_default_datastore()\n", + "ds.upload(src_dir='./data/mnist', target_path='mnist', overwrite=True, show_progress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Batch AI cluster as compute target\n", + "[Batch AI](https://docs.microsoft.com/en-us/azure/batch-ai/overview) is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's create a new Batch AI cluster in the current workspace, if it doesn't already exist. We will then run the training script on this compute target." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we could not find the cluster with the given name in the previous cell, then we will create a new cluster here. We will create a Batch AI Cluster of `STANDARD_D2_V2` CPU VMs. This process is broken down into 3 steps:\n", + "1. create the configuration (this step is local and only takes a second)\n", + "2. create the Batch AI cluster (this step will take about **20 seconds**)\n", + "3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " # look for the existing cluster by name\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " if type(compute_target) is BatchAiCompute:\n", + " print('Found existing compute target {}.'.format(cluster_name))\n", + " else:\n", + " print('{} exists but it is not a Batch AI cluster. Please choose a different name.'.format(cluster_name))\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = BatchAiCompute.provisioning_configuration(vm_size=\"STANDARD_NC6\", # GPU-based VM\n", + " #vm_priority='lowpriority', # optional\n", + " autoscale_enabled=True,\n", + " cluster_min_nodes=0, \n", + " cluster_max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + " \n", + " # can poll for a minimum number of nodes and for a specific timeout. \n", + " # if no min node count is provided it uses the scale settings for the cluster\n", + " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", + " \n", + " # Use the 'status' property to get a detailed status for the current cluster. \n", + " print(compute_target.status.serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that you have created the compute target, let's see what the workspace's `compute_targets` property returns. You should now see one entry named 'gpucluster' of type BatchAI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compute_targets = ws.compute_targets\n", + "for name, ct in compute_targets.items():\n", + " print(name, ct.type, ct.provisioning_state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Copy the training files into the script folder\n", + "The TensorFlow training script is already created for you. You can simply copy it into the script folder, together with the utility library used to load compressed data file into numpy array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "# the training logic is in the tf_mnist.py file.\n", + "shutil.copy('./tf_mnist.py', script_folder)\n", + "\n", + "# the utils.py just helps loading data from the downloaded MNIST dataset into numpy arrays.\n", + "shutil.copy('./utils.py', script_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "2039d2d5-aca6-4f25-a12f-df9ae6529cae" + } + }, + "source": [ + "## Construct neural network in TensorFlow\n", + "In the training script `tf_mnist.py`, it creates a very simple DNN (deep neural network), with just 2 hidden layers. The input layer has 28 * 28 = 784 neurons, each representing a pixel in an image. The first hidden layer has 300 neurons, and the second hidden layer has 100 neurons. The output layer has 10 neurons, each representing a targeted label from 0 to 9.\n", + "\n", + "![DNN](nn.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure ML concepts \n", + "Please note the following three things in the code below:\n", + "1. The script accepts arguments using the argparse package. In this case there is one argument `--data_folder` which specifies the file system folder in which the script can find the MNIST data\n", + "```\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument('--data_folder')\n", + "```\n", + "2. The script is accessing the Azure ML `Run` object by executing `run = Run.get_context()`. Further down the script is using the `run` to report the training accuracy and the validation accuracy as training progresses.\n", + "```\n", + " run.log('training_acc', np.float(acc_train))\n", + " run.log('validation_acc', np.float(acc_val))\n", + "```\n", + "3. When running the script on Azure ML, you can write files out to a folder `./outputs` that is relative to the root directory. This folder is specially tracked by Azure ML in the sense that any files written to that folder during script execution on the remote target will be picked up by Run History; these files (known as artifacts) will be available as part of the run history record." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell will print out the training code for you to inspect it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(script_folder, './tf_mnist.py'), 'r') as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create TensorFlow estimator\n", + "Next, we construct an `azureml.train.dnn.TensorFlow` estimator object, use the Batch AI cluster as compute target, and pass the mount-point of the datastore to the training code as a parameter.\n", + "The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed -- if additional pip or conda packages are required, their names can be passed in via the `pip_packages` and `conda_packages` arguments and they will be included in the resulting docker." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.dnn import TensorFlow\n", + "\n", + "script_params = {\n", + " '--data-folder': ws.get_default_datastore().as_mount(),\n", + " '--batch-size': 50,\n", + " '--first-layer-neurons': 300,\n", + " '--second-layer-neurons': 100,\n", + " '--learning-rate': 0.01\n", + "}\n", + "\n", + "est = TensorFlow(source_directory=script_folder,\n", + " script_params=script_params,\n", + " compute_target=compute_target,\n", + " entry_script='tf_mnist.py', \n", + " use_gpu=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit job to run\n", + "Calling the `fit` function on the estimator submits the job to Azure ML for execution. Submitting the job should only take a few seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = exp.submit(config=est)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor the Run\n", + "As the Run is executed, it will go through the following stages:\n", + "1. Preparing: A docker image is created matching the Python environment specified by the TensorFlow estimator and it will be uploaded to the workspace's Azure Container Registry. This step will only happen once for each Python environment -- the container will then be cached for subsequent runs. Creating and uploading the image takes about **5 minutes**. While the job is preparing, logs are streamed to the run history and can be viewed to monitor the progress of the image creation.\n", + "\n", + "2. Scaling: If the compute needs to be scaled up (i.e. the Batch AI cluster requires more nodes to execute the run than currently available), the Batch AI cluster will attempt to scale up in order to make the required amount of nodes available. Scaling typically takes about **5 minutes**.\n", + "\n", + "3. Running: All scripts in the script folder are uploaded to the compute target, data stores are mounted/copied and the `entry_script` is executed. While the job is running, stdout and the `./logs` folder are streamed to the run history and can be viewed to monitor the progress of the run.\n", + "\n", + "4. Post-Processing: The `./outputs` folder of the run is copied over to the run history\n", + "\n", + "There are multiple ways to check the progress of a running job. We can use a Jupyter notebook widget. \n", + "\n", + "**Note: The widget will automatically update ever 10-15 seconds, always showing you the most up-to-date information about the run**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.widgets import RunDetails\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also periodically check the status of the run object, and navigate to Azure portal to monitor the run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The Run object\n", + "The Run object provides the interface to the run history -- both to the job and to the control plane (this notebook), and both while the job is running and after it has completed. It provides a number of interesting features for instance:\n", + "* `run.get_details()`: Provides a rich set of properties of the run\n", + "* `run.get_metrics()`: Provides a dictionary with all the metrics that were reported for the Run\n", + "* `run.get_file_names()`: List all the files that were uploaded to the run history for this Run. This will include the `outputs` and `logs` folder, azureml-logs and other logs, as well as files that were explicitly uploaded to the run using `run.upload_file()`\n", + "\n", + "Below are some examples -- please run through them and inspect their output. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.get_details()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.get_metrics()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.get_file_names()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot accuracy over epochs\n", + "Since we can retrieve the metrics from the run, we can easily make plots using `matplotlib` in the notebook. Then we can add the plotted image to the run using `run.log_image()`, so all information about the run is kept together." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.makedirs('./imgs', exist_ok = True)\n", + "metrics = run.get_metrics()\n", + "\n", + "plt.figure(figsize = (13,5))\n", + "plt.plot(metrics['validation_acc'], 'r-', lw = 4, alpha = .6)\n", + "plt.plot(metrics['training_acc'], 'b--', alpha = 0.5)\n", + "plt.legend(['Full evaluation set', 'Training set mini-batch'])\n", + "plt.xlabel('epochs', fontsize = 14)\n", + "plt.ylabel('accuracy', fontsize = 14)\n", + "plt.title('Accuracy over Epochs', fontsize = 16)\n", + "run.log_image(name = 'acc_over_epochs.png', plot = plt)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the saved model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the training script, a TensorFlow `saver` object is used to persist the model in a local folder (local to the compute target). The model was saved to the `./outputs` folder on the disk of the Batch AI cluster node where the job is run. Azure ML automatically uploaded anything written in the `./outputs` folder into run history file store. Subsequently, we can use the `Run` object to download the model files the `saver` object saved. They are under the the `outputs/model` folder in the run history file store, and are downloaded into a local folder named `model`. Note the TensorFlow model consists of four files in binary format and they are not human-readable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a model folder in the current directory\n", + "os.makedirs('./model', exist_ok = True)\n", + "\n", + "for f in run.get_file_names():\n", + " if f.startswith('outputs/model'):\n", + " output_file_path = os.path.join('./model', f.split('/')[-1])\n", + " print('Downloading from {} to {} ...'.format(f, output_file_path))\n", + " run.download_file(name = f, output_file_path = output_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predict on the test set\n", + "Now load the saved TensorFlow graph, and list all operations under the `network` scope. This way we can discover the input tensor `network/X:0` and the output tensor `network/output/MatMul:0`, and use them in the scoring script in the next step.\n", + "\n", + "Note: if your local TensorFlow version is different than the version running in the cluster where the model is trained, you might see a \"compiletime version mismatch\" warning. You can ignore it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "tf.reset_default_graph()\n", + "\n", + "saver = tf.train.import_meta_graph(\"./model/mnist-tf.model.meta\")\n", + "graph = tf.get_default_graph()\n", + "\n", + "for op in graph.get_operations():\n", + " if op.name.startswith('network'):\n", + " print(op.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Feed test dataset to the persisted model to get predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# input tensor. this is an array of 784 elements, each representing the intensity of a pixel in the digit image.\n", + "X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", + "# output tensor. this is an array of 10 elements, each representing the probability of predicted value of the digit.\n", + "output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", + "\n", + "with tf.Session() as sess:\n", + " saver.restore(sess, './model/mnist-tf.model')\n", + " k = output.eval(feed_dict = {X : X_test})\n", + "# get the prediction, which is the index of the element that has the largest probability value.\n", + "y_hat = np.argmax(k, axis = 1)\n", + "\n", + "# print the first 30 labels and predictions\n", + "print('labels: \\t', y_test[:30])\n", + "print('predictions:\\t', y_hat[:30])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculate the overall accuracy by comparing the predicted value against the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Accuracy on the test set:\", np.average(y_hat == y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Intelligent hyperparameter tuning\n", + "We have trained the model with one set of hyperparameters, now let's how we can do hyperparameter tuning by launching multiple runs on the cluster. First let's define the parameter space using random sampling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.hyperdrive import *\n", + "\n", + "ps = RandomParameterSampling(\n", + " {\n", + " '--batch-size': choice(25, 50, 100),\n", + " '--first-layer-neurons': choice(10, 50, 200, 300, 500),\n", + " '--second-layer-neurons': choice(10, 50, 200, 500),\n", + " '--learning-rate': loguniform(-6, -1)\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will create a new estimator without the above parameters since they will be passed in later. Note we still need to keep the `data-folder` parameter since that's not a hyperparamter we will sweep." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "est = TensorFlow(source_directory=script_folder,\n", + " script_params={'--data-folder': ws.get_default_datastore().as_mount()},\n", + " compute_target=compute_target,\n", + " entry_script='tf_mnist.py', \n", + " use_gpu=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will define an early termnination policy. The `BanditPolicy` basically states to check the job every 2 iterations. If the primary metric (defined later) falls outside of the top 10% range, Azure ML terminate the job. This saves us from continuing to explore hyperparameters that don't show promise of helping reach our target metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to configure a run configuration object, and specify the primary metric `validation_acc` that's recorded in your training runs. If you go back to visit the training script, you will notice that this value is being logged after every epoch (a full batch set). We also want to tell the service that we are looking to maximizing this value. We also set the number of samples to 20, and maximal concurrent job to 4, which is the same as the number of nodes in our computer cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "htc = HyperDriveRunConfig(estimator=est, \n", + " hyperparameter_sampling=ps, \n", + " primary_metric_name='validation_acc', \n", + " primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, \n", + " max_total_runs=20,\n", + " max_concurrent_runs=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's launch the hyperparameter tuning job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "htr = exp.submit(config=htc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use a run history widget to show the progress. Be patient as this might take a while to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "RunDetails(htr).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "htr.wait_for_completion(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find and register best model\n", + "When all the jobs finish, we can find out the one that has the highest accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_run = htr.get_best_run_by_primary_metric()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's list the model files uploaded during the run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(best_run.get_file_names()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then register the folder (and all files in it) as a model named `tf-dnn-mnist` under the workspace for deployment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = best_run.register_model(model_name='tf-dnn-mnist', model_path='outputs/model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the model in ACI\n", + "Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n", + "### Create score.py\n", + "First, we will create a scoring script that will be invoked by the web service call. \n", + "\n", + "* Note that the scoring script must have two required functions, `init()` and `run(input_data)`. \n", + " * In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started. \n", + " * In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to `run` typically use JSON as serialization and de-serialization format but you are not limited to that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile score.py\n", + "import json\n", + "import numpy as np\n", + "import os\n", + "import tensorflow as tf\n", + "\n", + "from azureml.core.model import Model\n", + "\n", + "def init():\n", + " global X, output, sess\n", + " tf.reset_default_graph()\n", + " model_root = Model.get_model_path('tf-dnn-mnist')\n", + " saver = tf.train.import_meta_graph(os.path.join(model_root, 'mnist-tf.model.meta'))\n", + " X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n", + " output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n", + " \n", + " sess = tf.Session()\n", + " saver.restore(sess, os.path.join(model_root, 'mnist-tf.model'))\n", + "\n", + "def run(raw_data):\n", + " data = np.array(json.loads(raw_data)['data'])\n", + " # make prediction\n", + " out = output.eval(session=sess, feed_dict={X: data})\n", + " y_hat = np.argmax(out, axis=1)\n", + " return y_hat.tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create myenv.yml\n", + "We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `tensorflow`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import CondaDependencies\n", + "cd = CondaDependencies.create()\n", + "cd.add_conda_package('numpy')\n", + "cd.add_tensorflow_conda_package()\n", + "cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n", + "\n", + "print(cd.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deploy to ACI\n", + "We are almost ready to deploy. Create a deployment configuration and specify the number of CPUs and gigbyte of RAM needed for your ACI container. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.webservice import AciWebservice\n", + "\n", + "aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n", + " memory_gb=1, \n", + " tags={'name':'mnist', 'framework': 'TensorFlow DNN'},\n", + " description='Tensorflow DNN on MNIST')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Deployment Process\n", + "Now we can deploy. **This cell will run for about 7-8 minutes**. Behind the scene, it will do the following:\n", + "1. **Register model** \n", + "Take the local `model` folder (which contains our previously downloaded trained model files) and register it (and the files inside that folder) as a model named `model` under the workspace. Azure ML will register the model directory or model file(s) we specify to the `model_paths` parameter of the `Webservice.deploy` call.\n", + "2. **Build Docker image** \n", + "Build a Docker image using the scoring file (`score.py`), the environment file (`myenv.yml`), and the `model` folder containing the TensorFlow model files. \n", + "3. **Register image** \n", + "Register that image under the workspace. \n", + "4. **Ship to ACI** \n", + "And finally ship the image to the ACI infrastructure, start up a container in ACI using that image, and expose an HTTP endpoint to accept REST client calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.image import ContainerImage\n", + "imgconfig = ContainerImage.image_configuration(execution_script=\"score.py\", \n", + " runtime=\"python\", \n", + " conda_file=\"myenv.yml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "from azureml.core.webservice import Webservice\n", + "\n", + "service = Webservice.deploy_from_model(workspace=ws,\n", + " name='tf-mnist-svc',\n", + " deployment_config=aciconfig,\n", + " models=[model],\n", + " image_config=imgconfig)\n", + "\n", + "service.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(service.get_logs())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the scoring web service endpoint:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(service.scoring_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test the deployed model\n", + "Let's test the deployed model. Pick 30 random samples from the test set, and send it to the web service hosted in ACI. Note here we are using the `run` API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.\n", + "\n", + "After the invocation, we print the returned predictions and plot them along with the input images. Use red font color and inversed image (white on black) to highlight the misclassified samples. Note since the model accuracy is pretty high, you might have to run the below cell a few times before you can see a misclassified sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# find 30 random samples from test set\n", + "n = 30\n", + "sample_indices = np.random.permutation(X_test.shape[0])[0:n]\n", + "\n", + "test_samples = json.dumps({\"data\": X_test[sample_indices].tolist()})\n", + "test_samples = bytes(test_samples, encoding='utf8')\n", + "\n", + "# predict using the deployed model\n", + "result = json.loads(service.run(input_data=test_samples))\n", + "\n", + "# compare actual value vs. the predicted values:\n", + "i = 0\n", + "plt.figure(figsize = (20, 1))\n", + "\n", + "for s in sample_indices:\n", + " plt.subplot(1, n, i + 1)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " \n", + " # use different color for misclassified sample\n", + " font_color = 'red' if y_test[s] != result[i] else 'black'\n", + " clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys\n", + " \n", + " plt.text(x=10, y=-10, s=y_hat[s], fontsize=18, color=font_color)\n", + " plt.imshow(X_test[s].reshape(28, 28), cmap=clr_map)\n", + " \n", + " i = i + 1\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also send raw HTTP request to the service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "\n", + "# send a random row from the test set to score\n", + "random_index = np.random.randint(0, len(X_test)-1)\n", + "input_data = \"{\\\"data\\\": [\" + str(list(X_test[random_index])) + \"]}\"\n", + "\n", + "headers = {'Content-Type':'application/json'}\n", + "\n", + "resp = requests.post(service.scoring_uri, input_data, headers=headers)\n", + "\n", + "print(\"POST to url\", service.scoring_uri)\n", + "#print(\"input data:\", input_data)\n", + "print(\"label:\", y_test[random_index])\n", + "print(\"prediction:\", resp.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at the workspace after the web service was deployed. You should see \n", + "* a registered model named 'model' and with the id 'model:1'\n", + "* an image called 'tf-mnist' and with a docker image location pointing to your workspace's Azure Container Registry (ACR) \n", + "* a webservice called 'tf-mnist' with some scoring URL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "models = ws.models\n", + "for name, model in models.items():\n", + " print(\"Model: {}, ID: {}\".format(name, model.id))\n", + " \n", + "images = ws.images\n", + "for name, image in images.items():\n", + " print(\"Image: {}, location: {}\".format(name, image.image_location))\n", + " \n", + "webservices = ws.webservices\n", + "for name, webservice in webservices.items():\n", + " print(\"Webservice: {}, scoring URI: {}\".format(name, webservice.scoring_uri))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean up\n", + "You can delete the ACI deployment with a simple delete API call." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also delete the computer cluster. But remember if you set the `cluster_min_nodes` value to 0 when you created the cluster, once the jobs are finished, all nodes are deleted automatically. So you don't have to delete the cluster itself since it won't incur any cost. Next time you submit jobs to it, the cluster will then automatically \"grow\" up to the `cluster_min_nodes` which is set to 4." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# delete the cluster if you need to.\n", + "compute_target.delete()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "minxia" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "nbpresent": { + "slides": { + "05bb34ad-74b0-42b3-9654-8357d1ba9c99": { + "id": "05bb34ad-74b0-42b3-9654-8357d1ba9c99", + "prev": "851089af-9725-40c9-8f0b-9bf892b2b1fe", + "regions": { + "23fb396d-50f9-4770-adb3-0d6abcb40767": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "2039d2d5-aca6-4f25-a12f-df9ae6529cae", + "part": "whole" + }, + "id": "23fb396d-50f9-4770-adb3-0d6abcb40767" + } + } + }, + "11bebe14-d1dc-476d-a31a-5828b9c3adf0": { + "id": "11bebe14-d1dc-476d-a31a-5828b9c3adf0", + "prev": "502648cb-26fe-496b-899f-84c8fe1dcbc0", + "regions": { + "a42499db-623e-4414-bea2-ff3617fd8fc5": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "4788c040-27a2-4dc1-8ed0-378a99b3a255", + "part": "whole" + }, + "id": "a42499db-623e-4414-bea2-ff3617fd8fc5" + } + } + }, + "134f92d0-6389-4226-af51-1134ae8e8278": { + "id": "134f92d0-6389-4226-af51-1134ae8e8278", + "prev": "36b8728c-32ad-4941-be03-5cef51cdc430", + "regions": { + "b6d82a77-2d58-4b9e-a375-3103214b826c": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "7ab0e6d0-1f1c-451b-8ac5-687da44a8287", + "part": "whole" + }, + "id": "b6d82a77-2d58-4b9e-a375-3103214b826c" + } + } + }, + "282a2421-697b-4fd0-9485-755abf5a0c18": { + "id": "282a2421-697b-4fd0-9485-755abf5a0c18", + "prev": "a8b9ceb9-b38f-4489-84df-b644c6fe28f2", + "regions": { + "522fec96-abe7-4a34-bd34-633733afecc8": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "d58e7785-c2ee-4a45-8e3d-4c538bf8075a", + "part": "whole" + }, + "id": "522fec96-abe7-4a34-bd34-633733afecc8" + } + } + }, + "2dfec088-8a70-411a-9199-904ef3fa2383": { + "id": "2dfec088-8a70-411a-9199-904ef3fa2383", + "prev": "282a2421-697b-4fd0-9485-755abf5a0c18", + "regions": { + "0535fcb6-3a2b-4b46-98a7-3ebb1a38c47e": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "c377ea0c-0cd9-4345-9be2-e20fb29c94c3", + "part": "whole" + }, + "id": "0535fcb6-3a2b-4b46-98a7-3ebb1a38c47e" + } + } + }, + "36a814c9-c540-4a6d-92d9-c03553d3d2c2": { + "id": "36a814c9-c540-4a6d-92d9-c03553d3d2c2", + "prev": "b52e4d09-5186-44e5-84db-3371c087acde", + "regions": { + "8bfba503-9907-43f0-b1a6-46a0b4311793": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "d5e4a56c-dfac-4346-be83-1c15b503deac", + "part": "whole" + }, + "id": "8bfba503-9907-43f0-b1a6-46a0b4311793" + } + } + }, + "36b8728c-32ad-4941-be03-5cef51cdc430": { + "id": "36b8728c-32ad-4941-be03-5cef51cdc430", + "prev": "05bb34ad-74b0-42b3-9654-8357d1ba9c99", + "regions": { + "a36a5bdf-7f62-49b0-8634-e155a98851dc": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "e33dfc47-e7df-4623-a7a6-ab6bcf944629", + "part": "whole" + }, + "id": "a36a5bdf-7f62-49b0-8634-e155a98851dc" + } + } + }, + "3f136f2a-f14c-4a4b-afea-13380556a79c": { + "id": "3f136f2a-f14c-4a4b-afea-13380556a79c", + "prev": "54cb8dfd-a89c-4922-867b-3c87d8b67cd3", + "regions": { + "80ecf237-d1b0-401e-83d2-6d04b7fcebd3": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "7debeb2b-ecea-414f-9b50-49657abb3e6a", + "part": "whole" + }, + "id": "80ecf237-d1b0-401e-83d2-6d04b7fcebd3" + } + } + }, + "502648cb-26fe-496b-899f-84c8fe1dcbc0": { + "id": "502648cb-26fe-496b-899f-84c8fe1dcbc0", + "prev": "3f136f2a-f14c-4a4b-afea-13380556a79c", + "regions": { + "4c83bb4d-2a52-41ba-a77f-0c6efebd83a6": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "dbd22f6b-6d49-4005-b8fe-422ef8ef1d42", + "part": "whole" + }, + "id": "4c83bb4d-2a52-41ba-a77f-0c6efebd83a6" + } + } + }, + "54cb8dfd-a89c-4922-867b-3c87d8b67cd3": { + "id": "54cb8dfd-a89c-4922-867b-3c87d8b67cd3", + "prev": "aa224267-f885-4c0c-95af-7bacfcc186d9", + "regions": { + "0848f0a7-032d-46c7-b35c-bfb69c83f961": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "3c32c557-d0e8-4bb3-a61a-aa51a767cd4e", + "part": "whole" + }, + "id": "0848f0a7-032d-46c7-b35c-bfb69c83f961" + } + } + }, + "636b563c-faee-4c9e-a6a3-f46a905bfa82": { + "id": "636b563c-faee-4c9e-a6a3-f46a905bfa82", + "prev": "c5f59b98-a227-4344-9d6d-03abdd01c6aa", + "regions": { + "9c64f662-05dc-4b14-9cdc-d450b96f4368": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "70640ac0-7041-47a8-9a7f-e871defd74b2", + "part": "whole" + }, + "id": "9c64f662-05dc-4b14-9cdc-d450b96f4368" + } + } + }, + "793cec2f-8413-484d-aa1e-388fd2b53a45": { + "id": "793cec2f-8413-484d-aa1e-388fd2b53a45", + "prev": "c66f3dfd-2d27-482b-be78-10ba733e826b", + "regions": { + "d08f9cfa-3b8d-4fb4-91ba-82d9858ea93e": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "dd56113e-e3db-41ae-91b7-2472ed194308", + "part": "whole" + }, + "id": "d08f9cfa-3b8d-4fb4-91ba-82d9858ea93e" + } + } + }, + "83e912ff-260a-4391-8a12-331aba098506": { + "id": "83e912ff-260a-4391-8a12-331aba098506", + "prev": "fe5a0732-69f5-462a-8af6-851f84a9fdec", + "regions": { + "2fefcf5f-ea20-4604-a528-5e6c91bcb100": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea", + "part": "whole" + }, + "id": "2fefcf5f-ea20-4604-a528-5e6c91bcb100" + } + } + }, + "851089af-9725-40c9-8f0b-9bf892b2b1fe": { + "id": "851089af-9725-40c9-8f0b-9bf892b2b1fe", + "prev": "636b563c-faee-4c9e-a6a3-f46a905bfa82", + "regions": { + "31c9dda5-fdf4-45e2-bcb7-12aa0f30e1d8": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "8408b90e-6cdd-44d1-86d3-648c23f877ac", + "part": "whole" + }, + "id": "31c9dda5-fdf4-45e2-bcb7-12aa0f30e1d8" + } + } + }, + "87ab653d-e804-470f-bde9-c67caaa0f354": { + "id": "87ab653d-e804-470f-bde9-c67caaa0f354", + "prev": "a8c2d446-caee-42c8-886a-ed98f4935d78", + "regions": { + "bc3aeb56-c465-4868-a1ea-2de82584de98": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "59f52294-4a25-4c92-bab8-3b07f0f44d15", + "part": "whole" + }, + "id": "bc3aeb56-c465-4868-a1ea-2de82584de98" + } + } + }, + "8b887c97-83bc-4395-83ac-f6703cbe243d": { + "id": "8b887c97-83bc-4395-83ac-f6703cbe243d", + "prev": "36a814c9-c540-4a6d-92d9-c03553d3d2c2", + "regions": { + "9d0bc72a-cb13-483f-a572-2bf60d0d145f": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "75499c85-d0a1-43db-8244-25778b9b2736", + "part": "whole" + }, + "id": "9d0bc72a-cb13-483f-a572-2bf60d0d145f" + } + } + }, + "a8b9ceb9-b38f-4489-84df-b644c6fe28f2": { + "id": "a8b9ceb9-b38f-4489-84df-b644c6fe28f2", + "prev": null, + "regions": { + "f741ed94-3f24-4427-b615-3ab8753e5814": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "bf74d2e9-2708-49b1-934b-e0ede342f475", + "part": "whole" + }, + "id": "f741ed94-3f24-4427-b615-3ab8753e5814" + } + } + }, + "a8c2d446-caee-42c8-886a-ed98f4935d78": { + "id": "a8c2d446-caee-42c8-886a-ed98f4935d78", + "prev": "2dfec088-8a70-411a-9199-904ef3fa2383", + "regions": { + "f03457d8-b2a7-4e14-9a73-cab80c5b815d": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "edaa7f2f-2439-4148-b57a-8c794c0945ec", + "part": "whole" + }, + "id": "f03457d8-b2a7-4e14-9a73-cab80c5b815d" + } + } + }, + "aa224267-f885-4c0c-95af-7bacfcc186d9": { + "id": "aa224267-f885-4c0c-95af-7bacfcc186d9", + "prev": "793cec2f-8413-484d-aa1e-388fd2b53a45", + "regions": { + "0d7ac442-5e1d-49a5-91b3-1432d72449d8": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "4d6826fe-2cb8-4468-85ed-a242a1ce7155", + "part": "whole" + }, + "id": "0d7ac442-5e1d-49a5-91b3-1432d72449d8" + } + } + }, + "b52e4d09-5186-44e5-84db-3371c087acde": { + "id": "b52e4d09-5186-44e5-84db-3371c087acde", + "prev": "134f92d0-6389-4226-af51-1134ae8e8278", + "regions": { + "7af7d997-80b2-497d-bced-ef8341763439": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "376882ec-d469-4fad-9462-18e4bbea64ca", + "part": "whole" + }, + "id": "7af7d997-80b2-497d-bced-ef8341763439" + } + } + }, + "c5f59b98-a227-4344-9d6d-03abdd01c6aa": { + "id": "c5f59b98-a227-4344-9d6d-03abdd01c6aa", + "prev": "83e912ff-260a-4391-8a12-331aba098506", + "regions": { + "7268abff-0540-4c06-aefc-c386410c0953": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "396d478b-34aa-4afa-9898-cdce8222a516", + "part": "whole" + }, + "id": "7268abff-0540-4c06-aefc-c386410c0953" + } + } + }, + "c66f3dfd-2d27-482b-be78-10ba733e826b": { + "id": "c66f3dfd-2d27-482b-be78-10ba733e826b", + "prev": "8b887c97-83bc-4395-83ac-f6703cbe243d", + "regions": { + "6cbe8e0e-8645-41a1-8a38-e44acb81be4b": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "7594c7c7-b808-48f7-9500-d7830a07968a", + "part": "whole" + }, + "id": "6cbe8e0e-8645-41a1-8a38-e44acb81be4b" + } + } + }, + "d22045e5-7e3e-452e-bc7b-c6c4a893da8e": { + "id": "d22045e5-7e3e-452e-bc7b-c6c4a893da8e", + "prev": "ec41f96a-63a3-4825-9295-f4657a440ddb", + "regions": { + "24e2a3a9-bf65-4dab-927f-0bf6ffbe581d": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "defe921f-8097-44c3-8336-8af6700804a7", + "part": "whole" + }, + "id": "24e2a3a9-bf65-4dab-927f-0bf6ffbe581d" + } + } + }, + "d24c958c-e419-4e4d-aa9c-d228a8ca55e4": { + "id": "d24c958c-e419-4e4d-aa9c-d228a8ca55e4", + "prev": "11bebe14-d1dc-476d-a31a-5828b9c3adf0", + "regions": { + "25312144-9faa-4680-bb8e-6307ea71370f": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "bed09a92-9a7a-473b-9464-90e479883a3e", + "part": "whole" + }, + "id": "25312144-9faa-4680-bb8e-6307ea71370f" + } + } + }, + "ec41f96a-63a3-4825-9295-f4657a440ddb": { + "id": "ec41f96a-63a3-4825-9295-f4657a440ddb", + "prev": "87ab653d-e804-470f-bde9-c67caaa0f354", + "regions": { + "22e8be98-c254-4d04-b0e4-b9b5ae46eefe": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "bc70f780-c240-4779-96f3-bc5ef9a37d59", + "part": "whole" + }, + "id": "22e8be98-c254-4d04-b0e4-b9b5ae46eefe" + } + } + }, + "fe5a0732-69f5-462a-8af6-851f84a9fdec": { + "id": "fe5a0732-69f5-462a-8af6-851f84a9fdec", + "prev": "d22045e5-7e3e-452e-bc7b-c6c4a893da8e", + "regions": { + "671b89f5-fa9c-4bc1-bdeb-6e0a4ce8939b": { + "attrs": { + "height": 0.8, + "width": 0.8, + "x": 0.1, + "y": 0.1 + }, + "content": { + "cell": "fd46e2ab-4ab6-4001-b536-1f323525d7d3", + "part": "whole" + }, + "id": "671b89f5-fa9c-4bc1-bdeb-6e0a4ce8939b" + } + } + } + }, + "themes": {} + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}