Update notebooks

This commit is contained in:
Roope Astala
2018-09-14 15:14:43 -04:00
parent 01a12c0b74
commit 8178484586
40 changed files with 14985 additions and 67 deletions

View File

@@ -0,0 +1,864 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 01. Train in the Notebook & Deploy Model to ACI\n",
"\n",
"* Load workspace\n",
"* Train a simple regression model directly in the Notebook python kernel\n",
"* Record run history\n",
"* Find the best model in run history and download it.\n",
"* Deploy the model as an Azure Container Instance (ACI)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"1. Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't. \n",
"\n",
"2. Install following pre-requisite libraries to your conda environment and restart notebook.\n",
"```shell\n",
"(myenv) $ conda install -y matplotlib tqdm scikit-learn\n",
"```\n",
"\n",
"3. Check that ACI is registered for your Azure Subscription. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az provider show -n Microsoft.ContainerInstance -o table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If ACI is not registered, run following command to register it. Note that you have to be a subscription owner, or this command will fail."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az provider register -n Microsoft.ContainerInstance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Validate Azure ML SDK installation and get version number for debugging purposes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"install"
]
},
"outputs": [],
"source": [
"from azureml.core import Experiment, Run, Workspace\n",
"import azureml.core\n",
"\n",
"# Check core SDK version number\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create workspace"
]
},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set experiment name\n",
"Choose a name for experiment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = 'train-in-notebook'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Start a training run in local Notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load diabetes dataset, a well-known small dataset that comes with scikit-learn\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.externals import joblib\n",
"\n",
"X, y = load_diabetes(return_X_y = True)\n",
"columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n",
"data = {\n",
" \"train\":{\"X\": X_train, \"y\": y_train}, \n",
" \"test\":{\"X\": X_test, \"y\": y_test}\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train a simple Ridge model\n",
"Train a very simple Ridge regression model in scikit-learn, and save it as a pickle file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"reg = Ridge(alpha = 0.03)\n",
"reg.fit(data['train']['X'], data['train']['y'])\n",
"preds = reg.predict(data['test']['X'])\n",
"print('Mean Squared Error is', mean_squared_error(preds, data['test']['y']))\n",
"joblib.dump(value = reg, filename = 'model.pkl');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add experiment tracking\n",
"Now, let's add Azure ML experiment logging, and upload persisted model into run record as well."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"local run",
"outputs upload"
]
},
"outputs": [],
"source": [
"experiment = Experiment(workspace = ws, name = experiment_name)\n",
"run = experiment.start_logging()\n",
"run.tag(\"Description\",\"My first run!\")\n",
"run.log('alpha', 0.03)\n",
"reg = Ridge(alpha = 0.03)\n",
"reg.fit(data['train']['X'], data['train']['y'])\n",
"preds = reg.predict(data['test']['X'])\n",
"run.log('mse', mean_squared_error(preds, data['test']['y']))\n",
"joblib.dump(value = reg, filename = 'model.pkl')\n",
"run.upload_file(name = 'outputs/model.pkl', path_or_stream = './model.pkl')\n",
"\n",
"run.complete()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can browse to the recorded run. Please make sure you use Chrome to navigate the run history page."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Simple parameter sweep\n",
"Sweep over alpha values of a sklearn ridge model, and capture metrics and trained model in the Azure ML experiment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"local run",
"outputs upload"
]
},
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
"from tqdm import tqdm\n",
"\n",
"model_name = \"model.pkl\"\n",
"\n",
"# start a training run\n",
"root_run = experiment.start_logging()\n",
"\n",
"# list of numbers from 0 to 1.0 with a 0.05 interval\n",
"alphas = np.arange(0.0, 1.0, 0.05)\n",
"\n",
"# try a bunch of alpha values in a Linear Regression (Ridge) model\n",
"for alpha in tqdm(alphas):\n",
" # create a bunch of child runs\n",
" with root_run.child_run(\"alpha-\" + str(alpha)) as run:\n",
" # Use Ridge algorithm to build a regression model\n",
" reg = Ridge(alpha=alpha)\n",
" reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])\n",
" preds = reg.predict(data[\"test\"][\"X\"])\n",
" mse = mean_squared_error(preds, data[\"test\"][\"y\"])\n",
"\n",
" # log alpha, mean_squared_error and feature names in run history\n",
" run.log(\"alpha\", alpha)\n",
" run.log(\"mse\", mse)\n",
" run.log_list(\"columns\", columns)\n",
"\n",
" with open(model_name, \"wb\") as file:\n",
" joblib.dump(value=reg, filename=file)\n",
" \n",
" # upload the serialized model into run history record\n",
" run.upload_file(name=\"outputs/\" + model_name, path_or_stream=model_name)\n",
"\n",
" # now delete the serialized model from local folder since it is already uploaded to run history \n",
" os.remove(model_name)\n",
" \n",
"# Declare run completed\n",
"root_run.complete()\n",
"root_run_id = root_run.id\n",
"print (\"run id:\", root_run.id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now you can reconstruct this run object from captured run id in a different Notebook session."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"query history"
]
},
"outputs": [],
"source": [
"rr = Run(experiment=experiment, run_id=root_run_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select best model from the experiment\n",
"Load all child run metrics recursively from the experiment into a dictionary object."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"query history",
"get metrics"
]
},
"outputs": [],
"source": [
"child_run_metrics = rr.get_metrics(recursive=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now find the run with the lowest Mean Squared Error value"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run_id = min(child_run_metrics, key = lambda k: child_run_metrics[k]['mse'])\n",
"best_run = Run(experiment=experiment, run_id=best_run_id)\n",
"print('Best run is:', best_run_id)\n",
"print('Metrics:', child_run_metrics[best_run_id])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can add tags to your runs to make them easier to catalog"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"query history"
]
},
"outputs": [],
"source": [
"best_run.tag(key=\"Description\", value=\"The best one\")\n",
"best_run.get_tags()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Plot MSE over alpha\n",
"\n",
"Let's observe the best model visually by plotting the MSE values over alpha values:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"\n",
"best_alpha = child_run_metrics[best_run_id]['alpha']\n",
"min_mse = child_run_metrics[best_run_id]['mse']\n",
"\n",
"alpha_mse = np.array([(child_run_metrics[k]['alpha'], child_run_metrics[k]['mse']) for k in child_run_metrics.keys()])\n",
"sorted_alpha_mse = alpha_mse[alpha_mse[:,0].argsort()]\n",
"\n",
"plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'r--')\n",
"plt.plot(sorted_alpha_mse[:,0], sorted_alpha_mse[:,1], 'bo')\n",
"\n",
"plt.xlabel('alpha', fontsize = 14)\n",
"plt.ylabel('mean squared error', fontsize = 14)\n",
"plt.title('MSE over alpha', fontsize = 16)\n",
"\n",
"# plot arrow\n",
"plt.arrow(x = best_alpha, y = min_mse + 39, dx = 0, dy = -26, ls = '-', lw = 0.4,\n",
" width = 0, head_width = .03, head_length = 8)\n",
"\n",
"# plot \"best run\" text\n",
"plt.text(x = best_alpha - 0.08, y = min_mse + 50, s = 'Best Run', fontsize = 14)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register the best model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Find the model file saved in the run record of best run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"query history"
]
},
"outputs": [],
"source": [
"for f in best_run.get_file_names():\n",
" print(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can register this model in the model registry of the workspace"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from history"
]
},
"outputs": [],
"source": [
"model = best_run.register_model(model_name='best_model', model_path='outputs/model.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Verify that the model has been registered properly. If you have done this several times you'd see the version number auto-increases each time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from history"
]
},
"outputs": [],
"source": [
"for m in ws.models(name='best_model'):\n",
" print(m.name, m.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also download the registered model. Afterwards, you should see a `model.pkl` file in the current directory. You can then use it for local testing if you'd like."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"download file"
]
},
"outputs": [],
"source": [
"model.download(target_dir='.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create scoring script\n",
"\n",
"The scoring script consists of two functions: `init` that is used to load the model to memory when starting the container, and `run` that makes the prediction when web service is called."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `%%writefile` cell magic is used to write the scoring function to a local file. Pay special attention to how the model is loaded in the `init()` function. When Docker image is built for this model, the actual model file is downloaded and placed on disk, and `get_model_path` function returns the local path where the model is placed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy as np\n",
"from sklearn.externals import joblib\n",
"from sklearn.linear_model import Ridge\n",
"from azureml.core.model import Model\n",
"\n",
"\n",
"def init():\n",
" global model\n",
" # note here \"best_model\" is the name of the model registered under the workspace\n",
" # this call should return the path to the model.pkl file on the local disk.\n",
" model_path = Model.get_model_path(model_name='best_model')\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
" \n",
"# note you can pass in multiple rows for scoring\n",
"def run(raw_data):\n",
" try:\n",
" data = json.loads(raw_data)['data']\n",
" data = np.array(data)\n",
" result = model.predict(data)\n",
" return json.dumps({\"result\": result.tolist()})\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"error\": result})\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create conda dependency file\n",
"\n",
"This `myenv.yml` file is used to specify which library dependencies to install on the web service. Note that the CondaDependencies API automatically adds necessary Azure ML dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies \n",
"\n",
"myenv = CondaDependencies()\n",
"myenv.add_conda_package(\"scikit-learn\")\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the `myenv.yml` file written."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pfile myenv.yml"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy web service into an Azure Container Instance\n",
"The deployment process takes the registered model and your scoring scrip, and builds a Docker image. It then deploys the Docker image into Azure Container Instance as a running container with an HTTP endpoint readying for scoring calls. Read more about [Azure Container Instance](https://azure.microsoft.com/en-us/services/container-instances/).\n",
"\n",
"Note ACI is great for quick and cost-effective dev/test deployment scenarios. For production workloads, please use [Azure Kubernentes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/) instead. Please follow in struction in [this notebook](11.production-deploy-to-aks.ipynb) to see how that can be done from Azure ML.\n",
" \n",
"** Note: ** The web service creation can take 6-7 minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice, Webservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n",
" memory_gb=1, \n",
" tags={'sample name': 'AML 101'}, \n",
" description='This is a great example.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note the below `WebService.deploy_from_model()` function takes a model object registered under the workspace. It then bakes the model file in the Docker image so it can be looked-up using the `Model.get_model_path()` function in `score.py`. \n",
"\n",
"If you have a local model file instead of a registered model object, you can also use the `WebService.deploy()` function which would register the model and then deploy."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"from azureml.core.image import ContainerImage\n",
"image_config = ContainerImage.image_configuration(execution_script=\"score.py\", \n",
" runtime=\"python\", \n",
" conda_file=\"myenv.yml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"%%time\n",
"# this will take 5-10 minutes to finish\n",
"# you can also use \"az container list\" command to find the ACI being deployed\n",
"service = Webservice.deploy_from_model(name='my-aci-svc',\n",
" deployment_config=aciconfig,\n",
" models=[model],\n",
" image_config=image_config,\n",
" workspace=ws)\n",
"\n",
"service.wait_for_deployment(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Test web service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"print('web service is hosted in ACI:', service.scoring_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the `run` API to call the web service with one row of data to get a prediction."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"import json\n",
"# score the first row from the test set.\n",
"test_samples = json.dumps({\"data\": X_test[0:1, :].tolist()})\n",
"service.run(input_data = test_samples)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Feed the entire test set and calculate the errors (residual values)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"# score the entire test set.\n",
"test_samples = json.dumps({'data': X_test.tolist()})\n",
"\n",
"result = json.loads(service.run(input_data = test_samples))['result']\n",
"residual = result - y_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also send raw HTTP request to test the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"\n",
"# 2 rows of input data, each with 10 made-up numerical features\n",
"input_data = \"{\\\"data\\\": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]}\"\n",
"\n",
"headers = {'Content-Type':'application/json'}\n",
"\n",
"# for AKS deployment you'd need to the service key in the header as well\n",
"# api_key = service.get_key()\n",
"# headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)} \n",
"\n",
"resp = requests.post(service.scoring_uri, input_data, headers = headers)\n",
"print(resp.text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Residual graph\n",
"Plot a residual value graph to chart the errors on the entire test set. Observe the nice bell curve."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f, (a0, a1) = plt.subplots(1, 2, gridspec_kw={'width_ratios':[3, 1], 'wspace':0, 'hspace': 0})\n",
"f.suptitle('Residual Values', fontsize = 18)\n",
"\n",
"f.set_figheight(6)\n",
"f.set_figwidth(14)\n",
"\n",
"a0.plot(residual, 'bo', alpha=0.4);\n",
"a0.plot([0,90], [0,0], 'r', lw=2)\n",
"a0.set_ylabel('residue values', fontsize=14)\n",
"a0.set_xlabel('test data set', fontsize=14)\n",
"\n",
"a1.hist(residual, orientation='horizontal', color='blue', bins=10, histtype='step');\n",
"a1.hist(residual, orientation='horizontal', color='blue', alpha=0.2, bins=10);\n",
"a1.set_yticklabels([])\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Delete ACI to clean up"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Deleting ACI is super fast!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"%%time\n",
"service.delete()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,432 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 02. Train locally\n",
"* Create or load workspace.\n",
"* Create scripts locally.\n",
"* Create `train.py` in a folder, along with a `my.lib` file.\n",
"* Configure & execute a local run in a user-managed Python environment.\n",
"* Configure & execute a local run in a system-managed Python environment.\n",
"* Configure & execute a local run in a Docker environment.\n",
"* Query run metrics to find the best model\n",
"* Register model for operationalization."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create An Experiment\n",
"**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"experiment_name = 'train-on-local'\n",
"exp = Experiment(workspace=ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a folder to store the training script."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"script_folder = './samples/train-on-local'\n",
"os.makedirs(script_folder, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create `train.py`\n",
"\n",
"Use `%%writefile` magic to write training code to `train.py` file under your script folder."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $script_folder/train.py\n",
"\n",
"import os\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import train_test_split\n",
"from azureml.core.run import Run\n",
"from sklearn.externals import joblib\n",
"\n",
"# example of referencing another script\n",
"import mylib\n",
"\n",
"X, y = load_diabetes(return_X_y=True)\n",
"\n",
"run = Run.get_submitted_run()\n",
"\n",
"X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)\n",
"data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
" \"test\": {\"X\": X_test, \"y\": y_test}}\n",
"\n",
"# example of referencing another script\n",
"alphas = mylib.get_alphas()\n",
"\n",
"for alpha in alphas:\n",
" # Use Ridge algorithm to create a regression model\n",
" reg = Ridge(alpha=alpha)\n",
" reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])\n",
"\n",
" preds = reg.predict(data[\"test\"][\"X\"])\n",
" mse = mean_squared_error(preds, data[\"test\"][\"y\"])\n",
" run.log('alpha', alpha)\n",
" run.log('mse', mse)\n",
"\n",
" model_file_name='ridge_{0:.2f}.pkl'.format(alpha)\n",
" # save model in the outputs folder so it automatically get uploaded\n",
" with open(model_file_name, \"wb\") as file:\n",
" joblib.dump(value=reg, filename=model_file_name)\n",
" \n",
" # upload the model file explicitly into artifacts \n",
" run.upload_file(name=model_file_name, path_or_stream=model_file_name)\n",
" \n",
" # register the model\n",
" run.register_model(model_name='diabetes-model', model_path=model_file_name)\n",
"\n",
" print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`train.py` also references a `mylib.py` file. So let's create that too."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $script_folder/mylib.py\n",
"import numpy as np\n",
"\n",
"def get_alphas():\n",
" # list of numbers from 0.0 to 1.0 with a 0.05 interval\n",
" return np.arange(0.0, 1.0, 0.05)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure & Run\n",
"### User-managed environment\n",
"Below, we use a user-managed run, which means you are responsible to ensure all the necessary packages are available in the Python environment you choose to run the script."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"\n",
"# Editing a run configuration property on-fly.\n",
"run_config_user_managed = RunConfiguration()\n",
"\n",
"run_config_user_managed.environment.python.user_managed_dependencies = True\n",
"\n",
"# You can choose a specific Python environment by pointing to a Python path \n",
"#run_config.environment.python.interpreter_path = '/home/ninghai/miniconda3/envs/sdk2/bin/python'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit script to run in the user-managed environment\n",
"Note whole script folder is submitted for execution, including the `mylib.py` file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"\n",
"src = ScriptRunConfig(source_directory=script_folder, script='train.py', run_config=run_config_user_managed)\n",
"run = exp.submit(src)\n",
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get run history details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### System-managed environment\n",
"You can also ask the system to build a new conda environment and execute your scripts in it. The environment is built once and will be reused in subsequent executions as long as the conda dependencies remain unchanged. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"run_config_system_managed = RunConfiguration()\n",
"\n",
"run_config_system_managed.environment.python.user_managed_dependencies = False\n",
"run_config_system_managed.prepare_environment = True\n",
"\n",
"# Specify conda dependencies with scikit-learn\n",
"cd = CondaDependencies.create(conda_packages=['scikit-learn'])\n",
"run_config_system_managed.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit script to run in the system-managed environment\n",
"A new conda environment is built based on the conda dependencies object. If you are running this for the first time, this might take up to 5 mninutes. But this conda environment is reused so long as you don't change the conda dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"src = ScriptRunConfig(source_directory=script_folder, script='train.py', run_config=run_config_system_managed)\n",
"run = exp.submit(src)\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get run history details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query run metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"query history",
"get metrics"
]
},
"outputs": [],
"source": [
"# get all metris logged in the run\n",
"run.get_metrics()\n",
"metrics = run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's find the model that has the lowest MSE value logged."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"best_alpha = metrics['alpha'][np.argmin(metrics['mse'])]\n",
"\n",
"print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n",
" min(metrics['mse']), \n",
" best_alpha\n",
"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also list all the files that are associated with this run record"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_file_names()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We know the model `ridge_0.40.pkl` is the best performing model from the eariler queries. So let's register it with the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# supply a model name, and the full path to the serialized model file.\n",
"model = run.register_model(model_name='best_ridge_model', model_path='ridge_0.40.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.name, model.version, model.url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now you can deploy this model following the example in the 01 notebook."
]
}
],
"metadata": {
"celltoolbar": "Edit Metadata",
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,45 @@
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
from sklearn.externals import joblib
import numpy as np
# os.makedirs('./outputs', exist_ok = True)
X, y = load_diabetes(return_X_y=True)
run = Run.get_submitted_run()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
"test": {"X": X_test, "y": y_test}}
# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)
for alpha in alphas:
# Use Ridge algorithm to create a regression model
reg = Ridge(alpha=alpha)
reg.fit(data["train"]["X"], data["train"]["y"])
preds = reg.predict(data["test"]["X"])
mse = mean_squared_error(preds, data["test"]["y"])
run.log('alpha', alpha)
run.log('mse', mse)
model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, "wb") as file:
joblib.dump(value=reg, filename=model_file_name)
# upload the model file explicitly into artifacts
run.upload_file(name=model_file_name, path_or_stream=model_file_name)
# register the model
# commented out for now until a bug is fixed
# run.register_model(file_name = model_file_name)
print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))

View File

@@ -0,0 +1,342 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 03. Train on Azure Container Instance (EXPERIMENTAL)\n",
"\n",
"* Create Workspace\n",
"* Create Project\n",
"* Create `train.py` in the project folder.\n",
"* Configure an ACI (Azure Container Instance) run\n",
"* Execute in ACI"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create workspace"
]
},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create An Experiment\n",
"\n",
"**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"experiment_name = 'train-on-aci'\n",
"experiment = Experiment(workspace = ws, name = experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a folder to store the training script."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"script_folder = './samples/train-on-aci'\n",
"os.makedirs(script_folder, exist_ok = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Remote execution on ACI\n",
"\n",
"Use `%%writefile` magic to write training code to `train.py` file under the project folder."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $script_folder/train.py\n",
"\n",
"import os\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import train_test_split\n",
"from azureml.core.run import Run\n",
"from sklearn.externals import joblib\n",
"\n",
"import numpy as np\n",
"\n",
"os.makedirs('./outputs', exist_ok=True)\n",
"\n",
"X, y = load_diabetes(return_X_y = True)\n",
"\n",
"run = Run.get_submitted_run()\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n",
"data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
" \"test\": {\"X\": X_test, \"y\": y_test}}\n",
"\n",
"# list of numbers from 0.0 to 1.0 with a 0.05 interval\n",
"alphas = np.arange(0.0, 1.0, 0.05)\n",
"\n",
"for alpha in alphas:\n",
" # Use Ridge algorithm to create a regression model\n",
" reg = Ridge(alpha = alpha)\n",
" reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])\n",
"\n",
" preds = reg.predict(data[\"test\"][\"X\"])\n",
" mse = mean_squared_error(preds, data[\"test\"][\"y\"])\n",
" run.log('alpha', alpha)\n",
" run.log('mse', mse)\n",
" \n",
" model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)\n",
" with open(model_file_name, \"wb\") as file:\n",
" joblib.dump(value = reg, filename = 'outputs/' + model_file_name)\n",
"\n",
" print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure for using ACI\n",
"Linux-based ACI is available in `westus`, `eastus`, `westeurope`, `northeurope`, `westus2` and `southeastasia` regions. See details [here](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-quotas#region-availability)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"configure run"
]
},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new runconfig object\n",
"run_config = RunConfiguration()\n",
"\n",
"# signal that you want to use ACI to execute script.\n",
"run_config.target = \"containerinstance\"\n",
"\n",
"# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n",
"run_config.container_instance.region = 'eastus'\n",
"\n",
"# set the ACI CPU and Memory \n",
"run_config.container_instance.cpu_cores = 1\n",
"run_config.container_instance.memory_gb = 2\n",
"\n",
"# enable Docker \n",
"run_config.environment.docker.enabled = True\n",
"\n",
"# set Docker base image to the default CPU-based image\n",
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"#run_config.environment.docker.base_image = 'microsoft/mmlspark:plus-0.9.9'\n",
"\n",
"# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
"run_config.auto_prepare_environment = True\n",
"\n",
"# specify CondaDependencies obj\n",
"run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Submit the Experiment\n",
"Finally, run the training job on the ACI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remote run",
"aci"
]
},
"outputs": [],
"source": [
"%%time \n",
"from azureml.core.script_run_config import ScriptRunConfig\n",
"\n",
"script_run_config = ScriptRunConfig(source_directory = script_folder,\n",
" script= 'train.py',\n",
" run_config = run_config)\n",
"\n",
"run = experiment.submit(script_run_config)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remote run",
"aci"
]
},
"outputs": [],
"source": [
"%%time\n",
"# Shows output of the run on stdout.\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"query history"
]
},
"outputs": [],
"source": [
"# Show run details\n",
"\n",
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Navigate to the above URL using Chrome, and you should see a graph of alpha values, and a graph of MSE.\n",
"\n",
"![graphs](../images/mse.png)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"get metrics"
]
},
"outputs": [],
"source": [
"# get all metris logged in the run\n",
"run.get_metrics()\n",
"metrics = run.get_metrics()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n",
" min(metrics['mse']), \n",
" metrics['alpha'][np.argmin(metrics['mse'])]\n",
"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,347 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 04. Train in a remote VM (MLC managed DSVM)\n",
"* Create Workspace\n",
"* Create Project\n",
"* Create `train.py` file\n",
"* Create DSVM as Machine Learning Compute (MLC) resource\n",
"* Configure & execute a run in a conda environment in the default miniconda Docker container on DSVM"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = 'train-on-remote-vm'\n",
"script_folder = './samples/train-on-remote-vm'\n",
"\n",
"import os\n",
"os.makedirs(script_folder, exist_ok = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"exp = Experiment(workspace = ws, name = experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create `train.py`\n",
"\n",
"Use `%%writefile` magic to write training code to `train.py` file under your project folder."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $script_folder/train.py\n",
"\n",
"import os\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import train_test_split\n",
"from azureml.core.run import Run\n",
"from sklearn.externals import joblib\n",
"\n",
"import numpy as np\n",
"\n",
"os.makedirs('./outputs', exist_ok=True)\n",
"\n",
"X, y = load_diabetes(return_X_y = True)\n",
"\n",
"run = Run.get_submitted_run()\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n",
"data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
" \"test\": {\"X\": X_test, \"y\": y_test}}\n",
"\n",
"# list of numbers from 0.0 to 1.0 with a 0.05 interval\n",
"alphas = np.arange(0.0, 1.0, 0.05)\n",
"\n",
"for alpha in alphas:\n",
" # Use Ridge algorithm to create a regression model\n",
" reg = Ridge(alpha = alpha)\n",
" reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])\n",
"\n",
" preds = reg.predict(data[\"test\"][\"X\"])\n",
" mse = mean_squared_error(preds, data[\"test\"][\"y\"])\n",
" run.log('alpha', alpha)\n",
" run.log('mse', mse)\n",
" \n",
" model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)\n",
" with open(model_file_name, \"wb\") as file:\n",
" joblib.dump(value = reg, filename = 'outputs/' + model_file_name)\n",
"\n",
" print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Linux DSVM as a compute target\n",
"\n",
"**Note**: If creation fails with a message about Marketplace purchase eligibilty, go to portal.azure.com, start creating DSVM there, and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled it, you can exit without actually creating VM.\n",
" \n",
"**Note**: By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you switch to a different port (such as 5022), you can append the port number to the address like the example below. [Read more](../../documentation/sdk/ssh-issue.md) on this."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import DsvmCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"compute_target_name = 'mydsvm'\n",
"\n",
"try:\n",
" dsvm_compute = DsvmCompute(workspace = ws, name = compute_target_name)\n",
" print('found existing:', dsvm_compute.name)\n",
"except ComputeTargetException:\n",
" print('creating new.')\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n",
" dsvm_compute = DsvmCompute.create(ws, name = compute_target_name, provisioning_configuration = dsvm_config)\n",
" dsvm_compute.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure & Run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure a Docker run with new conda environment on the VM\n",
"You can execute in a Docker container in the VM. If you choose this route, you don't need to install anything on the VM yourself. Azure ML execution service will take care of it for you."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"\n",
"# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n",
"run_config = RunConfiguration(framework = \"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"run_config.target = compute_target_name\n",
"\n",
"# Use Docker in the remote VM\n",
"run_config.environment.docker.enabled = True\n",
"\n",
"# Use CPU base image from DockerHub\n",
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"print('Base Docker image is:', run_config.environment.docker.base_image)\n",
"\n",
"# Ask system to provision a new one based on the conda_dependencies.yml file\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# Prepare the Docker and conda environment automatically when executingfor the first time.\n",
"run_config.prepare_environment = True\n",
"\n",
"# specify CondaDependencies obj\n",
"run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit the Experiment\n",
"Submit script to run in the Docker image in the remote VM. If you run this for the first time, the system will download the base image, layer in packages specified in the `conda_dependencies.yml` file on top of the base image, create a container and then execute the script in the container."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Run\n",
"from azureml.core import ScriptRunConfig\n",
"\n",
"src = ScriptRunConfig(source_directory = script_folder, script = 'train.py', run_config = run_config)\n",
"run = exp.submit(src)\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View run history details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find the best run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get all metris logged in the run\n",
"run.get_metrics()\n",
"metrics = run.get_metrics()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"print('When alpha is {1:0.2f}, we have min MSE {0:0.2f}.'.format(\n",
" min(metrics['mse']), \n",
" metrics['alpha'][np.argmin(metrics['mse'])]\n",
"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean up compute resource"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dsvm_compute.delete()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,39 @@
import os
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from azureml.core import Run
from sklearn.externals import joblib
import numpy as np
os.makedirs('./outputs', exist_ok=True)
X, y = load_diabetes(return_X_y=True)
run = Run.get_submitted_run()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
"test": {"X": X_test, "y": y_test}}
# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)
for alpha in alphas:
# Use Ridge algorithm to create a regression model
reg = Ridge(alpha=alpha)
reg.fit(data["train"]["X"], data["train"]["y"])
preds = reg.predict(data["test"]["X"])
mse = mean_squared_error(preds, data["test"]["y"])
run.log('alpha', alpha)
run.log('mse', mse)
model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
with open(model_file_name, "wb") as file:
joblib.dump(value=reg, filename='outputs/' + model_file_name)
print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))

View File

@@ -0,0 +1,470 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 05. Train in Spark\n",
"* Create Workspace\n",
"* Create Project\n",
"* Create `train-spark.py` file in the project folder\n",
"* Execute a PySpark script in ACI.\n",
"* Execute a PySpark script in a Docker container on remote DSVM\n",
"* Execute a PySpark script in HDI"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Project and Associate with Run History\n",
"**Project** is a local folder that contains files for your Azure ML experiments. It is associated with a **run history**, a cloud container of run metrics and output artifacts from your experiments. You can either attach a local folder as a new project, or load a local folder as a project if it has been attached before."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# choose a name for the run history container in the workspace\n",
"experiment_name = 'train-on-spark'\n",
"\n",
"# project folder\n",
"project_folder = './sample_projects/train-on-spark'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from azureml.project.project import Project\n",
"\n",
"project = Project.attach(workspace_object = ws,\n",
" experiment_name = experiment_name,\n",
" directory = project_folder)\n",
"\n",
"print(project.project_directory, project.history.name, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Copy files\n",
"\n",
"\n",
"Copy `train-spark.py` and `iris.csv` into the project folde"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from shutil import copyfile\n",
"\n",
"# copy iris dataset in to project folder\n",
"copyfile('./iris.csv', os.path.join(project_folder, 'iris.csv'))\n",
"\n",
"# copy train-spark.py file into project folder\n",
"# train-spark.py trains a simple LogisticRegression model using Spark.ML algorithm\n",
"copyfile('./train-spark.py', os.path.join(project_folder, 'train-spark.py'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Review the train-spark.py file in the project folder."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(os.path.join(project_folder, 'train-spark.py'), 'r') as fin:\n",
" print(fin.read())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure & Run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure ACI target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new runconfig object\n",
"run_config = RunConfiguration()\n",
"\n",
"# signal that you want to use ACI to execute script.\n",
"run_config.target = \"containerinstance\"\n",
"\n",
"# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n",
"run_config.container_instance.region = 'eastus'\n",
"\n",
"# set the ACI CPU and Memory \n",
"run_config.container_instance.cpu_cores = 1\n",
"run_config.container_instance.memory_gb = 2\n",
"\n",
"# enable Docker \n",
"run_config.environment.docker.enabled = True\n",
"\n",
"# set Docker base image to the default CPU-based image\n",
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
"print('base image is', run_config.environment.docker.base_image)\n",
"#run_config.environment.docker.base_image = 'microsoft/mmlspark:plus-0.9.9'\n",
"\n",
"# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
"# please update this file if you need additional packages.\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
"run_config.auto_prepare_environment = True\n",
"\n",
"cd = CondaDependencies()\n",
"# add numpy as a dependency\n",
"cd.add_conda_package('numpy')\n",
"# overwrite the default conda_dependencies.yml file\n",
"cd.save_to_file(base_directory = project_folder, conda_file_path='aml_config/conda_dependencies.yml')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run Spark job in ACI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time \n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.script_run_config import ScriptRunConfig\n",
"\n",
"experiment = Experiment(project_object.workspace_object, project_object.history.name)\n",
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
" script= 'train-spark.py',\n",
" run_config = run_config)\n",
"run = experiment.submit(script_run_config)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Show the run in the web UI\n",
"**IMPORTANT**: Please use Chrome to navigate to the URL."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import helpers.py\n",
"import helpers\n",
"\n",
"# get the URL of the run history web page\n",
"print(helpers.get_run_history_url(run))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach a remote Linux VM\n",
"To use remote docker commpute target:\n",
" 1. Create a Linux DSVM in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
" 2. Enter the IP address, username and password below\n",
" \n",
"**Note**: the below example use port 5022. By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you switch to a different port (such as 5022), you can append the port number to the address like the example below. [Read more](../../documentation/sdk/ssh-issue.md) on this."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import RemoteCompute\n",
"\n",
"try:\n",
" # Attaches a remote docker on a remote vm as a compute target.\n",
" RemoteCompute.attach(workspace,name = \"cpu-dsvm\", username = \"ninghai\", \n",
" address = \"hai2.eastus2.cloudapp.azure.com:5022\", \n",
" ssh-port=22\n",
" password = \"<password>\"))\n",
"except UserErrorException as e:\n",
" print(\"Caught = {}\".format(e.message))\n",
" print(\"Compute config already attached.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure a Spark Docker run on the VM\n",
"Execute in the Spark engine in a Docker container in the VM. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n",
"run_config = RunConfiguration.load(path = project_folder, name = \"cpu-dsvm\")\n",
"\n",
"# set framework to PySpark\n",
"run_config.framework = \"PySpark\"\n",
"\n",
"# Use Docker in the remote VM\n",
"run_config.environment.docker.enabled = True\n",
"\n",
"# Use the MMLSpark CPU based image.\n",
"# https://hub.docker.com/r/microsoft/mmlspark/\n",
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
"print('base image is:', run_config.environment.docker.base_image)\n",
"\n",
"# signal use the user-managed environment\n",
"# do NOT provision a new one based on the conda.yml file\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# Prepare the Docker and conda environment automatically when execute for the first time.\n",
"run_config.auto_prepare_environment = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit the Experiment\n",
"Submit script to run in the Spark engine in the Docker container in the remote VM."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
" script= 'train-spark.py',\n",
" run_config = run_config)\n",
"run = experiment.submit(script_run_config)\n",
"\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get the URL of the run history web page\n",
"print(helpers.get_run_history_url(run))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach an HDI cluster\n",
"To use HDI commpute target:\n",
" 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
" 2. Enter the IP address, username and password below"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import HDInsightCompute\n",
"\n",
"try:\n",
" # Attaches a HDI cluster as a compute target.\n",
" HDInsightCompute.attach(ws, name = \"myhdi\",\n",
" username = \"ninghai\", \n",
" address = \"sparkhai-ssh.azurehdinsight.net\", \n",
" password = \"<pwd>\"))\n",
"except UserErrorException as e:\n",
" print(\"Caught = {}\".format(e.message))\n",
" print(\"Compute config already attached.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure HDI run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above.\n",
"run_config = RunConfiguration.load(path = project_folder, name = 'myhdi')\n",
"\n",
"# ask system to prepare the conda environment automatically when executed for the first time\n",
"run_config.auto_prepare_environment = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit the script to HDI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
" script= 'train-spark.py',\n",
" run_config = run_config)\n",
"run = experiment.submit(script_run_config)\n",
"\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get the URL of the run history web page\n",
"print(helpers.get_run_history_url(run))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get all metris logged in the run\n",
"metrics = run.get_metrics()\n",
"print(metrics)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,150 @@
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
1 5.1 3.5 1.4 0.2 Iris-setosa
2 4.9 3.0 1.4 0.2 Iris-setosa
3 4.7 3.2 1.3 0.2 Iris-setosa
4 4.6 3.1 1.5 0.2 Iris-setosa
5 5.0 3.6 1.4 0.2 Iris-setosa
6 5.4 3.9 1.7 0.4 Iris-setosa
7 4.6 3.4 1.4 0.3 Iris-setosa
8 5.0 3.4 1.5 0.2 Iris-setosa
9 4.4 2.9 1.4 0.2 Iris-setosa
10 4.9 3.1 1.5 0.1 Iris-setosa
11 5.4 3.7 1.5 0.2 Iris-setosa
12 4.8 3.4 1.6 0.2 Iris-setosa
13 4.8 3.0 1.4 0.1 Iris-setosa
14 4.3 3.0 1.1 0.1 Iris-setosa
15 5.8 4.0 1.2 0.2 Iris-setosa
16 5.7 4.4 1.5 0.4 Iris-setosa
17 5.4 3.9 1.3 0.4 Iris-setosa
18 5.1 3.5 1.4 0.3 Iris-setosa
19 5.7 3.8 1.7 0.3 Iris-setosa
20 5.1 3.8 1.5 0.3 Iris-setosa
21 5.4 3.4 1.7 0.2 Iris-setosa
22 5.1 3.7 1.5 0.4 Iris-setosa
23 4.6 3.6 1.0 0.2 Iris-setosa
24 5.1 3.3 1.7 0.5 Iris-setosa
25 4.8 3.4 1.9 0.2 Iris-setosa
26 5.0 3.0 1.6 0.2 Iris-setosa
27 5.0 3.4 1.6 0.4 Iris-setosa
28 5.2 3.5 1.5 0.2 Iris-setosa
29 5.2 3.4 1.4 0.2 Iris-setosa
30 4.7 3.2 1.6 0.2 Iris-setosa
31 4.8 3.1 1.6 0.2 Iris-setosa
32 5.4 3.4 1.5 0.4 Iris-setosa
33 5.2 4.1 1.5 0.1 Iris-setosa
34 5.5 4.2 1.4 0.2 Iris-setosa
35 4.9 3.1 1.5 0.1 Iris-setosa
36 5.0 3.2 1.2 0.2 Iris-setosa
37 5.5 3.5 1.3 0.2 Iris-setosa
38 4.9 3.1 1.5 0.1 Iris-setosa
39 4.4 3.0 1.3 0.2 Iris-setosa
40 5.1 3.4 1.5 0.2 Iris-setosa
41 5.0 3.5 1.3 0.3 Iris-setosa
42 4.5 2.3 1.3 0.3 Iris-setosa
43 4.4 3.2 1.3 0.2 Iris-setosa
44 5.0 3.5 1.6 0.6 Iris-setosa
45 5.1 3.8 1.9 0.4 Iris-setosa
46 4.8 3.0 1.4 0.3 Iris-setosa
47 5.1 3.8 1.6 0.2 Iris-setosa
48 4.6 3.2 1.4 0.2 Iris-setosa
49 5.3 3.7 1.5 0.2 Iris-setosa
50 5.0 3.3 1.4 0.2 Iris-setosa
51 7.0 3.2 4.7 1.4 Iris-versicolor
52 6.4 3.2 4.5 1.5 Iris-versicolor
53 6.9 3.1 4.9 1.5 Iris-versicolor
54 5.5 2.3 4.0 1.3 Iris-versicolor
55 6.5 2.8 4.6 1.5 Iris-versicolor
56 5.7 2.8 4.5 1.3 Iris-versicolor
57 6.3 3.3 4.7 1.6 Iris-versicolor
58 4.9 2.4 3.3 1.0 Iris-versicolor
59 6.6 2.9 4.6 1.3 Iris-versicolor
60 5.2 2.7 3.9 1.4 Iris-versicolor
61 5.0 2.0 3.5 1.0 Iris-versicolor
62 5.9 3.0 4.2 1.5 Iris-versicolor
63 6.0 2.2 4.0 1.0 Iris-versicolor
64 6.1 2.9 4.7 1.4 Iris-versicolor
65 5.6 2.9 3.6 1.3 Iris-versicolor
66 6.7 3.1 4.4 1.4 Iris-versicolor
67 5.6 3.0 4.5 1.5 Iris-versicolor
68 5.8 2.7 4.1 1.0 Iris-versicolor
69 6.2 2.2 4.5 1.5 Iris-versicolor
70 5.6 2.5 3.9 1.1 Iris-versicolor
71 5.9 3.2 4.8 1.8 Iris-versicolor
72 6.1 2.8 4.0 1.3 Iris-versicolor
73 6.3 2.5 4.9 1.5 Iris-versicolor
74 6.1 2.8 4.7 1.2 Iris-versicolor
75 6.4 2.9 4.3 1.3 Iris-versicolor
76 6.6 3.0 4.4 1.4 Iris-versicolor
77 6.8 2.8 4.8 1.4 Iris-versicolor
78 6.7 3.0 5.0 1.7 Iris-versicolor
79 6.0 2.9 4.5 1.5 Iris-versicolor
80 5.7 2.6 3.5 1.0 Iris-versicolor
81 5.5 2.4 3.8 1.1 Iris-versicolor
82 5.5 2.4 3.7 1.0 Iris-versicolor
83 5.8 2.7 3.9 1.2 Iris-versicolor
84 6.0 2.7 5.1 1.6 Iris-versicolor
85 5.4 3.0 4.5 1.5 Iris-versicolor
86 6.0 3.4 4.5 1.6 Iris-versicolor
87 6.7 3.1 4.7 1.5 Iris-versicolor
88 6.3 2.3 4.4 1.3 Iris-versicolor
89 5.6 3.0 4.1 1.3 Iris-versicolor
90 5.5 2.5 4.0 1.3 Iris-versicolor
91 5.5 2.6 4.4 1.2 Iris-versicolor
92 6.1 3.0 4.6 1.4 Iris-versicolor
93 5.8 2.6 4.0 1.2 Iris-versicolor
94 5.0 2.3 3.3 1.0 Iris-versicolor
95 5.6 2.7 4.2 1.3 Iris-versicolor
96 5.7 3.0 4.2 1.2 Iris-versicolor
97 5.7 2.9 4.2 1.3 Iris-versicolor
98 6.2 2.9 4.3 1.3 Iris-versicolor
99 5.1 2.5 3.0 1.1 Iris-versicolor
100 5.7 2.8 4.1 1.3 Iris-versicolor
101 6.3 3.3 6.0 2.5 Iris-virginica
102 5.8 2.7 5.1 1.9 Iris-virginica
103 7.1 3.0 5.9 2.1 Iris-virginica
104 6.3 2.9 5.6 1.8 Iris-virginica
105 6.5 3.0 5.8 2.2 Iris-virginica
106 7.6 3.0 6.6 2.1 Iris-virginica
107 4.9 2.5 4.5 1.7 Iris-virginica
108 7.3 2.9 6.3 1.8 Iris-virginica
109 6.7 2.5 5.8 1.8 Iris-virginica
110 7.2 3.6 6.1 2.5 Iris-virginica
111 6.5 3.2 5.1 2.0 Iris-virginica
112 6.4 2.7 5.3 1.9 Iris-virginica
113 6.8 3.0 5.5 2.1 Iris-virginica
114 5.7 2.5 5.0 2.0 Iris-virginica
115 5.8 2.8 5.1 2.4 Iris-virginica
116 6.4 3.2 5.3 2.3 Iris-virginica
117 6.5 3.0 5.5 1.8 Iris-virginica
118 7.7 3.8 6.7 2.2 Iris-virginica
119 7.7 2.6 6.9 2.3 Iris-virginica
120 6.0 2.2 5.0 1.5 Iris-virginica
121 6.9 3.2 5.7 2.3 Iris-virginica
122 5.6 2.8 4.9 2.0 Iris-virginica
123 7.7 2.8 6.7 2.0 Iris-virginica
124 6.3 2.7 4.9 1.8 Iris-virginica
125 6.7 3.3 5.7 2.1 Iris-virginica
126 7.2 3.2 6.0 1.8 Iris-virginica
127 6.2 2.8 4.8 1.8 Iris-virginica
128 6.1 3.0 4.9 1.8 Iris-virginica
129 6.4 2.8 5.6 2.1 Iris-virginica
130 7.2 3.0 5.8 1.6 Iris-virginica
131 7.4 2.8 6.1 1.9 Iris-virginica
132 7.9 3.8 6.4 2.0 Iris-virginica
133 6.4 2.8 5.6 2.2 Iris-virginica
134 6.3 2.8 5.1 1.5 Iris-virginica
135 6.1 2.6 5.6 1.4 Iris-virginica
136 7.7 3.0 6.1 2.3 Iris-virginica
137 6.3 3.4 5.6 2.4 Iris-virginica
138 6.4 3.1 5.5 1.8 Iris-virginica
139 6.0 3.0 4.8 1.8 Iris-virginica
140 6.9 3.1 5.4 2.1 Iris-virginica
141 6.7 3.1 5.6 2.4 Iris-virginica
142 6.9 3.1 5.1 2.3 Iris-virginica
143 5.8 2.7 5.1 1.9 Iris-virginica
144 6.8 3.2 5.9 2.3 Iris-virginica
145 6.7 3.3 5.7 2.5 Iris-virginica
146 6.7 3.0 5.2 2.3 Iris-virginica
147 6.3 2.5 5.0 1.9 Iris-virginica
148 6.5 3.0 5.2 2.0 Iris-virginica
149 6.2 3.4 5.4 2.3 Iris-virginica
150 5.9 3.0 5.1 1.8 Iris-virginica

View File

@@ -0,0 +1,92 @@
import numpy as np
import pyspark
import os
import urllib
import sys
from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from azureml.core.run import Run
# initialize logger
run = Run.get_submitted_run()
# start Spark session
spark = pyspark.sql.SparkSession.builder.appName('Iris').getOrCreate()
# print runtime versions
print('****************')
print('Python version: {}'.format(sys.version))
print('Spark version: {}'.format(spark.version))
print('****************')
# load iris.csv into Spark dataframe
schema = StructType([
StructField("sepal-length", DoubleType()),
StructField("sepal-width", DoubleType()),
StructField("petal-length", DoubleType()),
StructField("petal-width", DoubleType()),
StructField("class", StringType())
])
data = spark.read.csv('iris.csv', header=False, schema=schema)
print("First 10 rows of Iris dataset:")
data.show(10)
# vectorize all numerical columns into a single feature column
feature_cols = data.columns[:-1]
assembler = pyspark.ml.feature.VectorAssembler(
inputCols=feature_cols, outputCol='features')
data = assembler.transform(data)
# convert text labels into indices
data = data.select(['features', 'class'])
label_indexer = pyspark.ml.feature.StringIndexer(
inputCol='class', outputCol='label').fit(data)
data = label_indexer.transform(data)
# only select the features and label column
data = data.select(['features', 'label'])
print("Reading for machine learning")
data.show(10)
# change regularization rate and you will likely get a different accuracy.
reg = 0.01
# load regularization rate from argument if present
if len(sys.argv) > 1:
reg = float(sys.argv[1])
# log regularization rate
run.log("Regularization Rate", reg)
# use Logistic Regression to train on the training set
train, test = data.randomSplit([0.70, 0.30])
lr = pyspark.ml.classification.LogisticRegression(regParam=reg)
model = lr.fit(train)
# predict on the test set
prediction = model.transform(test)
print("Prediction")
prediction.show(10)
# evaluate the accuracy of the model using the test set
evaluator = pyspark.ml.evaluation.MulticlassClassificationEvaluator(
metricName='accuracy')
accuracy = evaluator.evaluate(prediction)
print()
print('#####################################')
print('Regularization rate is {}'.format(reg))
print("Accuracy is {}".format(accuracy))
print('#####################################')
print()
# log accuracy
run.log('Accuracy', accuracy)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,52 @@
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import os
import argparse
# Import Run from azureml.core,
from azureml.core.run import Run
parser = argparse.ArgumentParser()
parser.add_argument('--alpha', type=float, dest='alpha',
default=0.5, help='regularization strength')
args = parser.parse_args()
# Get handle of current run for logging and history purposes
run = Run.get_submitted_run()
X, y = load_diabetes(return_X_y=True)
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
x_train, x_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
data = {"train": {"x": x_train, "y": y_train},
"test": {"x": x_test, "y": y_test}}
alpha = args.alpha
print('alpha value is:', alpha)
reg = Ridge(alpha=alpha)
reg.fit(data["train"]["x"], data["train"]["y"])
print('Ridget model fitted.')
preds = reg.predict(data["test"]["x"])
mse = mean_squared_error(preds, data["test"]["y"])
# Log metrics
run.log("alpha", alpha)
run.log("mse", mse)
os.makedirs('./outputs', exist_ok=True)
model_file_name = "model.pkl"
# Save model as part of the run history
with open(model_file_name, "wb") as file:
joblib.dump(reg, 'outputs/' + model_file_name)
print('Mean Squared Error is:', mse)

Binary file not shown.

After

Width:  |  Height:  |  Size: 572 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 555 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 615 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 561 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 504 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 546 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 578 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 586 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 562 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 592 B

View File

@@ -0,0 +1,103 @@
from __future__ import print_function
import tensorflow as tf
import numpy as np
import os
import json
import base64
from io import BytesIO
from PIL import Image
##############################################
# helper functions
##############################################
def build_model(x, y_, keep_prob):
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
x_image = tf.reshape(x, [-1, 28, 28, 1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv
def base64ToImg(base64ImgString):
if base64ImgString.startswith('b\''):
base64ImgString = base64ImgString[2:-1]
base64Img = base64ImgString.encode('utf-8')
decoded_img = base64.b64decode(base64Img)
img_buffer = BytesIO(decoded_img)
img = Image.open(img_buffer)
return img
##############################################
# API init() and run() methods
##############################################
def init():
global x, keep_prob, y_conv, sess
g = tf.Graph()
with g.as_default():
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
keep_prob = tf.placeholder(tf.float32)
y_conv = build_model(x, y_, keep_prob)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
model_dir = os.path.join('sample_projects', 'outputs')
saved_model_path = os.path.join(model_dir, 'model.ckpt')
sess = tf.Session(graph=g)
sess.run(init_op)
saver.restore(sess, saved_model_path)
def run(input_data):
img = base64ToImg(json.loads(input_data)['data'])
img_data = np.array(img, dtype=np.float32).flatten()
img_data.resize((1, 784))
y_pred = sess.run(y_conv, feed_dict={x: img_data, keep_prob: 1.0})
predicted_label = np.argmax(y_pred[0])
outJsonString = json.dumps({"label": str(predicted_label)})
return str(outJsonString)

View File

@@ -0,0 +1,151 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
# Load MNIST Data
from tensorflow.examples.tutorials.mnist import input_data
import os
import argparse
from azureml.core.run import Run
# the following 10 lines can be removed once BUG# 241943 is fixed
def get_logger():
try:
return Run.get_submitted_run()
except Exception:
return LocalLogger()
class LocalLogger:
def log(self, key, value):
print("AML-Log:", key, value)
def build_model(x, y_, keep_prob):
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
x_image = tf.reshape(x, [-1, 28, 28, 1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv
def main():
# Get command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--learning_rate', type=float,
default=0.0001, help='learning rate')
parser.add_argument('--minibatch_size', type=int,
default=50, help='minibatch size')
parser.add_argument('--keep_probability', type=float,
default=0.5, help='keep probability for dropout layer')
parser.add_argument('--num_iterations', type=int,
default=1000, help='number of iterations')
parser.add_argument('--output_dir', type=str, default='./outputs',
help='output directory to write checkpoints to')
args = parser.parse_args()
# log parameters
run_logger = get_logger()
run_logger.log("learning_rate", args.learning_rate)
run_logger.log("minibatch_size", args.minibatch_size)
run_logger.log("keep_probability", args.keep_probability)
run_logger.log("num_iterations", args.num_iterations)
# Load MNIST data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
keep_prob = tf.placeholder(tf.float32)
y_conv = build_model(x, y_, keep_prob)
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(
args.learning_rate).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.global_variables_initializer())
for i in range(args.num_iterations):
batch = mnist.train.next_batch(args.minibatch_size)
if i % 100 == 0:
test_acc = accuracy.eval(
feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
train_accuracy = accuracy.eval(
feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
print("step %d, training accuracy %g, test accuracy, %g" %
(i, train_accuracy, test_acc))
# log test accuracy to AML
run_logger.log("Accuracy", float(test_acc))
run_logger.log("Iterations", i)
sess.run(train_step, feed_dict={
x: batch[0], y_: batch[1], keep_prob: args.keep_probability})
# Save the trained model
model_dir = args.output_dir
model_file = 'model.ckpt'
os.makedirs(model_dir, exist_ok=True)
saver = tf.train.Saver()
saver.save(sess, os.path.join(model_dir, model_file))
final_test_acc = sess.run(accuracy, feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
run_logger.log("Accuracy", float(final_test_acc))
run_logger.log("Iterations", args.num_iterations)
print("test accuracy %g" % final_test_acc)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,420 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 10. Register Model, Create Image and Deploy Service\n",
"\n",
"This example shows how to deploy a web service in step-by-step fashion:\n",
"\n",
" 1. Register model\n",
" 2. Query versions of models and select one to deploy\n",
" 3. Create Docker image\n",
" 4. Query versions of images\n",
" 5. Deploy the image as web service\n",
" \n",
"**IMPORTANT**:\n",
" * This notebook requires you to first complete \"01.SDK-101-Train-and-Deploy-to-ACI.ipynb\" Notebook\n",
" \n",
"The 101 Notebook taught you how to deploy a web service directly from model in one step. This Notebook shows a more advanced approach that gives you more control over model versions and Docker image versions. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create workspace"
]
},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Register Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can add tags and descriptions to your models. Note you need to have a `sklearn_linreg_model.pkl` file in the current directory. This file is generated by the 01 notebook. The below call registers that file as a model with the same name `sklearn_linreg_model.pkl` in the workspace.\n",
"\n",
"Using tags, you can track useful information such as the name and version of the machine learning library used to train the model. Note that tags must be alphanumeric."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"from azureml.core.model import Model\n",
"import sklearn\n",
"\n",
"library_version = \"sklearn\"+sklearn.__version__.replace(\".\",\"x\")\n",
"\n",
"model = Model.register(model_path = \"sklearn_regression_model.pkl\",\n",
" model_name = \"sklearn_regression_model.pkl\",\n",
" tags = {'area': \"diabetes\", 'type': \"regression\", 'version': library_version},\n",
" description = \"Ridge regression model to predict diabetes\",\n",
" workspace = ws)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can explore the registered models within your workspace and query by tag. Models are versioned. If you call the register_model command many times with same model name, you will get multiple versions of the model with increasing version numbers."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"regression_models = ws.models(tags=['area'])\n",
"for m in regression_models:\n",
" print(\"Name:\", m.name,\"\\tVersion:\", m.version, \"\\tDescription:\", m.description, m.tags)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can pick a specific model to deploy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.name, model.description, model.version, sep = '\\t')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Docker Image"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Show `score.py`. Note that the `sklearn_regression_model.pkl` in the `get_model_path` call is referring to a model named `sklearn_linreg_model.pkl` registered under the workspace. It is NOT referenceing the local file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy\n",
"from sklearn.externals import joblib\n",
"from sklearn.linear_model import Ridge\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global model\n",
" # note here \"sklearn_regression_model.pkl\" is the name of the model registered under\n",
" # this is a different behavior than before when the code is run locally, even though the code is the same.\n",
" model_path = Model.get_model_path('sklearn_regression_model.pkl')\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
"# note you can pass in multiple rows for scoring\n",
"def run(raw_data):\n",
" try:\n",
" data = json.loads(raw_data)['data']\n",
" data = numpy.array(data)\n",
" result = model.predict(data)\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"result\": result.tolist()})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies \n",
"\n",
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'])\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that following command can take few minutes. \n",
"\n",
"You can add tags and descriptions to images. Also, an image can contain multiple models."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create image"
]
},
"outputs": [],
"source": [
"from azureml.core.image import Image, ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(runtime= \"python\",\n",
" execution_script=\"score.py\",\n",
" conda_file=\"myenv.yml\",\n",
" tags = {'area': \"diabetes\", 'type': \"regression\"},\n",
" description = \"Image with ridge regression model\")\n",
"\n",
"image = Image.create(name = \"myimage1\",\n",
" # this is the model object \n",
" models = [model],\n",
" image_config = image_config, \n",
" workspace = ws)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create image"
]
},
"outputs": [],
"source": [
"image.wait_for_creation(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"List images by tag and find out the detailed build log for debugging."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create image"
]
},
"outputs": [],
"source": [
"for i in Image.list(workspace = ws,tags = [\"area\"]):\n",
" print('{}(v.{} [{}]) stored at {} with build log {}'.format(i.name, i.version, i.creation_state, i.image_location, i.image_build_log_uri))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy image as web service on Azure Container Instance\n",
"\n",
"Note that the service creation can take few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
" memory_gb = 1, \n",
" tags = {'area': \"diabetes\", 'type': \"regression\"}, \n",
" description = 'Predict diabetes using regression model')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"from azureml.core.webservice import Webservice\n",
"\n",
"aci_service_name = 'my-aci-service-2'\n",
"print(aci_service_name)\n",
"aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n",
" image = image,\n",
" name = aci_service_name,\n",
" workspace = ws)\n",
"aci_service.wait_for_deployment(True)\n",
"print(aci_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test web service"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the web service with some dummy input data to get a prediction."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"import json\n",
"\n",
"test_sample = json.dumps({'data': [\n",
" [1,2,3,4,5,6,7,8,9,10], \n",
" [10,9,8,7,6,5,4,3,2,1]\n",
"]})\n",
"test_sample = bytes(test_sample,encoding = 'utf8')\n",
"\n",
"prediction = aci_service.run(input_data = test_sample)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete ACI to clean up"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"deploy service",
"aci"
]
},
"outputs": [],
"source": [
"aci_service.delete()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,335 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploying a web service to Azure Kubernetes Service (AKS)\n",
"This notebook shows the steps for deploying a service: registering a model, creating an image, provisioning a cluster (one time action), and deploying a service to it. \n",
"We then test and delete the service, image and model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"from azureml.core.image import Image\n",
"from azureml.core.model import Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"print(azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get workspace\n",
"Load existing workspace from the config file info."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Register the model\n",
"Register an existing trained model, add descirption and tags."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Register the model\n",
"from azureml.core.model import Model\n",
"model = Model.register(model_path = \"sklearn_regression_model.pkl\", # this points to a local file\n",
" model_name = \"sklearn_regression_model.pkl\", # this is the name the model is registered as\n",
" tags = {'area': \"diabetes\", 'type': \"regression\"},\n",
" description = \"Ridge regression model to predict diabetes\",\n",
" workspace = ws)\n",
"\n",
"print(model.name, model.description, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create an image\n",
"Create an image using the registered model the script that will load and run the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy\n",
"from sklearn.externals import joblib\n",
"from sklearn.linear_model import Ridge\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global model\n",
" # note here \"sklearn_regression_model.pkl\" is the name of the model registered under\n",
" # this is a different behavior than before when the code is run locally, even though the code is the same.\n",
" model_path = Model.get_model_path('sklearn_regression_model.pkl')\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
"\n",
"# note you can pass in multiple rows for scoring\n",
"def run(raw_data):\n",
" try:\n",
" data = json.loads(raw_data)['data']\n",
" data = numpy.array(data)\n",
" result = model.predict(data)\n",
" except Exception as e:\n",
" result = str(e)\n",
" return json.dumps({\"result\": result.tolist()})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies \n",
"\n",
"myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'])\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n",
" runtime = \"python\",\n",
" conda_file = \"myenv.yml\",\n",
" description = \"Image with ridge regression model\",\n",
" tags = {'area': \"diabetes\", 'type': \"regression\"}\n",
" )\n",
"\n",
"image = ContainerImage.create(name = \"myimage1\",\n",
" # this is the model object\n",
" models = [model],\n",
" image_config = image_config,\n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Provision the AKS Cluster\n",
"This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete the cluster or the resource group that contains it, then you would have to recreate it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use the default configuration (can also provide parameters to customize)\n",
"prov_config = AksCompute.provisioning_configuration()\n",
"\n",
"aks_name = 'my-aks-9' \n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n",
" provisioning_configuration = prov_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Optional step: Attach existing AKS cluster\n",
"\n",
"If you have existing AKS cluster in your Azure subscription, you can attach it to the Workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"# Use the default configuration (can also provide parameters to customize)\n",
"resource_id = '/subscriptions/92c76a2f-0e1c-4216-b65e-abf7a3f34c1e/resourcegroups/raymondsdk0604/providers/Microsoft.ContainerService/managedClusters/my-aks-0605d37425356b7d01'\n",
"\n",
"create_name='my-existing-aks' \n",
"# Create the cluster\n",
"aks_target = AksCompute.attach(workspace=ws, name=create_name, resource_id=resource_id)\n",
"# Wait for the operation to complete\n",
"aks_target.wait_for_completion(True)\n",
"'''"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploy web service to AKS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Set the web service configuration (using default here)\n",
"aks_config = AksWebservice.deploy_configuration()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service_name ='aks-service-1'\n",
"\n",
"aks_service = Webservice.deploy_from_image(workspace = ws, \n",
" name = aks_service_name,\n",
" image = image,\n",
" deployment_config = aks_config,\n",
" deployment_target = aks_target)\n",
"aks_service.wait_for_deployment(show_output = True)\n",
"print(aks_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test the web service\n",
"We test the web sevice by passing data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"import json\n",
"\n",
"test_sample = json.dumps({'data': [\n",
" [1,2,3,4,5,6,7,8,9,10], \n",
" [10,9,8,7,6,5,4,3,2,1]\n",
"]})\n",
"test_sample = bytes(test_sample,encoding = 'utf8')\n",
"\n",
"prediction = aks_service.run(input_data = test_sample)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clean up\n",
"Delete the service, image and model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service.delete()\n",
"image.delete()\n",
"model.delete()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,438 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Enabling Data Collection for Models in Production\n",
"With this notebook, you can learn how to collect input model data from your Azure Machine Learning service in an Azure Blob storage. Once enabled, this data collected gives you the opportunity:\n",
"\n",
"* Monitor data drifts as production data enters your model\n",
"* Make better decisions on when to retrain or optimize your model\n",
"* Retrain your model with the data collected\n",
"\n",
"## What data is collected?\n",
"* Model input data (voice, images, and video are not supported) from services deployed in Azure Kubernetes Cluster (AKS)\n",
"* Model predictions using production input data.\n",
"\n",
"**Note:** pre-aggregation or pre-calculations on this data are done by user and not included in this version of the product.\n",
"\n",
"## What is different compared to standard production deployment process?\n",
"1. Update scoring file.\n",
"2. Update yml file with new dependency.\n",
"3. Update aks configuration.\n",
"4. Build new image and deploy it. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Import your dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace, Run\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"from azureml.core.image import Image\n",
"from azureml.core.model import Model\n",
"\n",
"import azureml.core\n",
"print(azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Set up your configuration and create a workspace\n",
"Follow Notebook 00 instructions to do this.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Register Model\n",
"Register an existing trained model, add descirption and tags."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Register the model\n",
"from azureml.core.model import Model\n",
"model = Model.register(model_path = 'sklearn_regression_model.pkl', # this points to a local file\n",
" model_name = \"best_model\", # this is the name the model is registered as\n",
" tags = {'area': \"diabetes\", 'type': \"regression\"},\n",
" description = \"Ridge regression model to predict diabetes\",\n",
" workspace = ws)\n",
"\n",
"print(model.name, model.description, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Update your scoring file with Data Collection\n",
"The file below, compared to the file used in notebook 11, has the following changes:\n",
"### a. Import the module\n",
"<code> from azureml.monitoring import ModelDataCollector </code>\n",
"### b. In your init function add:\n",
"<code> global inputs_dc, prediction_d\n",
" inputs_dc = ModelDataCollector(\"best_model\", identifier=\"inputs\", feature_names=[\"feat1\", \"feat2\", \"feat3\". \"feat4\", \"feat5\", \"Feat6\"])\n",
" prediction_dc = ModelDataCollector(\"best_model\", identifier=\"predictions\", feature_names=[\"prediction1\", \"prediction2\"])</code>\n",
" \n",
"* Identifier: Identifier is later used for building the folder structure in your Blob, it can be used to divide “raw” data versus “processed”.\n",
"* CorrelationId: is an optional parameter, you do not need to set it up if your model doesnt require it. Having a correlationId in place does help you for easier mapping with other data. (Examples include: LoanNumber, CustomerId, etc.)\n",
"* Feature Names: These need to be set up in the order of your features in order for them to have column names when the .csv is created.\n",
"\n",
"### c. In your run function add:\n",
"<code> inputs_dc.collect(data)\n",
" prediction_dc.collect(result) </code>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import pickle\n",
"import json\n",
"import numpy as np\n",
"from sklearn.externals import joblib\n",
"from sklearn.linear_model import Ridge\n",
"from azureml.core.model import Model\n",
"from azureml.monitoring import ModelDataCollector\n",
"import time\n",
"\n",
"def init():\n",
" global model\n",
" #print (\"model initialized\" + time.strftime(\"%H:%M:%S\"))\n",
" # note here \"best_model\" is the name of the model registered under the workspace\n",
" # this call should return the path to the model.pkl file on the local disk.\n",
" model_path = Model.get_model_path(model_name = 'best_model')\n",
" # deserialize the model file back into a sklearn model\n",
" model = joblib.load(model_path)\n",
" global inputs_dc, prediction_dc\n",
" # this setup will help us save our inputs under the \"inputs\" path in our Azure Blob\n",
" inputs_dc = ModelDataCollector(model_name=\"best_model\", identifier=\"inputs\", feature_names=[\"feat1\", \"feat2\", \"feat3\",\"feat4\", \"feat5\",\"feat6\"]) \n",
" # this setup will help us save our ipredictions under the \"predictions\" path in our Azure Blob\n",
" prediction_dc = ModelDataCollector(\"best_model\", identifier=\"predictions\", feature_names=[\"prediction1\", \"prediction2\"]) \n",
" \n",
"# note you can pass in multiple rows for scoring\n",
"def run(raw_data):\n",
" global inputs_dc, prediction_dc\n",
" try:\n",
" data = json.loads(raw_data)['data']\n",
" data = np.array(data)\n",
" result = model.predict(data)\n",
" inputs_dc.collect(data) #this call is saving our input data into our blob\n",
" prediction_dc.collect(result)#this call is saving our prediction data into our blob\n",
" return json.dumps({\"result\": result.tolist()})\n",
" except Exception as e:\n",
" result = str(e)\n",
" #print (result + time.strftime(\"%H:%M:%S\"))\n",
" return json.dumps({\"error\": result})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Update your myenv.yml file with the required module"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile myenv.yml\n",
"name: myenv\n",
"channels:\n",
" - defaults\n",
"dependencies:\n",
" - pip:\n",
" - numpy\n",
" - scikit-learn\n",
" # Required packages for AzureML execution, history, and data preparation.\n",
" - --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/Preview/E7501C02541B433786111FE8E140CAA1\n",
" - azureml-core\n",
" - azureml-monitoring"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Create your new Image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n",
" runtime = \"python\",\n",
" conda_file = \"myenv.yml\",\n",
" description = \"Image with ridge regression model\",\n",
" tags = {'area': \"diabetes\", 'type': \"regression\"}\n",
" )\n",
"\n",
"image = ContainerImage.create(name = \"myimage1\",\n",
" # this is the model object\n",
" models = [model],\n",
" image_config = image_config,\n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.name, model.description, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Deploy to AKS service\n",
"For this step you would need to have an AKS cluster setup (Notebook 11).\n",
"In this case we are attaching to a previously created service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"resource_id = '/subscriptions/92c76a2f-0e1c-4216-b65e-abf7a3f34c1e/resourcegroups/marthateresource_groupjw/providers/Microsoft.ContainerService/managedClusters/my-aks-colfb348092fd3a760'\n",
"create_name= 'my-existing-aks'\n",
"aks_target = AksCompute.attach(workspace = ws, \n",
" name = create_name, \n",
" resource_id=resource_id)\n",
"# Wait for the operation to complete\n",
"aks_target.wait_for_completion(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### a. Activate Data Collection and App Insights\n",
"In order to enable Data Collection and App Insights in your service you will need to update your AKS configuration file:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Set the web service configuration (using default here)\n",
"aks_config = AksWebservice.deploy_configuration(collect_model_data=True, enable_app_insights=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### b. Deploy your service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service_name ='aks-w-collv5'\n",
"\n",
"aks_service = Webservice.deploy_from_image(workspace = ws, \n",
" name = aks_service_name,\n",
" image = image,\n",
" deployment_config = aks_config,\n",
" deployment_target = aks_target\n",
" )\n",
"aks_service.wait_for_deployment(show_output = True)\n",
"print(aks_service.state)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Test your service and send some data\n",
"**Note**: It will take around 15 mins for your data to appear in your blob.\n",
"The data will appear in your Azure Blob following this format:\n",
"\n",
"/modeldata/subscriptionid/resourcegroupname/workspacename/webservicename/modelname/modelversion/identifier/year/month/day/data.csv "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"import json\n",
"\n",
"test_sample = json.dumps({'data': [\n",
" [1,2,3,4,54,6,7,8,88,10], \n",
" [10,9,8,37,36,45,4,33,2,1]\n",
"]})\n",
"test_sample = bytes(test_sample,encoding = 'utf8')\n",
"\n",
"prediction = aks_service.run(input_data = test_sample)\n",
"print(prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_sample = json.dumps({'data': [\n",
" [1,22,3,4,5,68,7,98,95,310], \n",
" [10,92,8,7,6,53,84,23,323,1]\n",
"]})\n",
"test_sample = bytes(test_sample,encoding = 'utf8')\n",
"\n",
"prediction = aks_service.run(input_data = test_sample)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9. Validate you data and analyze it\n",
"You can look into your data following this path format in your Azure Blob:\n",
"\n",
"/modeldata/**subscriptionid>**/**resourcegroupname>**/**workspacename>**/**webservicename>**/**modelname>**/**modelversion>>**/**identifier>**/*year/month/day*/data.csv \n",
"\n",
"For doing further analysis you have multiple options:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### a. Create DataBricks cluster and connect it to your blob\n",
"https://docs.microsoft.com/en-us/azure/azure-databricks/quickstart-create-databricks-workspace-portal or in your databricks workspace you can look for the template \"Azure Blob Storage Import Example Notebook\".\n",
"\n",
"\n",
"Here is an example for setting up the file location to extract the relevant data:\n",
"\n",
"<code> file_location = \"wasbs://mycontainer@testmartstoragendbblgwy.blob.core.windows.net/unknown/unknown/unknown-bigdataset-unknown/my_iterate_parking_inputs/2018/&deg;/&deg;/data.csv\" \n",
"file_type = \"csv\"</code>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### b. Connect Blob to Power Bi (Small Data only)\n",
"1. Download and Open PowerBi Desktop\n",
"2. Select “Get Data” and click on “Azure Blob Storage” >> Connect\n",
"3. Add your storage account and enter your storage key.\n",
"4. Select the container where your Data Collection is stored and click on Edit. \n",
"5. In the query editor, click under “Name” column and add your Storage account Model path into the filter. Note: if you want to only look into files from a specific year or month, just expand the filter path. For example, just look into March data: /modeldata/subscriptionid>/resourcegroupname>/workspacename>/webservicename>/modelname>/modelversion>/identifier>/year>/3\n",
"6. Click on the double arrow aside the “Content” column to combine the files. \n",
"7. Click OK and the data will preload.\n",
"8. You can now click Close and Apply and start building your custom reports on your Model Input data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Disable Data Collection"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service.update(collect_model_data=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,529 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from azureml.core import Workspace, Run, Experiment\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')\n",
"\n",
"# Also create a Project and attach to Workspace\n",
"project_folder = \"sample_projects\"\n",
"run_history_name = project_folder\n",
"\n",
"if not os.path.isdir(project_folder):\n",
" os.mkdir(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import BatchAiCompute, ComputeTarget\n",
"from azureml.core.datastore import Datastore\n",
"from azureml.data.data_reference import DataReference\n",
"from azureml.pipeline.core import Pipeline, PipelineData\n",
"from azureml.pipeline.steps import PythonScriptStep\n",
"from azureml.core.runconfig import CondaDependencies, RunConfiguration"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create and attach Compute targets\n",
"Use the below code to create and attach Compute targets. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Batch AI compute\n",
"cluster_name = \"gpu_cluster\"\n",
"try:\n",
" cluster = BatchAiCompute(ws, cluster_name)\n",
" print(\"found existing cluster.\")\n",
"except:\n",
" print(\"creating new cluster\")\n",
" provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\",\n",
" autoscale_enabled = True,\n",
" cluster_min_nodes = 0, \n",
" cluster_max_nodes = 1)\n",
"\n",
" # create the cluster\n",
" cluster = ComputeTarget.create(ws, cluster_name, provisioning_config)\n",
" cluster.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Python scripts to run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Python scripts that run the batch scoring. `batchai_score.py` takes input images in `dataset_path`, pretrained models in `model_dir` and outputs a `results-label.txt` to `output_dir`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/batchai_score.py\n",
"import os\n",
"import argparse\n",
"import datetime,time\n",
"import tensorflow as tf\n",
"from math import ceil\n",
"import numpy as np\n",
"import shutil\n",
"from tensorflow.contrib.slim.python.slim.nets import inception_v3\n",
"\n",
"slim = tf.contrib.slim\n",
"\n",
"parser = argparse.ArgumentParser(description=\"Start a tensorflow model serving\")\n",
"parser.add_argument('--model_dir', dest=\"model_dir\", required=True)\n",
"parser.add_argument('--dataset_path', dest=\"dataset_path\", required=True)\n",
"parser.add_argument('--output_dir', dest=\"output_dir\", required=True)\n",
"parser.add_argument('--batch_size', dest=\"batch_size\", type=int, required=True)\n",
"\n",
"args = parser.parse_args()\n",
"\n",
"image_size = 299\n",
"num_channel = 3\n",
"\n",
"# create output directory if it does not exist\n",
"os.makedirs(args.output_dir, exist_ok=True)\n",
"\n",
"def get_class_label_dict(label_file):\n",
" label = []\n",
" proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()\n",
" for l in proto_as_ascii_lines:\n",
" label.append(l.rstrip())\n",
" return label\n",
"\n",
"\n",
"class DataIterator:\n",
" def __init__(self, data_dir):\n",
" self.file_paths = []\n",
" image_list = os.listdir(data_dir)\n",
" total_size = len(image_list)\n",
" self.file_paths = [data_dir + '/' + file_name.rstrip() for file_name in image_list ]\n",
"\n",
" self.labels = [1 for file_name in self.file_paths]\n",
"\n",
" @property\n",
" def size(self):\n",
" return len(self.labels)\n",
"\n",
" def input_pipeline(self, batch_size):\n",
" images_tensor = tf.convert_to_tensor(self.file_paths, dtype=tf.string)\n",
" labels_tensor = tf.convert_to_tensor(self.labels, dtype=tf.int64)\n",
" input_queue = tf.train.slice_input_producer([images_tensor, labels_tensor], shuffle=False)\n",
" labels = input_queue[1]\n",
" images_content = tf.read_file(input_queue[0])\n",
"\n",
" image_reader = tf.image.decode_jpeg(images_content, channels=num_channel, name=\"jpeg_reader\")\n",
" float_caster = tf.cast(image_reader, tf.float32)\n",
" new_size = tf.constant([image_size, image_size], dtype=tf.int32)\n",
" images = tf.image.resize_images(float_caster, new_size)\n",
" images = tf.divide(tf.subtract(images, [0]), [255])\n",
"\n",
" image_batch, label_batch = tf.train.batch([images, labels], batch_size=batch_size, capacity=5 * batch_size)\n",
" return image_batch\n",
"\n",
"def main(_):\n",
" start_time = datetime.datetime.now()\n",
" label_file_name = os.path.join(args.model_dir, \"labels.txt\")\n",
" label_dict = get_class_label_dict(label_file_name)\n",
" classes_num = len(label_dict)\n",
" test_feeder = DataIterator(data_dir=args.dataset_path)\n",
" total_size = len(test_feeder.labels)\n",
" count = 0\n",
" with tf.Session() as sess:\n",
" test_images = test_feeder.input_pipeline(batch_size=args.batch_size)\n",
" with slim.arg_scope(inception_v3.inception_v3_arg_scope()):\n",
" input_images = tf.placeholder(tf.float32, [args.batch_size, image_size, image_size, num_channel])\n",
" logits, _ = inception_v3.inception_v3(input_images,\n",
" num_classes=classes_num,\n",
" is_training=False)\n",
" probabilities = tf.argmax(logits, 1)\n",
"\n",
" sess.run(tf.global_variables_initializer())\n",
" sess.run(tf.local_variables_initializer())\n",
" coord = tf.train.Coordinator()\n",
" threads = tf.train.start_queue_runners(sess=sess, coord=coord)\n",
" saver = tf.train.Saver()\n",
" model_path = os.path.join(args.model_dir, \"inception_v3.ckpt\")\n",
" saver.restore(sess, model_path)\n",
" out_filename = os.path.join(args.output_dir, \"result-labels.txt\")\n",
" with open(out_filename, \"w\") as result_file:\n",
" i = 0\n",
" while count < total_size and not coord.should_stop():\n",
" test_images_batch = sess.run(test_images)\n",
" file_names_batch = test_feeder.file_paths[i*args.batch_size: min(test_feeder.size, (i+1)*args.batch_size)]\n",
" results = sess.run(probabilities, feed_dict={input_images: test_images_batch})\n",
" new_add = min(args.batch_size, total_size-count)\n",
" count += new_add\n",
" i += 1\n",
" for j in range(new_add):\n",
" result_file.write(os.path.basename(file_names_batch[j]) + \": \" + label_dict[results[j]] + \"\\n\")\n",
" result_file.flush()\n",
" coord.request_stop()\n",
" coord.join(threads)\n",
" \n",
" # copy the file to artifacts\n",
" shutil.copy(out_filename, \"./outputs/\")\n",
" # Move the processed data out of the blob so that the next run can process the data.\n",
"\n",
"if __name__ == \"__main__\":\n",
" tf.app.run()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"account_name = \"pipelinedata\"\n",
"sample_data = Datastore.register_azure_blob_container(ws, \"sampledata\", \"sampledata\", \n",
" account_name=account_name, \n",
" overwrite=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Output datastore"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We write the outputs to the default datastore"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"default_ds = \"workspaceblobstore\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Specify where the data is stored or will be written to"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.data.data_reference import DataReference\n",
"from azureml.pipeline.core import Pipeline, PipelineData\n",
"from azureml.core import Datastore\n",
"from azureml.core import Experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_images = DataReference(datastore=sample_data, \n",
" data_reference_name=\"input_images\",\n",
" path_on_datastore=\"batchscoring/images\",\n",
" mode=\"download\"\n",
" )\n",
"model_dir = DataReference(datastore=sample_data, \n",
" data_reference_name=\"input_model\",\n",
" path_on_datastore=\"batchscoring/models\",\n",
" mode=\"download\" \n",
" )\n",
"output_dir = PipelineData(name=\"scores\", \n",
" datastore_name=default_ds, \n",
" output_path_on_compute=\"batchscoring/results\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Specify environment to run the script"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cd = CondaDependencies.create(pip_packages=[\"tensorflow-gpu==1.4.0\", \"azureml-defaults\"])\n",
"\n",
"# Runconfig\n",
"batchai_run_config = RunConfiguration(conda_dependencies=cd)\n",
"batchai_run_config.environment.docker.enabled = True\n",
"batchai_run_config.environment.docker.gpu_support = True\n",
"batchai_run_config.environment.docker.base_image = \"microsoft/mmlspark:gpu-0.12\"\n",
"batchai_run_config.environment.spark.precache_packages = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Steps to run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"step = PythonScriptStep(\n",
" name=\"batch ai scoring\",\n",
" script_name=\"batchai_score.py\",\n",
" arguments=[\"--dataset_path\", input_images, \"--model_dir\", model_dir, \"--output_dir\", output_dir, \"--batch_size\", 20],\n",
" target=cluster,\n",
" inputs=[input_images, model_dir],\n",
" outputs=[output_dir],\n",
" runconfig=batchai_run_config,\n",
" source_directory=project_folder,\n",
" allow_reuse=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline = Pipeline(workspace=ws, steps=[step])\n",
"pipeline_run = Experiment(ws, 'batch_scoring').submit(pipeline)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Monitor run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.widgets import RunDetails\n",
"RunDetails(pipeline_run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"node_run = list(pipeline_run.get_children())[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"node_run.download_file(\"./outputs/result-labels.txt\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Display few results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"result-labels.txt\", delimiter=\":\", header=None)\n",
"df.columns = [\"Filename\", \"Prediction\"]\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create template and rerun the pipeline using a REST call"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create template"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"template = pipeline_run.create_template(name=\"batch score\", description=\"scores images kept in container sampledata\",\n",
" version=\"1.0\")\n",
"template_id = template.template_id"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Rerun using REST call"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get AAD token"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.authentication import AzureCliAuthentication\n",
"import requests\n",
"\n",
"cli_auth = AzureCliAuthentication()\n",
"aad_token = cli_auth.get_authentication_header()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hit the REST endpoint"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.pipeline.core import Template\n",
"\n",
"rest_endpoint = Template.get_template_endpoint(template_id, ws)\n",
"response = requests.post(rest_endpoint, headers=aad_token, json={})\n",
"run_id = response.json()[\"Id\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Monitor the template run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.pipeline.core.run import PipelineRun\n",
"template_run = PipelineRun(ws.experiments()[\"batch_scoring\"], run_id)\n",
"\n",
"RunDetails(template_run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,502 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 40. Tensorboard Integration with Run History\n",
"\n",
"1. Run a Tensorflow job locally and view its TB output live.\n",
"2. The same, for a DSVM.\n",
"3. And once more, with Batch AI.\n",
"4. Finally, we'll collect all of these historical runs together into a single Tensorboard graph."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set experiment name and create project\n",
"Choose a name for your run history container in the workspace, and create a folder for the project."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from os import path, makedirs\n",
"experiment_name = 'tensorboard-demo'\n",
"\n",
"# experiment folder\n",
"exp_dir = './sample_projects/' + experiment_name\n",
"\n",
"if not path.exists(exp_dir):\n",
" makedirs(exp_dir)\n",
"\n",
"# runs we started in this session, for the finale\n",
"runs = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Tensorflow Tensorboard demo code\n",
"\n",
"Tensorflow's repository has an MNIST demo with extensive Tensorboard instrumentation. We'll use it here for our purposes.\n",
"\n",
"Note that we don't need to make any code changes at all - the code works without modification from the Tensorflow repository."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import os\n",
"import tempfile\n",
"tf_code = requests.get(\"https://raw.githubusercontent.com/tensorflow/tensorflow/r1.8/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py\")\n",
"with open(os.path.join(exp_dir, \"mnist_with_summaries.py\"), \"w\") as file:\n",
" file.write(tf_code.text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure and run locally\n",
"\n",
"We'll start by running this locally. While it might not initially seem that useful to use this for a local run - why not just run TB against the files generated locally? - even in this case there is some value to using this feature. Your local run will be registered in the run history, and your Tensorboard logs will be uploaded to the artifact store associated with this run. Later, you'll be able to restore the logs from any run, regardless of where it happened.\n",
"\n",
"Note that for this run, you will need to install Tensorflow on your local machine by yourself. Further, the Tensorboard module (that is, the one included with Tensorflow) must be accessible to this notebook's kernel, as the local machine is what runs Tensorboard."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"\n",
"# Create a run configuration.\n",
"run_config = RunConfiguration()\n",
"run_config.environment.python.user_managed_dependencies = True\n",
"\n",
"# You can choose a specific Python environment by pointing to a Python path \n",
"#run_config.environment.python.interpreter_path = '/home/ninghai/miniconda3/envs/sdk2/bin/python'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment, Run\n",
"from azureml.core.script_run_config import ScriptRunConfig\n",
"import tensorflow as tf\n",
"\n",
"logs_dir = os.curdir + os.sep + \"logs\"\n",
"tensorflow_logs_dir = os.path.join(logs_dir, \"tensorflow\")\n",
"\n",
"if not path.exists(tensorflow_logs_dir):\n",
" makedirs(tensorflow_logs_dir)\n",
"\n",
"os.environ[\"TEST_TMPDIR\"] = logs_dir\n",
"\n",
"# Writing logs to ./logs results in their being uploaded to Artifact Service,\n",
"# and thus, made accessible to our Tensorboard instance.\n",
"arguments_list = [\"--log_dir\", logs_dir]\n",
"\n",
"# Create an experiment\n",
"exp = Experiment(ws, experiment_name)\n",
"\n",
"script = ScriptRunConfig(exp_dir,\n",
" script=\"mnist_with_summaries.py\",\n",
" run_config=run_config)\n",
"\n",
"# If you would like the run to go for longer, add --max_steps 5000 to the arguments list:\n",
"# arguments_list += [\"--max_steps\", \"5000\"]\n",
"kwargs = {}\n",
"kwargs['arguments_list'] = arguments_list\n",
"run = exp.submit(script, kwargs)\n",
"# You can also wait for the run to complete\n",
"# run.wait_for_completion(show_output=True)\n",
"runs.append(run)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Start Tensorboard\n",
"\n",
"Now, while the run is in progress, we just need to start Tensorboard with the run as its target, and it will begin streaming logs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.contrib.tensorboard import Tensorboard\n",
"\n",
"# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here\n",
"tb = Tensorboard([run])\n",
"\n",
"# If successful, start() returns a string with the URI of the instance.\n",
"tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop Tensorboard\n",
"\n",
"When you're done, make sure to call the `stop()` method of the Tensorboard object, or it will stay running even after your job completes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb.stop()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Now, with a DSVM\n",
"\n",
"Tensorboard uploading works with all compute targets. Here we demonstrate it from a DSVM.\n",
"Note that the Tensorboard instance itself will be run by the notebook kernel. Again, this means this notebook's kernel must have access to the Tensorboard module.\n",
"\n",
"If you are unfamiliar with DSVM configuration, check [04. Train in a remote VM (Ubuntu DSVM)](04.train-on-remote-vm.ipynb) for a more detailed breakdown."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import DsvmCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"compute_target_name = 'cpu-dsvm'\n",
"\n",
"try:\n",
" compute_target = DsvmCompute(workspace = ws, name = compute_target_name)\n",
" print('found existing:', compute_target.name)\n",
"except ComputeTargetException:\n",
" print('creating new.')\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n",
" compute_target = DsvmCompute.create(ws, name = compute_target_name, provisioning_configuration = dsvm_config)\n",
" compute_target.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Submit run using TensorFlow estimator\n",
"\n",
"Instead of manually configuring the DSVM environment, we can use the TensorFlow estimator and everything is set up automatically."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.dnn import TensorFlow\n",
"\n",
"script_params = {\"--log_dir\": \"./logs\"}\n",
"\n",
"# If you want the run to go longer, set --max-steps to a higher number.\n",
"# script_params[\"--max_steps\"] = \"5000\"\n",
"\n",
"tf_estimator = TensorFlow(source_directory=exp_dir,\n",
" compute_target=compute_target,\n",
" entry_script='mnist_with_summaries.py',\n",
" script_params=script_params)\n",
"\n",
"run = exp.submit(tf_estimator)\n",
"\n",
"runs.append(run)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Start Tensorboard with this run\n",
"\n",
"Just like before."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here\n",
"tb = Tensorboard([run])\n",
"\n",
"# If successful, start() returns a string with the URI of the instance.\n",
"tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop Tensorboard\n",
"\n",
"When you're done, make sure to call the `stop()` method of the Tensorboard object, or it will stay running even after your job completes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb.stop()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Once more, with a Batch AI cluster\n",
"\n",
"Just to prove we can, let's create a Batch AI cluster using MLC, and run our demo there, as well."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import BatchAiCompute\n",
"\n",
"clust_name = ws.name + \"cpu\"\n",
"\n",
"try:\n",
" # If you already have a cluster named this, we don't need to make a new one.\n",
" compute_target = [ct for ct in ws.compute_targets() if ct.name == clust_name and ct.type == 'BatchAI'][0]\n",
"except:\n",
" # Let's make a new one here.\n",
" provisioning_config = BatchAiCompute.provisioning_configuration(cluster_max_nodes=2, \n",
" autoscale_enabled=True, \n",
" cluster_min_nodes=1,\n",
" vm_size='Standard_D11_V2')\n",
" \n",
" compute_target = BatchAiCompute.create(ws, clust_name, provisioning_config)\n",
" compute_target.wait_for_completion(show_output=True, min_node_count=1, timeout_in_minutes=20)\n",
"print(compute_target.name)\n",
" # For a more detailed view of current BatchAI cluster status, use the 'status' property \n",
" # print(compute_target.status.serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Submit run using TensorFlow estimator\n",
"\n",
"Again, we can use the TensorFlow estimator and everything is set up automatically."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_params = {\"--log_dir\": \"./logs\"}\n",
"\n",
"# If you want the run to go longer, set --max-steps to a higher number.\n",
"# script_params[\"--max_steps\"] = \"5000\"\n",
"\n",
"tf_estimator = TensorFlow(source_directory=exp_dir,\n",
" compute_target=compute_target,\n",
" entry_script='mnist_with_summaries.py',\n",
" script_params=script_params)\n",
"\n",
"run = exp.submit(tf_estimator)\n",
"\n",
"runs.append(run)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Start Tensorboard with this run\n",
"\n",
"Once more..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here\n",
"tb = Tensorboard([run])\n",
"\n",
"# If successful, start() returns a string with the URI of the instance.\n",
"tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop Tensorboard\n",
"\n",
"When you're done, make sure to call the `stop()` method of the Tensorboard object, or it will stay running even after your job completes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb.stop()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Finale\n",
"\n",
"If you've paid close attention, you'll have noticed that we've been saving the run objects in an array as we went along. We can start a Tensorboard instance that combines all of these run objects into a single process. This way, you can compare historical runs. You can even do this with live runs; if you made some of those previous runs longer via the `--max_steps` parameter, they might still be running, and you'll see them live in this instance as well."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The Tensorboard constructor takes an array of runs...\n",
"# and it turns out that we have been building one of those all along.\n",
"tb = Tensorboard(runs)\n",
"\n",
"# If successful, start() returns a string with the URI of the instance.\n",
"tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop Tensorboard\n",
"\n",
"As you might already know, make sure to call the `stop()` method of the Tensorboard object, or it will stay running (until you kill the kernel associated with this notebook, at least)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 41. Export Run History as Tensorboard logs\n",
"\n",
"1. Run some training and log some metrics into Run History\n",
"2. Export the run history to some directory as Tensorboard logs\n",
"3. Launch a local Tensorboard to view the run history"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace, Run, Experiment\n",
"\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set experiment name and start the run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = 'export-to-tensorboard'\n",
"exp = Experiment(ws, experiment_name)\n",
"root_run = exp.start_logging()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load diabetes dataset, a well-known built-in small dataset that comes with scikit-learn\n",
"from sklearn.datasets import load_diabetes\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"X, y = load_diabetes(return_X_y=True)\n",
"\n",
"columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n",
"\n",
"x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
"data = {\n",
" \"train\":{\"x\":x_train, \"y\":y_train}, \n",
" \"test\":{\"x\":x_test, \"y\":y_test}\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example experiment\n",
"from tqdm import tqdm\n",
"\n",
"alphas = [.1, .2, .3, .4, .5, .6 , .7]\n",
"\n",
"# try a bunch of alpha values in a Linear Regression (Ridge) model\n",
"for alpha in tqdm(alphas):\n",
" # create a bunch of child runs\n",
" with root_run.child_run(\"alpha\" + str(alpha)) as run:\n",
" # More data science stuff\n",
" reg = Ridge(alpha=alpha)\n",
" reg.fit(data[\"train\"][\"x\"], data[\"train\"][\"y\"])\n",
" # TODO save model\n",
" preds = reg.predict(data[\"test\"][\"x\"])\n",
" mse = mean_squared_error(preds, data[\"test\"][\"y\"])\n",
" # End train and eval\n",
"\n",
" # log alpha, mean_squared_error and feature names in run history\n",
" root_run.log(\"alpha\", alpha)\n",
" root_run.log(\"mse\", mse)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Export Run History to Tensorboard logs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Export Run History to Tensorboard logs\n",
"from azureml.contrib.tensorboard.export import export_to_tensorboard\n",
"import os\n",
"import tensorflow as tf\n",
"\n",
"logdir = 'exportedTBlogs'\n",
"log_path = os.path.join(os.getcwd(), logdir)\n",
"try:\n",
" os.stat(log_path)\n",
"except os.error:\n",
" os.mkdir(log_path)\n",
"print(logdir)\n",
"\n",
"# export run history for the project\n",
"export_to_tensorboard(root_run, logdir)\n",
"\n",
"# or export a particular run\n",
"# export_to_tensorboard(run, logdir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"root_run.complete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Start Tensorboard\n",
"\n",
"Or you can start the Tensorboard outside this notebook to view the result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.contrib.tensorboard import Tensorboard\n",
"\n",
"# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here\n",
"tb = Tensorboard([], local_root=logdir, port=6006)\n",
"\n",
"# If successful, start() returns a string with the URI of the instance.\n",
"tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop Tensorboard\n",
"\n",
"When you're done, make sure to call the `stop()` method of the Tensorboard object."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,500 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 50. Distributed Tensorflow Horovod\n",
"\n",
"In this tutorial we demonstrate how to use the Azure ML Training SDK to train Tensorflow model in a distributed manner using Horovod framework.\n",
"\n",
"# Prerequisites\n",
"\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"from azureml.core.experiment import Experiment\n",
"\n",
"username = getpass.getuser().replace('-','')\n",
"\n",
"# choose a name for the run history container in the workspace\n",
"experiment = Experiment(ws, username + '-horovod')\n",
"\n",
"# project folder name\n",
"project_folder = './samples/distributed-tensorflow-horovod'\n",
"os.makedirs(project_folder, exist_ok = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This recipe is using a MLC-managed Batch AI cluster. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import BatchAiCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"batchai_cluster_name='gpucluster'\n",
"\n",
"\n",
"try:\n",
" # Check for existing cluster\n",
" compute_target = ComputeTarget(ws,batchai_cluster_name)\n",
" print('Found existing compute target')\n",
"except:\n",
" # Else, create new one\n",
" print('Creating a new compute target...')\n",
" provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n",
" #vm_priority = 'lowpriority', # optional\n",
" autoscale_enabled = True,\n",
" cluster_min_nodes = 0, \n",
" cluster_max_nodes = 4)\n",
" compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n",
" # can poll for a minimum number of nodes and for a specific timeout. \n",
" # if no min node count is provided it will use the scale settings for the cluster\n",
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n",
" # For a more detailed view of current BatchAI cluster status, use the 'status' property \n",
"print(compute_target.status.serialize())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile {project_folder}/word2vec.py\n",
"\n",
"# Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n",
"# Modifications copyright (C) 2017 Uber Technologies, Inc.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# ==============================================================================\n",
"\"\"\"Basic word2vec example.\"\"\"\n",
"\n",
"from __future__ import absolute_import\n",
"from __future__ import division\n",
"from __future__ import print_function\n",
"\n",
"import collections\n",
"import math\n",
"import os\n",
"import random\n",
"import zipfile\n",
"import argparse\n",
"\n",
"import numpy as np\n",
"from six.moves import urllib\n",
"from six.moves import xrange # pylint: disable=redefined-builtin\n",
"import tensorflow as tf\n",
"import horovod.tensorflow as hvd\n",
"from azureml.core.run import Run\n",
"\n",
"# Horovod: initialize Horovod.\n",
"hvd.init()\n",
"\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument('--data_dir', type=str, help='input directory')\n",
"\n",
"args = parser.parse_args()\n",
"\n",
"data_dir = args.data_dir\n",
"print(\"the input data_dir is %s\" % data_dir)\n",
"\n",
"# Step 1: Download the data.\n",
"url = 'http://mattmahoney.net/dc/text8.zip'\n",
"\n",
"\n",
"def maybe_download(filename, expected_bytes):\n",
" \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n",
" if not filename:\n",
" filename = \"text8.zip\"\n",
" if not os.path.exists(filename):\n",
" print(\"Downloading the data from http://mattmahoney.net/dc/text8.zip\")\n",
" filename, _ = urllib.request.urlretrieve(url, filename)\n",
" else:\n",
" print(\"Use the data from the input data_dir %s\" % data_dir)\n",
" statinfo = os.stat(filename)\n",
" if statinfo.st_size == expected_bytes:\n",
" print('Found and verified', filename)\n",
" else:\n",
" print(statinfo.st_size)\n",
" raise Exception(\n",
" 'Failed to verify ' + url + '. Can you get to it with a browser?')\n",
" return filename\n",
"\n",
"filename = maybe_download(data_dir, 31344016)\n",
"\n",
"\n",
"# Read the data into a list of strings.\n",
"def read_data(filename):\n",
" \"\"\"Extract the first file enclosed in a zip file as a list of words.\"\"\"\n",
" with zipfile.ZipFile(filename) as f:\n",
" data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n",
" return data\n",
"\n",
"vocabulary = read_data(filename)\n",
"print('Data size', len(vocabulary))\n",
"\n",
"# Step 2: Build the dictionary and replace rare words with UNK token.\n",
"vocabulary_size = 50000\n",
"\n",
"\n",
"def build_dataset(words, n_words):\n",
" \"\"\"Process raw inputs into a dataset.\"\"\"\n",
" count = [['UNK', -1]]\n",
" count.extend(collections.Counter(words).most_common(n_words - 1))\n",
" dictionary = dict()\n",
" for word, _ in count:\n",
" dictionary[word] = len(dictionary)\n",
" data = list()\n",
" unk_count = 0\n",
" for word in words:\n",
" if word in dictionary:\n",
" index = dictionary[word]\n",
" else:\n",
" index = 0 # dictionary['UNK']\n",
" unk_count += 1\n",
" data.append(index)\n",
" count[0][1] = unk_count\n",
" reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
" return data, count, dictionary, reversed_dictionary\n",
"\n",
"data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,\n",
" vocabulary_size)\n",
"del vocabulary # Hint to reduce memory.\n",
"print('Most common words (+UNK)', count[:5])\n",
"print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])\n",
"\n",
"\n",
"# Step 3: Function to generate a training batch for the skip-gram model.\n",
"def generate_batch(batch_size, num_skips, skip_window):\n",
" assert num_skips <= 2 * skip_window\n",
" # Adjust batch_size to match num_skips\n",
" batch_size = batch_size // num_skips * num_skips\n",
" span = 2 * skip_window + 1 # [ skip_window target skip_window ]\n",
" # Backtrack a little bit to avoid skipping words in the end of a batch\n",
" data_index = random.randint(0, len(data) - span - 1)\n",
" batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
" labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
" buffer = collections.deque(maxlen=span)\n",
" for _ in range(span):\n",
" buffer.append(data[data_index])\n",
" data_index = (data_index + 1) % len(data)\n",
" for i in range(batch_size // num_skips):\n",
" target = skip_window # target label at the center of the buffer\n",
" targets_to_avoid = [skip_window]\n",
" for j in range(num_skips):\n",
" while target in targets_to_avoid:\n",
" target = random.randint(0, span - 1)\n",
" targets_to_avoid.append(target)\n",
" batch[i * num_skips + j] = buffer[skip_window]\n",
" labels[i * num_skips + j, 0] = buffer[target]\n",
" buffer.append(data[data_index])\n",
" data_index = (data_index + 1) % len(data)\n",
" return batch, labels\n",
"\n",
"batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n",
"for i in range(8):\n",
" print(batch[i], reverse_dictionary[batch[i]],\n",
" '->', labels[i, 0], reverse_dictionary[labels[i, 0]])\n",
"\n",
"# Step 4: Build and train a skip-gram model.\n",
"\n",
"max_batch_size = 128\n",
"embedding_size = 128 # Dimension of the embedding vector.\n",
"skip_window = 1 # How many words to consider left and right.\n",
"num_skips = 2 # How many times to reuse an input to generate a label.\n",
"\n",
"# We pick a random validation set to sample nearest neighbors. Here we limit the\n",
"# validation samples to the words that have a low numeric ID, which by\n",
"# construction are also the most frequent.\n",
"valid_size = 16 # Random set of words to evaluate similarity on.\n",
"valid_window = 100 # Only pick dev samples in the head of the distribution.\n",
"valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n",
"num_sampled = 64 # Number of negative examples to sample.\n",
"\n",
"graph = tf.Graph()\n",
"\n",
"with graph.as_default():\n",
"\n",
" # Input data.\n",
" train_inputs = tf.placeholder(tf.int32, shape=[None])\n",
" train_labels = tf.placeholder(tf.int32, shape=[None, 1])\n",
" valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
"\n",
" # Look up embeddings for inputs.\n",
" embeddings = tf.Variable(\n",
" tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
" embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n",
"\n",
" # Construct the variables for the NCE loss\n",
" nce_weights = tf.Variable(\n",
" tf.truncated_normal([vocabulary_size, embedding_size],\n",
" stddev=1.0 / math.sqrt(embedding_size)))\n",
" nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
"\n",
" # Compute the average NCE loss for the batch.\n",
" # tf.nce_loss automatically draws a new sample of the negative labels each\n",
" # time we evaluate the loss.\n",
" loss = tf.reduce_mean(\n",
" tf.nn.nce_loss(weights=nce_weights,\n",
" biases=nce_biases,\n",
" labels=train_labels,\n",
" inputs=embed,\n",
" num_sampled=num_sampled,\n",
" num_classes=vocabulary_size))\n",
"\n",
" # Horovod: adjust learning rate based on number of GPUs.\n",
" optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size())\n",
"\n",
" # Horovod: add Horovod Distributed Optimizer.\n",
" optimizer = hvd.DistributedOptimizer(optimizer)\n",
"\n",
" train_op = optimizer.minimize(loss)\n",
"\n",
" # Compute the cosine similarity between minibatch examples and all embeddings.\n",
" norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
" normalized_embeddings = embeddings / norm\n",
" valid_embeddings = tf.nn.embedding_lookup(\n",
" normalized_embeddings, valid_dataset)\n",
" similarity = tf.matmul(\n",
" valid_embeddings, normalized_embeddings, transpose_b=True)\n",
"\n",
" # Add variable initializer.\n",
" init = tf.global_variables_initializer()\n",
"\n",
" # Horovod: broadcast initial variable states from rank 0 to all other processes.\n",
" # This is necessary to ensure consistent initialization of all workers when\n",
" # training is started with random weights or restored from a checkpoint.\n",
" bcast = hvd.broadcast_global_variables(0)\n",
"\n",
"# Step 5: Begin training.\n",
"\n",
"# Horovod: adjust number of steps based on number of GPUs.\n",
"num_steps = 4000 // hvd.size() + 1\n",
"\n",
"# Horovod: pin GPU to be used to process local rank (one GPU per process)\n",
"config = tf.ConfigProto()\n",
"config.gpu_options.allow_growth = True\n",
"config.gpu_options.visible_device_list = str(hvd.local_rank())\n",
"\n",
"with tf.Session(graph=graph, config=config) as session:\n",
" # We must initialize all variables before we use them.\n",
" init.run()\n",
" bcast.run()\n",
" print('Initialized')\n",
" run = Run.get_submitted_run()\n",
" average_loss = 0\n",
" for step in xrange(num_steps):\n",
" # simulate various sentence length by randomization\n",
" batch_size = random.randint(max_batch_size // 2, max_batch_size)\n",
" batch_inputs, batch_labels = generate_batch(\n",
" batch_size, num_skips, skip_window)\n",
" feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n",
"\n",
" # We perform one update step by evaluating the optimizer op (including it\n",
" # in the list of returned values for session.run()\n",
" _, loss_val = session.run([train_op, loss], feed_dict=feed_dict)\n",
" average_loss += loss_val\n",
"\n",
" if step % 2000 == 0:\n",
" if step > 0:\n",
" average_loss /= 2000\n",
" # The average loss is an estimate of the loss over the last 2000 batches.\n",
" print('Average loss at step ', step, ': ', average_loss)\n",
" run.log(\"Loss\", average_loss)\n",
" average_loss = 0\n",
" final_embeddings = normalized_embeddings.eval()\n",
"\n",
" # Evaluate similarity in the end on worker 0.\n",
" if hvd.rank() == 0:\n",
" sim = similarity.eval()\n",
" for i in xrange(valid_size):\n",
" valid_word = reverse_dictionary[valid_examples[i]]\n",
" top_k = 8 # number of nearest neighbors\n",
" nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n",
" log_str = 'Nearest to %s:' % valid_word\n",
" for k in xrange(top_k):\n",
" close_word = reverse_dictionary[nearest[k]]\n",
" log_str = '%s %s,' % (log_str, close_word)\n",
" print(log_str)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Upload http://mattmahoney.net/dc/text8.zip to the azure blob storage."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ds = ws.get_default_datastore()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import urllib\n",
"\n",
"os.makedirs('./data', exist_ok = True)\n",
"\n",
"urllib.request.urlretrieve('http://mattmahoney.net/dc/text8.zip', filename = './data/text8.zip')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ds.upload(src_dir = 'data', target_path = 'data', overwrite=True, show_progress = True)\n",
"\n",
"path_on_datastore = \"/data/text8.zip\"\n",
"ds_data = ds.path(path_on_datastore)\n",
"print(ds_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.dnn import *\n",
"script_params={\n",
" \"--data_dir\": ds_data\n",
"}\n",
"tf_estimator = TensorFlow(source_directory=project_folder,\n",
" compute_target=compute_target,\n",
" entry_script='word2vec.py',\n",
" script_params=script_params,\n",
" node_count=2,\n",
" process_count_per_node=1,\n",
" distributed_backend=\"mpi\",\n",
" use_gpu=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(tf_estimator)\n",
"print(run)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.widgets import RunDetails\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,473 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 51. Distributed TensorFlow using Parameter Server\n",
"In this tutorial we demonstrate how to use the Azure ML Training SDK to train Tensorflow model in a distributed manner using Parameter Server.\n",
"\n",
"# Prerequisites\n",
"\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"from azureml.core.experiment import Experiment\n",
"\n",
"username = getpass.getuser().replace('-','')\n",
"\n",
"# choose a name for the run history container in the workspace\n",
"run_history_name = username + '-tf_ps'\n",
"\n",
"experiment = Experiment(ws, run_history_name)\n",
"\n",
"# project folder name\n",
"project_folder = './' + run_history_name\n",
"\n",
"print(project_folder)\n",
"os.makedirs(project_folder, exist_ok = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This recipe is using a MLC-managed Batch AI cluster. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import BatchAiCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"batchai_cluster_name='gpucluster'\n",
"\n",
"\n",
"try:\n",
" # Check for existing cluster\n",
" compute_target = ComputeTarget(ws,batchai_cluster_name)\n",
" print('Found existing compute target')\n",
"except:\n",
" # Else, create new one\n",
" print('Creating a new compute target...')\n",
" provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n",
" #vm_priority = 'lowpriority', # optional\n",
" autoscale_enabled = True,\n",
" cluster_min_nodes = 0, \n",
" cluster_max_nodes = 4)\n",
" compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n",
" # can poll for a minimum number of nodes and for a specific timeout. \n",
" # if no min node count is provided it will use the scale settings for the cluster\n",
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n",
" # For a more detailed view of current BatchAI cluster status, use the 'status' property \n",
"print(compute_target.status.serialize())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile {project_folder}/mnist_replica.py\n",
"\n",
"# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# ==============================================================================\n",
"\"\"\"Distributed MNIST training and validation, with model replicas.\n",
"A simple softmax model with one hidden layer is defined. The parameters\n",
"(weights and biases) are located on one parameter server (ps), while the ops\n",
"are executed on two worker nodes by default. The TF sessions also run on the\n",
"worker node.\n",
"Multiple invocations of this script can be done in parallel, with different\n",
"values for --task_index. There should be exactly one invocation with\n",
"--task_index, which will create a master session that carries out variable\n",
"initialization. The other, non-master, sessions will wait for the master\n",
"session to finish the initialization before proceeding to the training stage.\n",
"The coordination between the multiple worker invocations occurs due to\n",
"the definition of the parameters on the same ps devices. The parameter updates\n",
"from one worker is visible to all other workers. As such, the workers can\n",
"perform forward computation and gradient calculation in parallel, which\n",
"should lead to increased training speed for the simple model.\n",
"\"\"\"\n",
"\n",
"from __future__ import absolute_import\n",
"from __future__ import division\n",
"from __future__ import print_function\n",
"\n",
"import os\n",
"import math\n",
"import sys\n",
"import tempfile\n",
"import time\n",
"import json\n",
"\n",
"import tensorflow as tf\n",
"from tensorflow.examples.tutorials.mnist import input_data\n",
"from azureml.core.run import Run\n",
"\n",
"flags = tf.app.flags\n",
"flags.DEFINE_string(\"data_dir\", \"/tmp/mnist-data\",\n",
" \"Directory for storing mnist data\")\n",
"flags.DEFINE_boolean(\"download_only\", False,\n",
" \"Only perform downloading of data; Do not proceed to \"\n",
" \"session preparation, model definition or training\")\n",
"flags.DEFINE_integer(\"num_gpus\", 0, \"Total number of gpus for each machine.\"\n",
" \"If you don't use GPU, please set it to '0'\")\n",
"flags.DEFINE_integer(\"replicas_to_aggregate\", None,\n",
" \"Number of replicas to aggregate before parameter update \"\n",
" \"is applied (For sync_replicas mode only; default: \"\n",
" \"num_workers)\")\n",
"flags.DEFINE_integer(\"hidden_units\", 100,\n",
" \"Number of units in the hidden layer of the NN\")\n",
"flags.DEFINE_integer(\"train_steps\", 200,\n",
" \"Number of (global) training steps to perform\")\n",
"flags.DEFINE_integer(\"batch_size\", 100, \"Training batch size\")\n",
"flags.DEFINE_float(\"learning_rate\", 0.01, \"Learning rate\")\n",
"flags.DEFINE_boolean(\n",
" \"sync_replicas\", False,\n",
" \"Use the sync_replicas (synchronized replicas) mode, \"\n",
" \"wherein the parameter updates from workers are aggregated \"\n",
" \"before applied to avoid stale gradients\")\n",
"flags.DEFINE_boolean(\n",
" \"existing_servers\", False, \"Whether servers already exists. If True, \"\n",
" \"will use the worker hosts via their GRPC URLs (one client process \"\n",
" \"per worker host). Otherwise, will create an in-process TensorFlow \"\n",
" \"server.\")\n",
"\n",
"FLAGS = flags.FLAGS\n",
"\n",
"IMAGE_PIXELS = 28\n",
"\n",
"\n",
"def main(unused_argv):\n",
" data_root = os.path.join(\"outputs\", \"MNIST\")\n",
" mnist = None\n",
" tf_config = os.environ.get(\"TF_CONFIG\")\n",
" if not tf_config or tf_config == \"\":\n",
" raise ValueError(\"TF_CONFIG not found.\")\n",
" tf_config_json = json.loads(tf_config)\n",
" cluster = tf_config_json.get('cluster')\n",
" job_name = tf_config_json.get('task', {}).get('type')\n",
" task_index = tf_config_json.get('task', {}).get('index')\n",
" job_name = \"worker\" if job_name == \"master\" else job_name\n",
" sentinel_path = os.path.join(data_root, \"complete.txt\") \n",
" if job_name==\"worker\" and task_index==0:\n",
" mnist = input_data.read_data_sets(data_root, one_hot=True)\n",
" path = os.path.join(data_root, \"complete.txt\") \n",
" with open(sentinel_path, 'w+') as f:\n",
" f.write(\"download complete\")\n",
" else:\n",
" while not os.path.exists(sentinel_path):\n",
" time.sleep(0.01)\n",
" mnist = input_data.read_data_sets(data_root, one_hot=True)\n",
" \n",
" if FLAGS.download_only:\n",
" sys.exit(0)\n",
"\n",
" print(\"job name = %s\" % job_name)\n",
" print(\"task index = %d\" % task_index)\n",
" print(\"number of GPUs = %d\" % FLAGS.num_gpus)\n",
"\n",
" #Construct the cluster and start the server\n",
" cluster_spec = tf.train.ClusterSpec(cluster)\n",
" \n",
" # Get the number of workers.\n",
" num_workers = len(cluster_spec.task_indices(\"worker\"))\n",
"\n",
" if not FLAGS.existing_servers:\n",
" # Not using existing servers. Create an in-process server.\n",
" server = tf.train.Server(\n",
" cluster_spec, job_name=job_name, task_index=task_index)\n",
" if job_name == \"ps\":\n",
" server.join()\n",
"\n",
" is_chief = (task_index == 0)\n",
" if FLAGS.num_gpus > 0:\n",
" # Avoid gpu allocation conflict: now allocate task_num -> #gpu\n",
" # for each worker in the corresponding machine\n",
" gpu = (task_index % FLAGS.num_gpus)\n",
" worker_device = \"/job:worker/task:%d/gpu:%d\" % (task_index, gpu)\n",
" elif FLAGS.num_gpus == 0:\n",
" # Just allocate the CPU to worker server\n",
" cpu = 0\n",
" worker_device = \"/job:worker/task:%d/cpu:%d\" % (task_index, cpu)\n",
" # The device setter will automatically place Variables ops on separate\n",
" # parameter servers (ps). The non-Variable ops will be placed on the workers.\n",
" # The ps use CPU and workers use corresponding GPU\n",
" with tf.device(\n",
" tf.train.replica_device_setter(\n",
" worker_device=worker_device,\n",
" ps_device=\"/job:ps/cpu:0\",\n",
" cluster=cluster)):\n",
" global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n",
"\n",
" # Variables of the hidden layer\n",
" hid_w = tf.Variable(\n",
" tf.truncated_normal(\n",
" [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],\n",
" stddev=1.0 / IMAGE_PIXELS),\n",
" name=\"hid_w\")\n",
" hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name=\"hid_b\")\n",
"\n",
" # Variables of the softmax layer\n",
" sm_w = tf.Variable(\n",
" tf.truncated_normal(\n",
" [FLAGS.hidden_units, 10],\n",
" stddev=1.0 / math.sqrt(FLAGS.hidden_units)),\n",
" name=\"sm_w\")\n",
" sm_b = tf.Variable(tf.zeros([10]), name=\"sm_b\")\n",
"\n",
" # Ops: located on the worker specified with task_index\n",
" x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])\n",
" y_ = tf.placeholder(tf.float32, [None, 10])\n",
"\n",
" hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)\n",
" hid = tf.nn.relu(hid_lin)\n",
"\n",
" y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))\n",
" cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))\n",
"\n",
" opt = tf.train.AdamOptimizer(FLAGS.learning_rate)\n",
"\n",
" if FLAGS.sync_replicas:\n",
" if FLAGS.replicas_to_aggregate is None:\n",
" replicas_to_aggregate = num_workers\n",
" else:\n",
" replicas_to_aggregate = FLAGS.replicas_to_aggregate\n",
"\n",
" opt = tf.train.SyncReplicasOptimizer(\n",
" opt,\n",
" replicas_to_aggregate=replicas_to_aggregate,\n",
" total_num_replicas=num_workers,\n",
" name=\"mnist_sync_replicas\")\n",
"\n",
" train_step = opt.minimize(cross_entropy, global_step=global_step)\n",
"\n",
" if FLAGS.sync_replicas:\n",
" local_init_op = opt.local_step_init_op\n",
" if is_chief:\n",
" local_init_op = opt.chief_init_op\n",
"\n",
" ready_for_local_init_op = opt.ready_for_local_init_op\n",
"\n",
" # Initial token and chief queue runners required by the sync_replicas mode\n",
" chief_queue_runner = opt.get_chief_queue_runner()\n",
" sync_init_op = opt.get_init_tokens_op()\n",
"\n",
" init_op = tf.global_variables_initializer()\n",
" train_dir = tempfile.mkdtemp()\n",
"\n",
" if FLAGS.sync_replicas:\n",
" sv = tf.train.Supervisor(\n",
" is_chief=is_chief,\n",
" logdir=train_dir,\n",
" init_op=init_op,\n",
" local_init_op=local_init_op,\n",
" ready_for_local_init_op=ready_for_local_init_op,\n",
" recovery_wait_secs=1,\n",
" global_step=global_step)\n",
" else:\n",
" sv = tf.train.Supervisor(\n",
" is_chief=is_chief,\n",
" logdir=train_dir,\n",
" init_op=init_op,\n",
" recovery_wait_secs=1,\n",
" global_step=global_step)\n",
"\n",
" sess_config = tf.ConfigProto(\n",
" allow_soft_placement=True,\n",
" log_device_placement=False,\n",
" device_filters=[\"/job:ps\",\n",
" \"/job:worker/task:%d\" % task_index])\n",
"\n",
" # The chief worker (task_index==0) session will prepare the session,\n",
" # while the remaining workers will wait for the preparation to complete.\n",
" if is_chief:\n",
" print(\"Worker %d: Initializing session...\" % task_index)\n",
" else:\n",
" print(\"Worker %d: Waiting for session to be initialized...\" %\n",
" task_index)\n",
"\n",
" if FLAGS.existing_servers:\n",
" server_grpc_url = \"grpc://\" + worker_spec[task_index]\n",
" print(\"Using existing server at: %s\" % server_grpc_url)\n",
"\n",
" sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)\n",
" else:\n",
" sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)\n",
"\n",
" print(\"Worker %d: Session initialization complete.\" % task_index)\n",
"\n",
" if FLAGS.sync_replicas and is_chief:\n",
" # Chief worker will start the chief queue runner and call the init op.\n",
" sess.run(sync_init_op)\n",
" sv.start_queue_runners(sess, [chief_queue_runner])\n",
"\n",
" # Perform training\n",
" time_begin = time.time()\n",
" print(\"Training begins @ %f\" % time_begin)\n",
"\n",
" local_step = 0\n",
" while True:\n",
" # Training feed\n",
" batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)\n",
" train_feed = {x: batch_xs, y_: batch_ys}\n",
"\n",
" _, step = sess.run([train_step, global_step], feed_dict=train_feed)\n",
" local_step += 1\n",
"\n",
" now = time.time()\n",
" print(\"%f: Worker %d: training step %d done (global step: %d)\" %\n",
" (now, task_index, local_step, step))\n",
"\n",
" if step >= FLAGS.train_steps:\n",
" break\n",
"\n",
" time_end = time.time()\n",
" print(\"Training ends @ %f\" % time_end)\n",
" training_time = time_end - time_begin\n",
" print(\"Training elapsed time: %f s\" % training_time)\n",
"\n",
" # Validation feed\n",
" val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}\n",
" val_xent = sess.run(cross_entropy, feed_dict=val_feed)\n",
" print(\"After %d training step(s), validation cross entropy = %g\" %\n",
" (FLAGS.train_steps, val_xent))\n",
" if job_name==\"worker\" and task_index==0:\n",
" run = Run.get_submitted_run()\n",
" run.log(\"CrossEntropy\", val_xent)\n",
"\n",
"if __name__ == \"__main__\":\n",
" tf.app.run()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.dnn import *\n",
"tf_estimator = TensorFlow(source_directory=project_folder,\n",
" compute_target=compute_target,\n",
" entry_script='mnist_replica.py',\n",
" node_count=2,\n",
" worker_count=2,\n",
" parameter_server_count=1, \n",
" distributed_backend=\"ps\",\n",
" use_gpu=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(tf_estimator)\n",
"print(run)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.widgets import RunDetails\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,509 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 52. Distributed CNTK\n",
"In this tutorial we demonstrate how to use the Azure ML Training SDK to train CNTK model in a distributed manner.\n",
"\n",
"# Prerequisites\n",
"\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"from azureml.core.experiment import Experiment\n",
"\n",
"username = getpass.getuser().replace('-','')\n",
"\n",
"# choose a name for the run history container in the workspace\n",
"run_history_name = username + '-cntk-distrib'\n",
"\n",
"experiment = Experiment(ws, run_history_name)\n",
"\n",
"# project folder name\n",
"project_folder = './' + run_history_name\n",
"\n",
"print(project_folder)\n",
"os.makedirs(project_folder, exist_ok = True)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This recipe is using a MLC-managed Batch AI cluster. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import BatchAiCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"batchai_cluster_name='gpucluster'\n",
"\n",
"\n",
"try:\n",
" # Check for existing cluster\n",
" compute_target = ComputeTarget(ws,batchai_cluster_name)\n",
" print('Found existing compute target')\n",
"except:\n",
" # Else, create new one\n",
" print('Creating a new compute target...')\n",
" provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n",
" #vm_priority = 'lowpriority', # optional\n",
" autoscale_enabled = True,\n",
" cluster_min_nodes = 0, \n",
" cluster_max_nodes = 4)\n",
" compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n",
" # can poll for a minimum number of nodes and for a specific timeout. \n",
" # if no min node count is provided it will use the scale settings for the cluster\n",
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n",
" # For a more detailed view of current BatchAI cluster status, use the 'status' property \n",
"print(compute_target.status.serialize())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile {project_folder}/cntk_mnist.py\n",
"\n",
"# This code is adapted from CNTK MNIST tutorials: \n",
"# 1. https://github.com/Microsoft/CNTK/blob/v2.0/Tutorials/CNTK_103A_MNIST_DataLoader.ipynb\n",
"# 2. https://github.com/Microsoft/CNTK/blob/v2.0/Tutorials/CNTK_103C_MNIST_MultiLayerPerceptron.ipynb\n",
"\n",
"# Import the relevant modules to be used later\n",
"from __future__ import print_function\n",
"import gzip\n",
"import numpy as np\n",
"import os\n",
"import shutil\n",
"import struct\n",
"import sys\n",
"import time\n",
"import pandas \n",
"\n",
"import cntk as C\n",
"from azureml.core.run import Run\n",
"import argparse\n",
"\n",
"run = Run.get_submitted_run()\n",
"\n",
"parser=argparse.ArgumentParser()\n",
"\n",
"parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate')\n",
"parser.add_argument('--num_hidden_layers', type=int, default=2, help='number of hidden layers')\n",
"parser.add_argument('--minibatch_size', type=int, default=64, help='minibatchsize')\n",
"\n",
"args=parser.parse_args() \n",
"\n",
"# Functions to load MNIST images and unpack into train and test set.\n",
"# - loadData reads image data and formats into a 28x28 long array\n",
"# - loadLabels reads the corresponding labels data, 1 for each image\n",
"# - load packs the downloaded image and labels data into a combined format to be read later by \n",
"# CNTK text reader \n",
"def loadData(src, cimg):\n",
" print ('Downloading ' + src)\n",
" gzfname, h = urlretrieve(src, './delete.me')\n",
" print ('Done.')\n",
" try:\n",
" with gzip.open(gzfname) as gz:\n",
" n = struct.unpack('I', gz.read(4))\n",
" # Read magic number.\n",
" if n[0] != 0x3080000:\n",
" raise Exception('Invalid file: unexpected magic number.')\n",
" # Read number of entries.\n",
" n = struct.unpack('>I', gz.read(4))[0]\n",
" if n != cimg:\n",
" raise Exception('Invalid file: expected {0} entries.'.format(cimg))\n",
" crow = struct.unpack('>I', gz.read(4))[0]\n",
" ccol = struct.unpack('>I', gz.read(4))[0]\n",
" if crow != 28 or ccol != 28:\n",
" raise Exception('Invalid file: expected 28 rows/cols per image.')\n",
" # Read data.\n",
" res = np.fromstring(gz.read(cimg * crow * ccol), dtype = np.uint8)\n",
" finally:\n",
" os.remove(gzfname)\n",
" return res.reshape((cimg, crow * ccol))\n",
"\n",
"def loadLabels(src, cimg):\n",
" print ('Downloading ' + src)\n",
" gzfname, h = urlretrieve(src, './delete.me')\n",
" print ('Done.')\n",
" try:\n",
" with gzip.open(gzfname) as gz:\n",
" n = struct.unpack('I', gz.read(4))\n",
" # Read magic number.\n",
" if n[0] != 0x1080000:\n",
" raise Exception('Invalid file: unexpected magic number.')\n",
" # Read number of entries.\n",
" n = struct.unpack('>I', gz.read(4))\n",
" if n[0] != cimg:\n",
" raise Exception('Invalid file: expected {0} rows.'.format(cimg))\n",
" # Read labels.\n",
" res = np.fromstring(gz.read(cimg), dtype = np.uint8)\n",
" finally:\n",
" os.remove(gzfname)\n",
" return res.reshape((cimg, 1))\n",
"\n",
"def try_download(dataSrc, labelsSrc, cimg):\n",
" data = loadData(dataSrc, cimg)\n",
" labels = loadLabels(labelsSrc, cimg)\n",
" return np.hstack((data, labels))\n",
"\n",
"# Save the data files into a format compatible with CNTK text reader\n",
"def savetxt(filename, ndarray):\n",
" dir = os.path.dirname(filename)\n",
"\n",
" if not os.path.exists(dir):\n",
" os.makedirs(dir)\n",
"\n",
" if not os.path.isfile(filename):\n",
" print(\"Saving\", filename )\n",
" with open(filename, 'w') as f:\n",
" labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))\n",
" for row in ndarray:\n",
" row_str = row.astype(str)\n",
" label_str = labels[row[-1]]\n",
" feature_str = ' '.join(row_str[:-1])\n",
" f.write('|labels {} |features {}\\n'.format(label_str, feature_str))\n",
" else:\n",
" print(\"File already exists\", filename)\n",
"\n",
"# Read a CTF formatted text (as mentioned above) using the CTF deserializer from a file\n",
"def create_reader(path, is_training, input_dim, num_label_classes):\n",
" return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(\n",
" labels = C.io.StreamDef(field='labels', shape=num_label_classes, is_sparse=False),\n",
" features = C.io.StreamDef(field='features', shape=input_dim, is_sparse=False)\n",
" )), randomize = is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)\n",
"\n",
"# Defines a utility that prints the training progress\n",
"def print_training_progress(trainer, mb, frequency, verbose=1):\n",
" training_loss = \"NA\"\n",
" eval_error = \"NA\"\n",
"\n",
" if mb%frequency == 0:\n",
" training_loss = trainer.previous_minibatch_loss_average\n",
" eval_error = trainer.previous_minibatch_evaluation_average\n",
" if verbose: \n",
" print (\"Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%\".format(mb, training_loss, eval_error*100))\n",
" \n",
" return mb, training_loss, eval_error\n",
"\n",
"# Create the network architecture\n",
"def create_model(features):\n",
" with C.layers.default_options(init = C.layers.glorot_uniform(), activation = C.ops.relu):\n",
" h = features\n",
" for _ in range(num_hidden_layers):\n",
" h = C.layers.Dense(hidden_layers_dim)(h)\n",
" r = C.layers.Dense(num_output_classes, activation = None)(h)\n",
" return r\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" run = Run.get_submitted_run()\n",
"\n",
" try: \n",
" from urllib.request import urlretrieve \n",
" except ImportError: \n",
" from urllib import urlretrieve\n",
"\n",
" # Select the right target device when this script is being used:\n",
" if 'TEST_DEVICE' in os.environ:\n",
" if os.environ['TEST_DEVICE'] == 'cpu':\n",
" C.device.try_set_default_device(C.device.cpu())\n",
" else:\n",
" C.device.try_set_default_device(C.device.gpu(0))\n",
"\n",
" # URLs for the train image and labels data\n",
" url_train_image = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'\n",
" url_train_labels = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'\n",
" num_train_samples = 60000\n",
"\n",
" print(\"Downloading train data\")\n",
" train = try_download(url_train_image, url_train_labels, num_train_samples)\n",
"\n",
" url_test_image = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'\n",
" url_test_labels = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'\n",
" num_test_samples = 10000\n",
"\n",
" print(\"Downloading test data\")\n",
" test = try_download(url_test_image, url_test_labels, num_test_samples)\n",
"\n",
"\n",
" # Save the train and test files (prefer our default path for the data\n",
" rank = os.environ.get(\"OMPI_COMM_WORLD_RANK\") \n",
" data_dir = os.path.join(\"outputs\", \"MNIST\")\n",
" sentinel_path = os.path.join(data_dir, \"complete.txt\") \n",
" if rank == '0': \n",
" print ('Writing train text file...')\n",
" savetxt(os.path.join(data_dir, \"Train-28x28_cntk_text.txt\"), train)\n",
"\n",
" print ('Writing test text file...')\n",
" savetxt(os.path.join(data_dir, \"Test-28x28_cntk_text.txt\"), test)\n",
" with open(sentinel_path, 'w+') as f:\n",
" f.write(\"download complete\")\n",
"\n",
" print('Done with downloading data.')\n",
" else:\n",
" while not os.path.exists(sentinel_path):\n",
" time.sleep(0.01)\n",
" \n",
"\n",
" # Ensure we always get the same amount of randomness\n",
" np.random.seed(0)\n",
"\n",
" # Define the data dimensions\n",
" input_dim = 784\n",
" num_output_classes = 10\n",
"\n",
" # Ensure the training and test data is generated and available for this tutorial.\n",
" # We search in two locations in the toolkit for the cached MNIST data set.\n",
" data_found = False\n",
" for data_dir in [os.path.join(\"..\", \"Examples\", \"Image\", \"DataSets\", \"MNIST\"),\n",
" os.path.join(\"data_\" + str(rank), \"MNIST\"),\n",
" os.path.join(\"outputs\", \"MNIST\")]:\n",
" train_file = os.path.join(data_dir, \"Train-28x28_cntk_text.txt\")\n",
" test_file = os.path.join(data_dir, \"Test-28x28_cntk_text.txt\")\n",
" if os.path.isfile(train_file) and os.path.isfile(test_file):\n",
" data_found = True\n",
" break\n",
" if not data_found:\n",
" raise ValueError(\"Please generate the data by completing CNTK 103 Part A\")\n",
" print(\"Data directory is {0}\".format(data_dir))\n",
"\n",
" num_hidden_layers = args.num_hidden_layers\n",
" hidden_layers_dim = 400\n",
"\n",
" input = C.input_variable(input_dim)\n",
" label = C.input_variable(num_output_classes)\n",
"\n",
" \n",
" z = create_model(input)\n",
" # Scale the input to 0-1 range by dividing each pixel by 255.\n",
" z = create_model(input/255.0)\n",
"\n",
" loss = C.cross_entropy_with_softmax(z, label)\n",
" label_error = C.classification_error(z, label)\n",
"\n",
"\n",
" # Instantiate the trainer object to drive the model training\n",
" learning_rate = args.learning_rate\n",
" lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)\n",
" learner = C.sgd(z.parameters, lr_schedule)\n",
" trainer = C.Trainer(z, (loss, label_error), [learner])\n",
"\n",
"\n",
" # Initialize the parameters for the trainer\n",
" minibatch_size = args.minibatch_size\n",
" num_samples_per_sweep = 60000\n",
" num_sweeps_to_train_with = 10\n",
" num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size\n",
"\n",
" # Create the reader to training data set\n",
" reader_train = create_reader(train_file, True, input_dim, num_output_classes)\n",
"\n",
" # Map the data streams to the input and labels.\n",
" input_map = {\n",
" label : reader_train.streams.labels,\n",
" input : reader_train.streams.features\n",
" } \n",
"\n",
" # Run the trainer on and perform model training\n",
" training_progress_output_freq = 500\n",
" \n",
" errors = []\n",
" losses = []\n",
" for i in range(0, int(num_minibatches_to_train)): \n",
" # Read a mini batch from the training data file\n",
" data = reader_train.next_minibatch(minibatch_size, input_map = input_map)\n",
" \n",
" trainer.train_minibatch(data)\n",
" batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1)\n",
" if (error != 'NA') and (loss != 'NA'):\n",
" errors.append(float(error))\n",
" losses.append(float(loss))\n",
" \n",
" # log the losses\n",
" if rank == '0': \n",
" run.log_list(\"Loss\", losses)\n",
" run.log_list(\"Error\",errors)\n",
"\n",
" # Read the training data\n",
" reader_test = create_reader(test_file, False, input_dim, num_output_classes)\n",
"\n",
" test_input_map = {\n",
" label : reader_test.streams.labels,\n",
" input : reader_test.streams.features,\n",
" }\n",
"\n",
" # Test data for trained model\n",
" test_minibatch_size = 512\n",
" num_samples = 10000\n",
" num_minibatches_to_test = num_samples // test_minibatch_size\n",
" test_result = 0.0\n",
"\n",
" \n",
" for i in range(num_minibatches_to_test): \n",
" # We are loading test data in batches specified by test_minibatch_size\n",
" # Each data point in the minibatch is a MNIST digit image of 784 dimensions \n",
" # with one pixel per dimension that we will encode / decode with the \n",
" # trained model.\n",
" data = reader_test.next_minibatch(test_minibatch_size,\n",
" input_map = test_input_map)\n",
"\n",
" eval_error = trainer.test_minibatch(data)\n",
" test_result = test_result + eval_error\n",
" \n",
"\n",
" # Average of evaluation errors of all test minibatches\n",
" print(\"Average test error: {0:.2f}%\".format(test_result*100 / num_minibatches_to_test))\n",
"\n",
" out = C.softmax(z)\n",
"\n",
" # Read the data for evaluation\n",
" reader_eval = create_reader(test_file, False, input_dim, num_output_classes)\n",
"\n",
" eval_minibatch_size = 25\n",
" eval_input_map = {input: reader_eval.streams.features} \n",
"\n",
" data = reader_test.next_minibatch(eval_minibatch_size, input_map = test_input_map)\n",
"\n",
" img_label = data[label].asarray()\n",
" img_data = data[input].asarray()\n",
" predicted_label_prob = [out.eval(img_data[i]) for i in range(len(img_data))]\n",
"\n",
" # Find the index with the maximum value for both predicted as well as the ground truth\n",
" pred = [np.argmax(predicted_label_prob[i]) for i in range(len(predicted_label_prob))]\n",
" gtlabel = [np.argmax(img_label[i]) for i in range(len(img_label))]\n",
"\n",
" print(\"Label :\", gtlabel[:25])\n",
" print(\"Predicted:\", pred)\n",
" \n",
" # save model to outputs folder\n",
" z.save('outputs/cntk.model')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.estimator import *\n",
"pip_packages=['cntk==2.5.1', 'pandas==0.23.4']\n",
"cntk_estimator = Estimator(source_directory=project_folder,\n",
" compute_target=compute_target,\n",
" entry_script='cntk_mnist.py',\n",
" node_count=2,\n",
" process_count_per_node=1,\n",
" distributed_backend=\"mpi\", \n",
" pip_packages=pip_packages,\n",
" custom_docker_base_image=\"microsoft/mmlspark:0.12\",\n",
" use_gpu=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(cntk_estimator)\n",
"print(run)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.widgets import RunDetails\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,376 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PyTorch Distributed Demo\n",
"\n",
"In this demo, we will run a sample PyTorch job using Horovod on a multi-node Batch AI cluster."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set experiment name and create project\n",
"Choose a name for your run history container in the workspace, and create a folder for the project."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"experiment_name = 'pytorch-dist-hvd'\n",
"\n",
"# project folder\n",
"project_folder = './sample_projects/pytorch-dist-hvd'\n",
"os.makedirs(project_folder, exist_ok = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Write demo PyTorch code\n",
"\n",
"We will use a distributed PyTorch implementation of the classic MNIST problem. The following cell writes the main implementation to the project folder."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile {project_folder}/pytorch_horovod_mnist.py\n",
"\n",
"from __future__ import print_function\n",
"import argparse\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torchvision import datasets, transforms\n",
"from torch.autograd import Variable\n",
"import torch.utils.data.distributed\n",
"import horovod.torch as hvd\n",
"\n",
"# Training settings\n",
"parser = argparse.ArgumentParser(description='PyTorch MNIST Example')\n",
"parser.add_argument('--batch-size', type=int, default=64, metavar='N',\n",
" help='input batch size for training (default: 64)')\n",
"parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',\n",
" help='input batch size for testing (default: 1000)')\n",
"parser.add_argument('--epochs', type=int, default=10, metavar='N',\n",
" help='number of epochs to train (default: 10)')\n",
"parser.add_argument('--lr', type=float, default=0.01, metavar='LR',\n",
" help='learning rate (default: 0.01)')\n",
"parser.add_argument('--momentum', type=float, default=0.5, metavar='M',\n",
" help='SGD momentum (default: 0.5)')\n",
"parser.add_argument('--no-cuda', action='store_true', default=False,\n",
" help='disables CUDA training')\n",
"parser.add_argument('--seed', type=int, default=42, metavar='S',\n",
" help='random seed (default: 42)')\n",
"parser.add_argument('--log-interval', type=int, default=10, metavar='N',\n",
" help='how many batches to wait before logging training status')\n",
"args = parser.parse_args()\n",
"args.cuda = not args.no_cuda and torch.cuda.is_available()\n",
"\n",
"hvd.init()\n",
"torch.manual_seed(args.seed)\n",
"\n",
"if args.cuda:\n",
" # Horovod: pin GPU to local rank.\n",
" torch.cuda.set_device(hvd.local_rank())\n",
" torch.cuda.manual_seed(args.seed)\n",
"\n",
"\n",
"kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}\n",
"train_dataset = \\\n",
" datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,\n",
" transform=transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,))\n",
" ]))\n",
"train_sampler = torch.utils.data.distributed.DistributedSampler(\n",
" train_dataset, num_replicas=hvd.size(), rank=hvd.rank())\n",
"train_loader = torch.utils.data.DataLoader(\n",
" train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)\n",
"\n",
"test_dataset = \\\n",
" datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,))\n",
" ]))\n",
"test_sampler = torch.utils.data.distributed.DistributedSampler(\n",
" test_dataset, num_replicas=hvd.size(), rank=hvd.rank())\n",
"test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,\n",
" sampler=test_sampler, **kwargs)\n",
"\n",
"\n",
"class Net(nn.Module):\n",
" def __init__(self):\n",
" super(Net, self).__init__()\n",
" self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n",
" self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n",
" self.conv2_drop = nn.Dropout2d()\n",
" self.fc1 = nn.Linear(320, 50)\n",
" self.fc2 = nn.Linear(50, 10)\n",
"\n",
" def forward(self, x):\n",
" x = F.relu(F.max_pool2d(self.conv1(x), 2))\n",
" x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n",
" x = x.view(-1, 320)\n",
" x = F.relu(self.fc1(x))\n",
" x = F.dropout(x, training=self.training)\n",
" x = self.fc2(x)\n",
" return F.log_softmax(x)\n",
"\n",
"\n",
"model = Net()\n",
"\n",
"if args.cuda:\n",
" # Move model to GPU.\n",
" model.cuda()\n",
"\n",
"# Horovod: broadcast parameters.\n",
"hvd.broadcast_parameters(model.state_dict(), root_rank=0)\n",
"\n",
"# Horovod: scale learning rate by the number of GPUs.\n",
"optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),\n",
" momentum=args.momentum)\n",
"\n",
"# Horovod: wrap optimizer with DistributedOptimizer.\n",
"optimizer = hvd.DistributedOptimizer(\n",
" optimizer, named_parameters=model.named_parameters())\n",
"\n",
"\n",
"def train(epoch):\n",
" model.train()\n",
" train_sampler.set_epoch(epoch)\n",
" for batch_idx, (data, target) in enumerate(train_loader):\n",
" if args.cuda:\n",
" data, target = data.cuda(), target.cuda()\n",
" data, target = Variable(data), Variable(target)\n",
" optimizer.zero_grad()\n",
" output = model(data)\n",
" loss = F.nll_loss(output, target)\n",
" loss.backward()\n",
" optimizer.step()\n",
" if batch_idx % args.log_interval == 0:\n",
" print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
" epoch, batch_idx * len(data), len(train_sampler),\n",
" 100. * batch_idx / len(train_loader), loss.data[0]))\n",
"\n",
"\n",
"def metric_average(val, name):\n",
" tensor = torch.FloatTensor([val])\n",
" avg_tensor = hvd.allreduce(tensor, name=name)\n",
" return avg_tensor[0]\n",
"\n",
"\n",
"def test():\n",
" model.eval()\n",
" test_loss = 0.\n",
" test_accuracy = 0.\n",
" for data, target in test_loader:\n",
" if args.cuda:\n",
" data, target = data.cuda(), target.cuda()\n",
" data, target = Variable(data, volatile=True), Variable(target)\n",
" output = model(data)\n",
" # sum up batch loss\n",
" test_loss += F.nll_loss(output, target, size_average=False).data[0]\n",
" # get the index of the max log-probability\n",
" pred = output.data.max(1, keepdim=True)[1]\n",
" test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()\n",
"\n",
" test_loss /= len(test_sampler)\n",
" test_accuracy /= len(test_sampler)\n",
"\n",
" test_loss = metric_average(test_loss, 'avg_loss')\n",
" test_accuracy = metric_average(test_accuracy, 'avg_accuracy')\n",
"\n",
" if hvd.rank() == 0:\n",
" print('\\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\\n'.format(\n",
" test_loss, 100. * test_accuracy))\n",
"\n",
"\n",
"for epoch in range(1, args.epochs + 1):\n",
" train(epoch)\n",
" test()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy Batch AI cluster\n",
"\n",
"To run this in a distributed context, we'll need a Batch AI cluster with at least two nodes.\n",
"\n",
"Here, we use exactly two CPU nodes, to conserve resources. If you want to try it with some other number or SKU, just change the relevant values in the following code block."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import BatchAiCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"batchai_cluster_name='gpucluster'\n",
"\n",
"\n",
"try:\n",
" # Check for existing cluster\n",
" compute_target = ComputeTarget(ws,batchai_cluster_name)\n",
" print('Found existing compute target')\n",
"except:\n",
" # Else, create new one\n",
" print('Creating a new compute target...')\n",
" provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n",
" #vm_priority = 'lowpriority', # optional\n",
" autoscale_enabled = True,\n",
" cluster_min_nodes = 0, \n",
" cluster_max_nodes = 4)\n",
" compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n",
" # can poll for a minimum number of nodes and for a specific timeout. \n",
" # if no min node count is provided it will use the scale settings for the cluster\n",
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n",
" # For a more detailed view of current BatchAI cluster status, use the 'status' property \n",
"print(compute_target.status.serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Submit job\n",
"\n",
"Now that we have a cluster ready to go, let's submit our job.\n",
"\n",
"We need to use a custom estimator here, and specify that we want the `pytorch`, `horovod` and `torchvision` packages installed to our image."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.dnn import PyTorch\n",
"\n",
"estimator = PyTorch(source_directory=project_folder,\n",
" compute_target=compute_target,\n",
" entry_script='pytorch_horovod_mnist.py',\n",
" node_count=2,\n",
" process_count_per_node=1,\n",
" distributed_backend=\"mpi\",\n",
" use_gpu=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.experiment import Experiment\n",
"\n",
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
"run = experiment.submit(estimator)\n",
"print(run)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.widgets import RunDetails\n",
"RunDetails(run).show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -110,7 +110,7 @@
"experiment_name = 'sklearn-mnist'\n",
"\n",
"from azureml.core import Experiment\n",
"exp = Experiment(workspace = ws, name = experiment_name)"
"exp = Experiment(workspace=ws, name=experiment_name)"
]
},
{
@@ -143,25 +143,25 @@
"\n",
"try:\n",
" # look for the existing cluster by name\n",
" compute_target = ComputeTarget(workspace = ws, name = batchai_cluster_name)\n",
" compute_target = ComputeTarget(workspace=ws, name=batchai_cluster_name)\n",
" if compute_target is BatchAiCompute:\n",
" print('found compute target {}, just use it.'.format(batchai_cluster_name))\n",
" else:\n",
" print('{} exists but it is not a Batch AI cluster. Please choose a different name.'.format(batchai_cluster_name))\n",
"except ComputeTargetException:\n",
" print('creating a new compute target...')\n",
" compute_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # small CPU-based VM\n",
" #vm_priority = 'lowpriority', # optional\n",
" autoscale_enabled = True,\n",
" cluster_min_nodes = 0, \n",
" cluster_max_nodes = 4)\n",
" compute_config = BatchAiCompute.provisioning_configuration(vm_size=\"STANDARD_D2_V2\", # small CPU-based VM\n",
" #vm_priority='lowpriority', # optional\n",
" autoscale_enabled=True,\n",
" cluster_min_nodes=0, \n",
" cluster_max_nodes=4)\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, batchai_cluster_name, compute_config)\n",
" \n",
" # can poll for a minimum number of nodes and for a specific timeout. \n",
" # if no min node count is provided it uses the scale settings for the cluster\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
" \n",
" # Use the 'status' property to get a detailed status for the current cluster. \n",
" print(compute_target.status.serialize())"
@@ -197,10 +197,10 @@
"\n",
"os.makedirs('./data', exist_ok = True)\n",
"\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', filename = './data/train-images.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', filename = './data/train-labels.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename = './data/test-images.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename = './data/test-labels.gz')"
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', filename='./data/train-images.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', filename='./data/train-labels.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename='./data/test-images.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename='./data/test-labels.gz')"
]
},
{
@@ -237,8 +237,8 @@
" plt.subplot(1, sample_size, count)\n",
" plt.axhline('')\n",
" plt.axvline('')\n",
" plt.text(x = 10, y = -10, s = y_train[i], fontsize = 18)\n",
" plt.imshow(X_train[i].reshape(28, 28), cmap = plt.cm.Greys)\n",
" plt.text(x=10, y=-10, s=y_train[i], fontsize=18)\n",
" plt.imshow(X_train[i].reshape(28, 28), cmap=plt.cm.Greys)\n",
"plt.show()"
]
},
@@ -264,7 +264,7 @@
"ds = ws.get_default_datastore()\n",
"print(ds.datastore_type, ds.account_name, ds.container_name)\n",
"\n",
"ds.upload(src_dir = './data', target_path = 'mnist', overwrite = True, show_progress = True)"
"ds.upload(src_dir='./data', target_path='mnist', overwrite=True, show_progress=True)"
]
},
{
@@ -339,7 +339,7 @@
"source": [
"import os\n",
"script_folder = './sklearn-mnist'\n",
"os.makedirs(script_folder, exist_ok = True)"
"os.makedirs(script_folder, exist_ok=True)"
]
},
{
@@ -371,8 +371,8 @@
"\n",
"# let user feed in 2 parameters, the location of the data files (from datastore), and the regularization rate of the logistic regression model\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument('--data-folder', type = str, dest = 'data_folder', help = 'data folder mounting point')\n",
"parser.add_argument('--regularization', type = float, dest = 'reg', default = 0.01, help = 'regularization rate')\n",
"parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')\n",
"parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')\n",
"args = parser.parse_args()\n",
"\n",
"data_folder = os.path.join(args.data_folder, 'mnist')\n",
@@ -389,25 +389,23 @@
"# get hold of the current run\n",
"run = Run.get_submitted_run()\n",
"\n",
"# train a logistic regression model with specified regularization rate\n",
"print('Train a logistic regression model with regularizaion rate of', args.reg)\n",
"clf = LogisticRegression(C = 1.0/args.reg, random_state = 42)\n",
"clf = LogisticRegression(C=1.0/args.reg, random_state=42)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print('Predict the test set')\n",
"# predict on the test set\n",
"y_hat = clf.predict(X_test)\n",
"\n",
"# calculate accuracy on the prediction\n",
"acc = np.average(y_hat == y_test)\n",
"print('Accuracy is', acc)\n",
"\n",
"# log regularization rate and accuracy \n",
"run.log('regularization rate', np.float(args.reg))\n",
"run.log('accuracy', np.float(acc))\n",
"\n",
"os.makedirs('outputs', exist_ok = True)\n",
"joblib.dump(value = clf, filename = 'outputs/sklearn_mnist_model.pkl')"
"os.makedirs('outputs', exist_ok=True)\n",
"# note file saved in the outputs folder is automatically uploaded into experiment record\n",
"joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')"
]
},
{
@@ -417,7 +415,7 @@
"Notice how the script gets data and saves models:\n",
"\n",
"+ The training script reads an argument to find the directory containing the data. When you submit the job later, you point to the datastore for this argument:\n",
"`parser.add_argument('--data-folder', type = str, dest = 'data_folder', help = 'data directory mounting point')`"
"`parser.add_argument('--data-folder', type=str, dest='data_folder', help='data directory mounting point')`"
]
},
{
@@ -426,7 +424,7 @@
"source": [
"\n",
"+ The training script saves your model into a directory named outputs. <br/>\n",
"`joblib.dump(value = clf, filename = 'outputs/sklearn_mnist_model.pkl')`<br/>\n",
"`joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')`<br/>\n",
"Anything written in this directory is automatically uploaded into your workspace. You'll access your model from this directory later in the tutorial."
]
},
@@ -477,11 +475,11 @@
" '--regularization': 0.8\n",
"}\n",
"\n",
"est = Estimator(source_directory = script_folder,\n",
" script_params = script_params,\n",
" compute_target = compute_target,\n",
" entry_script = 'train.py',\n",
" conda_packages = ['scikit-learn'])"
"est = Estimator(source_directory=script_folder,\n",
" script_params=script_params,\n",
" compute_target=compute_target,\n",
" entry_script='train.py',\n",
" conda_packages=['scikit-learn'])"
]
},
{
@@ -562,7 +560,7 @@
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output = True) # specify True for a verbose log"
"run.wait_for_completion(show_output=True) # specify True for a verbose log"
]
},
{
@@ -623,7 +621,7 @@
"outputs": [],
"source": [
"# register model \n",
"model = run.register_model(model_name = 'sklearn_mnist', model_path = 'outputs/sklearn_mnist_model.pkl')\n",
"model = run.register_model(model_name='sklearn_mnist', model_path='outputs/sklearn_mnist_model.pkl')\n",
"print(model.name, model.id, model.version, sep = '\\t')"
]
},

View File

@@ -34,7 +34,45 @@
"\n",
"Complete the model training in the [Tutorial #1: Train an image classification model with Azure Machine Learning](01.train-models.ipynb) notebook. \n",
"\n",
"If you did NOT complete the tutorial, you can instead run this cell to create a model and download the data needed for this tutorial:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# These prerequisites are created in the training tutorial\n",
"# Feel free to skip this cell if you completed the training tutorial \n",
"\n",
"# register a model\n",
"from azureml.core import Workspace\n",
"ws = Workspace.from_config()\n",
"\n",
"from azureml.core.model import Model\n",
"\n",
"model_name = \"sklearn_mnist\"\n",
"model = Model.register(model_path=\"sklearn_mnist_model.pkl\",\n",
" model_name=model_name,\n",
" tags={\"data\": \"mnist\", \"model\": \"classification\"},\n",
" description=\"Mnist handwriting recognition\",\n",
" workspace=ws)\n",
"\n",
"# download test data\n",
"import os\n",
"import urllib.request\n",
"\n",
"os.makedirs('./data', exist_ok=True)\n",
"\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename='./data/test-images.gz')\n",
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename='./data/test-labels.gz')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up the environment\n",
"\n",
"Start by setting up a testing environment.\n",
@@ -113,9 +151,8 @@
"from utils import load_data\n",
"\n",
"# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster\n",
"\n",
"X_test = load_data('./data/test-images.gz', False) / 255.0\n",
"y_test = load_data('./data/test-labels.gz', True).reshape(-1)\n"
"y_test = load_data('./data/test-labels.gz', True).reshape(-1)"
]
},
{
@@ -175,13 +212,14 @@
"metadata": {},
"outputs": [],
"source": [
"row_sums = conf_mx.sum(axis = 1, keepdims = True)\n",
"# normalize the diagnal cells so that they don't overpower the rest of the cells when visualized\n",
"row_sums = conf_mx.sum(axis=1, keepdims=True)\n",
"norm_conf_mx = conf_mx / row_sums\n",
"np.fill_diagonal(norm_conf_mx, 0)\n",
"\n",
"fig = plt.figure(figsize = (8,5))\n",
"fig = plt.figure(figsize=(8,5))\n",
"ax = fig.add_subplot(111)\n",
"cax = ax.matshow(norm_conf_mx, cmap = plt.cm.bone)\n",
"cax = ax.matshow(norm_conf_mx, cmap=plt.cm.bone)\n",
"ticks = np.arange(0, 10, 1)\n",
"ax.set_xticks(ticks)\n",
"ax.set_yticks(ticks)\n",
@@ -232,12 +270,11 @@
"from sklearn.externals import joblib\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"#from azureml.assets.persistence.persistence import get_model_path\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global model\n",
" # retreive the local path to the model using the model name\n",
" # retreive the path to the model file using the model name\n",
" model_path = Model.get_model_path('sklearn_mnist')\n",
" model = joblib.load(model_path)\n",
"\n",
@@ -263,16 +300,29 @@
"metadata": {},
"outputs": [],
"source": [
"%%writefile myenv.yml\n",
"name: myenv\n",
"channels:\n",
" - defaults\n",
"dependencies:\n",
" - scikit-learn\n",
" - pip:\n",
" # Required packages for AzureML execution, history, and data preparation.\n",
" - --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/Preview/E7501C02541B433786111FE8E140CAA1\n",
" - azureml-core"
"from azureml.core.conda_dependencies import CondaDependencies \n",
"\n",
"myenv = CondaDependencies()\n",
"myenv.add_conda_package(\"scikit-learn\")\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Review the content of the file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pfile myenv.yml"
]
},
{
@@ -281,7 +331,7 @@
"source": [
"### Create configuration file\n",
"\n",
"Create a deployment configuration file and specify the number of CPUs and gigabyte of RAM needed for your ACI container. While it depends on your model, the default of 1 core and 1 gigabyte of RAM is usually sufficient for many models. If you feel you need more later, you would have to recreate the image and redeploy the service."
"Create a deployment configuration file and specify the number of CPUs and gigabyte of RAM needed for your ACI container. While it depends on your model, the default of 1 core and 1 gigabyte of RAM is usually sufficient for many models. If you feel you need more later, you can always modify the configuration and redeploy the service."
]
},
{
@@ -292,10 +342,10 @@
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
" memory_gb = 1, \n",
" tags = {\"data\": \"MNIST\", \"method\" : \"sklearn\"}, \n",
" description = 'Predict MNIST with sklearn')"
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n",
" memory_gb=1, \n",
" tags={\"data\": \"MNIST\", \"method\" : \"sklearn\"}, \n",
" description='Predict MNIST with sklearn')"
]
},
{
@@ -328,17 +378,17 @@
"from azureml.core.image import ContainerImage\n",
"\n",
"# configure the image\n",
"image_config = ContainerImage.image_configuration(execution_script = \"score.py\", \n",
" runtime = \"python\", \n",
" conda_file = \"myenv.yml\")\n",
"image_config = ContainerImage.image_configuration(execution_script=\"score.py\", \n",
" runtime=\"python\", \n",
" conda_file=\"myenv.yml\")\n",
"\n",
"service = Webservice.deploy_from_model(workspace = ws,\n",
" name = 'sklearn-mnist-model',\n",
" deployment_config = aciconfig,\n",
" models = [model],\n",
" image_config = image_config)\n",
"service = Webservice.deploy_from_model(workspace=ws,\n",
" name='sklearn-mnist-model',\n",
" deployment_config=aciconfig,\n",
" models=[model],\n",
" image_config=image_config)\n",
"\n",
"service.wait_for_deployment(show_output = True)"
"service.wait_for_deployment(show_output=True)"
]
},
{
@@ -391,7 +441,7 @@
"test_samples = bytes(test_samples, encoding = 'utf8')\n",
"\n",
"# predict using the deployed model\n",
"result = json.loads(service.run(input_data = test_samples))\n",
"result = json.loads(service.run(input_data=test_samples))\n",
"\n",
"# compare actual value vs. the predicted values:\n",
"i = 0\n",
@@ -406,8 +456,8 @@
" font_color = 'red' if y_test[s] != result[i] else 'black'\n",
" clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys\n",
" \n",
" plt.text(x = 10, y = -10, s = result[i], fontsize = 18, color = font_color)\n",
" plt.imshow(X_test[s].reshape(28, 28), cmap = clr_map)\n",
" plt.text(x=10, y =-10, s=result[i], fontsize=18, color=font_color)\n",
" plt.imshow(X_test[s].reshape(28, 28), cmap=clr_map)\n",
" \n",
" i = i + 1\n",
"plt.show()"

Binary file not shown.