version 1.0.43

This commit is contained in:
Roope Astala
2019-06-12 10:08:35 -04:00
parent 5e91c836b9
commit 79c9f50c15
33 changed files with 7263 additions and 3153 deletions

View File

@@ -102,7 +102,7 @@
"source": [
"import azureml.core\n",
"\n",
"print(\"This notebook was created using version 1.0.41 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.0.43 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -204,7 +204,7 @@
"\n",
"If you don't have an existing workspace and are the owner of the subscription or resource group, you can create a new workspace. If you don't have a resource group, the create workspace command will create one for you using the name you provide.\n",
"\n",
"**Note**: The Workspace creation command will create default CPU and GPU compute clusters for you. As with other Azure services, there are limits on certain resources (for example AmlCompute quota) associated with the Azure ML service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.\n",
"**Note**: As with other Azure services, there are limits on certain resources (for example AmlCompute quota) associated with the Azure ML service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.\n",
"\n",
"This cell will create an Azure ML workspace for you in a subscription provided you have the correct permissions.\n",
"\n",
@@ -233,8 +233,6 @@
" subscription_id = subscription_id,\n",
" resource_group = resource_group, \n",
" location = workspace_region,\n",
" default_cpu_compute_target=Workspace.DEFAULT_CPU_CLUSTER_CONFIGURATION,\n",
" default_gpu_compute_target=Workspace.DEFAULT_GPU_CLUSTER_CONFIGURATION,\n",
" create_resource_group = True,\n",
" exist_ok = True)\n",
"ws.get_details()\n",

View File

@@ -192,7 +192,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create or Attach a Remote Linux DSVM"
"### Create or Attach an AmlCompute cluster"
]
},
{
@@ -201,21 +201,36 @@
"metadata": {},
"outputs": [],
"source": [
"dsvm_name = 'mydsvmb'\n",
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"try:\n",
" while ws.compute_targets[dsvm_name].provisioning_state == 'Creating':\n",
" time.sleep(1)\n",
" \n",
" dsvm_compute = DsvmCompute(ws, dsvm_name)\n",
" print('Found existing DVSM.')\n",
"except:\n",
" print('Creating a new DSVM.')\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n",
" dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)\n",
" dsvm_compute.wait_for_completion(show_output = True)\n",
" print(\"Waiting one minute for ssh to be accessible\")\n",
" time.sleep(90) # Wait for ssh to be accessible"
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"cpucluster\"\n",
"\n",
"found = False\n",
"\n",
"# Check if this compute target already exists in the workspace.\n",
"\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
"\n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\\n\",\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
"\n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
"\n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
@@ -227,9 +242,13 @@
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"conda_run_config.target = dsvm_compute\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
@@ -294,6 +313,27 @@
"remote_run.clean_preprocessor_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -1,565 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-attach/auto-ml-remote-attach.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution using attach**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the scikit-learn's [20newsgroup](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html) to showcase how you can use AutoML to handle text data with remote attach.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you will learn how to:\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Attach an existing DSVM to a workspace.\n",
"3. Configure AutoML using `AutoMLConfig`.\n",
"4. Train the model using the DSVM.\n",
"5. Explore the results.\n",
"6. Viewing the engineered names for featurized data and featurization summary for all raw features.\n",
"7. Test the best fitted model.\n",
"\n",
"In addition this notebook showcases the following features\n",
"- **Parallel** executions for iterations\n",
"- **Asynchronous** tracking of progress\n",
"- **Cancellation** of individual iterations or the entire run\n",
"- Retrieving models for any iteration or logged metric\n",
"- Specifying AutoML settings as `**kwargs`\n",
"- Handling **text** data using the `preprocess` flag"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the run history container in the workspace.\n",
"experiment_name = 'automl-remote-attach'\n",
"project_folder = './sample_projects/automl-remote-attach'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach a Remote Linux DSVM\n",
"To use a remote Docker compute target:\n",
"1. Create a Linux DSVM in Azure, following these [instructions](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro). Make sure you use the Ubuntu flavor (not CentOS). Make sure that disk space is available under `/tmp` because AutoML creates files under `/tmp/azureml_run`s. The DSVM should have more cores than the number of parallel runs that you plan to enable. It should also have at least 4GB per core.\n",
"2. Enter the IP address, user name and password below.\n",
"\n",
"**Note:** By default, SSH runs on port 22 and you don't need to change the port number below. If you've configured SSH to use a different port, change `dsvm_ssh_port` accordinglyaddress. [Read more](https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/detailed-troubleshoot-ssh-connection) on changing SSH ports for security reasons."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, RemoteCompute\n",
"import time\n",
"\n",
"# Add your VM information below\n",
"# If a compute with the specified compute_name already exists, it will be used and the dsvm_ip_addr, dsvm_ssh_port, \n",
"# dsvm_username and dsvm_password will be ignored.\n",
"compute_name = 'mydsvmb'\n",
"dsvm_ip_addr = '<<ip_addr>>'\n",
"dsvm_ssh_port = 22\n",
"dsvm_username = '<<username>>'\n",
"dsvm_password = '<<password>>'\n",
"\n",
"if compute_name in ws.compute_targets:\n",
" print('Using existing compute.')\n",
" dsvm_compute = ws.compute_targets[compute_name]\n",
"else:\n",
" attach_config = RemoteCompute.attach_configuration(address=dsvm_ip_addr, username=dsvm_username, password=dsvm_password, ssh_port=dsvm_ssh_port)\n",
" ComputeTarget.attach(workspace=ws, name=compute_name, attach_configuration=attach_config)\n",
"\n",
" while ws.compute_targets[compute_name].provisioning_state == 'Creating':\n",
" time.sleep(1)\n",
"\n",
" dsvm_compute = ws.compute_targets[compute_name]\n",
" \n",
" if dsvm_compute.provisioning_state == 'Failed':\n",
" print('Attached failed.')\n",
" print(dsvm_compute.provisioning_errors)\n",
" dsvm_compute.detach()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"import pkg_resources\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"conda_run_config.target = dsvm_compute\n",
"\n",
"pandas_dependency = 'pandas==' + pkg_resources.get_distribution(\"pandas\").version\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80',pandas_dependency])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"For remote executions you should author a `get_data.py` file containing a `get_data()` function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n",
"In this example, the `get_data()` function returns a [dictionary](README.md#getdata)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import numpy as np\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
"def get_data():\n",
" remove = ('headers', 'footers', 'quotes')\n",
" categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
" data_train = fetch_20newsgroups(subset = 'train', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
" \n",
" X_train = np.array(data_train.data).reshape((len(data_train.data),1))\n",
" y_train = np.array(data_train.target)\n",
" \n",
" return { \"X\" : X_train, \"y\" : y_train }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
"\n",
"**Note:** When using Remote DSVM, you can't pass Numpy arrays directly to the fit method.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|\n",
"|**preprocess**|Setting this to *True* enables AutoML to perform preprocessing on the input to handle *missing data*, and to perform some common *feature extraction*.|\n",
"|**enable_cache**|Setting this to *True* enables preprocess done once and reuse the same preprocessed data for all the iterations. Default value is True.\n",
"|**max_cores_per_iteration**|Indicates how many cores on the compute target would be used to train a single pipeline.<br>Default is *1*; you can set it to *-1* to use all cores.|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 60,\n",
" \"iterations\": 4,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": True,\n",
" \"max_cores_per_iteration\": 2\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pre-process cache cleanup\n",
"The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run.clean_preprocessor_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### View the engineered names for featurized data\n",
"Below we display the engineered feature names generated for the featurized data using the preprocessing featurization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fitted_model.named_steps['datatransformer'].get_engineered_feature_names()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### View the featurization summary\n",
"Below we display the featurization that was performed on different raw features in the user data. For each raw feature in the user data, the following information is displayed:-\n",
"- Raw feature name\n",
"- Number of engineered features formed out of this raw feature\n",
"- Type detected\n",
"- If feature was dropped\n",
"- List of feature transformations for the raw feature"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fitted_model.named_steps['datatransformer'].get_featurization_summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model which has the smallest `accuracy` value:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# lookup_metric = \"accuracy\"\n",
"# best_run, fitted_model = remote_run.get_output(metric = lookup_metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a Specific Iteration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 0\n",
"zero_run, zero_model = remote_run.get_output(iteration = iteration)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test data.\n",
"from pandas_ml import ConfusionMatrix\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
"remove = ('headers', 'footers', 'quotes')\n",
"categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
"\n",
"data_test = fetch_20newsgroups(subset = 'test', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
"\n",
"X_test = np.array(data_test.data).reshape((len(data_test.data),1))\n",
"y_test = data_test.target\n",
"\n",
"# Test our best pipeline.\n",
"\n",
"y_pred = fitted_model.predict(X_test)\n",
"y_pred_strings = [data_test.target_names[i] for i in y_pred]\n",
"y_test_strings = [data_test.target_names[i] for i in y_test]\n",
"\n",
"cm = ConfusionMatrix(y_test_strings, y_pred_strings)\n",
"print(cm)\n",
"cm.plot()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,555 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution using AmlCompute**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you would see\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Create or Attach existing AmlCompute to a workspace.\n",
"3. Configure AutoML using `AutoMLConfig`.\n",
"4. Train the model using AmlCompute\n",
"5. Explore the results.\n",
"6. Test the best fitted model.\n",
"\n",
"In addition this notebook showcases the following features\n",
"- **Parallel** executions for iterations\n",
"- **Asynchronous** tracking of progress\n",
"- **Cancellation** of individual iterations or the entire run\n",
"- Retrieving models for any iteration or logged metric\n",
"- Specifying AutoML settings as `**kwargs`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import csv\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the run history container in the workspace.\n",
"experiment_name = 'automl-remote-amlcompute'\n",
"project_folder = './project'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create or Attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"\n",
"# Choose a name for your cluster.\n",
"amlcompute_cluster_name = \"automlcl\"\n",
"\n",
"found = False\n",
"# Check if this compute target already exists in the workspace.\n",
"cts = ws.compute_targets\n",
"if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n",
" found = True\n",
" print('Found existing compute target.')\n",
" compute_target = cts[amlcompute_cluster_name]\n",
" \n",
"if not found:\n",
" print('Creating a new compute target...')\n",
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n",
" #vm_priority = 'lowpriority', # optional\n",
" max_nodes = 6)\n",
"\n",
" # Create the cluster.\n",
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n",
" \n",
" # Can poll for a minimum number of nodes and for a specific timeout.\n",
" # If no min_node_count is provided, it will use the scale settings for the cluster.\n",
" compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n",
" \n",
" # For a more detailed view of current AmlCompute status, use get_status()."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"For remote executions, you need to make the data accessible from the remote compute.\n",
"This can be done by uploading the data to DataStore.\n",
"In this example, we upload scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_train = datasets.load_digits()\n",
"\n",
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)\n",
" \n",
"pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
"pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
"\n",
"ds = ws.get_default_datastore()\n",
"ds.upload(src_dir='./data', target_path='bai_data', overwrite=True, show_progress=True)\n",
"\n",
"from azureml.core.runconfig import DataReferenceConfiguration\n",
"dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
" path_on_datastore='bai_data', \n",
" path_on_compute='/tmp/azureml_runs',\n",
" mode='download', # download files from datastore to compute target\n",
" overwrite=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to AmlCompute\n",
"conda_run_config.target = compute_target\n",
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"# set the data reference of the run coonfiguration\n",
"conda_run_config.data_references = {ds.name: dr}\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import pandas as pd\n",
"\n",
"def get_data():\n",
" X_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
" y_train = pd.read_csv(\"/tmp/azureml_runs/bai_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
"\n",
" return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
"\n",
"**Note:** When using AmlCompute, you can't pass Numpy arrays directly to the fit method.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**max_concurrent_iterations**|Maximum number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM.|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 2,\n",
" \"iterations\": 20,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": False,\n",
" \"max_concurrent_iterations\": 5,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path = project_folder,\n",
" run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run.\n",
"In this example, we specify `show_output = False` to suppress console output while the run is in progress."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"\n",
"#### Loading executed runs\n",
"In case you need to load a previously executed run, enable the cell below and replace the `run_id` value."
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"remote_run = AutoMLRun(experiment = experiment, run_id = 'AutoML_5db13491-c92a-4f1d-b622-8ab8d973a058')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}\n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model which has the smallest `log_loss` value:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lookup_metric = \"log_loss\"\n",
"best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a Specific Iteration\n",
"Show the run and the model from the third iteration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 3\n",
"third_run, third_model = remote_run.get_output(iteration=iteration)\n",
"print(third_run)\n",
"print(third_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"#### Load Test Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"digits = datasets.load_digits()\n",
"X_test = digits.data[:10, :]\n",
"y_test = digits.target[:10]\n",
"images = digits.images[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Testing Our Best Fitted Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Randomly select digits and test.\n",
"for index in np.random.choice(len(y_test), 2, replace = False):\n",
" print(index)\n",
" predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
" label = y_test[index]\n",
" title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n",
" fig = plt.figure(1, figsize=(3,3))\n",
" ax1 = fig.add_axes((0,0,.8,.8))\n",
" ax1.set_title(title)\n",
" plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
" plt.show()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,593 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-execution-with-datastore/auto-ml-remote-execution-with-datastore.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution with DataStore**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"This sample accesses a data file on a remote DSVM through DataStore. Advantages of using data store are:\n",
"1. DataStore secures the access details.\n",
"2. DataStore supports read, write to blob and file store\n",
"3. AutoML natively supports copying data from DataStore to DSVM\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you would see\n",
"1. Storing data in DataStore.\n",
"2. get_data returning data from DataStore."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import time\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import azureml.core\n",
"from azureml.core.compute import DsvmCompute\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# choose a name for experiment\n",
"experiment_name = 'automl-remote-datastore-file'\n",
"# project folder\n",
"project_folder = './sample_projects/automl-remote-datastore-file'\n",
"\n",
"experiment=Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Remote Linux DSVM\n",
"Note: If creation fails with a message about Marketplace purchase eligibilty, go to portal.azure.com, start creating DSVM there, and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled it, you can exit without actually creating VM.\n",
"\n",
"**Note**: By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you can switch to a different port (such as 5022), you can append the port number to the address. [Read more](https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/detailed-troubleshoot-ssh-connection) on this."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"compute_target_name = 'mydsvmc'\n",
"\n",
"try:\n",
" while ws.compute_targets[compute_target_name].provisioning_state == 'Creating':\n",
" time.sleep(1)\n",
" \n",
" dsvm_compute = DsvmCompute(workspace=ws, name=compute_target_name)\n",
" print('found existing:', dsvm_compute.name)\n",
"except:\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size=\"Standard_D2_v2\")\n",
" dsvm_compute = DsvmCompute.create(ws, name=compute_target_name, provisioning_configuration=dsvm_config)\n",
" dsvm_compute.wait_for_completion(show_output=True)\n",
" print(\"Waiting one minute for ssh to be accessible\")\n",
" time.sleep(90) # Wait for ssh to be accessible"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"\n",
"### Copy data file to local\n",
"\n",
"Download the data file.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir('data'):\n",
" os.mkdir('data') "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"import csv\n",
"\n",
"remove = ('headers', 'footers', 'quotes')\n",
"categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
"data_train = fetch_20newsgroups(subset = 'train', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
" \n",
"pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
"pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload data to the cloud"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now make the data accessible remotely by uploading that data from your local machine into Azure so it can be accessed for remote training. The datastore is a convenient construct associated with your workspace for you to upload/download data, and interact with it from your remote compute targets. It is backed by Azure blob storage account.\n",
"\n",
"The data.tsv files are uploaded into a directory named data at the root of the datastore."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#blob_datastore = Datastore(ws, blob_datastore_name)\n",
"ds = ws.get_default_datastore()\n",
"print(ds.datastore_type, ds.account_name, ds.container_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ds.upload_files(\"data.tsv\")\n",
"ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure & Run\n",
"\n",
"First let's create a DataReferenceConfigruation object to inform the system what data folder to download to the compute target.\n",
"The path_on_compute should be an absolute path to ensure that the data files are downloaded only once. The get_data method should use this same path to access the data files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import DataReferenceConfiguration\n",
"dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
" path_on_datastore='data', \n",
" path_on_compute='/tmp/azureml_runs',\n",
" mode='download', # download files from datastore to compute target\n",
" overwrite=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"import pkg_resources\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"conda_run_config.target = dsvm_compute\n",
"# set the data reference of the run coonfiguration\n",
"conda_run_config.data_references = {ds.name: dr}\n",
"\n",
"pandas_dependency = 'pandas==' + pkg_resources.get_distribution(\"pandas\").version\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80',pandas_dependency])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Get Data File\n",
"For remote executions you should author a get_data.py file containing a get_data() function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n",
"\n",
"The *get_data()* function returns a [dictionary](README.md#getdata).\n",
"\n",
"The read_csv uses the path_on_compute value specified in the DataReferenceConfiguration call plus the path_on_datastore folder and then the actual file name."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import pandas as pd\n",
"\n",
"def get_data():\n",
" X_train = pd.read_csv(\"/tmp/azureml_runs/data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
" y_train = pd.read_csv(\"/tmp/azureml_runs/data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
"\n",
" return { \"X\" : X_train.values, \"y\" : y_train[0].values }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify automl_settings as **kwargs** as well. Also note that you can use the get_data() symantic for local excutions too. \n",
"\n",
"<i>Note: For Remote DSVM and Batch AI you cannot pass Numpy arrays directly to AutoMLConfig.</i>\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration|\n",
"|**iterations**|Number of iterations. In each iteration Auto ML trains a specific pipeline with the data|\n",
"|**n_cross_validations**|Number of cross validation splits|\n",
"|**max_concurrent_iterations**|Max number of iterations that would be executed in parallel. This should be less than the number of cores on the DSVM\n",
"|**preprocess**| *True/False* <br>Setting this to *True* enables Auto ML to perform preprocessing <br>on the input to handle *missing data*, and perform some common *feature extraction*|\n",
"|**enable_cache**|Setting this to *True* enables preprocess done once and reuse the same preprocessed data for all the iterations. Default value is True.|\n",
"|**max_cores_per_iteration**| Indicates how many cores on the compute target would be used to train a single pipeline.<br> Default is *1*, you can set it to *-1* to use all cores|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 60,\n",
" \"iterations\": 4,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": True,\n",
" \"max_cores_per_iteration\": 1,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path=project_folder,\n",
" run_configuration=conda_run_config,\n",
" #compute_target = dsvm_compute,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets/models even when the experiment is running to retreive the best model up to that point. Once you are satisfied with the model you can cancel a particular iteration or the whole run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"#### Widget for monitoring runs\n",
"\n",
"The widget will sit on \"loading\" until the first iteration completed, then you will see an auto-updating graph and table show up. It refreshed once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under /tmp/azureml_run/{iterationid}/azureml-logs\n",
"\n",
"NOTE: The widget displays a link at the bottom. This links to a web-ui to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use sdk methods to fetch all the child runs and see individual metrics that we log. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Canceling Runs\n",
"You can cancel ongoing remote runs using the *cancel()* and *cancel_iteration()* functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pre-process cache cleanup\n",
"The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned by running below cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run.clean_preprocessor_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The *get_output* method returns the best run and the fitted model. There are overloads on *get_output* that allow you to retrieve the best run and fitted model for *any* logged metric or a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model based on any other metric"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# lookup_metric = \"accuracy\"\n",
"# best_run, fitted_model = remote_run.get_output(metric=lookup_metric)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a specific iteration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# iteration = 1\n",
"# best_run, fitted_model = remote_run.get_output(iteration=iteration)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test data.\n",
"from pandas_ml import ConfusionMatrix\n",
"\n",
"data_test = fetch_20newsgroups(subset = 'test', categories = categories,\n",
" shuffle = True, random_state = 42,\n",
" remove = remove)\n",
"\n",
"X_test = np.array(data_test.data).reshape((len(data_test.data),1))\n",
"y_test = data_test.target\n",
"\n",
"# Test our best pipeline.\n",
"\n",
"y_pred = fitted_model.predict(X_test)\n",
"y_pred_strings = [data_test.target_names[i] for i in y_pred]\n",
"y_test_strings = [data_test.target_names[i] for i in y_test]\n",
"\n",
"cm = ConfusionMatrix(y_test_strings, y_pred_strings)\n",
"print(cm)\n",
"cm.plot()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,534 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/remote-execution/auto-ml-remote-execution.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automated Machine Learning\n",
"_**Remote Execution using DSVM (Ubuntu)**_\n",
"\n",
"## Contents\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Data](#Data)\n",
"1. [Train](#Train)\n",
"1. [Results](#Results)\n",
"1. [Test](#Test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"\n",
"In this notebook you wiil learn how to:\n",
"1. Create an `Experiment` in an existing `Workspace`.\n",
"2. Attach an existing DSVM to a workspace.\n",
"3. Configure AutoML using `AutoMLConfig`.\n",
"4. Train the model using the DSVM.\n",
"5. Explore the results.\n",
"6. Test the best fitted model.\n",
"\n",
"In addition, this notebook showcases the following features:\n",
"- **Parallel** executions for iterations\n",
"- **Asynchronous** tracking of progress\n",
"- **Cancellation** of individual iterations or the entire run\n",
"- Retrieving models for any iteration or logged metric\n",
"- Specifying AutoML settings as `**kwargs`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import os\n",
"import time\n",
"import csv\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"\n",
"import azureml.core\n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.train.automl import AutoMLConfig"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"\n",
"# Choose a name for the run history container in the workspace.\n",
"experiment_name = 'automl-remote-dsvm'\n",
"project_folder = './project'\n",
"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"output = {}\n",
"output['SDK version'] = azureml.core.VERSION\n",
"output['Subscription ID'] = ws.subscription_id\n",
"output['Workspace Name'] = ws.name\n",
"output['Resource Group'] = ws.resource_group\n",
"output['Location'] = ws.location\n",
"output['Project Directory'] = project_folder\n",
"output['Experiment Name'] = experiment.name\n",
"pd.set_option('display.max_colwidth', -1)\n",
"outputDf = pd.DataFrame(data = output, index = [''])\n",
"outputDf.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a Remote Linux DSVM\n",
"**Note:** If creation fails with a message about Marketplace purchase eligibilty, start creation of a DSVM through the [Azure portal](https://portal.azure.com), and select \"Want to create programmatically\" to enable programmatic creation. Once you've enabled this setting, you can exit the portal without actually creating the DSVM, and creation of the DSVM through the notebook should work.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import DsvmCompute\n",
"\n",
"dsvm_name = 'mydsvma'\n",
"try:\n",
" dsvm_compute = DsvmCompute(ws, dsvm_name)\n",
" print('Found an existing DSVM.')\n",
"except:\n",
" print('Creating a new DSVM.')\n",
" dsvm_config = DsvmCompute.provisioning_configuration(vm_size = \"Standard_D2_v2\")\n",
" dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)\n",
" dsvm_compute.wait_for_completion(show_output = True)\n",
" print(\"Waiting one minute for ssh to be accessible\")\n",
" time.sleep(90) # Wait for ssh to be accessible"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data\n",
"For remote executions, you need to make the data accessible from the remote compute.\n",
"This can be done by uploading the data to DataStore.\n",
"In this example, we upload scikit-learn's [load_digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_train = datasets.load_digits()\n",
"\n",
"if not os.path.isdir('data'):\n",
" os.mkdir('data')\n",
" \n",
"if not os.path.exists(project_folder):\n",
" os.makedirs(project_folder)\n",
" \n",
"pd.DataFrame(data_train.data).to_csv(\"data/X_train.tsv\", index=False, header=False, quoting=csv.QUOTE_ALL, sep=\"\\t\")\n",
"pd.DataFrame(data_train.target).to_csv(\"data/y_train.tsv\", index=False, header=False, sep=\"\\t\")\n",
"\n",
"ds = ws.get_default_datastore()\n",
"ds.upload(src_dir='./data', target_path='re_data', overwrite=True, show_progress=True)\n",
"\n",
"from azureml.core.runconfig import DataReferenceConfiguration\n",
"dr = DataReferenceConfiguration(datastore_name=ds.name, \n",
" path_on_datastore='re_data', \n",
" path_on_compute='/tmp/azureml_runs',\n",
" mode='download', # download files from datastore to compute target\n",
" overwrite=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new RunConfig object\n",
"conda_run_config = RunConfiguration(framework=\"python\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"conda_run_config.target = dsvm_compute\n",
"\n",
"# set the data reference of the run coonfiguration\n",
"conda_run_config.data_references = {ds.name: dr}\n",
"\n",
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])\n",
"conda_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile $project_folder/get_data.py\n",
"\n",
"import pandas as pd\n",
"\n",
"def get_data():\n",
" X_train = pd.read_csv(\"/tmp/azureml_runs/re_data/X_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
" y_train = pd.read_csv(\"/tmp/azureml_runs/re_data/y_train.tsv\", delimiter=\"\\t\", header=None, quotechar='\"')\n",
"\n",
" return { \"X\" : X_train.values, \"y\" : y_train[0].values }\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train\n",
"\n",
"You can specify `automl_settings` as `**kwargs` as well. Also note that you can use a `get_data()` function for local excutions too.\n",
"\n",
"**Note:** When using Remote DSVM, you can't pass Numpy arrays directly to the fit method.\n",
"\n",
"|Property|Description|\n",
"|-|-|\n",
"|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|\n",
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
"|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n",
"|**n_cross_validations**|Number of cross validation splits.|\n",
"|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be less than the number of cores on the DSVM.|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"iteration_timeout_minutes\": 10,\n",
" \"iterations\": 20,\n",
" \"n_cross_validations\": 5,\n",
" \"primary_metric\": 'AUC_weighted',\n",
" \"preprocess\": False,\n",
" \"max_concurrent_iterations\": 2,\n",
" \"verbosity\": logging.INFO\n",
"}\n",
"\n",
"automl_config = AutoMLConfig(task = 'classification',\n",
" debug_log = 'automl_errors.log',\n",
" path = project_folder, \n",
" run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n",
" **automl_settings\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note:** The first run on a new DSVM may take several minutes to prepare the environment."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Call the `submit` method on the experiment object and pass the run configuration. For remote runs the execution is asynchronous, so you will see the iterations get populated as they complete. You can interact with the widgets and models even when the experiment is running to retrieve the best model up to that point. Once you are satisfied with the model, you can cancel a particular iteration or the whole run.\n",
"\n",
"In this example, we specify `show_output = False` to suppress console output while the run is in progress."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run = experiment.submit(automl_config, show_output = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"remote_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n",
"\n",
"#### Loading Executed Runs\n",
"In case you need to load a previously executed run, enable the cell below and replace the `run_id` value."
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"remote_run = AutoMLRun(experiment=experiment, run_id = 'AutoML_480d3ed6-fc94-44aa-8f4e-0b945db9d3ef')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Widget for Monitoring Runs\n",
"\n",
"The widget will first report a \"loading\" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"\n",
"You can click on a pipeline to see run properties and output logs. Logs are also available on the DSVM under `/tmp/azureml_run/{iterationid}/azureml-logs`\n",
"\n",
"**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"RunDetails(remote_run).show() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait until the run finishes.\n",
"remote_run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"#### Retrieve All Child Runs\n",
"You can also use SDK methods to fetch all the child runs and see individual metrics that we log."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"children = list(remote_run.get_children())\n",
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
"rundata = pd.DataFrame(metricslist).sort_index(1)\n",
"rundata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancelling Runs\n",
"\n",
"You can cancel ongoing remote runs using the `cancel` and `cancel_iteration` functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cancel the ongoing experiment and stop scheduling new iterations.\n",
"# remote_run.cancel()\n",
"\n",
"# Cancel iteration 1 and move onto iteration 2.\n",
"# remote_run.cancel_iteration(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve the Best Model\n",
"\n",
"Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run, fitted_model = remote_run.get_output()\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Best Model Based on Any Other Metric\n",
"Show the run and the model which has the smallest `log_loss` value:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lookup_metric = \"log_loss\"\n",
"best_run, fitted_model = remote_run.get_output(metric = lookup_metric)\n",
"print(best_run)\n",
"print(fitted_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Model from a Specific Iteration\n",
"Show the run and the model from the third iteration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iteration = 3\n",
"third_run, third_model = remote_run.get_output(iteration = iteration)\n",
"print(third_run)\n",
"print(third_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test\n",
"\n",
"#### Load Test Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"digits = datasets.load_digits()\n",
"X_test = digits.data[:10, :]\n",
"y_test = digits.target[:10]\n",
"images = digits.images[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Test Our Best Fitted Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Randomly select digits and test.\n",
"for index in np.random.choice(len(y_test), 2, replace = False):\n",
" print(index)\n",
" predicted = fitted_model.predict(X_test[index:index + 1])[0]\n",
" label = y_test[index]\n",
" title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n",
" fig = plt.figure(1, figsize=(3,3))\n",
" ax1 = fig.add_axes((0,0,.8,.8))\n",
" ax1.set_title(title)\n",
" plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n",
" plt.show()"
]
}
],
"metadata": {
"authors": [
{
"name": "savitam"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -43,5 +43,6 @@ Take a look at [intro-to-pipelines](./intro-to-pipelines/) for the list of noteb
1. [pipeline-batch-scoring.ipynb](https://aka.ms/pl-batch-score): This notebook demonstrates how to run a batch scoring job using Azure Machine Learning pipelines.
2. [pipeline-style-transfer.ipynb](https://aka.ms/pl-style-trans): This notebook demonstrates a multi-step pipeline that uses GPU compute.
3. [nyc-taxi-data-regression-model-building.ipynb](https://aka.ms/pl-nyctaxi-tutorial): This notebook is an AzureML Pipelines version of the previously published two part sample.
![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/machine-learning-pipelines/README.png)

View File

@@ -65,8 +65,6 @@
"import os\n",
"import azureml.core\n",
"from azureml.core import Workspace, Experiment, Datastore\n",
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute import ComputeTarget\n",
"from azureml.widgets import RunDetails\n",
"\n",
"# Check core SDK version number\n",
@@ -116,36 +114,20 @@
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')\n",
"\n",
"# Default datastore (Azure file storage)\n",
"def_file_store = ws.get_default_datastore() \n",
"# The above call is equivalent to Datastore(ws, \"workspacefilestore\") or simply Datastore(ws)\n",
"print(\"Default datastore's name: {}\".format(def_file_store.name))\n",
"\n",
"# Blob storage associated with the workspace\n",
"# Default datastore\n",
"def_blob_store = ws.get_default_datastore() \n",
"# The following call GETS the Azure Blob Store associated with your workspace.\n",
"# Note that workspaceblobstore is **the name of this store and CANNOT BE CHANGED and must be used as is** \n",
"def_blob_store = Datastore(ws, \"workspaceblobstore\")\n",
"print(\"Blobstore's name: {}\".format(def_blob_store.name))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# project folder\n",
"project_folder = '.'\n",
" \n",
"print('Sample projects will be created in {}.'.format(os.path.realpath(project_folder)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Required data and script files for the the tutorial\n",
"Sample files required to finish this tutorial are already copied to the project folder specified above. Even though the .py provided in the samples don't have much \"ML work,\" as a data scientist, you will work on this extensively as part of your work. To complete this tutorial, the contents of these files are not very important. The one-line files are for demostration purpose only."
"Sample files required to finish this tutorial are already copied to the corresponding source_directory locations. Even though the .py provided in the samples don't have much \"ML work,\" as a data scientist, you will work on this extensively as part of your work. To complete this tutorial, the contents of these files are not very important. The one-line files are for demostration purpose only."
]
},
{
@@ -176,19 +158,10 @@
"metadata": {},
"outputs": [],
"source": [
"# get_default_datastore() gets the default Azure File Store associated with your workspace.\n",
"# Here we are reusing the def_file_store object we obtained earlier\n",
"\n",
"# target_path is the directory at the destination\n",
"def_file_store.upload_files(['./20news.pkl'], \n",
" target_path = '20newsgroups', \n",
" overwrite = True, \n",
" show_progress = True)\n",
"\n",
"# Here we are reusing the def_blob_store we created earlier\n",
"# get_default_datastore() gets the default Azure Blob Store associated with your workspace.\n",
"# Here we are reusing the def_blob_store object we obtained earlier\n",
"def_blob_store.upload_files([\"./20news.pkl\"], target_path=\"20newsgroups\", overwrite=True)\n",
"\n",
"print(\"Upload calls completed\")"
"print(\"Upload call completed\")"
]
},
{
@@ -290,9 +263,10 @@
"- [**MpiStep**](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.mpi_step.mpistep?view=azure-ml-py): Adds a step to run a MPI job in a Pipeline.\n",
"- [**AutoMLStep**](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py): Creates a AutoML step in a Pipeline.\n",
"\n",
"The following code will create a PythonScriptStep to be executed in the Azure Machine Learning Compute we created above using train.py, one of the files already made available in the project folder.\n",
"The following code will create a PythonScriptStep to be executed in the Azure Machine Learning Compute we created above using train.py, one of the files already made available in the `source_directory`.\n",
"\n",
"A **PythonScriptStep** is a basic, built-in step to run a Python Script on a compute target. It takes a script name and optionally other parameters like arguments for the script, compute target, inputs and outputs. If no compute target is specified, default compute target for the workspace is used. You can also use a [**RunConfiguration**](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py) to specify requirements for the PythonScriptStep, such as conda dependencies and docker image."
"A **PythonScriptStep** is a basic, built-in step to run a Python Script on a compute target. It takes a script name and optionally other parameters like arguments for the script, compute target, inputs and outputs. If no compute target is specified, default compute target for the workspace is used. You can also use a [**RunConfiguration**](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py) to specify requirements for the PythonScriptStep, such as conda dependencies and docker image.\n",
"> The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step."
]
},
{
@@ -303,6 +277,9 @@
"source": [
"# Uses default values for PythonScriptStep construct.\n",
"\n",
"source_directory = './train'\n",
"print('Source directory for the step is {}.'.format(os.path.realpath(source_directory)))\n",
"\n",
"# Syntax\n",
"# PythonScriptStep(\n",
"# script_name, \n",
@@ -321,7 +298,7 @@
"step1 = PythonScriptStep(name=\"train_step\",\n",
" script_name=\"train.py\", \n",
" compute_target=aml_compute, \n",
" source_directory=project_folder,\n",
" source_directory=source_directory,\n",
" allow_reuse=True)\n",
"print(\"Step1 created\")"
]
@@ -351,12 +328,15 @@
"metadata": {},
"outputs": [],
"source": [
"# All steps use files already available in the project_folder\n",
"# For this step, we use a different source_directory\n",
"source_directory = './compare'\n",
"print('Source directory for the step is {}.'.format(os.path.realpath(source_directory)))\n",
"\n",
"# All steps use the same Azure Machine Learning compute target as well\n",
"step2 = PythonScriptStep(name=\"compare_step\",\n",
" script_name=\"compare.py\", \n",
" compute_target=aml_compute, \n",
" source_directory=project_folder)\n",
" source_directory=source_directory)\n",
"\n",
"# Use a RunConfiguration to specify some additional requirements for this step.\n",
"from azureml.core.runconfig import RunConfiguration\n",
@@ -378,10 +358,14 @@
"# specify CondaDependencies obj\n",
"run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n",
"\n",
"# For this step, we use yet another source_directory\n",
"source_directory = './extract'\n",
"print('Source directory for the step is {}.'.format(os.path.realpath(source_directory)))\n",
"\n",
"step3 = PythonScriptStep(name=\"extract_step\",\n",
" script_name=\"extract.py\", \n",
" compute_target=aml_compute, \n",
" source_directory=project_folder,\n",
" source_directory=source_directory,\n",
" runconfig=run_config)\n",
"\n",
"# list of steps to run\n",
@@ -597,7 +581,7 @@
"metadata": {
"authors": [
{
"name": "diray"
"name": "sanpil"
}
],
"kernelspec": {

View File

@@ -22,9 +22,17 @@
"# Azure Machine Learning Pipeline with HyperDriveStep\n",
"\n",
"\n",
"This notebook is used to demonstrate the use of HyperDriveStep in AML Pipeline.\n",
"This notebook is used to demonstrate the use of HyperDriveStep in AML Pipeline."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites and Azure Machine Learning Basics\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the configuration Notebook located at https://github.com/Azure/MachineLearningNotebooks first if you haven't. This sets you up with a working config file that has information on your workspace, subscription id, etc. \n",
"\n",
"## Azure Machine Learning and Pipeline SDK-specific imports\n"
"## Azure Machine Learning and Pipeline SDK-specific imports"
]
},
{
@@ -33,19 +41,24 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"import urllib\n",
"import azureml.core\n",
"from azureml.core import Workspace, Experiment\n",
"from azureml.core.datastore import Datastore\n",
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.exceptions import ComputeTargetException\n",
"from azureml.data.data_reference import DataReference\n",
"from azureml.pipeline.steps import HyperDriveStep\n",
"from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun\n",
"from azureml.pipeline.core import Pipeline, PipelineData\n",
"from azureml.train.dnn import TensorFlow\n",
"from azureml.train.hyperdrive import *\n",
"# from azureml.train.hyperdrive import *\n",
"from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal\n",
"from azureml.train.hyperdrive import choice, loguniform\n",
"\n",
"import os\n",
"import shutil\n",
"import urllib\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Check core SDK version number\n",
"print(\"SDK version:\", azureml.core.VERSION)"
@@ -87,7 +100,7 @@
"script_folder = './tf-mnist'\n",
"os.makedirs(script_folder, exist_ok=True)\n",
"\n",
"exp = Experiment(workspace=ws, name='tf-mnist')"
"exp = Experiment(workspace=ws, name='Hyperdrive_sample')"
]
},
{
@@ -112,6 +125,42 @@
"urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename = './data/mnist/test-labels.gz')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show some sample images\n",
"Let's load the downloaded compressed file into numpy arrays using some utility functions included in the `utils.py` library file from the current folder. Then we use `matplotlib` to plot 30 random images from the dataset along with their labels."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from utils import load_data\n",
"\n",
"# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster.\n",
"X_train = load_data('./data/mnist/train-images.gz', False) / 255.0\n",
"y_train = load_data('./data/mnist/train-labels.gz', True).reshape(-1)\n",
"\n",
"X_test = load_data('./data/mnist/test-images.gz', False) / 255.0\n",
"y_test = load_data('./data/mnist/test-labels.gz', True).reshape(-1)\n",
"\n",
"count = 0\n",
"sample_size = 30\n",
"plt.figure(figsize = (16, 6))\n",
"for i in np.random.permutation(X_train.shape[0])[:sample_size]:\n",
" count = count + 1\n",
" plt.subplot(1, sample_size, count)\n",
" plt.axhline('')\n",
" plt.axvline('')\n",
" plt.text(x = 10, y = -10, s = y_train[i], fontsize = 18)\n",
" plt.imshow(X_train[i].reshape(28, 28), cmap = plt.cm.Greys)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -135,7 +184,8 @@
"metadata": {},
"source": [
"## Retrieve or create a Azure Machine Learning compute\n",
"Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's get the default Azure Machine Learning Compute in the current workspace. We will then run the training script on this compute target."
"Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads.\n",
"Let's check the available computes first."
]
},
{
@@ -144,7 +194,27 @@
"metadata": {},
"outputs": [],
"source": [
"compute_target = ws.get_default_compute_target(\"GPU\")"
"cts = ws.compute_targets\n",
"for name, ct in cts.items():\n",
" print(name, ct.type, ct.provisioning_state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's get the default Azure Machine Learning Compute in the current workspace. We will then run the training script on this compute target."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"compute_target = ws.get_default_compute_target(\"GPU\")\n",
"\n",
"print(compute_target.get_status().serialize())"
]
},
{
@@ -173,8 +243,12 @@
"metadata": {},
"source": [
"## Create TensorFlow estimator\n",
"Next, we construct an `azureml.train.dnn.TensorFlow` estimator object, use the Batch AI cluster as compute target, and pass the mount-point of the datastore to the training code as a parameter.\n",
"The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed -- if additional pip or conda packages are required, their names can be passed in via the `pip_packages` and `conda_packages` arguments and they will be included in the resulting docker."
"Next, we construct an [TensorFlow](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.dnn.tensorflow?view=azure-ml-py) estimator object.\n",
"The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed -- if additional pip or conda packages are required, their names can be passed in via the `pip_packages` and `conda_packages` arguments and they will be included in the resulting docker.\n",
"\n",
"The TensorFlow estimator also takes a `framework_version` parameter -- if no version is provided, the estimator will default to the latest version supported by AzureML. Use `TensorFlow.get_supported_versions()` to get a list of all versions supported by your current SDK version or see the [SDK documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.dnn?view=azure-ml-py) for the versions supported in the most current release.\n",
"\n",
"The TensorFlow estimator also takes a `framework_version` parameter -- if no version is provided, the estimator will default to the latest version supported by AzureML. Use `TensorFlow.get_supported_versions()` to get a list of all versions supported by your current SDK version or see the [SDK documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.dnn?view=azure-ml-py) for the versions supported in the most current release."
]
},
{
@@ -186,7 +260,8 @@
"est = TensorFlow(source_directory=script_folder, \n",
" compute_target=compute_target,\n",
" entry_script='tf_mnist.py', \n",
" use_gpu=True)"
" use_gpu=True,\n",
" framework_version='1.13')"
]
},
{
@@ -194,7 +269,7 @@
"metadata": {},
"source": [
"## Intelligent hyperparameter tuning\n",
"We have trained the model with one set of hyperparameters, now let's how we can do hyperparameter tuning by launching multiple runs on the cluster. First let's define the parameter space using random sampling.\n",
"Now let's try hyperparameter tuning by launching multiple runs on the cluster. First let's define the parameter space using random sampling.\n",
"\n",
"In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (`validation_acc`)."
]
@@ -251,8 +326,8 @@
" policy=early_termination_policy,\n",
" primary_metric_name='validation_acc', \n",
" primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, \n",
" max_total_runs=1,\n",
" max_concurrent_runs=1)"
" max_total_runs=10,\n",
" max_concurrent_runs=4)"
]
},
{
@@ -261,6 +336,7 @@
"source": [
"## Add HyperDrive as a step of pipeline\n",
"\n",
"### Setup an input for the hypderdrive step\n",
"Let's setup a data reference for inputs of hyperdrive step."
]
},
@@ -302,8 +378,9 @@
" datastore=ds,\n",
" pipeline_output_name=metrics_output_name)\n",
"\n",
"hd_step_name='hd_step01'\n",
"hd_step = HyperDriveStep(\n",
" name=\"hyperdrive_module\",\n",
" name=hd_step_name,\n",
" hyperdrive_config=hd_config,\n",
" estimator_entry_script_arguments=['--data-folder', data_folder],\n",
" inputs=[data_folder],\n",
@@ -324,7 +401,7 @@
"outputs": [],
"source": [
"pipeline = Pipeline(workspace=ws, steps=[hd_step])\n",
"pipeline_run = Experiment(ws, 'Hyperdrive_Test').submit(pipeline)"
"pipeline_run = exp.submit(pipeline)"
]
},
{
@@ -357,7 +434,8 @@
"metadata": {},
"outputs": [],
"source": [
"pipeline_run.wait_for_completion()"
"# PUBLISHONLY\n",
"# pipeline_run.wait_for_completion()"
]
},
{
@@ -374,8 +452,9 @@
"metadata": {},
"outputs": [],
"source": [
"metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)\n",
"num_file_downloaded = metrics_output.download('.', show_progress=True)"
"# PUBLISHONLY\n",
"# metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)\n",
"# num_file_downloaded = metrics_output.download('.', show_progress=True)"
]
},
{
@@ -384,14 +463,374 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"with open(metrics_output._path_on_datastore) as f: \n",
" metrics_output_result = f.read()\n",
"# PUBLISHONLY\n",
"# import pandas as pd\n",
"# import json\n",
"# with open(metrics_output._path_on_datastore) as f: \n",
"# metrics_output_result = f.read()\n",
" \n",
"deserialized_metrics_output = json.loads(metrics_output_result)\n",
"df = pd.DataFrame(deserialized_metrics_output)\n",
"df"
"# deserialized_metrics_output = json.loads(metrics_output_result)\n",
"# df = pd.DataFrame(deserialized_metrics_output)\n",
"# df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find and register best model\n",
"When all the jobs finish, we can find out the one that has the highest accuracy."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# hd_step_run = HyperDriveStepRun(step_run=pipeline_run.find_step_run(hd_step_name)[0])\n",
"# best_run = hd_step_run.get_best_run_by_primary_metric()\n",
"# best_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's list the model files uploaded during the run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# print(best_run.get_file_names())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can then register the folder (and all files in it) as a model named `tf-dnn-mnist` under the workspace for deployment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# model = best_run.register_model(model_name='tf-dnn-mnist', model_path='outputs/model')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy the model in ACI\n",
"Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n",
"### Create score.py\n",
"First, we will create a scoring script that will be invoked by the web service call. \n",
"\n",
"* Note that the scoring script must have two required functions, `init()` and `run(input_data)`. \n",
" * In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started. \n",
" * In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to `run` typically use JSON as serialization and de-serialization format but you are not limited to that."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import json\n",
"import numpy as np\n",
"import os\n",
"import tensorflow as tf\n",
"\n",
"from azureml.core.model import Model\n",
"\n",
"def init():\n",
" global X, output, sess\n",
" tf.reset_default_graph()\n",
" model_root = Model.get_model_path('tf-dnn-mnist')\n",
" saver = tf.train.import_meta_graph(os.path.join(model_root, 'mnist-tf.model.meta'))\n",
" X = tf.get_default_graph().get_tensor_by_name(\"network/X:0\")\n",
" output = tf.get_default_graph().get_tensor_by_name(\"network/output/MatMul:0\")\n",
" \n",
" sess = tf.Session()\n",
" saver.restore(sess, os.path.join(model_root, 'mnist-tf.model'))\n",
"\n",
"def run(raw_data):\n",
" data = np.array(json.loads(raw_data)['data'])\n",
" # make prediction\n",
" out = output.eval(session=sess, feed_dict={X: data})\n",
" y_hat = np.argmax(out, axis=1)\n",
" return y_hat.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create myenv.yml\n",
"We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `tensorflow`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# from azureml.core.runconfig import CondaDependencies\n",
"\n",
"# cd = CondaDependencies.create()\n",
"# cd.add_conda_package('numpy')\n",
"# cd.add_tensorflow_conda_package()\n",
"# cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n",
"\n",
"# print(cd.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy to ACI\n",
"We are almost ready to deploy. Create a deployment configuration and specify the number of CPUs and gigbyte of RAM needed for your ACI container. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# from azureml.core.webservice import AciWebservice\n",
"\n",
"# aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n",
"# memory_gb=1, \n",
"# tags={'name':'mnist', 'framework': 'TensorFlow DNN'},\n",
"# description='Tensorflow DNN on MNIST')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Deployment Process\n",
"Now we can deploy. **This cell will run for about 7-8 minutes**. Behind the scene, it will do the following:\n",
"1. **Register model** \n",
"Take the local `model` folder (which contains our previously downloaded trained model files) and register it (and the files inside that folder) as a model named `model` under the workspace. Azure ML will register the model directory or model file(s) we specify to the `model_paths` parameter of the `Webservice.deploy` call.\n",
"2. **Build Docker image** \n",
"Build a Docker image using the scoring file (`score.py`), the environment file (`myenv.yml`), and the `model` folder containing the TensorFlow model files. \n",
"3. **Register image** \n",
"Register that image under the workspace. \n",
"4. **Ship to ACI** \n",
"And finally ship the image to the ACI infrastructure, start up a container in ACI using that image, and expose an HTTP endpoint to accept REST client calls."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# from azureml.core.image import ContainerImage\n",
"\n",
"# imgconfig = ContainerImage.image_configuration(execution_script=\"score.py\", \n",
"# runtime=\"python\", \n",
"# conda_file=\"myenv.yml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# %%time\n",
"# from azureml.core.webservice import Webservice\n",
"\n",
"# service = Webservice.deploy_from_model(workspace=ws,\n",
"# name='tf-mnist-svc',\n",
"# deployment_config=aciconfig,\n",
"# models=[model],\n",
"# image_config=imgconfig)\n",
"\n",
"# service.wait_for_deployment(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# print(service.get_logs())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is the scoring web service endpoint:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# print(service.scoring_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the deployed model\n",
"Let's test the deployed model. Pick 30 random samples from the test set, and send it to the web service hosted in ACI. Note here we are using the `run` API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.\n",
"\n",
"After the invocation, we print the returned predictions and plot them along with the input images. Use red font color and inversed image (white on black) to highlight the misclassified samples. Note since the model accuracy is pretty high, you might have to run the below cell a few times before you can see a misclassified sample."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# import json\n",
"\n",
"# # find 30 random samples from test set\n",
"# n = 30\n",
"# sample_indices = np.random.permutation(X_test.shape[0])[0:n]\n",
"\n",
"# test_samples = json.dumps({\"data\": X_test[sample_indices].tolist()})\n",
"# test_samples = bytes(test_samples, encoding='utf8')\n",
"\n",
"# # predict using the deployed model\n",
"# result = service.run(input_data=test_samples)\n",
"\n",
"# # compare actual value vs. the predicted values:\n",
"# i = 0\n",
"# plt.figure(figsize = (20, 1))\n",
"\n",
"# for s in sample_indices:\n",
"# plt.subplot(1, n, i + 1)\n",
"# plt.axhline('')\n",
"# plt.axvline('')\n",
" \n",
"# # use different color for misclassified sample\n",
"# font_color = 'red' if y_test[s] != result[i] else 'black'\n",
"# clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys\n",
" \n",
"# plt.text(x=10, y=-10, s=y_hat[s], fontsize=18, color=font_color)\n",
"# plt.imshow(X_test[s].reshape(28, 28), cmap=clr_map)\n",
" \n",
"# i = i + 1\n",
"# plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can also send raw HTTP request to the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# import requests\n",
"\n",
"# # send a random row from the test set to score\n",
"# random_index = np.random.randint(0, len(X_test)-1)\n",
"# input_data = \"{\\\"data\\\": [\" + str(list(X_test[random_index])) + \"]}\"\n",
"\n",
"# headers = {'Content-Type':'application/json'}\n",
"\n",
"# resp = requests.post(service.scoring_uri, input_data, headers=headers)\n",
"\n",
"# print(\"POST to url\", service.scoring_uri)\n",
"# print(\"input data:\", input_data)\n",
"# print(\"label:\", y_test[random_index])\n",
"# print(\"prediction:\", resp.text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's look at the workspace after the web service was deployed. You should see \n",
"* a registered model named 'model' and with the id 'model:1'\n",
"* an image called 'tf-mnist' and with a docker image location pointing to your workspace's Azure Container Registry (ACR) \n",
"* a webservice called 'tf-mnist' with some scoring URL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# models = ws.models\n",
"# for name, model in models.items():\n",
"# print(\"Model: {}, ID: {}\".format(name, model.id))\n",
" \n",
"# images = ws.images\n",
"# for name, image in images.items():\n",
"# print(\"Image: {}, location: {}\".format(name, image.image_location))\n",
" \n",
"# webservices = ws.webservices\n",
"# for name, webservice in webservices.items():\n",
"# print(\"Webservice: {}, scoring URI: {}\".format(name, webservice.scoring_uri))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean up\n",
"You can delete the ACI deployment with a simple delete API call."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PUBLISHONLY\n",
"# service.delete()"
]
}
],

View File

@@ -127,7 +127,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Retrieve or create a Aml compute\n",
"#### Retrieve or create an Aml compute\n",
"Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's get the default Aml Compute in the current workspace. We will then run the training script on this compute target."
]
},
@@ -418,6 +418,59 @@
"RunDetails(pipeline_run1).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### See Outputs\n",
"\n",
"See where outputs of each pipeline step are located on your datastore."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get Steps\n",
"for step in pipeline_run1.get_steps():\n",
" print(\"Outputs of step \" + step.name)\n",
" \n",
" # Get a dictionary of StepRunOutputs with the output name as the key \n",
" output_dict = step.get_outputs()\n",
" \n",
" for name, output in output_dict.items():\n",
" \n",
" output_reference = output.get_port_data_reference() # Get output port data reference\n",
" print(\"\\tname: \" + name)\n",
" print(\"\\tdatastore: \" + output_reference.datastore_name)\n",
" print(\"\\tpath on datastore: \" + output_reference.path_on_datastore)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download Outputs\n",
"\n",
"We can download the output of any step to our local machine using the SDK."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Retrieve the step runs by name 'train.py'\n",
"train_step = pipeline_run1.find_step_run('train.py')\n",
"\n",
"if train_step:\n",
" train_step_obj = train_step[0] # since we have only one step by name 'train.py'\n",
" train_step_obj.get_output_data('processed_data1').download(\"./outputs\") # download the output to current directory"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -293,7 +293,7 @@
"source": [
"from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n",
"\n",
"cd = CondaDependencies.create(pip_packages=[\"tensorflow-gpu==1.10.0\", \"azureml-defaults\"])\n",
"cd = CondaDependencies.create(pip_packages=[\"tensorflow-gpu==1.13.1\", \"azureml-defaults\"])\n",
"\n",
"# Runconfig\n",
"amlcompute_run_config = RunConfiguration(conda_dependencies=cd)\n",

View File

@@ -223,7 +223,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI, you must provide the argument `distributed_backend='mpi'`. Using this estimator with these settings, Chainer and its dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `Chainer` constructor's `pip_packages` or `conda_packages` parameters."
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI, you must provide the argument `distributed_training=MpiConfiguration()`. Using this estimator with these settings, Chainer and its dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `Chainer` constructor's `pip_packages` or `conda_packages` parameters."
]
},
{

View File

@@ -270,6 +270,7 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import MpiConfiguration\n",
"from azureml.train.estimator import Estimator\n",
"\n",
"script_params = {\n",
@@ -283,8 +284,7 @@
" entry_script='cntk_distr_mnist.py',\n",
" script_params=script_params,\n",
" node_count=2,\n",
" process_count_per_node=1,\n",
" distributed_backend='mpi',\n",
" distributed_training=MpiConfiguration(),\n",
" pip_packages=['cntk-gpu==2.6'],\n",
" custom_docker_image='microsoft/mmlspark:gpu-0.12',\n",
" use_gpu=True)"
@@ -296,7 +296,7 @@
"source": [
"We would like to train our model using a [pre-built Docker container](https://hub.docker.com/r/microsoft/mmlspark/). To do so, specify the name of the docker image to the argument `custom_docker_image`. Finally, we provide the `cntk` package to `pip_packages` to install CNTK 2.6 on our custom image.\n",
"\n",
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to run distributed CNTK, which uses MPI, you must provide the argument `distributed_backend='mpi'`."
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to run distributed CNTK, which uses MPI, you must provide the argument `distributed_training=MpiConfiguration()`."
]
},
{

View File

@@ -236,7 +236,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI/Horovod, you must provide the argument `distributed_backend='mpi'`. Using this estimator with these settings, PyTorch, Horovod and their dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `PyTorch` constructor's `pip_packages` or `conda_packages` parameters."
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI/Horovod, you must provide the argument `distributed_training=MpiConfiguration()`. Using this estimator with these settings, PyTorch, Horovod and their dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `PyTorch` constructor's `pip_packages` or `conda_packages` parameters."
]
},
{

View File

@@ -304,7 +304,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI/Horovod, you must provide the argument `distributed_backend='mpi'`. Using this estimator with these settings, TensorFlow, Horovod and their dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `TensorFlow` constructor's `pip_packages` or `conda_packages` parameters.\n",
"The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI/Horovod, you must provide the argument `distributed_training=MpiConfiguration()`. Using this estimator with these settings, TensorFlow, Horovod and their dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `TensorFlow` constructor's `pip_packages` or `conda_packages` parameters.\n",
"\n",
"Note that we passed our training data reference `ds_data` to our script's `--input_data` argument. This will 1) mount our datastore on the remote compute and 2) provide the path to the data zip file on our datastore."
]

View File

@@ -220,7 +220,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The above code specifies that we will run our training script on `2` nodes, with two workers and one parameter server. In order to execute a native distributed TensorFlow run, you must provide the argument `distributed_backend='ps'`. Using this estimator with these settings, TensorFlow and its dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `TensorFlow` constructor's `pip_packages` or `conda_packages` parameters."
"The above code specifies that we will run our training script on `2` nodes, with two workers and one parameter server. In order to execute a native distributed TensorFlow run, you must provide the argument `distributed_training=TensorflowConfiguration()`. Using this estimator with these settings, TensorFlow and its dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `TensorFlow` constructor's `pip_packages` or `conda_packages` parameters."
]
},
{

View File

@@ -312,6 +312,35 @@
"RunDetails(run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In addition to passing in a python file, you can also pass in a Jupyter notebook as the `entry_script`. [notebook_example.ipynb](notebook_example.ipynb) uses pm.record() to log key-value pairs which will appear in Azure Portal and shown in below widget.\n",
"\n",
"In order to run below, make sure `azureml-contrib-notebook` package is installed in current environment with `pip intall azureml-contrib-notebook`.\n",
"\n",
"This code snippet specifies the following parameters to the `Estimator` constructor. For more information on `Estimator`, please see [tutorial](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-ml-models) or [API doc](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.estimator.estimator?view=azure-ml-py).\n",
"\n",
"| Parameter | Description |\n",
"| ------------- | ------------- |\n",
"| source_directory | (str) Local directory that contains all of your code needed for the training job. This folder gets copied from your local machine to the remote compute |\n",
"| compute_target | ([AbstractComputeTarget](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute_target.abstractcomputetarget?view=azure-ml-py) or str) Remote compute target that your training script will run on, in this case a previously created persistent compute cluster (cpu_cluster) |\n",
"| entry_script | (str) Filepath (relative to the source_directory) of the training script/notebook to be run on the remote compute. This file, and any additional files it depends on, should be located in this folder |\n",
"| script_params | (dict) A dictionary containing parameters to the `entry_script`. This is useful for passing datastore reference, for example, see [train-hyperparameter-tune-deploy-with-tensorflow.ipynb](../train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) |"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"est = Estimator(source_directory='.', compute_target=cpu_cluster, entry_script='notebook_example.ipynb', pip_packages=['nteract-scrapbook', 'azureml-contrib-notebook'])\n",
"run = exp.submit(est)\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -424,7 +453,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"When all the runs complete, we can find the run with the best performance"
"When all the runs complete, we can find the run with the best performance."
]
},
{
@@ -475,7 +504,7 @@
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"msauthor": "minxia"
"msauthor": "jingywa"
},
"nbformat": 4,
"nbformat_minor": 2

View File

@@ -0,0 +1,57 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import scrapbook as sb\n",
"sb.glue('Fibonacci numbers', [0, 1, 1, 2, 3, 5, 8, 13, 21, 34])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"authors": [
{
"name": "jingywa"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"msauthor": "jingywa"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -425,7 +425,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor the Run\n",
"### Monitor the Run <a class=\"anchor\" id=\"monitor-run\"></a>\n",
"As the Run is executed, it will go through the following stages:\n",
"1. Preparing: A docker image is created matching the Python environment specified by the TensorFlow estimator and it will be uploaded to the workspace's Azure Container Registry. This step will only happen once for each Python environment -- the container will then be cached for subsequent runs. Creating and uploading the image takes about **5 minutes**. While the job is preparing, logs are streamed to the run history and can be viewed to monitor the progress of the image creation.\n",
"\n",
@@ -480,7 +480,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### The Run object\n",
"### The Run object <a class=\"anchor\" id=\"run-object\"></a>\n",
"The Run object provides the interface to the run history -- both to the job and to the control plane (this notebook), and both while the job is running and after it has completed. It provides a number of interesting features for instance:\n",
"* `run.get_details()`: Provides a rich set of properties of the run\n",
"* `run.get_metrics()`: Provides a dictionary with all the metrics that were reported for the Run\n",
@@ -778,7 +778,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find and register best model\n",
"## Find and register best model <a class=\"anchor\" id=\"register-model\"></a>\n",
"When all the jobs finish, we can find out the one that has the highest accuracy."
]
},
@@ -1140,7 +1140,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
"version": "3.6.8"
},
"msauthor": "minxia"
},

View File

@@ -100,7 +100,7 @@
"\n",
"# Check core SDK version number\n",
"\n",
"print(\"This notebook was created using SDK version 1.0.41, you are currently running version\", azureml.core.VERSION)"
"print(\"This notebook was created using SDK version 1.0.43, you are currently running version\", azureml.core.VERSION)"
]
},
{

View File

@@ -0,0 +1,8 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License
# Very simple script to demonstrate run in environment
# Print message passed in as environment variable
import os
print(os.environ.get("MESSAGE"))

View File

@@ -60,12 +60,12 @@
]
},
{
"cell_type": "raw",
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"from azureml.core import Workspace\n",
"\n",
"print(azureml.core.VERSION)"
]
},
@@ -75,6 +75,7 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"ws = Workspace.from_config()\n",
"ws.get_details()"
]
@@ -176,7 +177,7 @@
"metadata": {},
"outputs": [],
"source": [
"runconfig = ScriptRunConfig(source_directory=\"example\", script=\"example.py\")\n",
"runconfig = ScriptRunConfig(source_directory=\".\", script=\"example.py\")\n",
"runconfig.run_config.target = \"local\"\n",
"runconfig.run_config.environment = myenv\n",
"run = myexp.submit(config=runconfig)\n",

View File

@@ -31,6 +31,35 @@ If you have any questions or feedback, send us an email at: [askamldataprep@micr
## Release Notes
### 2019-05-28 (version 1.1.4)
New features
- You can now use the following expression language functions to extract and parse datetime values into new columns.
- `RegEx.extract_record()` extracts datetime elements into a new column.
- `create_datetime()` creates datetime objects from separate datetime elements.
- When calling `get_profile()`, you can now see that quantile columns are labeled as (est.) to clearly indicate that the values are approximations.
- You can now use ** globbing when reading from Azure Blob Storage.
- e.g. `dprep.read_csv(path='https://yourblob.blob.core.windows.net/yourcontainer/**/data/*.csv')`
Bug fixes
- Fixed a bug related to reading a Parquet file from a remote source (Azure Blob).
### 2019-05-08 (version 1.1.3)
New features
- Added support to read from a PostgresSQL database, either by calling `read_postgresql` or using a Datastore.
- See examples in how-to guides:
- [Data Ingestion notebook](https://aka.ms/aml-data-prep-ingestion-nb)
- [Datastore notebook](https://aka.ms/aml-data-prep-datastore-nb)
Bug fixes and improvements
- Fixed issues with column type conversion:
- Now correctly converts a boolean or numeric column to a boolean column.
- Now does not fail when attempting to set a date column to be date type.
- Improved JoinType types and accompanying reference documentation. When joining two dataflows, you can now specify one of these types of join:
- NONE, MATCH, INNER, UNMATCHLEFT, LEFTANTI, LEFTOUTER, UNMATCHRIGHT, RIGHTANTI, RIGHTOUTER, FULLANTI, FULL.
- Improved data type inferencing to recognize more date formats.
### 2019-04-17 (version 1.1.2)
Note: Data Prep Python SDK will no longer install `numpy` and `pandas` packages. See [updated installation instructions](https://aka.ms/aml-data-prep-installation).

View File

@@ -94,7 +94,7 @@
"metadata": {},
"outputs": [],
"source": [
"spark_df = df.take(5).to_pandas_dataframe()\n",
"spark_df = df.to_spark_dataframe()\n",
"spark_df.head(5)"
]
},

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
Stream Path
https://dataset.blob.core.windows.net/blobstore/container/2019/01/01/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/02/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/03/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/04/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/05/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/06/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/07/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/08/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/09/train.csv
https://dataset.blob.core.windows.net/blobstore/container/2019/01/10/train.csv
1 Stream Path
2 https://dataset.blob.core.windows.net/blobstore/container/2019/01/01/train.csv
3 https://dataset.blob.core.windows.net/blobstore/container/2019/01/02/train.csv
4 https://dataset.blob.core.windows.net/blobstore/container/2019/01/03/train.csv
5 https://dataset.blob.core.windows.net/blobstore/container/2019/01/04/train.csv
6 https://dataset.blob.core.windows.net/blobstore/container/2019/01/05/train.csv
7 https://dataset.blob.core.windows.net/blobstore/container/2019/01/06/train.csv
8 https://dataset.blob.core.windows.net/blobstore/container/2019/01/07/train.csv
9 https://dataset.blob.core.windows.net/blobstore/container/2019/01/08/train.csv
10 https://dataset.blob.core.windows.net/blobstore/container/2019/01/09/train.csv
11 https://dataset.blob.core.windows.net/blobstore/container/2019/01/10/train.csv

View File

@@ -139,6 +139,51 @@
"dflow_to_lower.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### `RegEx.extract_record()`\n",
"Using the `RegEx.extract_record()` expression, add a new record column \"Stream Date Record\", which contains the name capturing groups in the regex with value."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dflow_regex_extract_record = dprep.auto_read_file('../data/stream-path.csv')\n",
"regex = dprep.RegEx('\\/(?<year>\\d{4})\\/(?<month>\\d{2})\\/(?<day>\\d{2})\\/')\n",
"dflow_regex_extract_record = dflow_regex_extract_record.add_column(new_column_name='Stream Date Record',\n",
" prior_column='Stream Path',\n",
" expression=regex.extract_record(dflow_regex_extract_record['Stream Path']))\n",
"dflow_regex_extract_record.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### `create_datetime()`\n",
"Using the `create_datetime()` expression, add a new column \"Stream Date\", which contains datetime values constructed from year, month, day values extracted from a record column \"Stream Date Record\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"year = dprep.col('year', dflow_regex_extract_record['Stream Date Record'])\n",
"month = dprep.col('month', dflow_regex_extract_record['Stream Date Record'])\n",
"day = dprep.col('day', dflow_regex_extract_record['Stream Date Record'])\n",
"dflow_create_datetime = dflow_regex_extract_record.add_column(new_column_name='Stream Date',\n",
" prior_column='Stream Date Record',\n",
" expression=dprep.create_datetime(year, month, day))\n",
"dflow_create_datetime.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -39,6 +39,7 @@
"[Read Part Files Using Globbing](#globbing)<br>\n",
"[Read JSON](#json)<br>\n",
"[Read SQL](#sql)<br>\n",
"[Read PostgreSQL](#postgresql)<br>\n",
"[Read From Azure Blob](#azure-blob)<br>\n",
"[Read From ADLS](#adls)<br>\n",
"[Read Pandas DataFrame](#pandas-df)<br>"
@@ -110,7 +111,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"When reading delimited files, the only required parameter is `path`. Other parameters (e.g. separator, encoding, whether to use headers, etc.) are available to modify default behavior.In this case, you can read a file by specifying only its location, then retrieve the first 5 rows to evaluate the result."
"When reading delimited files, the only required parameter is `path`. Other parameters (e.g. separator, encoding, whether to use headers, etc.) are available to modify default behavior.\n",
"In this case, you can read a file by specifying only its location, then retrieve the first 5 rows to evaluate the result."
]
},
{
@@ -632,6 +634,69 @@
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"postgresql\"></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read PostgreSQL"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data Prep can also fetch data from Azure PostgreSQL servers.\n",
"\n",
"To read data from a PostgreSQL server, first create a data source object that contains the connection information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"secret = dprep.register_secret(value=\"dpr3pTestU$er\", id=\"dprepPostgresqlUser\")\n",
"ds = dprep.PostgreSQLDataSource(server_name=\"dprep-postgresql-test.postgres.database.azure.com\",\n",
" database_name=\"dprep-postgresql-testdb\",\n",
" user_name=\"dprepPostgresqlReadOnlyUser@dprep-postgresql-test\",\n",
" password=secret)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As you can see, the password parameter of `PostgreSQLDataSource` accepts a Secret object as well.\n",
"Now that you have created a PostgreSQL data source object, you can proceed to read data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dflow = dprep.read_postgresql(ds, \"SELECT * FROM public.people\")\n",
"dflow.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dflow.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -899,7 +964,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.4"
}
},
"nbformat": 4,

View File

@@ -142,33 +142,6 @@
"source": [
"profile.columns['X Coordinate'].type_counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#TEST CELL: Profile.Compare\n",
"import azureml.dataprep as dprep\n",
"import math\n",
"\n",
"lhs_dflow = dprep.auto_read_file('../data/crime-spring.csv')\n",
"lhs_profile = lhs_dflow.get_profile(number_of_histogram_bins=100)\n",
"rhs_dflow = dprep.auto_read_file('../data/crime-winter.csv')\n",
"rhs_profile = rhs_dflow.get_profile(number_of_histogram_bins=100)\n",
"\n",
"diff = lhs_profile.compare(rhs_profile)\n",
"\n",
"expected_col1 = dprep.ColumnProfileDifference()\n",
"expected_col1.difference_in_count_in_percent = 0\n",
"expected_col1.difference_in_histograms = 135349.66146244822\n",
"\n",
"for actual, expected in zip(diff.column_profile_difference, [expected_col1]) :\n",
" assert math.isclose(actual.difference_in_count_in_percent, expected.difference_in_count_in_percent)\n",
" assert math.isclose(actual.difference_in_histograms, expected.difference_in_histograms)\n",
" break\n"
]
}
],
"metadata": {

View File

@@ -117,6 +117,24 @@
"dflow_sql.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also read from a PostgreSQL database. To do that, you will first get a PostgreSQL database datastore instance and pass it to Data Prep for reading."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datastore = Datastore(workspace=workspace, name='postgre_test')\n",
"dflow_sql = dprep.read_postgresql(data_source=datastore, query='SELECT * FROM public.people')\n",
"dflow_sql.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -536,7 +536,7 @@
"outputs": [],
"source": [
"# specify show_output to True for a verbose log\n",
"run.wait_for_completion(show_output=False) "
"run.wait_for_completion(show_output=True) "
]
},
{