mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-23 20:00:06 -05:00
Compare commits
8 Commits
azureml-sd
...
jeffshep/w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e8a240a71 | ||
|
|
5b019e28de | ||
|
|
bf4cb1e86c | ||
|
|
eaa7c56590 | ||
|
|
8fc0fa040d | ||
|
|
56e13b0b9a | ||
|
|
785fe3c962 | ||
|
|
3c341f6e9a |
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.59.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -14,14 +14,13 @@ dependencies:
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.57.0
|
||||
- azureml-defaults~=1.57.0
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.57.0/validated_win32_requirements.txt [--no-deps]
|
||||
- azureml-widgets~=1.59.0
|
||||
- azureml-defaults~=1.59.0
|
||||
- -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.59.0/validated_win32_requirements.txt [--no-deps]
|
||||
- matplotlib==3.7.1
|
||||
- xgboost==1.5.2
|
||||
- prophet==1.1.4
|
||||
- pandas==1.3.5
|
||||
- cmdstanpy==1.1.0
|
||||
- onnx==1.16.1
|
||||
- setuptools-git==1.2
|
||||
- spacy==3.7.4
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.7.1.tar.gz
|
||||
|
||||
@@ -20,11 +20,11 @@ dependencies:
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.57.0
|
||||
- azureml-defaults~=1.57.0
|
||||
- azureml-widgets~=1.59.0
|
||||
- azureml-defaults~=1.59.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==3.7.4
|
||||
- xgboost==1.5.2
|
||||
- prophet==1.1.4
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.7.1.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.57.0/validated_linux_requirements.txt [--no-deps]
|
||||
- -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.59.0/validated_linux_requirements.txt [--no-deps]
|
||||
|
||||
@@ -15,12 +15,12 @@ dependencies:
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.57.0
|
||||
- azureml-defaults~=1.57.0
|
||||
- azureml-widgets~=1.59.0
|
||||
- azureml-defaults~=1.59.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- prophet==1.1.4
|
||||
- xgboost==1.5.2
|
||||
- spacy==3.7.4
|
||||
- matplotlib==3.7.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-3.7.1.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.57.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- -r https://automlcesdkdataresources.blob.core.windows.net/validated-requirements/1.59.0/validated_darwin_requirements.txt [--no-deps]
|
||||
|
||||
@@ -93,7 +93,8 @@
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from azureml.interpret import ExplanationClient"
|
||||
"from azureml.interpret import ExplanationClient\n",
|
||||
"from azureml.data.datapath import DataPath"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -266,10 +267,12 @@
|
||||
"pd.DataFrame(data).to_csv(\"data/train_data.csv\", index=False)\n",
|
||||
"\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(\n",
|
||||
" src_dir=\"./data\", target_path=\"bankmarketing\", overwrite=True, show_progress=True\n",
|
||||
"target = DataPath(\n",
|
||||
" datastore=ds, path_on_datastore=\"bankmarketing/train_data.csv\", name=\"bankmarketing\"\n",
|
||||
")\n",
|
||||
"Dataset.File.upload_directory(\n",
|
||||
" src_dir=\"./data\", target=target, overwrite=True, show_progress=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Upload the training data as a tabular dataset for access during training on remote compute\n",
|
||||
"train_data = Dataset.Tabular.from_delimited_files(\n",
|
||||
@@ -1090,7 +1093,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
@@ -1104,5 +1107,5 @@
|
||||
"task": "Classification"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.59.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.59.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.57.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.59.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -292,7 +292,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.16-cuda11')"
|
||||
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.16-cuda12')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,753 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Neural style transfer on video\n",
|
||||
"Using modified code from `pytorch`'s neural style [example](https://pytorch.org/tutorials/advanced/neural_style_tutorial.html), we show how to setup a pipeline for doing style transfer on video. The pipeline has following steps:\n",
|
||||
"1. Split a video into images\n",
|
||||
"2. Run neural style on each image using one of the provided models (from `pytorch` pretrained models for this example).\n",
|
||||
"3. Stitch the image back into a video.\n",
|
||||
"\n",
|
||||
"> **Tip**\n",
|
||||
"If your system requires low-latency processing (to process a single document or small set of documents quickly), use [real-time scoring](https://docs.microsoft.com/en-us/azure/machine-learning/v1/how-to-consume-web-service) instead of batch prediction."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the configuration Notebook located at https://github.com/Azure/MachineLearningNotebooks first if you haven't. This sets you up with a working config file that has information on your workspace, subscription id, etc. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize Workspace\n",
|
||||
"\n",
|
||||
"Initialize a workspace object from persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace, Experiment\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"from azureml.core import Datastore, Dataset\n",
|
||||
"from azureml.pipeline.core import Pipeline\n",
|
||||
"from azureml.pipeline.steps import PythonScriptStep\n",
|
||||
"from azureml.core.runconfig import CondaDependencies, RunConfiguration\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"from azureml.data import OutputFileDatasetConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# create directory for model\n",
|
||||
"model_dir = 'models'\n",
|
||||
"if not os.path.isdir(model_dir):\n",
|
||||
" os.mkdir(model_dir)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"def download_model(model_name):\n",
|
||||
" # downloaded models from https://pytorch.org/tutorials/advanced/neural_style_tutorial.html are kept here\n",
|
||||
" url = \"https://pipelinedata.blob.core.windows.net/styletransfer/saved_models/\" + model_name\n",
|
||||
" local_path = os.path.join(model_dir, model_name)\n",
|
||||
" urllib.request.urlretrieve(url, local_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Register all Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import Model\n",
|
||||
"mosaic_model = None\n",
|
||||
"candy_model = None\n",
|
||||
"\n",
|
||||
"models = Model.list(workspace=ws, tags=['scenario'])\n",
|
||||
"for m in models:\n",
|
||||
" print(\"Name:\", m.name,\"\\tVersion:\", m.version, \"\\tDescription:\", m.description, m.tags)\n",
|
||||
" if m.name == 'mosaic' and mosaic_model is None:\n",
|
||||
" mosaic_model = m\n",
|
||||
" elif m.name == 'candy' and candy_model is None:\n",
|
||||
" candy_model = m\n",
|
||||
"\n",
|
||||
"if mosaic_model is None:\n",
|
||||
" print('Mosaic model does not exist, registering it')\n",
|
||||
" download_model('mosaic.pth')\n",
|
||||
" mosaic_model = Model.register(model_path = os.path.join(model_dir, \"mosaic.pth\"),\n",
|
||||
" model_name = \"mosaic\",\n",
|
||||
" tags = {'type': \"mosaic\", 'scenario': \"Style transfer using batch inference\"},\n",
|
||||
" description = \"Style transfer - Mosaic\",\n",
|
||||
" workspace = ws)\n",
|
||||
"else:\n",
|
||||
" print('Reusing existing mosaic model')\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"if candy_model is None:\n",
|
||||
" print('Candy model does not exist, registering it')\n",
|
||||
" download_model('candy.pth')\n",
|
||||
" candy_model = Model.register(model_path = os.path.join(model_dir, \"candy.pth\"),\n",
|
||||
" model_name = \"candy\",\n",
|
||||
" tags = {'type': \"candy\", 'scenario': \"Style transfer using batch inference\"},\n",
|
||||
" description = \"Style transfer - Candy\",\n",
|
||||
" workspace = ws)\n",
|
||||
"else:\n",
|
||||
" print('Reusing existing candy model')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Create or use existing compute\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# AmlCompute\n",
|
||||
"cpu_cluster_name = \"cpu-cluster\"\n",
|
||||
"try:\n",
|
||||
" cpu_cluster = AmlCompute(ws, cpu_cluster_name)\n",
|
||||
" print(\"found existing cluster.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"creating new cluster\")\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_v2\",\n",
|
||||
" max_nodes = 1)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
|
||||
" cpu_cluster.wait_for_completion(show_output=True)\n",
|
||||
" \n",
|
||||
"# AmlCompute\n",
|
||||
"gpu_cluster_name = \"gpu-cluster\"\n",
|
||||
"try:\n",
|
||||
" gpu_cluster = AmlCompute(ws, gpu_cluster_name)\n",
|
||||
" print(\"found existing cluster.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"creating new cluster\")\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"Standard_NC6s_v3\",\n",
|
||||
" max_nodes = 3)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)\n",
|
||||
" gpu_cluster.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Python Scripts\n",
|
||||
"We use an edited version of `neural_style_mpi.py` (original is [here](https://github.com/pytorch/examples/blob/master/fast_neural_style/neural_style/neural_style.py)). Scripts to split and stitch the video are thin wrappers to calls to `ffmpeg`. \n",
|
||||
"\n",
|
||||
"We install `ffmpeg` through conda dependencies."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scripts_folder = \"scripts\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"process_video_script_file = \"process_video.py\"\n",
|
||||
"\n",
|
||||
"# peek at contents\n",
|
||||
"with open(os.path.join(scripts_folder, process_video_script_file)) as process_video_file:\n",
|
||||
" print(process_video_file.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"stitch_video_script_file = \"stitch_video.py\"\n",
|
||||
"\n",
|
||||
"# peek at contents\n",
|
||||
"with open(os.path.join(scripts_folder, stitch_video_script_file)) as stitch_video_file:\n",
|
||||
" print(stitch_video_file.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The sample video **organutan.mp4** is stored at a publicly shared datastore. We are registering the datastore below. If you want to take a look at the original video, click here. (https://pipelinedata.blob.core.windows.net/sample-videos/orangutan.mp4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# datastore for input video\n",
|
||||
"account_name = \"pipelinedata\"\n",
|
||||
"video_ds = Datastore.register_azure_blob_container(ws, \"videos\", \"sample-videos\",\n",
|
||||
" account_name=account_name, overwrite=True)\n",
|
||||
"\n",
|
||||
"# the default blob store attached to a workspace\n",
|
||||
"default_datastore = ws.get_default_datastore()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sample video"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"video_name=os.getenv(\"STYLE_TRANSFER_VIDEO_NAME\", \"orangutan.mp4\") \n",
|
||||
"orangutan_video = Dataset.File.from_files((video_ds,video_name))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cd = CondaDependencies.create(python_version=\"3.8\", conda_packages=['pip==20.2.4'])\n",
|
||||
"\n",
|
||||
"cd.add_channel(\"conda-forge\")\n",
|
||||
"cd.add_conda_package(\"ffmpeg==4.0.2\")\n",
|
||||
"\n",
|
||||
"# Runconfig\n",
|
||||
"amlcompute_run_config = RunConfiguration(conda_dependencies=cd)\n",
|
||||
"amlcompute_run_config.environment.docker.base_image = \"pytorch/pytorch\"\n",
|
||||
"amlcompute_run_config.environment.spark.precache_packages = False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ffmpeg_audio = OutputFileDatasetConfig(name=\"ffmpeg_audio\")\n",
|
||||
"processed_images = OutputFileDatasetConfig(name=\"processed_images\")\n",
|
||||
"output_video = OutputFileDatasetConfig(name=\"output_video\")\n",
|
||||
"\n",
|
||||
"ffmpeg_images = OutputFileDatasetConfig(name=\"ffmpeg_images\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Define tweakable parameters to pipeline\n",
|
||||
"These parameters can be changed when the pipeline is published and rerun from a REST call.\n",
|
||||
"As part of ParallelRunStep following 2 pipeline parameters will be created which can be used to override values.\n",
|
||||
" node_count\n",
|
||||
" process_count_per_node"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core.graph import PipelineParameter\n",
|
||||
"# create a parameter for style (one of \"candy\", \"mosaic\") to transfer the images to\n",
|
||||
"style_param = PipelineParameter(name=\"style\", default_value=\"mosaic\")\n",
|
||||
"# create a parameter for the number of nodes to use in step no. 2 (style transfer)\n",
|
||||
"nodecount_param = PipelineParameter(name=\"nodecount\", default_value=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"split_video_step = PythonScriptStep(\n",
|
||||
" name=\"split video\",\n",
|
||||
" script_name=\"process_video.py\",\n",
|
||||
" arguments=[\"--input_video\", orangutan_video.as_mount(),\n",
|
||||
" \"--output_audio\", ffmpeg_audio,\n",
|
||||
" \"--output_images\", ffmpeg_images],\n",
|
||||
" compute_target=cpu_cluster,\n",
|
||||
" runconfig=amlcompute_run_config,\n",
|
||||
" source_directory=scripts_folder\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"stitch_video_step = PythonScriptStep(\n",
|
||||
" name=\"stitch\",\n",
|
||||
" script_name=\"stitch_video.py\",\n",
|
||||
" arguments=[\"--images_dir\", processed_images.as_input(), \n",
|
||||
" \"--input_audio\", ffmpeg_audio.as_input(), \n",
|
||||
" \"--output_dir\", output_video],\n",
|
||||
" compute_target=cpu_cluster,\n",
|
||||
" runconfig=amlcompute_run_config,\n",
|
||||
" source_directory=scripts_folder\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Create environment, parallel step run config and parallel run step"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n",
|
||||
"\n",
|
||||
"parallel_cd = CondaDependencies.create(python_version=\"3.8\", conda_packages=['pip==20.2.4', 'numpy==1.19'])\n",
|
||||
"\n",
|
||||
"parallel_cd.add_channel(\"pytorch\")\n",
|
||||
"parallel_cd.add_conda_package(\"pytorch\")\n",
|
||||
"parallel_cd.add_conda_package(\"torchvision\")\n",
|
||||
"parallel_cd.add_conda_package(\"pillow<7\") # needed for torchvision==0.4.0\n",
|
||||
"\n",
|
||||
"styleenvironment = Environment(name=\"styleenvironment\")\n",
|
||||
"styleenvironment.python.conda_dependencies=parallel_cd\n",
|
||||
"styleenvironment.docker.base_image = DEFAULT_GPU_IMAGE"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core import PipelineParameter\n",
|
||||
"from azureml.pipeline.steps import ParallelRunConfig\n",
|
||||
"\n",
|
||||
"parallel_run_config = ParallelRunConfig(\n",
|
||||
" environment=styleenvironment,\n",
|
||||
" entry_script='transform.py',\n",
|
||||
" output_action='summary_only',\n",
|
||||
" mini_batch_size=\"1\",\n",
|
||||
" error_threshold=1,\n",
|
||||
" source_directory=scripts_folder,\n",
|
||||
" compute_target=gpu_cluster, \n",
|
||||
" node_count=nodecount_param,\n",
|
||||
" process_count_per_node=2\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.steps import ParallelRunStep\n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"parallel_step_name = 'styletransfer-' + datetime.now().strftime('%Y%m%d%H%M')\n",
|
||||
"\n",
|
||||
"distributed_style_transfer_step = ParallelRunStep(\n",
|
||||
" name=parallel_step_name,\n",
|
||||
" inputs=[ffmpeg_images], # Input file share/blob container/file dataset\n",
|
||||
" output=processed_images, # Output file share/blob container\n",
|
||||
" arguments=[\"--style\", style_param],\n",
|
||||
" parallel_run_config=parallel_run_config,\n",
|
||||
" allow_reuse=False #[optional - default value True]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Run the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline = Pipeline(workspace=ws, steps=[stitch_video_step])\n",
|
||||
"\n",
|
||||
"pipeline.validate()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# submit the pipeline and provide values for the PipelineParameters used in the pipeline\n",
|
||||
"pipeline_run = Experiment(ws, 'styletransfer_parallel_mosaic').submit(pipeline)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Monitor pipeline run\n",
|
||||
"\n",
|
||||
"The pipeline run status could be checked in Azure Machine Learning portal (https://ml.azure.com). The link to the pipeline run could be retrieved by inspecting the `pipeline_run` object.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This will output information of the pipeline run, including the link to the details page of portal.\n",
|
||||
"pipeline_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Optional: View detailed logs (streaming) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Wait the run for completion and show output log to console\n",
|
||||
"pipeline_run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download output video"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Downloads the video in `output_video` folder"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def download_video(run, target_dir=None):\n",
|
||||
" stitch_run = run.find_step_run(stitch_video_step.name)[0]\n",
|
||||
" port_data = stitch_run.get_details()['outputDatasets'][0]['dataset']\n",
|
||||
" port_data.download(target_dir)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_run.wait_for_completion()\n",
|
||||
"download_video(pipeline_run, \"output_video_mosaic\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Publish pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_name = \"style-transfer-batch-inference\"\n",
|
||||
"print(pipeline_name)\n",
|
||||
"\n",
|
||||
"published_pipeline = pipeline.publish(\n",
|
||||
" name=pipeline_name, \n",
|
||||
" description=pipeline_name)\n",
|
||||
"print(\"Newly published pipeline id: {}\".format(published_pipeline.id))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Get published pipeline\n",
|
||||
"This is another way to get the published pipeline."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core import PublishedPipeline\n",
|
||||
"\n",
|
||||
"# You could retrieve all pipelines that are published, or \n",
|
||||
"# just get the published pipeline object that you have the ID for.\n",
|
||||
"\n",
|
||||
"# Get all published pipeline objects in the workspace\n",
|
||||
"all_pub_pipelines = PublishedPipeline.list(ws)\n",
|
||||
"\n",
|
||||
"# We will iterate through the list of published pipelines and \n",
|
||||
"# use the last ID in the list for Schelue operations: \n",
|
||||
"print(\"Published pipelines found in the workspace:\")\n",
|
||||
"for pub_pipeline in all_pub_pipelines:\n",
|
||||
" print(\"Name:\", pub_pipeline.name,\"\\tDescription:\", pub_pipeline.description, \"\\tId:\", pub_pipeline.id, \"\\tStatus:\", pub_pipeline.status)\n",
|
||||
" if(pub_pipeline.name == pipeline_name):\n",
|
||||
" published_pipeline = pub_pipeline\n",
|
||||
"\n",
|
||||
"print(\"Published pipeline id: {}\".format(published_pipeline.id))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Run pipeline through REST calls for other styles\n",
|
||||
"\n",
|
||||
"# Get AAD token"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.authentication import InteractiveLoginAuthentication\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"auth = InteractiveLoginAuthentication()\n",
|
||||
"aad_token = auth.get_authentication_header()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Get endpoint URL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"rest_endpoint = published_pipeline.endpoint\n",
|
||||
"print(\"Pipeline REST endpoing: {}\".format(rest_endpoint))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Send request and monitor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment_name = 'styletransfer_parallel_candy'\n",
|
||||
"response = requests.post(rest_endpoint, \n",
|
||||
" headers=aad_token,\n",
|
||||
" json={\"ExperimentName\": experiment_name,\n",
|
||||
" \"ParameterAssignments\": {\"style\": \"candy\", \"NodeCount\": 3}})\n",
|
||||
"\n",
|
||||
"run_id = response.json()[\"Id\"]\n",
|
||||
"\n",
|
||||
"from azureml.pipeline.core.run import PipelineRun\n",
|
||||
"published_pipeline_run_candy = PipelineRun(ws.experiments[experiment_name], run_id)\n",
|
||||
"\n",
|
||||
"# Show detail information of run\n",
|
||||
"published_pipeline_run_candy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download output from re-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"published_pipeline_run_candy.wait_for_completion()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"download_video(published_pipeline_run_candy, target_dir=\"output_video_candy\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "sanpil joringer asraniwa pansav tracych"
|
||||
}
|
||||
],
|
||||
"category": "Other notebooks",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": true,
|
||||
"framework": [
|
||||
"None"
|
||||
],
|
||||
"friendly_name": "Style transfer using ParallelRunStep",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
},
|
||||
"tags": [
|
||||
"Batch Inferencing",
|
||||
"Pipeline"
|
||||
],
|
||||
"task": "Style transfer"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process input video")
|
||||
parser.add_argument('--input_video', required=True)
|
||||
parser.add_argument('--output_audio', required=True)
|
||||
parser.add_argument('--output_images', required=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(args.output_audio, exist_ok=True)
|
||||
os.makedirs(args.output_images, exist_ok=True)
|
||||
|
||||
subprocess.run("ffmpeg -i {} {}/video.aac".format(args.input_video, args.output_audio),
|
||||
shell=True,
|
||||
check=True)
|
||||
|
||||
subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner".format(args.input_video, args.output_images),
|
||||
shell=True,
|
||||
check=True)
|
||||
@@ -1,22 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process input video")
|
||||
parser.add_argument('--images_dir', required=True)
|
||||
parser.add_argument('--input_audio', required=True)
|
||||
parser.add_argument('--output_dir', required=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
subprocess.run("ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p "
|
||||
"-y {}/video_without_audio.mp4"
|
||||
.format(args.images_dir, args.output_dir),
|
||||
shell=True, check=True)
|
||||
|
||||
subprocess.run("ffmpeg -i {}/video_without_audio.mp4 -i {}/video.aac -map 0:0 -map 1:0 -vcodec "
|
||||
"copy -acodec copy -y {}/video_with_audio.mp4"
|
||||
.format(args.output_dir, args.input_audio, args.output_dir),
|
||||
shell=True, check=True)
|
||||
@@ -1,172 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
import traceback
|
||||
from PIL import Image
|
||||
|
||||
import torch
|
||||
from torchvision import transforms
|
||||
|
||||
from azureml.core.model import Model
|
||||
|
||||
style_model = None
|
||||
|
||||
|
||||
class TransformerNet(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(TransformerNet, self).__init__()
|
||||
# Initial convolution layers
|
||||
self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
|
||||
self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
|
||||
self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
|
||||
self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
|
||||
self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
|
||||
self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
|
||||
# Residual layers
|
||||
self.res1 = ResidualBlock(128)
|
||||
self.res2 = ResidualBlock(128)
|
||||
self.res3 = ResidualBlock(128)
|
||||
self.res4 = ResidualBlock(128)
|
||||
self.res5 = ResidualBlock(128)
|
||||
# Upsampling Layers
|
||||
self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
|
||||
self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
|
||||
self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
|
||||
self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
|
||||
self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
|
||||
# Non-linearities
|
||||
self.relu = torch.nn.ReLU()
|
||||
|
||||
def forward(self, X):
|
||||
y = self.relu(self.in1(self.conv1(X)))
|
||||
y = self.relu(self.in2(self.conv2(y)))
|
||||
y = self.relu(self.in3(self.conv3(y)))
|
||||
y = self.res1(y)
|
||||
y = self.res2(y)
|
||||
y = self.res3(y)
|
||||
y = self.res4(y)
|
||||
y = self.res5(y)
|
||||
y = self.relu(self.in4(self.deconv1(y)))
|
||||
y = self.relu(self.in5(self.deconv2(y)))
|
||||
y = self.deconv3(y)
|
||||
return y
|
||||
|
||||
|
||||
class ConvLayer(torch.nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride):
|
||||
super(ConvLayer, self).__init__()
|
||||
reflection_padding = kernel_size // 2
|
||||
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
|
||||
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
|
||||
|
||||
def forward(self, x):
|
||||
out = self.reflection_pad(x)
|
||||
out = self.conv2d(out)
|
||||
return out
|
||||
|
||||
|
||||
class ResidualBlock(torch.nn.Module):
|
||||
"""ResidualBlock
|
||||
introduced in: https://arxiv.org/abs/1512.03385
|
||||
recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
|
||||
"""
|
||||
|
||||
def __init__(self, channels):
|
||||
super(ResidualBlock, self).__init__()
|
||||
self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
|
||||
self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
|
||||
self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
|
||||
self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
|
||||
self.relu = torch.nn.ReLU()
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
out = self.relu(self.in1(self.conv1(x)))
|
||||
out = self.in2(self.conv2(out))
|
||||
out = out + residual
|
||||
return out
|
||||
|
||||
|
||||
class UpsampleConvLayer(torch.nn.Module):
|
||||
"""UpsampleConvLayer
|
||||
Upsamples the input and then does a convolution. This method gives better results
|
||||
compared to ConvTranspose2d.
|
||||
ref: http://distill.pub/2016/deconv-checkerboard/
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None):
|
||||
super(UpsampleConvLayer, self).__init__()
|
||||
self.upsample = upsample
|
||||
if upsample:
|
||||
self.upsample_layer = torch.nn.Upsample(mode='nearest', scale_factor=upsample)
|
||||
reflection_padding = kernel_size // 2
|
||||
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
|
||||
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
|
||||
|
||||
def forward(self, x):
|
||||
x_in = x
|
||||
if self.upsample:
|
||||
x_in = self.upsample_layer(x_in)
|
||||
out = self.reflection_pad(x_in)
|
||||
out = self.conv2d(out)
|
||||
return out
|
||||
|
||||
|
||||
def load_image(filename):
|
||||
img = Image.open(filename)
|
||||
return img
|
||||
|
||||
|
||||
def save_image(filename, data):
|
||||
img = data.clone().clamp(0, 255).numpy()
|
||||
img = img.transpose(1, 2, 0).astype("uint8")
|
||||
img = Image.fromarray(img)
|
||||
img.save(filename)
|
||||
|
||||
|
||||
def init():
|
||||
global output_path, args
|
||||
global style_model, device
|
||||
output_path = os.environ['AZUREML_BI_OUTPUT_PATH']
|
||||
print(f'output path: {output_path}')
|
||||
print(f'Cuda available? {torch.cuda.is_available()}')
|
||||
|
||||
arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
|
||||
arg_parser.add_argument("--style", type=str, help="style name")
|
||||
args, unknown_args = arg_parser.parse_known_args()
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
with torch.no_grad():
|
||||
style_model = TransformerNet()
|
||||
model_path = Model.get_model_path(args.style)
|
||||
state_dict = torch.load(os.path.join(model_path))
|
||||
# remove saved deprecated running_* keys in InstanceNorm from the checkpoint
|
||||
for k in list(state_dict.keys()):
|
||||
if re.search(r'in\d+\.running_(mean|var)$', k):
|
||||
del state_dict[k]
|
||||
style_model.load_state_dict(state_dict)
|
||||
style_model.to(device)
|
||||
print(f'Model loaded successfully. Path: {model_path}')
|
||||
|
||||
|
||||
def run(mini_batch):
|
||||
|
||||
result = []
|
||||
for image_file_path in mini_batch:
|
||||
img = load_image(image_file_path)
|
||||
|
||||
with torch.no_grad():
|
||||
content_transform = transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Lambda(lambda x: x.mul(255))
|
||||
])
|
||||
content_image = content_transform(img)
|
||||
content_image = content_image.unsqueeze(0).to(device)
|
||||
|
||||
output = style_model(content_image).cpu()
|
||||
output_file_path = os.path.join(output_path, os.path.basename(image_file_path))
|
||||
save_image(output_file_path, output[0])
|
||||
result.append(output_file_path)
|
||||
|
||||
return result
|
||||
@@ -293,7 +293,7 @@
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"pytorch_env = Environment.get(ws, name='azureml-acpt-pytorch-1.13-cuda11.7')"
|
||||
"pytorch_env = Environment.get(ws, name='azureml-acpt-pytorch-2.2-cuda12.1')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,378 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed PyTorch with Horovod\n",
|
||||
"In this tutorial, you will train a PyTorch model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using distributed training via [Horovod](https://github.com/uber/horovod) across a GPU cluster."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`\n",
|
||||
"* Review the [tutorial](../train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) on single-node PyTorch training using Azure Machine Learning"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diagnostics\n",
|
||||
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"Diagnostics"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||
"\n",
|
||||
"set_diagnostics_collection(send_diagnostics=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"\n",
|
||||
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep='\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `Standard_NC6s_v3` GPU cluster that autoscales from `0` to `4` nodes.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
" print('Found existing compute target.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3',\n",
|
||||
" max_nodes=4)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
" compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# use get_status() to get a detailed status for the current AmlCompute. \n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The above code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train model on the remote compute\n",
|
||||
"Now that we have the AmlCompute ready to go, let's run our distributed training job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a project directory\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"project_folder = './pytorch-distr-hvd'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare training script\n",
|
||||
"Now you will need to create your training script. In this tutorial, the script for distributed training of MNIST is already provided for you at `pytorch_horovod_mnist.py`. In practice, you should be able to take any custom PyTorch training script as is and run it with Azure ML without having to modify your code.\n",
|
||||
"\n",
|
||||
"However, if you would like to use Azure ML's [metric logging](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#logging) capabilities, you will have to add a small amount of Azure ML logic inside your training script. In this example, at each logging interval, we will log the loss for that minibatch to our Azure ML run.\n",
|
||||
"\n",
|
||||
"To do so, in `pytorch_horovod_mnist.py`, we will first access the Azure ML `Run` object within the script:\n",
|
||||
"```Python\n",
|
||||
"from azureml.core.run import Run\n",
|
||||
"run = Run.get_context()\n",
|
||||
"```\n",
|
||||
"Later within the script, we log the loss metric to our run:\n",
|
||||
"```Python\n",
|
||||
"run.log('loss', loss.item())\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Once your script is ready, copy the training script `pytorch_horovod_mnist.py` into the project directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"shutil.copy('pytorch_horovod_mnist.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'pytorch-distr-hvd'\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an environment\n",
|
||||
"\n",
|
||||
"In this tutorial, we will use one of Azure ML's curated PyTorch environments for training. [Curated environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments#use-a-curated-environment) are available in your workspace by default. Specifically, we will use the PyTorch 1.6 GPU curated environment. The curated environment includes the `torch`, `torchvision` and `horovod` packages required by the training script."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"pytorch_env = Environment.get(ws, name='AzureML-acpt-pytorch-1.13-cuda11.7')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure the training job\n",
|
||||
"\n",
|
||||
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
|
||||
"\n",
|
||||
"In order to execute a distributed run using MPI/Horovod, you must create an `MpiConfiguration` object and pass it to the `distributed_job_config` parameter of the ScriptRunConfig constructor. The below code will configure a 2-node distributed job running one process per node. If you would also like to run multiple processes per node (i.e. if your cluster SKU has multiple GPUs), additionally specify the `process_count_per_node` parameter in `MpiConfiguration` (the default is `1`)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"from azureml.core.runconfig import MpiConfiguration\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=project_folder,\n",
|
||||
" script='pytorch_horovod_mnist.py',\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" environment=pytorch_env,\n",
|
||||
" distributed_job_config=MpiConfiguration(node_count=2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit job\n",
|
||||
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(src)\n",
|
||||
"print(run)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor your run\n",
|
||||
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes. You can see that the widget automatically plots and visualizes the loss metric that we logged to the Azure ML run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Alternatively, you can block until the script has completed training before running more code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.wait_for_completion(show_output=True) # this provides a verbose log"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "ninhu"
|
||||
}
|
||||
],
|
||||
"category": "training",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"MNIST"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"PyTorch"
|
||||
],
|
||||
"friendly_name": "Distributed PyTorch",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
},
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Train a model using the distributed training via Horovod"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,181 +0,0 @@
|
||||
# Copyright (c) 2017, PyTorch contributors
|
||||
# Modifications copyright (C) Microsoft Corporation
|
||||
# Licensed under the BSD license
|
||||
# Adapted from https://github.com/uber/horovod/blob/master/examples/pytorch_mnist.py
|
||||
|
||||
from __future__ import print_function
|
||||
import argparse
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torchvision import datasets, transforms
|
||||
import torch.utils.data.distributed
|
||||
import horovod.torch as hvd
|
||||
|
||||
from azureml.core.run import Run
|
||||
# get the Azure ML run object
|
||||
run = Run.get_context()
|
||||
|
||||
print("Torch version:", torch.__version__)
|
||||
|
||||
# Training settings
|
||||
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
|
||||
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
|
||||
help='input batch size for training (default: 64)')
|
||||
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
|
||||
help='input batch size for testing (default: 1000)')
|
||||
parser.add_argument('--epochs', type=int, default=10, metavar='N',
|
||||
help='number of epochs to train (default: 10)')
|
||||
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
|
||||
help='learning rate (default: 0.01)')
|
||||
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
|
||||
help='SGD momentum (default: 0.5)')
|
||||
parser.add_argument('--no-cuda', action='store_true', default=False,
|
||||
help='disables CUDA training')
|
||||
parser.add_argument('--seed', type=int, default=42, metavar='S',
|
||||
help='random seed (default: 42)')
|
||||
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
|
||||
help='how many batches to wait before logging training status')
|
||||
parser.add_argument('--fp16-allreduce', action='store_true', default=False,
|
||||
help='use fp16 compression during allreduce')
|
||||
args = parser.parse_args()
|
||||
args.cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
|
||||
hvd.init()
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
if args.cuda:
|
||||
# Horovod: pin GPU to local rank.
|
||||
torch.cuda.set_device(hvd.local_rank())
|
||||
torch.cuda.manual_seed(args.seed)
|
||||
|
||||
|
||||
kwargs = {}
|
||||
# MNIST dataset
|
||||
datasets.MNIST.resources = [
|
||||
("train-images-idx3-ubyte.gz",
|
||||
"f68b3c2dcbeaaa9fbdd348bbdeb94873"),
|
||||
("train-labels-idx1-ubyte.gz",
|
||||
"d53e105ee54ea40749a09fcbcd1e9432"),
|
||||
("t10k-images-idx3-ubyte.gz",
|
||||
"9fb629c4189551a2d022fa330f9573f3"),
|
||||
("t10k-labels-idx1-ubyte.gz",
|
||||
"ec29112dd5afa0611ce80d1b7f02629c")
|
||||
]
|
||||
train_dataset = \
|
||||
datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))
|
||||
]))
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(
|
||||
train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
|
||||
|
||||
test_dataset = \
|
||||
datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))
|
||||
]))
|
||||
test_sampler = torch.utils.data.distributed.DistributedSampler(
|
||||
test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
|
||||
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
|
||||
sampler=test_sampler, **kwargs)
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
|
||||
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
|
||||
self.conv2_drop = nn.Dropout2d()
|
||||
self.fc1 = nn.Linear(320, 50)
|
||||
self.fc2 = nn.Linear(50, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(F.max_pool2d(self.conv1(x), 2))
|
||||
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
|
||||
x = x.view(-1, 320)
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.dropout(x, training=self.training)
|
||||
x = self.fc2(x)
|
||||
return F.log_softmax(x)
|
||||
|
||||
|
||||
model = Net()
|
||||
|
||||
if args.cuda:
|
||||
# Move model to GPU.
|
||||
model.cuda()
|
||||
|
||||
# Horovod: broadcast parameters.
|
||||
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
|
||||
|
||||
# Horovod: scale learning rate by the number of GPUs.
|
||||
optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
|
||||
momentum=args.momentum)
|
||||
|
||||
# Horovod: (optional) compression algorithm.
|
||||
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
|
||||
|
||||
# Horovod: wrap optimizer with DistributedOptimizer.
|
||||
optimizer = hvd.DistributedOptimizer(optimizer,
|
||||
named_parameters=model.named_parameters(),
|
||||
compression=compression)
|
||||
|
||||
|
||||
def train(epoch):
|
||||
model.train()
|
||||
train_sampler.set_epoch(epoch)
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
if args.cuda:
|
||||
data, target = data.cuda(), target.cuda()
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.nll_loss(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if batch_idx % args.log_interval == 0:
|
||||
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
||||
epoch, batch_idx * len(data), len(train_sampler),
|
||||
100. * batch_idx / len(train_loader), loss.item()))
|
||||
|
||||
# log the loss to the Azure ML run
|
||||
run.log('loss', loss.item())
|
||||
|
||||
|
||||
def metric_average(val, name):
|
||||
tensor = torch.tensor(val)
|
||||
avg_tensor = hvd.allreduce(tensor, name=name)
|
||||
return avg_tensor.item()
|
||||
|
||||
|
||||
def test():
|
||||
model.eval()
|
||||
test_loss = 0.
|
||||
test_accuracy = 0.
|
||||
for data, target in test_loader:
|
||||
if args.cuda:
|
||||
data, target = data.cuda(), target.cuda()
|
||||
output = model(data)
|
||||
# sum up batch loss
|
||||
test_loss += F.nll_loss(output, target, size_average=False).item()
|
||||
# get the index of the max log-probability
|
||||
pred = output.data.max(1, keepdim=True)[1]
|
||||
test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()
|
||||
|
||||
test_loss /= len(test_sampler)
|
||||
test_accuracy /= len(test_sampler)
|
||||
|
||||
test_loss = metric_average(test_loss, 'avg_loss')
|
||||
test_accuracy = metric_average(test_accuracy, 'avg_accuracy')
|
||||
|
||||
if hvd.rank() == 0:
|
||||
print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
|
||||
test_loss, 100. * test_accuracy))
|
||||
|
||||
|
||||
for epoch in range(1, args.epochs + 1):
|
||||
train(epoch)
|
||||
test()
|
||||
@@ -273,7 +273,7 @@
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"pytorch_env = Environment.get(ws, name='azureml-acpt-pytorch-1.13-cuda11.7')"
|
||||
"pytorch_env = Environment.get(ws, name='azureml-acpt-pytorch-2.2-cuda12.1')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,344 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed TensorFlow with Horovod\n",
|
||||
"In this tutorial, you will train a model in TensorFlow using distributed training via [Horovod](https://github.com/uber/horovod)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n",
|
||||
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../../configuration.ipynb) to:\n",
|
||||
" * install the AML SDK\n",
|
||||
" * create a workspace and its configuration file (`config.json`)\n",
|
||||
"* Review the [tutorial](../train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) on single-node TensorFlow training using the SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diagnostics\n",
|
||||
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"Diagnostics"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||
"\n",
|
||||
"set_diagnostics_collection(send_diagnostics=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep='\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or Attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
" print('Found existing compute target')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC6s_v3', \n",
|
||||
" max_nodes=4)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
" compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# use get_status() to get a detailed status for the current cluster. \n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You may want to register datasets using the register() method to your workspace so that the dataset can be shared with others, reused across various experiments, and referred to by name in your training script."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train model on the remote compute"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'tf-distr-hvd'\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an environment\n",
|
||||
"\n",
|
||||
"In this tutorial, we will use one of Azure ML's curated TensorFlow environments for training. [Curated environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments#use-a-curated-environment) are available in your workspace by default. Specifically, we will use the TensorFlow 1.13 GPU curated environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"tf_env = Environment.get(ws, name='azureml-tensorflow-2.11-cuda11')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure the training job\n",
|
||||
"\n",
|
||||
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
|
||||
"\n",
|
||||
"In order to execute a distributed run using MPI/Horovod, you must create an `MpiConfiguration` object and pass it to the `distributed_job_config` parameter of the ScriptRunConfig constructor. The below code will configure a 2-node distributed job running one process per node. If you would also like to run multiple processes per node (i.e. if your cluster SKU has multiple GPUs), additionally specify the `process_count_per_node` parameter in `MpiConfiguration` (the default is `1`)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"from azureml.core.runconfig import MpiConfiguration\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=\"src\",\n",
|
||||
" script='train.py',\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" environment=tf_env,\n",
|
||||
" distributed_job_config=MpiConfiguration(node_count=2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit job\n",
|
||||
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(src)\n",
|
||||
"print(run)\n",
|
||||
"run.get_details()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor your run\n",
|
||||
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Alternatively, you can block until the script has completed training before running more code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "minxia"
|
||||
}
|
||||
],
|
||||
"category": "training",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"None"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"TensorFlow"
|
||||
],
|
||||
"friendly_name": "Distributed training using TensorFlow with Horovod",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
},
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Use the TensorFlow estimator to train a word2vec model"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,120 +0,0 @@
|
||||
# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Script adapted from: https://github.com/horovod/horovod/blob/master/examples/tensorflow2_keras_mnist.py
|
||||
# ==============================================================================
|
||||
|
||||
import tensorflow as tf
|
||||
import horovod.tensorflow.keras as hvd
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--learning-rate", "-lr", type=float, default=0.001)
|
||||
parser.add_argument("--epochs", type=int, default=24)
|
||||
parser.add_argument("--steps-per-epoch", type=int, default=500)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
gpus = tf.config.experimental.list_physical_devices("GPU")
|
||||
for gpu in gpus:
|
||||
tf.config.experimental.set_memory_growth(gpu, True)
|
||||
if gpus:
|
||||
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
|
||||
|
||||
(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
|
||||
path="mnist-%d.npz" % hvd.rank()
|
||||
)
|
||||
|
||||
dataset = tf.data.Dataset.from_tensor_slices(
|
||||
(
|
||||
tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
|
||||
tf.cast(mnist_labels, tf.int64),
|
||||
)
|
||||
)
|
||||
dataset = dataset.repeat().shuffle(10000).batch(128)
|
||||
|
||||
mnist_model = tf.keras.Sequential(
|
||||
[
|
||||
tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
|
||||
tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
|
||||
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
|
||||
tf.keras.layers.Dropout(0.25),
|
||||
tf.keras.layers.Flatten(),
|
||||
tf.keras.layers.Dense(128, activation="relu"),
|
||||
tf.keras.layers.Dropout(0.5),
|
||||
tf.keras.layers.Dense(10, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
# Horovod: adjust learning rate based on number of GPUs.
|
||||
scaled_lr = args.learning_rate * hvd.size()
|
||||
opt = tf.optimizers.Adam(scaled_lr)
|
||||
|
||||
# Horovod: add Horovod DistributedOptimizer.
|
||||
opt = hvd.DistributedOptimizer(opt)
|
||||
|
||||
# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
|
||||
# uses hvd.DistributedOptimizer() to compute gradients.
|
||||
mnist_model.compile(
|
||||
loss=tf.losses.SparseCategoricalCrossentropy(),
|
||||
optimizer=opt,
|
||||
metrics=["accuracy"],
|
||||
experimental_run_tf_function=False,
|
||||
)
|
||||
|
||||
callbacks = [
|
||||
# Horovod: broadcast initial variable states from rank 0 to all other processes.
|
||||
# This is necessary to ensure consistent initialization of all workers when
|
||||
# training is started with random weights or restored from a checkpoint.
|
||||
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
|
||||
# Horovod: average metrics among workers at the end of every epoch.
|
||||
#
|
||||
# Note: This callback must be in the list before the ReduceLROnPlateau,
|
||||
# TensorBoard or other metrics-based callbacks.
|
||||
hvd.callbacks.MetricAverageCallback(),
|
||||
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
|
||||
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
|
||||
# the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
|
||||
hvd.callbacks.LearningRateWarmupCallback(
|
||||
warmup_epochs=3, initial_lr=scaled_lr, verbose=1
|
||||
),
|
||||
]
|
||||
|
||||
# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
|
||||
if hvd.rank() == 0:
|
||||
output_dir = "./outputs"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
callbacks.append(
|
||||
tf.keras.callbacks.ModelCheckpoint(
|
||||
os.path.join(output_dir, "checkpoint-{epoch}.h5")
|
||||
)
|
||||
)
|
||||
|
||||
# Horovod: write logs on worker 0.
|
||||
verbose = 1 if hvd.rank() == 0 else 0
|
||||
|
||||
# Train the model.
|
||||
# Horovod: adjust number of steps based on number of GPUs.
|
||||
mnist_model.fit(
|
||||
dataset,
|
||||
steps_per_epoch=args.steps_per_epoch // hvd.size(),
|
||||
callbacks=callbacks,
|
||||
epochs=args.epochs,
|
||||
verbose=verbose,
|
||||
)
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 119 KiB |
@@ -1,190 +0,0 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import tensorflow as tf
|
||||
import time
|
||||
import glob
|
||||
|
||||
from azureml.core import Run
|
||||
from utils import load_data
|
||||
from tensorflow.keras import Model, layers
|
||||
|
||||
|
||||
# Create TF Model.
|
||||
class NeuralNet(Model):
|
||||
# Set layers.
|
||||
def __init__(self):
|
||||
super(NeuralNet, self).__init__()
|
||||
# First hidden layer.
|
||||
self.h1 = layers.Dense(n_h1, activation=tf.nn.relu)
|
||||
# Second hidden layer.
|
||||
self.h2 = layers.Dense(n_h2, activation=tf.nn.relu)
|
||||
self.out = layers.Dense(n_outputs)
|
||||
|
||||
# Set forward pass.
|
||||
def call(self, x, is_training=False):
|
||||
x = self.h1(x)
|
||||
x = self.h2(x)
|
||||
x = self.out(x)
|
||||
if not is_training:
|
||||
# Apply softmax when not training.
|
||||
x = tf.nn.softmax(x)
|
||||
return x
|
||||
|
||||
|
||||
def cross_entropy_loss(y, logits):
|
||||
# Convert labels to int 64 for tf cross-entropy function.
|
||||
y = tf.cast(y, tf.int64)
|
||||
# Apply softmax to logits and compute cross-entropy.
|
||||
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
|
||||
# Average loss across the batch.
|
||||
return tf.reduce_mean(loss)
|
||||
|
||||
|
||||
# Accuracy metric.
|
||||
def accuracy(y_pred, y_true):
|
||||
# Predicted class is the index of highest score in prediction vector (i.e. argmax).
|
||||
correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
|
||||
return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)
|
||||
|
||||
|
||||
# Optimization process.
|
||||
def run_optimization(x, y):
|
||||
# Wrap computation inside a GradientTape for automatic differentiation.
|
||||
with tf.GradientTape() as g:
|
||||
# Forward pass.
|
||||
logits = neural_net(x, is_training=True)
|
||||
# Compute loss.
|
||||
loss = cross_entropy_loss(y, logits)
|
||||
|
||||
# Variables to update, i.e. trainable variables.
|
||||
trainable_variables = neural_net.trainable_variables
|
||||
|
||||
# Compute gradients.
|
||||
gradients = g.gradient(loss, trainable_variables)
|
||||
|
||||
# Update W and b following gradients.
|
||||
optimizer.apply_gradients(zip(gradients, trainable_variables))
|
||||
|
||||
|
||||
print("TensorFlow version:", tf.__version__)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point')
|
||||
parser.add_argument('--batch-size', type=int, dest='batch_size', default=128, help='mini batch size for training')
|
||||
parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=128,
|
||||
help='# of neurons in the first layer')
|
||||
parser.add_argument('--second-layer-neurons', type=int, dest='n_hidden_2', default=128,
|
||||
help='# of neurons in the second layer')
|
||||
parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.01, help='learning rate')
|
||||
parser.add_argument('--resume-from', type=str, default=None,
|
||||
help='location of the model or checkpoint files from where to resume the training')
|
||||
args = parser.parse_args()
|
||||
|
||||
previous_model_location = args.resume_from
|
||||
# You can also use environment variable to get the model/checkpoint files location
|
||||
# previous_model_location = os.path.expandvars(os.getenv("AZUREML_DATAREFERENCE_MODEL_LOCATION", None))
|
||||
|
||||
data_folder = args.data_folder
|
||||
print('Data folder:', data_folder)
|
||||
|
||||
# load train and test set into numpy arrays
|
||||
# note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
|
||||
X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'),
|
||||
recursive=True)[0], False) / np.float32(255.0)
|
||||
X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'),
|
||||
recursive=True)[0], False) / np.float32(255.0)
|
||||
y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'),
|
||||
recursive=True)[0], True).reshape(-1)
|
||||
y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'),
|
||||
recursive=True)[0], True).reshape(-1)
|
||||
|
||||
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')
|
||||
|
||||
training_set_size = X_train.shape[0]
|
||||
|
||||
n_inputs = 28 * 28
|
||||
n_h1 = args.n_hidden_1
|
||||
n_h2 = args.n_hidden_2
|
||||
n_outputs = 10
|
||||
learning_rate = args.learning_rate
|
||||
n_epochs = 20
|
||||
batch_size = args.batch_size
|
||||
|
||||
# Build neural network model.
|
||||
neural_net = NeuralNet()
|
||||
|
||||
# Stochastic gradient descent optimizer.
|
||||
optimizer = tf.optimizers.SGD(learning_rate)
|
||||
|
||||
# start an Azure ML run
|
||||
run = Run.get_context()
|
||||
|
||||
if previous_model_location:
|
||||
# Restore variables from latest checkpoint.
|
||||
checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer)
|
||||
checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location)
|
||||
checkpoint.restore(checkpoint_file_path)
|
||||
checkpoint_filename = os.path.basename(checkpoint_file_path)
|
||||
num_found = re.search(r'\d+', checkpoint_filename)
|
||||
if num_found:
|
||||
start_epoch = int(num_found.group(0))
|
||||
print("Resuming from epoch {}".format(str(start_epoch)))
|
||||
|
||||
start_time = time.perf_counter()
|
||||
for epoch in range(0, n_epochs):
|
||||
|
||||
# randomly shuffle training set
|
||||
indices = np.random.permutation(training_set_size)
|
||||
X_train = X_train[indices]
|
||||
y_train = y_train[indices]
|
||||
|
||||
# batch index
|
||||
b_start = 0
|
||||
b_end = b_start + batch_size
|
||||
for _ in range(training_set_size // batch_size):
|
||||
# get a batch
|
||||
X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end]
|
||||
|
||||
# update batch index for the next batch
|
||||
b_start = b_start + batch_size
|
||||
b_end = min(b_start + batch_size, training_set_size)
|
||||
|
||||
# train
|
||||
run_optimization(X_batch, y_batch)
|
||||
|
||||
# evaluate training set
|
||||
pred = neural_net(X_batch, is_training=False)
|
||||
acc_train = accuracy(pred, y_batch)
|
||||
|
||||
# evaluate validation set
|
||||
pred = neural_net(X_test, is_training=False)
|
||||
acc_val = accuracy(pred, y_test)
|
||||
|
||||
# log accuracies
|
||||
run.log('training_acc', np.float(acc_train))
|
||||
run.log('validation_acc', np.float(acc_val))
|
||||
print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val)
|
||||
|
||||
# Save checkpoints in the "./outputs" folder so that they are automatically uploaded into run history.
|
||||
checkpoint_dir = './outputs/'
|
||||
checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer)
|
||||
|
||||
if epoch % 2 == 0:
|
||||
checkpoint.save(checkpoint_dir)
|
||||
|
||||
run.log('final_acc', np.float(acc_val))
|
||||
os.makedirs('./outputs/model', exist_ok=True)
|
||||
|
||||
# files saved in the "./outputs" folder are automatically uploaded into run history
|
||||
# this is workaround for https://github.com/tensorflow/tensorflow/issues/33913 and will be fixed once we move to >tf2.1
|
||||
neural_net._set_inputs(X_train)
|
||||
tf.saved_model.save(neural_net, './outputs/model/')
|
||||
|
||||
stop_time = time.perf_counter()
|
||||
training_time = (stop_time - start_time) * 1000
|
||||
print("Total time in milliseconds for training: {}".format(str(training_time)))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,27 +0,0 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import gzip
|
||||
import numpy as np
|
||||
import struct
|
||||
|
||||
|
||||
# load compressed MNIST gz files and return numpy arrays
|
||||
def load_data(filename, label=False):
|
||||
with gzip.open(filename) as gz:
|
||||
struct.unpack('I', gz.read(4))
|
||||
n_items = struct.unpack('>I', gz.read(4))
|
||||
if not label:
|
||||
n_rows = struct.unpack('>I', gz.read(4))[0]
|
||||
n_cols = struct.unpack('>I', gz.read(4))[0]
|
||||
res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
|
||||
res = res.reshape(n_items[0], n_rows * n_cols)
|
||||
else:
|
||||
res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
|
||||
res = res.reshape(n_items[0], 1)
|
||||
return res
|
||||
|
||||
|
||||
# one-hot encode a 1-D array
|
||||
def one_hot_encode(array, num_of_classes):
|
||||
return np.eye(num_of_classes)[array.reshape(-1)]
|
||||
@@ -101,7 +101,7 @@
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.57.0, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version 1.59.0, you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -186,7 +186,7 @@
|
||||
"\n",
|
||||
"# Specify conda dependencies with scikit-learn and temporary pointers to mlflow extensions\n",
|
||||
"cd = CondaDependencies.create(\n",
|
||||
" pip_packages=[\"azureml-mlflow\", \"scikit-learn\", \"matplotlib\", \"pandas\", \"numpy\"]\n",
|
||||
" pip_packages=[\"azureml-mlflow\", \"scikit-learn\", \"matplotlib\", \"pandas\", \"numpy\", \"protobuf==5.28.3\"]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"env.python.conda_dependencies = cd"
|
||||
|
||||
@@ -1,466 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Analyze data drift in Azure Machine Learning datasets \n",
|
||||
"\n",
|
||||
"In this tutorial, you will setup a data drift monitor on a weather dataset to:\n",
|
||||
"\n",
|
||||
"☑ Analyze historical data for drift\n",
|
||||
"\n",
|
||||
"☑ Setup a monitor to recieve email alerts if data drift is detected going forward\n",
|
||||
"\n",
|
||||
"If your workspace is Enterprise level, view and exlpore the results in the Azure Machine Learning studio. The video below shows the results from this tutorial. \n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"If you are using an Azure Machine Learning Compute instance, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) if you haven't already established your connection to the AzureML Workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print('SDK version:', azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize Workspace\n",
|
||||
"\n",
|
||||
"Initialize a workspace object from persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"ws"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup target and baseline datasets\n",
|
||||
"\n",
|
||||
"Setup the baseline and target datasets. The baseline will be used to compare each time slice of the target dataset, which is sampled by a given frequency. For further details, see [our documentation](http://aka.ms/datadrift). \n",
|
||||
"\n",
|
||||
"The next few cells will:\n",
|
||||
" * get the default datastore\n",
|
||||
" * upload the `weather-data` to the datastore\n",
|
||||
" * create the Tabular dataset from the data\n",
|
||||
" * add the timeseries trait by specifying the timestamp column `datetime`\n",
|
||||
" * register the dataset\n",
|
||||
" * create the baseline as a time slice of the target dataset\n",
|
||||
" * optionally, register the baseline dataset\n",
|
||||
" \n",
|
||||
"The folder `weather-data` contains weather data from the [NOAA Integrated Surface Data](https://azure.microsoft.com/services/open-datasets/catalog/noaa-integrated-surface-data/) filtered down to to station names containing the string 'FLORIDA' to reduce the size of data. See `get_data.py` to see how this data is curated and modify as desired. This script may take a long time to run, hence the data is provided in the `weather-data` folder for this demo."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# use default datastore\n",
|
||||
"dstore = ws.get_default_datastore()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# upload weather data\n",
|
||||
"dstore.upload('weather-data', 'datadrift-data', overwrite=True, show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import Dataset class\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"# create target dataset \n",
|
||||
"target = Dataset.Tabular.from_parquet_files(dstore.path('datadrift-data/**/data.parquet'))\n",
|
||||
"# set the timestamp column\n",
|
||||
"target = target.with_timestamp_columns('datetime')\n",
|
||||
"# register the target dataset\n",
|
||||
"target = target.register(ws, 'target')\n",
|
||||
"# retrieve the dataset from the workspace by name\n",
|
||||
"target = Dataset.get_by_name(ws, 'target')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import datetime \n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"# set baseline dataset as January 2019 weather data\n",
|
||||
"baseline = Dataset.Tabular.from_parquet_files(dstore.path('datadrift-data/2019/01/data.parquet'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# optionally, register the baseline dataset. if skipped, an unregistered dataset will be used\n",
|
||||
"#baseline = baseline.register(ws, 'baseline')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create compute target\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"Create an Azure Machine Learning compute cluster to run the data drift monitor and associated runs. The below cell will create a compute cluster named `'cpu-cluster'`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"compute_name = 'cpu-cluster'\n",
|
||||
"\n",
|
||||
"if compute_name in ws.compute_targets:\n",
|
||||
" compute_target = ws.compute_targets[compute_name]\n",
|
||||
" if compute_target and type(compute_target) is AmlCompute:\n",
|
||||
" print('found compute target. just use it. ' + compute_name)\n",
|
||||
"else:\n",
|
||||
" print('creating a new compute target...')\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D3_V2', min_nodes=0, max_nodes=2)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n",
|
||||
"\n",
|
||||
" # can poll for a minimum number of nodes and for a specific timeout.\n",
|
||||
" # if no min node count is provided it will use the scale settings for the cluster\n",
|
||||
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
|
||||
"\n",
|
||||
" # For a more detailed view of current AmlCompute status, use get_status()\n",
|
||||
" print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create data drift monitor\n",
|
||||
"\n",
|
||||
"See [our documentation](http://aka.ms/datadrift) for a complete description for all of the parameters. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"datadrift-remarks-sample"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.datadrift import DataDriftDetector, AlertConfiguration\n",
|
||||
"\n",
|
||||
"alert_config = AlertConfiguration(['user@contoso.com']) # replace with your email to recieve alerts from the scheduled pipeline after enabling\n",
|
||||
"\n",
|
||||
"monitor = DataDriftDetector.create_from_datasets(ws, 'weather-monitor', baseline, target, \n",
|
||||
" compute_target='cpu-cluster', # compute target for scheduled pipeline and backfills \n",
|
||||
" frequency='Week', # how often to analyze target data\n",
|
||||
" feature_list=None, # list of features to detect drift on\n",
|
||||
" drift_threshold=None, # threshold from 0 to 1 for email alerting\n",
|
||||
" latency=0, # SLA in hours for target data to arrive in the dataset\n",
|
||||
" alert_config=alert_config) # email addresses to send alert"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Update data drift monitor\n",
|
||||
"\n",
|
||||
"Many settings of the data drift monitor can be updated after creation. In this demo, we will update the `drift_threshold` and `feature_list`. See [our documentation](http://aka.ms/datadrift) for details on which settings can be changed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get monitor by name\n",
|
||||
"monitor = DataDriftDetector.get_by_name(ws, 'weather-monitor')\n",
|
||||
"\n",
|
||||
"# create feature list - need to exclude columns that naturally drift or increment over time, such as year, day, index\n",
|
||||
"columns = list(baseline.take(1).to_pandas_dataframe())\n",
|
||||
"exclude = ['year', 'day', 'version', '__index_level_0__']\n",
|
||||
"features = [col for col in columns if col not in exclude]\n",
|
||||
"\n",
|
||||
"# update the feature list\n",
|
||||
"monitor = monitor.update(feature_list=features)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Analyze historical data and backfill\n",
|
||||
"\n",
|
||||
"You can use the `backfill` method to:\n",
|
||||
" * analyze historical data\n",
|
||||
" * backfill metrics after updating the settings (mainly the feature list)\n",
|
||||
" * backfill metrics for failed runs\n",
|
||||
" \n",
|
||||
"The below cells will run two backfills that will produce data drift results for 2019 weather data, with January used as the baseline in the monitor. The output can be seen from the `show` method after the runs have completed, or viewed from the Azure Machine Learning studio for Enterprise workspaces.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": true
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
">**Tip!** When starting with the data drift capability, start by backfilling on a small section of data to get initial results. Update the feature list as needed by removing columns that are causing drift, but can be ignored, and backfill this section of data until satisfied with the results. Then, backfill on a larger slice of data and/or set the alert configuration, threshold, and enable the schedule to recieve alerts to drift on your dataset. All of this can be done through the UI (Enterprise) or Python SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Although it depends on many factors, the below backfill should typically take less than 20 minutes to run. Results will show as soon as they become available, not when the backfill is completed, so you may begin to see some metrics in a few minutes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# backfill for one month\n",
|
||||
"backfill_start_date = datetime(2019, 9, 1)\n",
|
||||
"backfill_end_date = datetime(2019, 10, 1)\n",
|
||||
"backfill = monitor.backfill(backfill_start_date, backfill_end_date)\n",
|
||||
"backfill"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Query metrics and show results in Python\n",
|
||||
"\n",
|
||||
"The below cell will plot some key data drift metrics, and can be used to query the results. Run `help(monitor.get_output)` for specifics on the object returned."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# make sure the backfill has completed\n",
|
||||
"backfill.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get results from Python SDK (wait for backfills or monitor runs to finish)\n",
|
||||
"results, metrics = monitor.get_output(start_time=datetime(year=2019, month=9, day=1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the results from Python SDK \n",
|
||||
"monitor.show(backfill_start_date, backfill_end_date)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Enable the monitor's pipeline schedule\n",
|
||||
"\n",
|
||||
"Turn on a scheduled pipeline which will anlayze the target dataset for drift every `frequency`. Use the latency parameter to adjust the start time of the pipeline. For instance, if it takes 24 hours for my data processing pipelines for data to arrive in the target dataset, set latency to 24. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# enable the pipeline schedule and recieve email alerts\n",
|
||||
"monitor.enable_schedule()\n",
|
||||
"\n",
|
||||
"# disable the pipeline schedule \n",
|
||||
"#monitor.disable_schedule()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Delete compute target\n",
|
||||
"\n",
|
||||
"Do not delete the compute target if you intend to keep using it for the data drift monitor scheduled runs or otherwise. If the minimum nodes are set to 0, it will scale down soon after jobs are completed, and scale up the next time the cluster is needed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# optionally delete the compute target\n",
|
||||
"#compute_target.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Delete the DataDriftDetector\n",
|
||||
"\n",
|
||||
"Invoking the `delete()` method on the object deletes the the drift monitor permanently and cannot be undone. You will no longer be able to find it in the UI and the `list()` or `get()` methods. The object on which delete() was called will have its state set to deleted and name suffixed with deleted. The baseline and target datasets and model data that was collected, if any, are not deleted. The compute is not deleted. The DataDrift schedule pipeline is disabled and archived."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"monitor.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next steps\n",
|
||||
"\n",
|
||||
" * See [our documentation](https://aka.ms/datadrift) or [Python SDK reference](https://docs.microsoft.com/python/api/overview/azure/ml/intro)\n",
|
||||
" * [Send requests or feedback](mailto:driftfeedback@microsoft.com) on data drift directly to the team\n",
|
||||
" * Please open issues with data drift here on GitHub or on StackOverflow if others are likely to run into the same issue"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "jamgan"
|
||||
}
|
||||
],
|
||||
"category": "tutorial",
|
||||
"compute": [
|
||||
"Remote"
|
||||
],
|
||||
"datasets": [
|
||||
"NOAA"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"Azure ML"
|
||||
],
|
||||
"friendly_name": "Data drift quickdemo",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"star_tag": [
|
||||
"featured"
|
||||
],
|
||||
"tags": [
|
||||
"Dataset",
|
||||
"Timeseries",
|
||||
"Drift"
|
||||
],
|
||||
"task": "Filtering"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
# import packages
|
||||
import os
|
||||
import pandas as pd
|
||||
from calendar import monthrange
|
||||
from datetime import datetime, timedelta
|
||||
from azureml.core import Dataset, Datastore, Workspace
|
||||
from azureml.opendatasets import NoaaIsdWeather
|
||||
|
||||
# get workspace and datastore
|
||||
ws = Workspace.from_config()
|
||||
dstore = ws.get_default_datastore()
|
||||
|
||||
# adjust parameters as needed
|
||||
target_years = list(range(2010, 2020))
|
||||
start_month = 1
|
||||
|
||||
# get data
|
||||
for year in target_years:
|
||||
for month in range(start_month, 12 + 1):
|
||||
path = 'weather-data/{}/{:02d}/'.format(year, month)
|
||||
try:
|
||||
start = datetime(year, month, 1)
|
||||
end = datetime(year, month, monthrange(year, month)[1]) + timedelta(days=1)
|
||||
isd = NoaaIsdWeather(start, end).to_pandas_dataframe()
|
||||
isd = isd[isd['stationName'].str.contains('FLORIDA', regex=True, na=False)]
|
||||
os.makedirs(path, exist_ok=True)
|
||||
isd.to_parquet(path + 'data.parquet')
|
||||
except Exception as e:
|
||||
print('Month {} in year {} likely has no data.\n'.format(month, year))
|
||||
print('Exception: {}'.format(e))
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 56 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 8.7 MiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user