mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 01:27:06 -05:00
474 lines
18 KiB
Plaintext
474 lines
18 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
|
"\n",
|
|
"Licensed under the MIT License."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# 51. Distributed TensorFlow using Parameter Server\n",
|
|
"In this tutorial we demonstrate how to use the Azure ML Training SDK to train Tensorflow model in a distributed manner using Parameter Server.\n",
|
|
"\n",
|
|
"# Prerequisites\n",
|
|
"\n",
|
|
"Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check core SDK version number\n",
|
|
"import azureml.core\n",
|
|
"\n",
|
|
"print(\"SDK version:\", azureml.core.VERSION)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from azureml.core.workspace import Workspace\n",
|
|
"\n",
|
|
"ws = Workspace.from_config()\n",
|
|
"print('Workspace name: ' + ws.name, \n",
|
|
" 'Azure region: ' + ws.location, \n",
|
|
" 'Subscription id: ' + ws.subscription_id, \n",
|
|
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import getpass\n",
|
|
"import os\n",
|
|
"from azureml.core.experiment import Experiment\n",
|
|
"\n",
|
|
"username = getpass.getuser().replace('-','')\n",
|
|
"\n",
|
|
"# choose a name for the run history container in the workspace\n",
|
|
"run_history_name = username + '-tf_ps'\n",
|
|
"\n",
|
|
"experiment = Experiment(ws, run_history_name)\n",
|
|
"\n",
|
|
"# project folder name\n",
|
|
"project_folder = './' + run_history_name\n",
|
|
"\n",
|
|
"print(project_folder)\n",
|
|
"os.makedirs(project_folder, exist_ok = True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This recipe is using a MLC-managed Batch AI cluster. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from azureml.core.compute import BatchAiCompute\n",
|
|
"from azureml.core.compute import ComputeTarget\n",
|
|
"\n",
|
|
"batchai_cluster_name='gpucluster'\n",
|
|
"\n",
|
|
"\n",
|
|
"try:\n",
|
|
" # Check for existing cluster\n",
|
|
" compute_target = ComputeTarget(ws,batchai_cluster_name)\n",
|
|
" print('Found existing compute target')\n",
|
|
"except:\n",
|
|
" # Else, create new one\n",
|
|
" print('Creating a new compute target...')\n",
|
|
" provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n",
|
|
" #vm_priority = 'lowpriority', # optional\n",
|
|
" autoscale_enabled = True,\n",
|
|
" cluster_min_nodes = 0, \n",
|
|
" cluster_max_nodes = 4)\n",
|
|
" compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n",
|
|
" # can poll for a minimum number of nodes and for a specific timeout. \n",
|
|
" # if no min node count is provided it will use the scale settings for the cluster\n",
|
|
" compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
|
|
"\n",
|
|
" # For a more detailed view of current BatchAI cluster status, use the 'status' property \n",
|
|
"print(compute_target.status.serialize())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%%writefile {project_folder}/mnist_replica.py\n",
|
|
"\n",
|
|
"# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n",
|
|
"#\n",
|
|
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
|
|
"# you may not use this file except in compliance with the License.\n",
|
|
"# You may obtain a copy of the License at\n",
|
|
"#\n",
|
|
"# http://www.apache.org/licenses/LICENSE-2.0\n",
|
|
"#\n",
|
|
"# Unless required by applicable law or agreed to in writing, software\n",
|
|
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
|
|
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
|
|
"# See the License for the specific language governing permissions and\n",
|
|
"# limitations under the License.\n",
|
|
"# ==============================================================================\n",
|
|
"\"\"\"Distributed MNIST training and validation, with model replicas.\n",
|
|
"A simple softmax model with one hidden layer is defined. The parameters\n",
|
|
"(weights and biases) are located on one parameter server (ps), while the ops\n",
|
|
"are executed on two worker nodes by default. The TF sessions also run on the\n",
|
|
"worker node.\n",
|
|
"Multiple invocations of this script can be done in parallel, with different\n",
|
|
"values for --task_index. There should be exactly one invocation with\n",
|
|
"--task_index, which will create a master session that carries out variable\n",
|
|
"initialization. The other, non-master, sessions will wait for the master\n",
|
|
"session to finish the initialization before proceeding to the training stage.\n",
|
|
"The coordination between the multiple worker invocations occurs due to\n",
|
|
"the definition of the parameters on the same ps devices. The parameter updates\n",
|
|
"from one worker is visible to all other workers. As such, the workers can\n",
|
|
"perform forward computation and gradient calculation in parallel, which\n",
|
|
"should lead to increased training speed for the simple model.\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"from __future__ import absolute_import\n",
|
|
"from __future__ import division\n",
|
|
"from __future__ import print_function\n",
|
|
"\n",
|
|
"import os\n",
|
|
"import math\n",
|
|
"import sys\n",
|
|
"import tempfile\n",
|
|
"import time\n",
|
|
"import json\n",
|
|
"\n",
|
|
"import tensorflow as tf\n",
|
|
"from tensorflow.examples.tutorials.mnist import input_data\n",
|
|
"from azureml.core.run import Run\n",
|
|
"\n",
|
|
"flags = tf.app.flags\n",
|
|
"flags.DEFINE_string(\"data_dir\", \"/tmp/mnist-data\",\n",
|
|
" \"Directory for storing mnist data\")\n",
|
|
"flags.DEFINE_boolean(\"download_only\", False,\n",
|
|
" \"Only perform downloading of data; Do not proceed to \"\n",
|
|
" \"session preparation, model definition or training\")\n",
|
|
"flags.DEFINE_integer(\"num_gpus\", 0, \"Total number of gpus for each machine.\"\n",
|
|
" \"If you don't use GPU, please set it to '0'\")\n",
|
|
"flags.DEFINE_integer(\"replicas_to_aggregate\", None,\n",
|
|
" \"Number of replicas to aggregate before parameter update \"\n",
|
|
" \"is applied (For sync_replicas mode only; default: \"\n",
|
|
" \"num_workers)\")\n",
|
|
"flags.DEFINE_integer(\"hidden_units\", 100,\n",
|
|
" \"Number of units in the hidden layer of the NN\")\n",
|
|
"flags.DEFINE_integer(\"train_steps\", 200,\n",
|
|
" \"Number of (global) training steps to perform\")\n",
|
|
"flags.DEFINE_integer(\"batch_size\", 100, \"Training batch size\")\n",
|
|
"flags.DEFINE_float(\"learning_rate\", 0.01, \"Learning rate\")\n",
|
|
"flags.DEFINE_boolean(\n",
|
|
" \"sync_replicas\", False,\n",
|
|
" \"Use the sync_replicas (synchronized replicas) mode, \"\n",
|
|
" \"wherein the parameter updates from workers are aggregated \"\n",
|
|
" \"before applied to avoid stale gradients\")\n",
|
|
"flags.DEFINE_boolean(\n",
|
|
" \"existing_servers\", False, \"Whether servers already exists. If True, \"\n",
|
|
" \"will use the worker hosts via their GRPC URLs (one client process \"\n",
|
|
" \"per worker host). Otherwise, will create an in-process TensorFlow \"\n",
|
|
" \"server.\")\n",
|
|
"\n",
|
|
"FLAGS = flags.FLAGS\n",
|
|
"\n",
|
|
"IMAGE_PIXELS = 28\n",
|
|
"\n",
|
|
"\n",
|
|
"def main(unused_argv):\n",
|
|
" data_root = os.path.join(\"outputs\", \"MNIST\")\n",
|
|
" mnist = None\n",
|
|
" tf_config = os.environ.get(\"TF_CONFIG\")\n",
|
|
" if not tf_config or tf_config == \"\":\n",
|
|
" raise ValueError(\"TF_CONFIG not found.\")\n",
|
|
" tf_config_json = json.loads(tf_config)\n",
|
|
" cluster = tf_config_json.get('cluster')\n",
|
|
" job_name = tf_config_json.get('task', {}).get('type')\n",
|
|
" task_index = tf_config_json.get('task', {}).get('index')\n",
|
|
" job_name = \"worker\" if job_name == \"master\" else job_name\n",
|
|
" sentinel_path = os.path.join(data_root, \"complete.txt\") \n",
|
|
" if job_name==\"worker\" and task_index==0:\n",
|
|
" mnist = input_data.read_data_sets(data_root, one_hot=True)\n",
|
|
" path = os.path.join(data_root, \"complete.txt\") \n",
|
|
" with open(sentinel_path, 'w+') as f:\n",
|
|
" f.write(\"download complete\")\n",
|
|
" else:\n",
|
|
" while not os.path.exists(sentinel_path):\n",
|
|
" time.sleep(0.01)\n",
|
|
" mnist = input_data.read_data_sets(data_root, one_hot=True)\n",
|
|
" \n",
|
|
" if FLAGS.download_only:\n",
|
|
" sys.exit(0)\n",
|
|
"\n",
|
|
" print(\"job name = %s\" % job_name)\n",
|
|
" print(\"task index = %d\" % task_index)\n",
|
|
" print(\"number of GPUs = %d\" % FLAGS.num_gpus)\n",
|
|
"\n",
|
|
" #Construct the cluster and start the server\n",
|
|
" cluster_spec = tf.train.ClusterSpec(cluster)\n",
|
|
" \n",
|
|
" # Get the number of workers.\n",
|
|
" num_workers = len(cluster_spec.task_indices(\"worker\"))\n",
|
|
"\n",
|
|
" if not FLAGS.existing_servers:\n",
|
|
" # Not using existing servers. Create an in-process server.\n",
|
|
" server = tf.train.Server(\n",
|
|
" cluster_spec, job_name=job_name, task_index=task_index)\n",
|
|
" if job_name == \"ps\":\n",
|
|
" server.join()\n",
|
|
"\n",
|
|
" is_chief = (task_index == 0)\n",
|
|
" if FLAGS.num_gpus > 0:\n",
|
|
" # Avoid gpu allocation conflict: now allocate task_num -> #gpu\n",
|
|
" # for each worker in the corresponding machine\n",
|
|
" gpu = (task_index % FLAGS.num_gpus)\n",
|
|
" worker_device = \"/job:worker/task:%d/gpu:%d\" % (task_index, gpu)\n",
|
|
" elif FLAGS.num_gpus == 0:\n",
|
|
" # Just allocate the CPU to worker server\n",
|
|
" cpu = 0\n",
|
|
" worker_device = \"/job:worker/task:%d/cpu:%d\" % (task_index, cpu)\n",
|
|
" # The device setter will automatically place Variables ops on separate\n",
|
|
" # parameter servers (ps). The non-Variable ops will be placed on the workers.\n",
|
|
" # The ps use CPU and workers use corresponding GPU\n",
|
|
" with tf.device(\n",
|
|
" tf.train.replica_device_setter(\n",
|
|
" worker_device=worker_device,\n",
|
|
" ps_device=\"/job:ps/cpu:0\",\n",
|
|
" cluster=cluster)):\n",
|
|
" global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n",
|
|
"\n",
|
|
" # Variables of the hidden layer\n",
|
|
" hid_w = tf.Variable(\n",
|
|
" tf.truncated_normal(\n",
|
|
" [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],\n",
|
|
" stddev=1.0 / IMAGE_PIXELS),\n",
|
|
" name=\"hid_w\")\n",
|
|
" hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name=\"hid_b\")\n",
|
|
"\n",
|
|
" # Variables of the softmax layer\n",
|
|
" sm_w = tf.Variable(\n",
|
|
" tf.truncated_normal(\n",
|
|
" [FLAGS.hidden_units, 10],\n",
|
|
" stddev=1.0 / math.sqrt(FLAGS.hidden_units)),\n",
|
|
" name=\"sm_w\")\n",
|
|
" sm_b = tf.Variable(tf.zeros([10]), name=\"sm_b\")\n",
|
|
"\n",
|
|
" # Ops: located on the worker specified with task_index\n",
|
|
" x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])\n",
|
|
" y_ = tf.placeholder(tf.float32, [None, 10])\n",
|
|
"\n",
|
|
" hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)\n",
|
|
" hid = tf.nn.relu(hid_lin)\n",
|
|
"\n",
|
|
" y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))\n",
|
|
" cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))\n",
|
|
"\n",
|
|
" opt = tf.train.AdamOptimizer(FLAGS.learning_rate)\n",
|
|
"\n",
|
|
" if FLAGS.sync_replicas:\n",
|
|
" if FLAGS.replicas_to_aggregate is None:\n",
|
|
" replicas_to_aggregate = num_workers\n",
|
|
" else:\n",
|
|
" replicas_to_aggregate = FLAGS.replicas_to_aggregate\n",
|
|
"\n",
|
|
" opt = tf.train.SyncReplicasOptimizer(\n",
|
|
" opt,\n",
|
|
" replicas_to_aggregate=replicas_to_aggregate,\n",
|
|
" total_num_replicas=num_workers,\n",
|
|
" name=\"mnist_sync_replicas\")\n",
|
|
"\n",
|
|
" train_step = opt.minimize(cross_entropy, global_step=global_step)\n",
|
|
"\n",
|
|
" if FLAGS.sync_replicas:\n",
|
|
" local_init_op = opt.local_step_init_op\n",
|
|
" if is_chief:\n",
|
|
" local_init_op = opt.chief_init_op\n",
|
|
"\n",
|
|
" ready_for_local_init_op = opt.ready_for_local_init_op\n",
|
|
"\n",
|
|
" # Initial token and chief queue runners required by the sync_replicas mode\n",
|
|
" chief_queue_runner = opt.get_chief_queue_runner()\n",
|
|
" sync_init_op = opt.get_init_tokens_op()\n",
|
|
"\n",
|
|
" init_op = tf.global_variables_initializer()\n",
|
|
" train_dir = tempfile.mkdtemp()\n",
|
|
"\n",
|
|
" if FLAGS.sync_replicas:\n",
|
|
" sv = tf.train.Supervisor(\n",
|
|
" is_chief=is_chief,\n",
|
|
" logdir=train_dir,\n",
|
|
" init_op=init_op,\n",
|
|
" local_init_op=local_init_op,\n",
|
|
" ready_for_local_init_op=ready_for_local_init_op,\n",
|
|
" recovery_wait_secs=1,\n",
|
|
" global_step=global_step)\n",
|
|
" else:\n",
|
|
" sv = tf.train.Supervisor(\n",
|
|
" is_chief=is_chief,\n",
|
|
" logdir=train_dir,\n",
|
|
" init_op=init_op,\n",
|
|
" recovery_wait_secs=1,\n",
|
|
" global_step=global_step)\n",
|
|
"\n",
|
|
" sess_config = tf.ConfigProto(\n",
|
|
" allow_soft_placement=True,\n",
|
|
" log_device_placement=False,\n",
|
|
" device_filters=[\"/job:ps\",\n",
|
|
" \"/job:worker/task:%d\" % task_index])\n",
|
|
"\n",
|
|
" # The chief worker (task_index==0) session will prepare the session,\n",
|
|
" # while the remaining workers will wait for the preparation to complete.\n",
|
|
" if is_chief:\n",
|
|
" print(\"Worker %d: Initializing session...\" % task_index)\n",
|
|
" else:\n",
|
|
" print(\"Worker %d: Waiting for session to be initialized...\" %\n",
|
|
" task_index)\n",
|
|
"\n",
|
|
" if FLAGS.existing_servers:\n",
|
|
" server_grpc_url = \"grpc://\" + worker_spec[task_index]\n",
|
|
" print(\"Using existing server at: %s\" % server_grpc_url)\n",
|
|
"\n",
|
|
" sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)\n",
|
|
" else:\n",
|
|
" sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)\n",
|
|
"\n",
|
|
" print(\"Worker %d: Session initialization complete.\" % task_index)\n",
|
|
"\n",
|
|
" if FLAGS.sync_replicas and is_chief:\n",
|
|
" # Chief worker will start the chief queue runner and call the init op.\n",
|
|
" sess.run(sync_init_op)\n",
|
|
" sv.start_queue_runners(sess, [chief_queue_runner])\n",
|
|
"\n",
|
|
" # Perform training\n",
|
|
" time_begin = time.time()\n",
|
|
" print(\"Training begins @ %f\" % time_begin)\n",
|
|
"\n",
|
|
" local_step = 0\n",
|
|
" while True:\n",
|
|
" # Training feed\n",
|
|
" batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)\n",
|
|
" train_feed = {x: batch_xs, y_: batch_ys}\n",
|
|
"\n",
|
|
" _, step = sess.run([train_step, global_step], feed_dict=train_feed)\n",
|
|
" local_step += 1\n",
|
|
"\n",
|
|
" now = time.time()\n",
|
|
" print(\"%f: Worker %d: training step %d done (global step: %d)\" %\n",
|
|
" (now, task_index, local_step, step))\n",
|
|
"\n",
|
|
" if step >= FLAGS.train_steps:\n",
|
|
" break\n",
|
|
"\n",
|
|
" time_end = time.time()\n",
|
|
" print(\"Training ends @ %f\" % time_end)\n",
|
|
" training_time = time_end - time_begin\n",
|
|
" print(\"Training elapsed time: %f s\" % training_time)\n",
|
|
"\n",
|
|
" # Validation feed\n",
|
|
" val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}\n",
|
|
" val_xent = sess.run(cross_entropy, feed_dict=val_feed)\n",
|
|
" print(\"After %d training step(s), validation cross entropy = %g\" %\n",
|
|
" (FLAGS.train_steps, val_xent))\n",
|
|
" if job_name==\"worker\" and task_index==0:\n",
|
|
" run = Run.get_submitted_run()\n",
|
|
" run.log(\"CrossEntropy\", val_xent)\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" tf.app.run()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from azureml.train.dnn import *\n",
|
|
"tf_estimator = TensorFlow(source_directory=project_folder,\n",
|
|
" compute_target=compute_target,\n",
|
|
" entry_script='mnist_replica.py',\n",
|
|
" node_count=2,\n",
|
|
" worker_count=2,\n",
|
|
" parameter_server_count=1, \n",
|
|
" distributed_backend=\"ps\",\n",
|
|
" use_gpu=False)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"run = experiment.submit(tf_estimator)\n",
|
|
"print(run)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from azureml.train.widgets import RunDetails\n",
|
|
"RunDetails(run).show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"run.wait_for_completion(show_output=True)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|