Merge pull request #1078 from Azure/release_update/Release-59

update samples from Release-59 as a part of SDK release
2025-12-20 09:37:04 -05:00 · 2020-07-31 10:52:22 -07:00
parent 7e2c1ca152 b9ef23ad4b
commit 0ab8b141ee
13 changed files with 1758 additions and 1 deletions
--- a/how-to-use-azureml/reinforcement-learning/README.md
+++ b/how-to-use-azureml/reinforcement-learning/README.md
@@ -35,6 +35,7 @@ Using these samples, you will learn how to do the following.
 | [cartpole_sc.ipynb](cartpole-on-single-compute/cartpole_sc.ipynb)  | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Cluster (single node) |
 | [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb)   | Notebook for distributed training of Pong agent using RLlib on multiple compute targets |
 | [minecraft.ipynb](minecraft-on-distributed-compute/minecraft.ipynb)   | Notebook to train an agent to navigate through a lava maze in the Minecraft game |
 | [particle.ipynb](multiagent-particle-envs/particle.ipynb)  | Notebook to train policies in a multiagent cooperative navigation scenario based on OpenAI's Particle environments |
 ## Prerequisites
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/Dockerfile
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/Dockerfile
@@ -0,0 +1,60 @@
 FROM mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
    curl \
    ca-certificates \
    sudo \
 	cpio \
    git \
    bzip2 \
    libx11-6 \
    tmux \
    htop \
    gcc \
    xvfb \
    python-opengl \
    x11-xserver-utils \
    ffmpeg \
    mesa-utils \
    nano \
    vim \
    rsync \
 && rm -rf /var/lib/apt/lists/*
 # Install python 3.7
 RUN conda install python==3.7 
 # Create a working directory
 RUN mkdir /app
 WORKDIR /app
 # Install required pip packages
 RUN pip install --upgrade pip setuptools && pip install --upgrade \
    pandas \
    matplotlib \
    psutil \
    numpy \
    scipy \
    gym \
    azureml-defaults \
    tensorboardX \
    tensorflow==1.15 \
    tensorflow-probability==0.8.0 \
    onnxruntime \
    tf2onnx \
    cloudpickle==1.2.0 \
    tabulate \
    dm_tree \
    lz4 \
    opencv-python \
    ray==0.8.3 \
    ray[rllib]==0.8.3 \
    ray[tune]==0.8.3
 # Install particle
 RUN git clone https://github.com/openai/multiagent-particle-envs.git
 COPY patch_files/* multiagent-particle-envs/multiagent/
 RUN cd multiagent-particle-envs && \
    pip install -e . && \
    pip install --upgrade pyglet==1.3.2
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/multi_discrete.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/multi_discrete.py
@@ -0,0 +1,70 @@
 # MIT License
 # Copyright (c) 2018 OpenAI
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import numpy as np
 import gym
 class MultiDiscrete(gym.Space):
    """
    - The multi-discrete action space consists of a series of discrete action spaces with different
      parameters
    - It can be adapted to both a Discrete action space or a continuous (Box) action space
    - It is useful to represent game controllers or keyboards where each key can be represented as
      a discrete action space
    - It is parametrized by passing an array of arrays containing [min, max] for each discrete action
      space where the discrete action space can take any integers from `min` to `max` (both inclusive)
    Note: A value of 0 always need to represent the NOOP action.
    e.g. Nintendo Game Controller
    - Can be conceptualized as 3 discrete action spaces:
        1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
        2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
        3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
    - Can be initialized as
        MultiDiscrete([ [0,4], [0,1], [0,1] ])
    """
    def __init__(self, array_of_param_array):
        self.low = np.array([x[0] for x in array_of_param_array])
        self.high = np.array([x[1] for x in array_of_param_array])
        self.num_discrete_space = self.low.shape[0]
    def sample(self):
        """ Returns a array with one sample from each discrete action space """
        # For each row: round(random .* (max - min) + min, 0)
        # random_array = prng.np_random.rand(self.num_discrete_space)
        random_array = np.random.RandomState().rand(self.num_discrete_space)
        return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
    def contains(self, x):
        return len(x) == self.num_discrete_space \
            and (np.array(x) >= self.low).all() \
            and (np.array(x) <= self.high).all()
    @property
    def shape(self):
        return self.num_discrete_space
    def __repr__(self):
        return "MultiDiscrete" + str(self.num_discrete_space)
    def __eq__(self, other):
        return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/rendering.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/rendering.py
@@ -0,0 +1,413 @@
 # MIT License
 # Copyright (c) 2018 OpenAI
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 """
 2D rendering framework
 """
 from __future__ import division
 import os
 import six
 import sys
 from gym import error
 import math
 import numpy as np
 import pyglet
 from pyglet.gl import glEnable, glHint, glLineWidth, glBlendFunc, glClearColor, glPushMatrix, \
    glTranslatef, glRotatef, glScalef, glPopMatrix, glColor4f, glBegin, glVertex3f, glEnd, glLineStipple, \
    glDisable, glVertex2f, GL_BLEND, GL_LINE_SMOOTH, GL_LINE_SMOOTH_HINT, GL_NICEST, GL_SRC_ALPHA, \
    GL_ONE_MINUS_SRC_ALPHA, GL_LINE_STIPPLE, GL_POINTS, GL_QUADS, GL_TRIANGLES, GL_POLYGON, GL_LINE_LOOP, \
    GL_LINE_STRIP, GL_LINES
 if "Apple" in sys.version:
    if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
        os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
        # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
 RAD2DEG = 57.29577951308232
 def get_display(spec):
    """Convert a display specification (such as :0) into an actual Display
    object.
    Pyglet only supports multiple Displays on Linux.
    """
    if spec is None:
        return None
    elif isinstance(spec, six.string_types):
        return pyglet.canvas.Display(spec)
    else:
        raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
 class Viewer(object):
    def __init__(self, width, height, display=None):
        display = get_display(display)
        self.width = width
        self.height = height
        self.window = pyglet.window.Window(width=width, height=height, display=display)
        self.window.on_close = self.window_closed_by_user
        self.geoms = []
        self.onetime_geoms = []
        self.transform = Transform()
        glEnable(GL_BLEND)
        # glEnable(GL_MULTISAMPLE)
        glEnable(GL_LINE_SMOOTH)
        # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
        glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
        glLineWidth(2.0)
        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
    def close(self):
        self.window.close()
    def window_closed_by_user(self):
        self.close()
    def set_bounds(self, left, right, bottom, top):
        assert right > left and top > bottom
        scalex = self.width / (right - left)
        scaley = self.height / (top - bottom)
        self.transform = Transform(
            translation=(-left * scalex, -bottom * scaley),
            scale=(scalex, scaley))
    def add_geom(self, geom):
        self.geoms.append(geom)
    def add_onetime(self, geom):
        self.onetime_geoms.append(geom)
    def render(self, return_rgb_array=False):
        glClearColor(1, 1, 1, 1)
        self.window.clear()
        self.window.switch_to()
        self.window.dispatch_events()
        self.transform.enable()
        for geom in self.geoms:
            geom.render()
        for geom in self.onetime_geoms:
            geom.render()
        self.transform.disable()
        arr = None
        if return_rgb_array:
            buffer = pyglet.image.get_buffer_manager().get_color_buffer()
            image_data = buffer.get_image_data()
            arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
            # In https://github.com/openai/gym-http-api/issues/2, we
            # discovered that someone using Xmonad on Arch was having
            # a window of size 598 x 398, though a 600 x 400 window
            # was requested. (Guess Xmonad was preserving a pixel for
            # the boundary.) So we use the buffer height/width rather
            # than the requested one.
            arr = arr.reshape(buffer.height, buffer.width, 4)
            arr = arr[::-1, :, 0:3]
        self.window.flip()
        self.onetime_geoms = []
        return arr
    # Convenience
    def draw_circle(self, radius=10, res=30, filled=True, **attrs):
        geom = make_circle(radius=radius, res=res, filled=filled)
        _add_attrs(geom, attrs)
        self.add_onetime(geom)
        return geom
    def draw_polygon(self, v, filled=True, **attrs):
        geom = make_polygon(v=v, filled=filled)
        _add_attrs(geom, attrs)
        self.add_onetime(geom)
        return geom
    def draw_polyline(self, v, **attrs):
        geom = make_polyline(v=v)
        _add_attrs(geom, attrs)
        self.add_onetime(geom)
        return geom
    def draw_line(self, start, end, **attrs):
        geom = Line(start, end)
        _add_attrs(geom, attrs)
        self.add_onetime(geom)
        return geom
    def get_array(self):
        self.window.flip()
        image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
        self.window.flip()
        arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
        arr = arr.reshape(self.height, self.width, 4)
        return arr[::-1, :, 0:3]
 def _add_attrs(geom, attrs):
    if "color" in attrs:
        geom.set_color(*attrs["color"])
    if "linewidth" in attrs:
        geom.set_linewidth(attrs["linewidth"])
 class Geom(object):
    def __init__(self):
        self._color = Color((0, 0, 0, 1.0))
        self.attrs = [self._color]
    def render(self):
        for attr in reversed(self.attrs):
            attr.enable()
        self.render1()
        for attr in self.attrs:
            attr.disable()
    def render1(self):
        raise NotImplementedError
    def add_attr(self, attr):
        self.attrs.append(attr)
    def set_color(self, r, g, b, alpha=1):
        self._color.vec4 = (r, g, b, alpha)
 class Attr(object):
    def enable(self):
        raise NotImplementedError
    def disable(self):
        pass
 class Transform(Attr):
    def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1, 1)):
        self.set_translation(*translation)
        self.set_rotation(rotation)
        self.set_scale(*scale)
    def enable(self):
        glPushMatrix()
        glTranslatef(self.translation[0], self.translation[1], 0)  # translate to GL loc ppint
        glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
        glScalef(self.scale[0], self.scale[1], 1)
    def disable(self):
        glPopMatrix()
    def set_translation(self, newx, newy):
        self.translation = (float(newx), float(newy))
    def set_rotation(self, new):
        self.rotation = float(new)
    def set_scale(self, newx, newy):
        self.scale = (float(newx), float(newy))
 class Color(Attr):
    def __init__(self, vec4):
        self.vec4 = vec4
    def enable(self):
        glColor4f(*self.vec4)
 class LineStyle(Attr):
    def __init__(self, style):
        self.style = style
    def enable(self):
        glEnable(GL_LINE_STIPPLE)
        glLineStipple(1, self.style)
    def disable(self):
        glDisable(GL_LINE_STIPPLE)
 class LineWidth(Attr):
    def __init__(self, stroke):
        self.stroke = stroke
    def enable(self):
        glLineWidth(self.stroke)
 class Point(Geom):
    def __init__(self):
        Geom.__init__(self)
    def render1(self):
        glBegin(GL_POINTS)  # draw point
        glVertex3f(0.0, 0.0, 0.0)
        glEnd()
 class FilledPolygon(Geom):
    def __init__(self, v):
        Geom.__init__(self)
        self.v = v
    def render1(self):
        if len(self.v) == 4:
            glBegin(GL_QUADS)
        elif len(self.v) > 4:
            glBegin(GL_POLYGON)
        else:
            glBegin(GL_TRIANGLES)
        for p in self.v:
            glVertex3f(p[0], p[1], 0)  # draw each vertex
        glEnd()
        color = (
            self._color.vec4[0] * 0.5,
            self._color.vec4[1] * 0.5,
            self._color.vec4[2] * 0.5,
            self._color.vec4[3] * 0.5)
        glColor4f(*color)
        glBegin(GL_LINE_LOOP)
        for p in self.v:
            glVertex3f(p[0], p[1], 0)  # draw each vertex
        glEnd()
 def make_circle(radius=10, res=30, filled=True):
    points = []
    for i in range(res):
        ang = 2 * math.pi * i / res
        points.append((math.cos(ang) * radius, math.sin(ang) * radius))
    if filled:
        return FilledPolygon(points)
    else:
        return PolyLine(points, True)
 def make_polygon(v, filled=True):
    if filled:
        return FilledPolygon(v)
    else:
        return PolyLine(v, True)
 def make_polyline(v):
    return PolyLine(v, False)
 def make_capsule(length, width):
    l, r, t, b = 0, length, width / 2, -width / 2
    box = make_polygon([(l, b), (l, t), (r, t), (r, b)])
    circ0 = make_circle(width / 2)
    circ1 = make_circle(width / 2)
    circ1.add_attr(Transform(translation=(length, 0)))
    geom = Compound([box, circ0, circ1])
    return geom
 class Compound(Geom):
    def __init__(self, gs):
        Geom.__init__(self)
        self.gs = gs
        for g in self.gs:
            g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
    def render1(self):
        for g in self.gs:
            g.render()
 class PolyLine(Geom):
    def __init__(self, v, close):
        Geom.__init__(self)
        self.v = v
        self.close = close
        self.linewidth = LineWidth(1)
        self.add_attr(self.linewidth)
    def render1(self):
        glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
        for p in self.v:
            glVertex3f(p[0], p[1], 0)  # draw each vertex
        glEnd()
    def set_linewidth(self, x):
        self.linewidth.stroke = x
 class Line(Geom):
    def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
        Geom.__init__(self)
        self.start = start
        self.end = end
        self.linewidth = LineWidth(1)
        self.add_attr(self.linewidth)
    def render1(self):
        glBegin(GL_LINES)
        glVertex2f(*self.start)
        glVertex2f(*self.end)
        glEnd()
 class Image(Geom):
    def __init__(self, fname, width, height):
        Geom.__init__(self)
        self.width = width
        self.height = height
        img = pyglet.image.load(fname)
        self.img = img
        self.flip = False
    def render1(self):
        self.img.blit(-self.width / 2, -self.height / 2, width=self.width, height=self.height)
 class SimpleImageViewer(object):
    def __init__(self, display=None):
        self.window = None
        self.isopen = False
        self.display = display
    def imshow(self, arr):
        if self.window is None:
            height, width, channels = arr.shape
            self.window = pyglet.window.Window(width=width, height=height, display=self.display)
            self.width = width
            self.height = height
            self.isopen = True
        assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
        image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
        self.window.clear()
        self.window.switch_to()
        self.window.dispatch_events()
        image.blit(0, 0)
        self.window.flip()
    def close(self):
        if self.isopen:
            self.window.close()
            self.isopen = False
    def __del__(self):
        self.close()
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/particle_train.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/particle_train.py
@@ -0,0 +1,124 @@
 import argparse
 import re
 import os
 import ray
 from ray.tune import run_experiments
 from ray.tune.registry import register_trainable, register_env, get_trainable_cls
 import ray.rllib.contrib.maddpg.maddpg as maddpg
 from rllib_multiagent_particle_env import env_creator
 from util import parse_args
 def setup_ray():
    ray.init(address='auto')
    register_env('particle', env_creator)
 def gen_policy(args, env, id):
    use_local_critic = [
        args.adv_policy == 'ddpg' if id < args.num_adversaries else
        args.good_policy == 'ddpg' for id in range(env.num_agents)
    ]
    return (
        None,
        env.observation_space_dict[id],
        env.action_space_dict[id],
        {
            'agent_id': id,
            'use_local_critic': use_local_critic[id],
            'obs_space_dict': env.observation_space_dict,
            'act_space_dict': env.action_space_dict,
        }
    )
 def gen_policies(args, env_config):
    env = env_creator(env_config)
    return {'policy_%d' % i: gen_policy(args, env, i) for i in range(len(env.observation_space_dict))}
 def to_multiagent_config(policies):
    policy_ids = list(policies.keys())
    return {
        'policies': policies,
        'policy_mapping_fn': lambda index: policy_ids[index]
    }
 def train(args, env_config):
    def stop(trial_id, result):
        max_train_time = int(os.environ.get('AML_MAX_TRAIN_TIME_SECONDS', 2 * 60 * 60))
        return result['episode_reward_mean'] >= args.final_reward \
            or result['time_total_s'] >= max_train_time
    run_experiments({
        'MADDPG_RLLib': {
            'run': 'contrib/MADDPG',
            'env': 'particle',
            'stop': stop,
            # Uncomment to enable more frequent checkpoints:
            # 'checkpoint_freq': args.checkpoint_freq,
            'checkpoint_at_end': True,
            'local_dir': args.local_dir,
            'restore': args.restore,
            'config': {
                # === Log ===
                'log_level': 'ERROR',
                # === Environment ===
                'env_config': env_config,
                'num_envs_per_worker': args.num_envs_per_worker,
                'horizon': args.max_episode_len,
                # === Policy Config ===
                # --- Model ---
                'good_policy': args.good_policy,
                'adv_policy': args.adv_policy,
                'actor_hiddens': [args.num_units] * 2,
                'actor_hidden_activation': 'relu',
                'critic_hiddens': [args.num_units] * 2,
                'critic_hidden_activation': 'relu',
                'n_step': args.n_step,
                'gamma': args.gamma,
                # --- Exploration ---
                'tau': 0.01,
                # --- Replay buffer ---
                'buffer_size': int(1e6),
                # --- Optimization ---
                'actor_lr': args.lr,
                'critic_lr': args.lr,
                'learning_starts': args.train_batch_size * args.max_episode_len,
                'sample_batch_size': args.sample_batch_size,
                'train_batch_size': args.train_batch_size,
                'batch_mode': 'truncate_episodes',
                # --- Parallelism ---
                'num_workers': args.num_workers,
                'num_gpus': args.num_gpus,
                'num_gpus_per_worker': 0,
                # === Multi-agent setting ===
                'multiagent': to_multiagent_config(gen_policies(args, env_config)),
            },
        },
    }, verbose=1)
 if __name__ == '__main__':
    args = parse_args()
    setup_ray()
    env_config = {
        'scenario_name': args.scenario,
        'horizon': args.max_episode_len,
        'video_frequency': args.checkpoint_freq,
    }
    train(args, env_config)
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/rllib_multiagent_particle_env.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/rllib_multiagent_particle_env.py
@@ -0,0 +1,113 @@
 # Some code taken from: https://github.com/wsjeon/maddpg-rllib/
 import imp
 import os
 import gym
 from gym import wrappers
 from ray import rllib
 from multiagent.environment import MultiAgentEnv
 import multiagent.scenarios as scenarios
 CUSTOM_SCENARIOS = ['simple_switch']
 class ParticleEnvRenderWrapper(gym.Wrapper):
    def __init__(self, env, horizon):
        super().__init__(env)
        self.horizon = horizon
    def reset(self):
        self._num_steps = 0
        return self.env.reset()
    def render(self, mode):
        if mode == 'human':
            self.env.render(mode=mode)
        else:
            return self.env.render(mode=mode)[0]
    def step(self, actions):
        obs_list, rew_list, done_list, info_list = self.env.step(actions)
        self._num_steps += 1
        done = (all(done_list) or self._num_steps >= self.horizon)
        # Gym monitor expects reward to be an int.  This is only used for its
        # stats reporter, which we're not interested in.  To make video recording
        # work, we package the rewards in the info object and extract it below.
        return obs_list, 0, done, [rew_list, done_list, info_list]
 class RLlibMultiAgentParticleEnv(rllib.MultiAgentEnv):
    def __init__(self, scenario_name, horizon, monitor_enabled=False, video_frequency=500):
        self._env = _make_env(scenario_name, horizon, monitor_enabled, video_frequency)
        self.num_agents = self._env.n
        self.agent_ids = list(range(self.num_agents))
        self.observation_space_dict = self._make_dict(self._env.observation_space)
        self.action_space_dict = self._make_dict(self._env.action_space)
    def reset(self):
        obs_dict = self._make_dict(self._env.reset())
        return obs_dict
    def step(self, action_dict):
        actions = list(action_dict.values())
        obs_list, _, _, infos = self._env.step(actions)
        rew_list, done_list, _ = infos
        obs_dict = self._make_dict(obs_list)
        rew_dict = self._make_dict(rew_list)
        done_dict = self._make_dict(done_list)
        done_dict['__all__'] = all(done_list)
        info_dict = self._make_dict([{'done': done} for done in done_list])
        return obs_dict, rew_dict, done_dict, info_dict
    def render(self, mode='human'):
        self._env.render(mode=mode)
    def _make_dict(self, values):
        return dict(zip(self.agent_ids, values))
 def _video_callable(video_frequency):
    def should_record_video(episode_id):
        if episode_id % video_frequency == 0:
            return True
        return False
    return should_record_video
 def _make_env(scenario_name, horizon, monitor_enabled, video_frequency):
    if scenario_name in CUSTOM_SCENARIOS:
        # Scenario file must exist locally
        file_path = os.path.join(os.path.dirname(__file__), scenario_name + '.py')
        scenario = imp.load_source('', file_path).Scenario()
    else:
        scenario = scenarios.load(scenario_name + '.py').Scenario()
    world = scenario.make_world()
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    env.metadata['video.frames_per_second'] = 8
    env = ParticleEnvRenderWrapper(env, horizon)
    if not monitor_enabled:
        return env
    return wrappers.Monitor(env, './logs/videos', resume=True, video_callable=_video_callable(video_frequency))
 def env_creator(config):
    monitor_enabled = False
    if hasattr(config, 'worker_index') and hasattr(config, 'vector_index'):
        monitor_enabled = (config.worker_index == 1 and config.vector_index == 0)
    return RLlibMultiAgentParticleEnv(**config, monitor_enabled=monitor_enabled)
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/simple_switch.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/simple_switch.py
@@ -0,0 +1,358 @@
 import numpy as np
 import random
 from multiagent.core import World, Agent, Landmark
 from multiagent.scenario import BaseScenario
 class SwitchWorld(World):
    """ Extended World with hills and switches """
    def __init__(self, hills, switches):
        super().__init__()
        # add hills and switches
        self.hills = hills
        self.switches = switches
        self.landmarks.extend(self.hills)
        self.landmarks.extend(self.switches)
    def step(self):
        super().step()
        # if all hills are activated, reset the switches and hills
        if all([hill.active for hill in self.hills]):
            self.reset_hills()
            self.reset_switches()
        else:
            # Update switches
            for switch in self.switches:
                switch.step(self)
            # Update hills
            for hill in self.hills:
                hill.step(self)
    def reset_hills(self):
        possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
        hill_positions = random.sample(possible_hill_positions, k=len(self.hills))
        for i, hill in enumerate(self.hills):
            hill.state.p_pos = hill_positions[i]
            hill.deactivate()
    def reset_switches(self):
        possible_switch_positions = [
            np.array([-0.8, -0.8]),
            np.array([-0.8, 0.8]),
            np.array([0.8, -0.8]),
            np.array([0.8, 0.8])]
        switch_positions = random.sample(possible_switch_positions, k=len(self.switches))
        for i, switch in enumerate(self.switches):
            switch.state.p_pos = switch_positions[i]
            switch.deactivate()
 class Scenario(BaseScenario):
    def make_world(self):
        # main configurations
        num_agents = 2
        num_hills = 2
        num_switches = 1
        self.max_episode_length = 100
        # create hills (on edges)
        possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
        hill_positions = random.sample(possible_hill_positions, k=num_hills)
        hills = [Hill(hill_positions[i]) for i in range(num_hills)]
        # create switches (in corners)
        possible_switch_positions = [
            np.array([-0.8, -0.8]),
            np.array([-0.8, 0.8]),
            np.array([0.8, -0.8]),
            np.array([0.8, 0.8])]
        switch_positions = random.sample(possible_switch_positions, k=num_switches)
        switches = [Switch(switch_positions[i]) for i in range(num_switches)]
        # make world and set basic properties
        world = SwitchWorld(hills, switches)
        world.dim_c = 2
        world.collaborative = True
        # add agents
        world.agents = [Agent() for i in range(num_agents)]
        for i, agent in enumerate(world.agents):
            agent.name = 'agent %d' % i
            agent.collide = True
            agent.silent = True
            agent.size = 0.1
            agent.accel = 5.0
            agent.max_speed = 5.0
            if i == 0:
                agent.color = np.array([0.35, 0.35, 0.85])
            else:
                agent.color = np.array([0.35, 0.85, 0.85])
        # make initial conditions
        self.reset_world(world)
        return world
    def reset_world(self, world):
        # set random initial states
        for agent in world.agents:
            agent.state.p_pos = np.array([random.uniform(-1, +1) for _ in range(world.dim_p)])
            agent.state.p_vel = np.zeros(world.dim_p)
            agent.state.c = np.zeros(world.dim_c)
        # set hills randomly
        world.reset_hills()
        # set switches randomly
        world.reset_switches()
    def is_collision(self, agent1, agent2):
        delta_pos = agent1.state.p_pos - agent2.state.p_pos
        dist = np.sqrt(np.sum(np.square(delta_pos)))
        dist_min = agent1.size + agent2.size
        return True if dist < dist_min else False
    def reward(self, agent, world):
        # Agents are rewarded based on number of landmarks activated
        rew = 0
        if all([h.active for h in world.hills]):
            rew += 100
        else:
            # give bonus each time a hill is activated
            for hill in world.hills:
                if hill.activated_just_now:
                    rew += 50
        # penalise timesteps where nothing is happening
        if rew == 0:
            rew -= 0.1
        # add collision penalty
        if agent.collide:
            for a in world.agents:
                # note: this also counts collision with "itself", so gives -1 at every timestep
                # would be good to tune the reward function and use (not a == agent) here
                if self.is_collision(a, agent):
                    rew -= 1
        return rew
    def observation(self, agent, world):
        # get positions of all entities in this agent's reference frame
        entity_pos = []
        for entity in world.landmarks:  # world.entities:
            entity_pos.append(entity.state.p_pos - agent.state.p_pos)
        # entity colors
        entity_color = []
        for entity in world.landmarks:  # world.entities:
            entity_color.append(entity.color)
        # communication of all other agents
        comm = []
        other_pos = []
        for other in world.agents:
            if other is agent:
                continue
            comm.append(other.state.c)
            other_pos.append(other.state.p_pos - agent.state.p_pos)
        return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
 class Hill(Landmark):
    """
    A hill that can be captured by an agent.
    To be captured, a team must occupy a hill for a fixed amount of time.
    """
    def __init__(self,
                 pos=None,
                 size=0.08,
                 capture_time=2
                 ):
        # Initialize Landmark super class
        super().__init__()
        self.movable = False
        self.collide = False
        self.state.p_pos = pos
        self.size = size
        # Set static configurations
        self.capture_time = capture_time
        # Initialize all hills to be inactive
        self.active = False
        self.color = np.array([0.5, 0.5, 0.5])
        self.capture_timer = 0
        self.activated_just_now = False
    def activate(self):
        self.active = True
        self.color = np.array([0.1, 0.1, 0.9])
    def deactivate(self):
        self.active = False
        self.color = np.array([0.5, 0.5, 0.5])
    def _is_occupied(self, agents):
        # a hill is occupied if an agent stands on it
        for agent in agents:
            dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
            if dist < agent.size + self.size:
                return True
        return False
    def step(self, world):
        self.activated_just_now = False
        # If hill isn't activated yet, check if an agent activates it
        # if (not self.active) and (world.switch.is_active()):
        if (not self.active):
            # Check if an agent is on the hill and all switches are active
            if (self._is_occupied(world.agents)) and all([switch.active for switch in world.switches]):
                self.capture_timer += 1
                # activate hill (this is irreversible)
                if self.capture_timer > self.capture_time:
                    self.activate()
                    self.activated_just_now = True
            # Reset capture timer if hill is not occupied
            else:
                self.capture_timer = 0
 class Switch(Landmark):
    """
    A switch that can be activated by an agent.
    The agent has to stay on the switch for it to be active.
    """
    def __init__(self,
                 pos=None,
                 size=0.03,
                 ):
        # Initialize Landmark super class
        super().__init__()
        self.movable = False
        self.collide = False
        self.state.p_pos = pos
        self.size = size
        # Initialize all hills to be inactive
        self.active = False
        self.color = np.array([0.8, 0.05, 0.3])
        self.capture_timer = 0
    def activate(self):
        self.active = True
        self.color = np.array([0.1, 0.9, 0.4])
    def deactivate(self):
        self.active = False
        self.color = np.array([0.8, 0.05, 0.3])
    def _is_occupied(self, agents):
        # a switch is active if an agent stands on it
        for agent in agents:
            dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
            if dist < agent.size + self.size:
                return True
        return False
    def step(self, world):
        # check if an agent is on the switch and activate/deactive accordingly
        if self._is_occupied(world.agents):
            self.activate()
        else:
            self.deactivate()
 class SwitchExpertPolicy():
    """
    Hand-coded expert policy for the simple switch environment.
    Types of possible experts:
    - always go to the switch
    - always go to the hills
    """
    def __init__(self, dim_c, agent, world, expert_type=None, discrete_action_input=True):
        self.dim_c = dim_c
        self.discrete_action_input = discrete_action_input
        # the agent we control and world we're in
        self.agent = agent
        self.world = world
        if expert_type is None:
            self.expert_type = random.choice(['switch', 'hill'])
        else:
            self.expert_type = expert_type
        if self.expert_type == 'switch':
            self.target_switch = self.select_inital_target_switch()
        elif self.expert_type == 'hill':
            self.target_hill = self.select_inital_target_hill()
        else:
            raise NotImplementedError
        self.step_count = 0
    def select_inital_target_switch(self):
        return random.choice(self.world.switches)
    def select_inital_target_hill(self):
        return random.choice(self.world.hills)
    def action(self):
        # select a target!
        if self.expert_type == 'switch':
            # if agent is not already on a switch, choose target switch
            if not any([switch._is_occupied([self.agent]) for switch in self.world.switches]):
                # select a target switch if there's an inactive one
                inactive_switches = [switch for switch in self.world.switches if not switch.active]
                if len(inactive_switches) > 0 and (self.target_switch not in inactive_switches):
                    self.target_switch = random.choice(inactive_switches)
            target = self.target_switch.state.p_pos
        elif self.expert_type == 'hill':
            # select a target hill if we haven't done so yet, or the current target switch is inactive
            inactive_hills = [hill for hill in self.world.hills if not hill.active]
            if len(inactive_hills) > 0 and (self.target_hill not in inactive_hills):
                self.target_hill = random.choice(inactive_hills)
            target = self.target_hill.state.p_pos
        self.step_count += 1
        impulse = np.clip(target - self.agent.state.p_pos, -self.agent.u_range, self.agent.u_range)
        if self.discrete_action_input:
            u_idx = np.argmax(np.abs(impulse))
            if u_idx == 0 and impulse[u_idx] < 0:
                u = 1
            elif u_idx == 0 and impulse[u_idx] > 0:
                u = 2
            elif u_idx == 1 and impulse[u_idx] < 0:
                u = 3
            elif u_idx == 1 and impulse[u_idx] > 0:
                u = 4
            else:
                u = 0
        else:
            u = np.zeros(5)
            if (impulse[0] == impulse[1] == 0) \
                or (self.step_count < self.burn_in) \
                    or (self.burn_step != 0 and self.step_count % self.burn_step != 0):
                u[0] = 0.1
            else:
                pass
                # u: noop (?), right, left, down, up
                if impulse[0] > 0:  # x-direction (- left/right + )
                    u[1] = impulse[0]  # right
                elif impulse[0] < 0:
                    u[2] = -impulse[0]
                if impulse[1] > 0:  # y-direction (- up/down + )
                    u[3] = impulse[1]
                elif impulse[1] < 0:
                    u[4] = -impulse[1]
        return u
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/util.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/util.py
@@ -0,0 +1,82 @@
 import argparse
 import os
 import re
 from rllib_multiagent_particle_env import CUSTOM_SCENARIOS
 def parse_args():
    parser = argparse.ArgumentParser('MADDPG with OpenAI MPE')
    # Environment
    parser.add_argument('--scenario', type=str, default='simple',
                        choices=['simple', 'simple_speaker_listener',
                                 'simple_crypto', 'simple_push',
                                 'simple_tag', 'simple_spread', 'simple_adversary'
                                 ] + CUSTOM_SCENARIOS,
                        help='name of the scenario script')
    parser.add_argument('--max-episode-len', type=int, default=25,
                        help='maximum episode length')
    parser.add_argument('--num-episodes', type=int, default=60000,
                        help='number of episodes')
    parser.add_argument('--num-adversaries', type=int, default=0,
                        help='number of adversaries')
    parser.add_argument('--good-policy', type=str, default='maddpg',
                        help='policy for good agents')
    parser.add_argument('--adv-policy', type=str, default='maddpg',
                        help='policy of adversaries')
    # Core training parameters
    parser.add_argument('--lr', type=float, default=1e-2,
                        help='learning rate for Adam optimizer')
    parser.add_argument('--gamma', type=float, default=0.95,
                        help='discount factor')
    # NOTE: 1 iteration = sample_batch_size * num_workers timesteps * num_envs_per_worker
    parser.add_argument('--sample-batch-size', type=int, default=25,
                        help='number of data points sampled /update /worker')
    parser.add_argument('--train-batch-size', type=int, default=1024,
                        help='number of data points /update')
    parser.add_argument('--n-step', type=int, default=1,
                        help='length of multistep value backup')
    parser.add_argument('--num-units', type=int, default=64,
                        help='number of units in the mlp')
    parser.add_argument('--final-reward', type=int, default=-400,
                        help='final reward after which to stop training')
    # Checkpoint
    parser.add_argument('--checkpoint-freq', type=int, default=200,
                        help='save model once every time this many iterations are completed')
    parser.add_argument('--local-dir', type=str, default='./logs',
                        help='path to save checkpoints')
    parser.add_argument('--restore', type=str, default=None,
                        help='directory in which training state and model are loaded')
    # Parallelism
    parser.add_argument('--num-workers', type=int, default=1)
    parser.add_argument('--num-envs-per-worker', type=int, default=4)
    parser.add_argument('--num-gpus', type=int, default=0)
    return parser.parse_args()
 def find_final_checkpoint(start_dir):
    def find(pattern, path):
        result = []
        for root, _, files in os.walk(path):
            for name in files:
                if pattern.match(name):
                    result.append(os.path.join(root, name))
        return result
    cp_pattern = re.compile('.*checkpoint-\\d+$')
    checkpoint_files = find(cp_pattern, start_dir)
    checkpoint_numbers = []
    for file in checkpoint_files:
        checkpoint_numbers.append(int(file.split('-')[-1]))
    final_checkpoint_number = max(checkpoint_numbers)
    return next(
        checkpoint_file for checkpoint_file in checkpoint_files
        if checkpoint_file.endswith(str(final_checkpoint_number)))
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/images/particle_simple_spread.gif
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/images/particle_simple_spread.gif
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb
@@ -0,0 +1,526 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Reinforcement Learning in Azure Machine Learning - Training multiple agents on collaborative ParticleEnv tasks\n",
        "\n",
        "This tutorial will show you how to train policies in a multi-agent scenario.\n",
        "We use OpenAI Gym's [Particle environments](https://github.com/openai/multiagent-particle-envs),\n",
        "which model agents and landmarks in a two-dimensional world. Particle comes with\n",
        "several predefined scenarios, both competitive and collaborative, and with or without communication.\n",
        "\n",
        "For this tutorial, we pick a cooperative navigation scenario where N agents are in a world with N\n",
        "landmarks.  The agents' goal is to cover all the landmarks without collisions,\n",
        "so agents must learn to avoid each other (social distancing!).  The video below shows training\n",
        "results for N=3 agents/landmarks:\n",
        "\n",
        "<table style=\"width:50%\">\n",
        "  <tr>\n",
        "      <th style=\"text-align: center;\">\n",
        "          <img src=\"./images/particle_simple_spread.gif\" alt=\"Particle video\" align=\"middle\" margin-left=\"auto\" margin-right=\"auto\"/>\n",
        "      </th>\n",
        "  </tr>\n",
        "  <tr style=\"text-align: center;\">\n",
        "      <th>Fig 1. Video of 3 agents covering 3 landmarks in a multiagent Particle scenario.</th>\n",
        "  </tr>\n",
        "</table>\n",
        "\n",
        "The tutorial will cover the following steps:\n",
        "- Initializing Azure Machine Learning resources for training\n",
        "- Training policies in a multi-agent environment with Azure Machine Learning service\n",
        "- Monitoring training progress\n",
        "\n",
        "## Prerequisites\n",
        "\n",
        "The user should have completed the Azure Machine Learning introductory tutorial. You will need to make sure that you have a valid subscription id, a resource group and a workspace. For detailed instructions see [Tutorial: Get started creating your first ML experiment](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup).\n",
        "\n",
        "Please ensure that you have a current version of IPython (>= 7.15) installed.\n",
        "\n",
        "While this is a standalone notebook, we highly recommend going over the introductory notebooks for RL first.\n",
        "- Getting started:\n",
        "  - [RL using a compute instance with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb)\n",
        "  - [RL using Azure Machine Learning compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb)\n",
        "- [Scaling RL training runs with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb)\n",
        "\n",
        "Advanced users might also be interested in [this notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb) demonstrating how to train a Minecraft RL agent in Azure Machine Learning.\n",
        "\n",
        "## Initialize resources\n",
        "\n",
        "All required Azure Machine Learning service resources for this tutorial can be set up from Jupyter. This includes:\n",
        "\n",
        "- Connecting to your existing Azure Machine Learning workspace.\n",
        "- Creating an experiment to track runs.\n",
        "- Creating remote compute targets for [Ray](https://docs.ray.io/en/latest/index.html).\n",
        "\n",
        "\n",
        "### Azure Machine Learning SDK\n",
        "\n",
        "Display the Azure Machine Learning SDK version."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import azureml.core\n",
        "print('Azure Machine Learning SDK Version: ', azureml.core.VERSION)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Connect to workspace\n",
        "\n",
        "Get a reference to an existing Azure Machine Learning workspace."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core import Workspace\n",
        "\n",
        "ws = Workspace.from_config()\n",
        "print(ws.name, ws.location, ws.resource_group, sep=' | ')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create an experiment\n",
        "\n",
        "Create an experiment to track the runs in your workspace. A\n",
        "workspace can have multiple experiments and each experiment\n",
        "can be used to track multiple runs (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)\n",
        "for details)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core import Experiment\n",
        "\n",
        "exp = Experiment(workspace=ws, name='particle-multiagent')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create or attach an existing compute resource\n",
        "\n",
        "A compute target is a designated compute resource where you run your training script. For more information, see [What are compute targets in Azure Machine Learning service?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target).\n",
        "\n",
        "#### CPU target for Ray head\n",
        "\n",
        "In the experiment setup for this tutorial, the Ray head node will\n",
        "run on a CPU node (D3 type). A maximum cluster size of 1 node is\n",
        "therefore sufficient. If you wish to run multiple experiments in\n",
        "parallel using the same CPU cluster, you may elect to increase this\n",
        "number. The cluster will automatically scale down to 0 nodes when\n",
        "no training jobs are scheduled (see min_nodes).\n",
        "\n",
        "The code below creates a compute cluster of D3 type nodes.\n",
        "If the cluster with the specified name is already in your workspace\n",
        "the code will skip the creation process.\n",
        "\n",
        "**Note: Creation of a compute resource can take several minutes**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.core.compute import AmlCompute, ComputeTarget\n",
        "\n",
        "cpu_cluster_name = 'cpu-cl-d3'\n",
        "\n",
        "if cpu_cluster_name in ws.compute_targets:\n",
        "    cpu_cluster = ws.compute_targets[cpu_cluster_name]\n",
        "    if cpu_cluster and type(cpu_cluster) is AmlCompute:\n",
        "        if cpu_cluster.provisioning_state == 'Succeeded':\n",
        "            print('Found existing compute target for {}. Using it.'.format(cpu_cluster_name))\n",
        "        else: \n",
        "            raise Exception('Found existing compute target for {} '.format(cpu_cluster_name)\n",
        "                            + 'but it is in state {}'.format(cpu_cluster.provisioning_state))\n",
        "else:\n",
        "    print('Creating a new compute target for {}...'.format(cpu_cluster_name))\n",
        "    provisioning_config = AmlCompute.provisioning_configuration(\n",
        "        vm_size='STANDARD_D3',\n",
        "        min_nodes=0, \n",
        "        max_nodes=1)\n",
        "\n",
        "    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
        "    cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
        "    \n",
        "    print('Cluster created.')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Training the policies\n",
        "\n",
        "### Training environment\n",
        "\n",
        "This tutorial uses a custom docker image\n",
        "with the necessary software installed. The [Environment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments)\n",
        "class stores the configuration for the training environment. The\n",
        "docker image is set via `env.docker.base_image`.\n",
        "`user_managed_dependencies` is set so that\n",
        "the preinstalled Python packages in the image are preserved.\n",
        "\n",
        "Note that since we want to capture videos of the training runs requiring a display, we set the interpreter_path such that the Python process is started via **xvfb-run**."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "from azureml.core import Environment\n",
        "    \n",
        "cpu_particle_env = Environment(name='particle-cpu')\n",
        "\n",
        "cpu_particle_env.docker.enabled = True\n",
        "cpu_particle_env.docker.base_image = 'akdmsft/particle-cpu'\n",
        "cpu_particle_env.python.interpreter_path = 'xvfb-run -s \"-screen 0 640x480x16 -ac +extension GLX +render\" python'\n",
        "\n",
        "max_train_time = os.environ.get('AML_MAX_TRAIN_TIME_SECONDS', 2 * 60 * 60)\n",
        "cpu_particle_env.environment_variables['AML_MAX_TRAIN_TIME_SECONDS'] = str(max_train_time)\n",
        "cpu_particle_env.python.user_managed_dependencies = True"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Training script\n",
        "\n",
        "This tutorial uses the multiagent algorithm [Multi-Agent Deep Deterministic Policy Gradient (MADDPG)](https://docs.ray.io/en/latest/rllib-algorithms.html?highlight=maddpg#multi-agent-deep-deterministic-policy-gradient-contrib-maddpg).\n",
        "For training policies in a multiagent scenario, Ray's RLlib also\n",
        "requires the `multiagent` configuration section to be specified. You\n",
        "can find more information in the [common parameters](https://docs.ray.io/en/latest/rllib-training.html?highlight=multiagent#common-parameters)\n",
        "documentation.\n",
        "\n",
        "For monitoring and understanding the training progress, one\n",
        "of the training environments is wrapped in a [Gym monitor](https://github.com/openai/gym/blob/master/gym/wrappers/monitor.py)\n",
        "which periodically captures videos - by default every 200 training\n",
        "iterations.\n",
        "\n",
        "The stopping criteria are set such that the training run is\n",
        "terminated after either a mean reward of -400 is observed, or\n",
        "training has run for over 2 hours.\n",
        "\n",
        "### Submitting a training run\n",
        "\n",
        "Below, you create the training run using a `ReinforcementLearningEstimator`\n",
        "object, which contains all the configuration parameters for this experiment:\n",
        "\n",
        "- `source_directory`: Contains the training script and helper files to be\n",
        "  copied onto the node.\n",
        "- `entry_script`: The training script, described in more detail above.\n",
        "- `script_params`: The command line arguments to pass to the entry script.\n",
        "- `compute_target`: The compute target for training script execution.\n",
        "- `environment`: The Azure Machine Learning environment definition for the node running the training.\n",
        "- `max_run_duration_seconds`: The time after which to abort the run if it is still running.\n",
        "\n",
        "For more details, please take a look at the [online documentation](https://docs.microsoft.com/en-us/python/api/azureml-contrib-reinforcementlearning/?view=azure-ml-py)\n",
        "for Azure Machine Learning service's reinforcement learning offering.\n",
        "\n",
        "Note that you can use the same notebook and scripts to experiment with\n",
        "different Particle environments.  You can find a list of supported\n",
        "environments [here](https://github.com/openai/multiagent-particle-envs/tree/master#list-of-environments).\n",
        "Simply change the `--scenario` parameter to a supported scenario.\n",
        "\n",
        "In order to get the best training results, you can also adjust the\n",
        "`--final-reward` parameter to determine when to stop training. A greater\n",
        "reward means longer running time, but improved results. By default,\n",
        "the final reward will be -400, which should show good progress after\n",
        "about one hour of run time.\n",
        "\n",
        "For this notebook, we use a single D3 nodes, giving us a total of 4 CPUs and\n",
        "0 GPUs. One CPU is used by the MADDPG trainer, and an additional CPU is\n",
        "consumed by the RLlib rollout worker. The other 2 CPUs are not used, though\n",
        "smaller node types will run out of memory for this task.\n",
        "\n",
        "Lastly, the RunDetails widget displays information about the submitted RL\n",
        "experiment, including a link to the Azure portal with more details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.contrib.train.rl import ReinforcementLearningEstimator\n",
        "from azureml.widgets import RunDetails\n",
        "\n",
        "estimator = ReinforcementLearningEstimator(\n",
        "    source_directory='files',\n",
        "    entry_script='particle_train.py',\n",
        "    script_params={\n",
        "        '--scenario': 'simple_spread',\n",
        "        '--final-reward': -400\n",
        "    },\n",
        "    compute_target=cpu_cluster,\n",
        "    environment=cpu_particle_env,\n",
        "    max_run_duration_seconds=3 * 60 * 60\n",
        ")\n",
        "\n",
        "train_run = exp.submit(config=estimator)\n",
        "\n",
        "RunDetails(train_run).show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# If you wish to cancel the run before it completes, uncomment and execute:\n",
        "#train_run.cancel()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Monitoring training progress\n",
        "\n",
        "### View the Tensorboard\n",
        "\n",
        "The Tensorboard can be displayed via the Azure Machine Learning\n",
        "service's [Tensorboard API](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-monitor-tensorboard).\n",
        "When running locally, please make sure to follow the instructions\n",
        "in the link and install required packages. Running this cell will output a URL for the Tensorboard.\n",
        "\n",
        "Note that the training script sets the log directory when\n",
        "starting RLlib via the local_dir parameter. ./logs will automatically\n",
        "appear in the downloadable files for a run. Since this script is\n",
        "executed on the Ray head node run, we need to get a reference to it\n",
        "as shown below.\n",
        "\n",
        "The Tensorboard API will continuously stream logs from the run.\n",
        "\n",
        "**Note: It may take a couple of minutes after the run is in \"Running\"\n",
        "state before Tensorboard files are available and the board will refresh automatically**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import time\n",
        "from azureml.tensorboard import Tensorboard\n",
        "\n",
        "head_run = None\n",
        "\n",
        "timeout = 60\n",
        "while timeout > 0 and head_run is None:\n",
        "    timeout -= 1\n",
        "    \n",
        "    try:\n",
        "        head_run = next(r for r in train_run.get_children() if r.id.endswith('head'))\n",
        "    except StopIteration:\n",
        "        time.sleep(1)\n",
        "\n",
        "tb = Tensorboard([head_run])\n",
        "tb.start()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### View training videos\n",
        "\n",
        "As mentioned above, we record videos of the agents interacting with the\n",
        "Particle world. These videos are often a crucial indicator for training\n",
        "success. The code below downloads the latest video as it becomes available\n",
        "and displays it in-line.\n",
        "\n",
        "Over time, the agents learn to cooperate and avoid collisions while\n",
        "traveling to all landmarks.\n",
        "\n",
        "**Note: It can take several minutes for a video to appear after the run\n",
        "was started.**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import tempfile\n",
        "from azureml.core import Dataset\n",
        "from azureml.data.dataset_error_handling import DatasetValidationError\n",
        "\n",
        "from IPython.display import clear_output\n",
        "from IPython.core.display import display, Video\n",
        "\n",
        "datastore = ws.get_default_datastore()\n",
        "path_prefix = './tmp_videos'\n",
        "\n",
        "def download_latest_training_video(run, video_checkpoint_counter):\n",
        "    run_artifacts_path = os.path.join('azureml', run.id)\n",
        "    \n",
        "    try:\n",
        "        run_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))\n",
        "    except DatasetValidationError as e:\n",
        "        # This happens at the start of the run when there is no data available\n",
        "        # in the run's artifacts\n",
        "        return None, video_checkpoint_counter\n",
        "    \n",
        "    video_files = [file for file in run_artifacts_ds.to_path() if file.endswith('.mp4')]\n",
        "    if len(video_files) == video_checkpoint_counter:\n",
        "        return None, video_checkpoint_counter\n",
        "    \n",
        "    iteration_numbers = [int(vf[vf.rindex('video') + len('video') : vf.index('.mp4')]) for vf in video_files]\n",
        "    latest_video = next(vf for vf in video_files if vf.endswith('{num}.mp4'.format(num=max(iteration_numbers))))\n",
        "    latest_video = os.path.join(run_artifacts_path, os.path.normpath(latest_video[1:]))\n",
        "    \n",
        "    datastore.download(\n",
        "        target_path=path_prefix,\n",
        "        prefix=latest_video.replace('\\\\', '/'),\n",
        "        show_progress=False)\n",
        "    \n",
        "    return os.path.join(path_prefix, latest_video), len(video_files)\n",
        "\n",
        "\n",
        "def render_video(vf):\n",
        "    clear_output(wait=True)\n",
        "    display(Video(data=vf, embed=True, html_attributes='loop autoplay width=50%'))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import shutil\n",
        "\n",
        "terminal_statuses = ['Canceled', 'Completed', 'Failed']\n",
        "video_checkpoint_counter = 0\n",
        "\n",
        "while head_run.get_status() not in terminal_statuses:\n",
        "    video_file, video_checkpoint_counter = download_latest_training_video(head_run, video_checkpoint_counter)\n",
        "    if video_file is not None:\n",
        "        render_video(video_file)\n",
        "        \n",
        "        print('Displaying video number {}'.format(video_checkpoint_counter))\n",
        "        shutil.rmtree(path_prefix)\n",
        "    \n",
        "    # Interrupting the kernel can take up to 15 seconds\n",
        "    # depending on when time.sleep started\n",
        "    time.sleep(15)\n",
        "    \n",
        "train_run.wait_for_completion()\n",
        "print('The training run has reached a terminal status.')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Cleaning up\n",
        "\n",
        "Below, you can find code snippets for your convenience to clean up any resources created as part of this tutorial you don't wish to retain."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# to stop the Tensorboard, uncomment and run\n",
        "#tb.stop()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# to delete the cpu compute target, uncomment and run\n",
        "#cpu_cluster.delete()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Next steps\n",
        "\n",
        "We would love to hear your feedback! Please let us know what you think of Reinforcement Learning in Azure Machine Learning and what features you are looking forward to."
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "andress"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.0"
    },
    "notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00afLicensed under the MIT License.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00af "
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.yml
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.yml
@@ -0,0 +1,9 @@
 name: particle
 dependencies:
 - pip:
  - azureml-sdk
  - azureml-contrib-reinforcementlearning
  - azureml-widgets
  - tensorboard
  - azureml-tensorboard
  - ipython
--- a/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb
@@ -58,7 +58,7 @@
      "metadata": {},
      "source": [
        "### Get Azure Machine Learning workspace\n",
-        "Get a reference to an existing Azure Machine Learning workspace. Please make sure to change `STANDARD_NC6` and `STANDARD_D2_V2` to [the ones available in your region](https://azure.microsoft.com/en-us/global-infrastructure/services/?products=virtual-machines).\n"
+        "Get a reference to an existing Azure Machine Learning workspace.\n"
      ]
    },
    {
--- a/index.md
+++ b/index.md
@@ -130,6 +130,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
 | [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) |  |  |  |  |  |  |
 | [cartpole_sc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb) |  |  |  |  |  |  |
 | [minecraft](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb) |  |  |  |  |  |  |
 | [particle](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb) |  |  |  |  |  |  |
 | [devenv_setup](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb) |  |  |  |  |  |  |
 | [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
 | [distributed-cntk-with-custom-docker](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/distributed-cntk-with-custom-docker/distributed-cntk-with-custom-docker.ipynb) |  |  |  |  |  |  |