update samples from Release-59 as a part of SDK release

2025-12-19 17:17:04 -05:00 · 2020-07-31 17:23:17 +00:00
parent 7e2c1ca152
commit b9ef23ad4b
13 changed files with 1758 additions and 1 deletions
--- a/how-to-use-azureml/reinforcement-learning/README.md
+++ b/how-to-use-azureml/reinforcement-learning/README.md
@@ -35,6 +35,7 @@ Using these samples, you will learn how to do the following.
 | [cartpole_sc.ipynb](cartpole-on-single-compute/cartpole_sc.ipynb)  | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Cluster (single node) |
 | [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb)   | Notebook for distributed training of Pong agent using RLlib on multiple compute targets |
 | [minecraft.ipynb](minecraft-on-distributed-compute/minecraft.ipynb)   | Notebook to train an agent to navigate through a lava maze in the Minecraft game |
+| [particle.ipynb](multiagent-particle-envs/particle.ipynb)  | Notebook to train policies in a multiagent cooperative navigation scenario based on OpenAI's Particle environments |

 ## Prerequisites

--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/Dockerfile
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/Dockerfile
@@ -0,0 +1,60 @@
+FROM mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+	cpio \
+    git \
+    bzip2 \
+    libx11-6 \
+    tmux \
+    htop \
+    gcc \
+    xvfb \
+    python-opengl \
+    x11-xserver-utils \
+    ffmpeg \
+    mesa-utils \
+    nano \
+    vim \
+    rsync \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install python 3.7
+RUN conda install python==3.7 
+
+# Create a working directory
+RUN mkdir /app
+WORKDIR /app
+
+# Install required pip packages
+RUN pip install --upgrade pip setuptools && pip install --upgrade \
+    pandas \
+    matplotlib \
+    psutil \
+    numpy \
+    scipy \
+    gym \
+    azureml-defaults \
+    tensorboardX \
+    tensorflow==1.15 \
+    tensorflow-probability==0.8.0 \
+    onnxruntime \
+    tf2onnx \
+    cloudpickle==1.2.0 \
+    tabulate \
+    dm_tree \
+    lz4 \
+    opencv-python \
+    ray==0.8.3 \
+    ray[rllib]==0.8.3 \
+    ray[tune]==0.8.3
+
+# Install particle
+RUN git clone https://github.com/openai/multiagent-particle-envs.git
+COPY patch_files/* multiagent-particle-envs/multiagent/
+RUN cd multiagent-particle-envs && \
+    pip install -e . && \
+    pip install --upgrade pyglet==1.3.2
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/multi_discrete.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/multi_discrete.py
@@ -0,0 +1,70 @@
+# MIT License
+
+# Copyright (c) 2018 OpenAI
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+import gym
+
+
+class MultiDiscrete(gym.Space):
+    """
+    - The multi-discrete action space consists of a series of discrete action spaces with different
+      parameters
+    - It can be adapted to both a Discrete action space or a continuous (Box) action space
+    - It is useful to represent game controllers or keyboards where each key can be represented as
+      a discrete action space
+    - It is parametrized by passing an array of arrays containing [min, max] for each discrete action
+      space where the discrete action space can take any integers from `min` to `max` (both inclusive)
+    Note: A value of 0 always need to represent the NOOP action.
+    e.g. Nintendo Game Controller
+    - Can be conceptualized as 3 discrete action spaces:
+        1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
+        2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
+        3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
+    - Can be initialized as
+        MultiDiscrete([ [0,4], [0,1], [0,1] ])
+    """
+    def __init__(self, array_of_param_array):
+        self.low = np.array([x[0] for x in array_of_param_array])
+        self.high = np.array([x[1] for x in array_of_param_array])
+        self.num_discrete_space = self.low.shape[0]
+
+    def sample(self):
+        """ Returns a array with one sample from each discrete action space """
+        # For each row: round(random .* (max - min) + min, 0)
+        # random_array = prng.np_random.rand(self.num_discrete_space)
+        random_array = np.random.RandomState().rand(self.num_discrete_space)
+        return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
+
+    def contains(self, x):
+        return len(x) == self.num_discrete_space \
+            and (np.array(x) >= self.low).all() \
+            and (np.array(x) <= self.high).all()
+
+    @property
+    def shape(self):
+        return self.num_discrete_space
+
+    def __repr__(self):
+        return "MultiDiscrete" + str(self.num_discrete_space)
+
+    def __eq__(self, other):
+        return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/rendering.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/docker/cpu/patch_files/rendering.py
@@ -0,0 +1,413 @@
+# MIT License
+
+# Copyright (c) 2018 OpenAI
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+2D rendering framework
+"""
+from __future__ import division
+import os
+import six
+import sys
+from gym import error
+import math
+import numpy as np
+import pyglet
+
+from pyglet.gl import glEnable, glHint, glLineWidth, glBlendFunc, glClearColor, glPushMatrix, \
+    glTranslatef, glRotatef, glScalef, glPopMatrix, glColor4f, glBegin, glVertex3f, glEnd, glLineStipple, \
+    glDisable, glVertex2f, GL_BLEND, GL_LINE_SMOOTH, GL_LINE_SMOOTH_HINT, GL_NICEST, GL_SRC_ALPHA, \
+    GL_ONE_MINUS_SRC_ALPHA, GL_LINE_STIPPLE, GL_POINTS, GL_QUADS, GL_TRIANGLES, GL_POLYGON, GL_LINE_LOOP, \
+    GL_LINE_STRIP, GL_LINES
+
+
+if "Apple" in sys.version:
+    if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
+        os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
+        # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
+
+
+RAD2DEG = 57.29577951308232
+
+
+def get_display(spec):
+    """Convert a display specification (such as :0) into an actual Display
+    object.
+
+    Pyglet only supports multiple Displays on Linux.
+    """
+    if spec is None:
+        return None
+    elif isinstance(spec, six.string_types):
+        return pyglet.canvas.Display(spec)
+    else:
+        raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
+
+
+class Viewer(object):
+    def __init__(self, width, height, display=None):
+        display = get_display(display)
+
+        self.width = width
+        self.height = height
+
+        self.window = pyglet.window.Window(width=width, height=height, display=display)
+        self.window.on_close = self.window_closed_by_user
+        self.geoms = []
+        self.onetime_geoms = []
+        self.transform = Transform()
+
+        glEnable(GL_BLEND)
+        # glEnable(GL_MULTISAMPLE)
+        glEnable(GL_LINE_SMOOTH)
+        # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
+        glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
+        glLineWidth(2.0)
+        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
+
+    def close(self):
+        self.window.close()
+
+    def window_closed_by_user(self):
+        self.close()
+
+    def set_bounds(self, left, right, bottom, top):
+        assert right > left and top > bottom
+        scalex = self.width / (right - left)
+        scaley = self.height / (top - bottom)
+        self.transform = Transform(
+            translation=(-left * scalex, -bottom * scaley),
+            scale=(scalex, scaley))
+
+    def add_geom(self, geom):
+        self.geoms.append(geom)
+
+    def add_onetime(self, geom):
+        self.onetime_geoms.append(geom)
+
+    def render(self, return_rgb_array=False):
+        glClearColor(1, 1, 1, 1)
+        self.window.clear()
+        self.window.switch_to()
+        self.window.dispatch_events()
+        self.transform.enable()
+        for geom in self.geoms:
+            geom.render()
+        for geom in self.onetime_geoms:
+            geom.render()
+        self.transform.disable()
+        arr = None
+        if return_rgb_array:
+            buffer = pyglet.image.get_buffer_manager().get_color_buffer()
+            image_data = buffer.get_image_data()
+            arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
+            # In https://github.com/openai/gym-http-api/issues/2, we
+            # discovered that someone using Xmonad on Arch was having
+            # a window of size 598 x 398, though a 600 x 400 window
+            # was requested. (Guess Xmonad was preserving a pixel for
+            # the boundary.) So we use the buffer height/width rather
+            # than the requested one.
+            arr = arr.reshape(buffer.height, buffer.width, 4)
+            arr = arr[::-1, :, 0:3]
+        self.window.flip()
+        self.onetime_geoms = []
+        return arr
+
+    # Convenience
+    def draw_circle(self, radius=10, res=30, filled=True, **attrs):
+        geom = make_circle(radius=radius, res=res, filled=filled)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def draw_polygon(self, v, filled=True, **attrs):
+        geom = make_polygon(v=v, filled=filled)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def draw_polyline(self, v, **attrs):
+        geom = make_polyline(v=v)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def draw_line(self, start, end, **attrs):
+        geom = Line(start, end)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def get_array(self):
+        self.window.flip()
+        image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
+        self.window.flip()
+        arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
+        arr = arr.reshape(self.height, self.width, 4)
+        return arr[::-1, :, 0:3]
+
+
+def _add_attrs(geom, attrs):
+    if "color" in attrs:
+        geom.set_color(*attrs["color"])
+    if "linewidth" in attrs:
+        geom.set_linewidth(attrs["linewidth"])
+
+
+class Geom(object):
+    def __init__(self):
+        self._color = Color((0, 0, 0, 1.0))
+        self.attrs = [self._color]
+
+    def render(self):
+        for attr in reversed(self.attrs):
+            attr.enable()
+        self.render1()
+        for attr in self.attrs:
+            attr.disable()
+
+    def render1(self):
+        raise NotImplementedError
+
+    def add_attr(self, attr):
+        self.attrs.append(attr)
+
+    def set_color(self, r, g, b, alpha=1):
+        self._color.vec4 = (r, g, b, alpha)
+
+
+class Attr(object):
+    def enable(self):
+        raise NotImplementedError
+
+    def disable(self):
+        pass
+
+
+class Transform(Attr):
+    def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1, 1)):
+        self.set_translation(*translation)
+        self.set_rotation(rotation)
+        self.set_scale(*scale)
+
+    def enable(self):
+        glPushMatrix()
+        glTranslatef(self.translation[0], self.translation[1], 0)  # translate to GL loc ppint
+        glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
+        glScalef(self.scale[0], self.scale[1], 1)
+
+    def disable(self):
+        glPopMatrix()
+
+    def set_translation(self, newx, newy):
+        self.translation = (float(newx), float(newy))
+
+    def set_rotation(self, new):
+        self.rotation = float(new)
+
+    def set_scale(self, newx, newy):
+        self.scale = (float(newx), float(newy))
+
+
+class Color(Attr):
+    def __init__(self, vec4):
+        self.vec4 = vec4
+
+    def enable(self):
+        glColor4f(*self.vec4)
+
+
+class LineStyle(Attr):
+    def __init__(self, style):
+        self.style = style
+
+    def enable(self):
+        glEnable(GL_LINE_STIPPLE)
+        glLineStipple(1, self.style)
+
+    def disable(self):
+        glDisable(GL_LINE_STIPPLE)
+
+
+class LineWidth(Attr):
+    def __init__(self, stroke):
+        self.stroke = stroke
+
+    def enable(self):
+        glLineWidth(self.stroke)
+
+
+class Point(Geom):
+    def __init__(self):
+        Geom.__init__(self)
+
+    def render1(self):
+        glBegin(GL_POINTS)  # draw point
+        glVertex3f(0.0, 0.0, 0.0)
+        glEnd()
+
+
+class FilledPolygon(Geom):
+    def __init__(self, v):
+        Geom.__init__(self)
+        self.v = v
+
+    def render1(self):
+        if len(self.v) == 4:
+            glBegin(GL_QUADS)
+        elif len(self.v) > 4:
+            glBegin(GL_POLYGON)
+        else:
+            glBegin(GL_TRIANGLES)
+        for p in self.v:
+            glVertex3f(p[0], p[1], 0)  # draw each vertex
+        glEnd()
+
+        color = (
+            self._color.vec4[0] * 0.5,
+            self._color.vec4[1] * 0.5,
+            self._color.vec4[2] * 0.5,
+            self._color.vec4[3] * 0.5)
+        glColor4f(*color)
+        glBegin(GL_LINE_LOOP)
+        for p in self.v:
+            glVertex3f(p[0], p[1], 0)  # draw each vertex
+        glEnd()
+
+
+def make_circle(radius=10, res=30, filled=True):
+    points = []
+    for i in range(res):
+        ang = 2 * math.pi * i / res
+        points.append((math.cos(ang) * radius, math.sin(ang) * radius))
+    if filled:
+        return FilledPolygon(points)
+    else:
+        return PolyLine(points, True)
+
+
+def make_polygon(v, filled=True):
+    if filled:
+        return FilledPolygon(v)
+    else:
+        return PolyLine(v, True)
+
+
+def make_polyline(v):
+    return PolyLine(v, False)
+
+
+def make_capsule(length, width):
+    l, r, t, b = 0, length, width / 2, -width / 2
+    box = make_polygon([(l, b), (l, t), (r, t), (r, b)])
+    circ0 = make_circle(width / 2)
+    circ1 = make_circle(width / 2)
+    circ1.add_attr(Transform(translation=(length, 0)))
+    geom = Compound([box, circ0, circ1])
+    return geom
+
+
+class Compound(Geom):
+    def __init__(self, gs):
+        Geom.__init__(self)
+        self.gs = gs
+        for g in self.gs:
+            g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
+
+    def render1(self):
+        for g in self.gs:
+            g.render()
+
+
+class PolyLine(Geom):
+    def __init__(self, v, close):
+        Geom.__init__(self)
+        self.v = v
+        self.close = close
+        self.linewidth = LineWidth(1)
+        self.add_attr(self.linewidth)
+
+    def render1(self):
+        glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
+        for p in self.v:
+            glVertex3f(p[0], p[1], 0)  # draw each vertex
+        glEnd()
+
+    def set_linewidth(self, x):
+        self.linewidth.stroke = x
+
+
+class Line(Geom):
+    def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
+        Geom.__init__(self)
+        self.start = start
+        self.end = end
+        self.linewidth = LineWidth(1)
+        self.add_attr(self.linewidth)
+
+    def render1(self):
+        glBegin(GL_LINES)
+        glVertex2f(*self.start)
+        glVertex2f(*self.end)
+        glEnd()
+
+
+class Image(Geom):
+    def __init__(self, fname, width, height):
+        Geom.__init__(self)
+        self.width = width
+        self.height = height
+        img = pyglet.image.load(fname)
+        self.img = img
+        self.flip = False
+
+    def render1(self):
+        self.img.blit(-self.width / 2, -self.height / 2, width=self.width, height=self.height)
+
+
+class SimpleImageViewer(object):
+    def __init__(self, display=None):
+        self.window = None
+        self.isopen = False
+        self.display = display
+
+    def imshow(self, arr):
+        if self.window is None:
+            height, width, channels = arr.shape
+            self.window = pyglet.window.Window(width=width, height=height, display=self.display)
+            self.width = width
+            self.height = height
+            self.isopen = True
+        assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
+        image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
+        self.window.clear()
+        self.window.switch_to()
+        self.window.dispatch_events()
+        image.blit(0, 0)
+        self.window.flip()
+
+    def close(self):
+        if self.isopen:
+            self.window.close()
+            self.isopen = False
+
+    def __del__(self):
+        self.close()
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/particle_train.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/particle_train.py
@@ -0,0 +1,124 @@
+import argparse
+import re
+import os
+
+import ray
+from ray.tune import run_experiments
+from ray.tune.registry import register_trainable, register_env, get_trainable_cls
+import ray.rllib.contrib.maddpg.maddpg as maddpg
+
+from rllib_multiagent_particle_env import env_creator
+from util import parse_args
+
+
+def setup_ray():
+    ray.init(address='auto')
+
+    register_env('particle', env_creator)
+
+
+def gen_policy(args, env, id):
+    use_local_critic = [
+        args.adv_policy == 'ddpg' if id < args.num_adversaries else
+        args.good_policy == 'ddpg' for id in range(env.num_agents)
+    ]
+    return (
+        None,
+        env.observation_space_dict[id],
+        env.action_space_dict[id],
+        {
+            'agent_id': id,
+            'use_local_critic': use_local_critic[id],
+            'obs_space_dict': env.observation_space_dict,
+            'act_space_dict': env.action_space_dict,
+        }
+    )
+
+
+def gen_policies(args, env_config):
+    env = env_creator(env_config)
+    return {'policy_%d' % i: gen_policy(args, env, i) for i in range(len(env.observation_space_dict))}
+
+
+def to_multiagent_config(policies):
+    policy_ids = list(policies.keys())
+    return {
+        'policies': policies,
+        'policy_mapping_fn': lambda index: policy_ids[index]
+    }
+
+
+def train(args, env_config):
+    def stop(trial_id, result):
+        max_train_time = int(os.environ.get('AML_MAX_TRAIN_TIME_SECONDS', 2 * 60 * 60))
+
+        return result['episode_reward_mean'] >= args.final_reward \
+            or result['time_total_s'] >= max_train_time
+
+    run_experiments({
+        'MADDPG_RLLib': {
+            'run': 'contrib/MADDPG',
+            'env': 'particle',
+            'stop': stop,
+            # Uncomment to enable more frequent checkpoints:
+            # 'checkpoint_freq': args.checkpoint_freq,
+            'checkpoint_at_end': True,
+            'local_dir': args.local_dir,
+            'restore': args.restore,
+            'config': {
+                # === Log ===
+                'log_level': 'ERROR',
+
+                # === Environment ===
+                'env_config': env_config,
+                'num_envs_per_worker': args.num_envs_per_worker,
+                'horizon': args.max_episode_len,
+
+                # === Policy Config ===
+                # --- Model ---
+                'good_policy': args.good_policy,
+                'adv_policy': args.adv_policy,
+                'actor_hiddens': [args.num_units] * 2,
+                'actor_hidden_activation': 'relu',
+                'critic_hiddens': [args.num_units] * 2,
+                'critic_hidden_activation': 'relu',
+                'n_step': args.n_step,
+                'gamma': args.gamma,
+
+                # --- Exploration ---
+                'tau': 0.01,
+
+                # --- Replay buffer ---
+                'buffer_size': int(1e6),
+
+                # --- Optimization ---
+                'actor_lr': args.lr,
+                'critic_lr': args.lr,
+                'learning_starts': args.train_batch_size * args.max_episode_len,
+                'sample_batch_size': args.sample_batch_size,
+                'train_batch_size': args.train_batch_size,
+                'batch_mode': 'truncate_episodes',
+
+                # --- Parallelism ---
+                'num_workers': args.num_workers,
+                'num_gpus': args.num_gpus,
+                'num_gpus_per_worker': 0,
+
+                # === Multi-agent setting ===
+                'multiagent': to_multiagent_config(gen_policies(args, env_config)),
+            },
+        },
+    }, verbose=1)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    setup_ray()
+
+    env_config = {
+        'scenario_name': args.scenario,
+        'horizon': args.max_episode_len,
+        'video_frequency': args.checkpoint_freq,
+    }
+
+    train(args, env_config)
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/rllib_multiagent_particle_env.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/rllib_multiagent_particle_env.py
@@ -0,0 +1,113 @@
+# Some code taken from: https://github.com/wsjeon/maddpg-rllib/
+
+import imp
+import os
+
+import gym
+from gym import wrappers
+from ray import rllib
+
+from multiagent.environment import MultiAgentEnv
+import multiagent.scenarios as scenarios
+
+
+CUSTOM_SCENARIOS = ['simple_switch']
+
+
+class ParticleEnvRenderWrapper(gym.Wrapper):
+    def __init__(self, env, horizon):
+        super().__init__(env)
+        self.horizon = horizon
+
+    def reset(self):
+        self._num_steps = 0
+
+        return self.env.reset()
+
+    def render(self, mode):
+        if mode == 'human':
+            self.env.render(mode=mode)
+        else:
+            return self.env.render(mode=mode)[0]
+
+    def step(self, actions):
+        obs_list, rew_list, done_list, info_list = self.env.step(actions)
+
+        self._num_steps += 1
+        done = (all(done_list) or self._num_steps >= self.horizon)
+
+        # Gym monitor expects reward to be an int.  This is only used for its
+        # stats reporter, which we're not interested in.  To make video recording
+        # work, we package the rewards in the info object and extract it below.
+        return obs_list, 0, done, [rew_list, done_list, info_list]
+
+
+class RLlibMultiAgentParticleEnv(rllib.MultiAgentEnv):
+    def __init__(self, scenario_name, horizon, monitor_enabled=False, video_frequency=500):
+        self._env = _make_env(scenario_name, horizon, monitor_enabled, video_frequency)
+        self.num_agents = self._env.n
+        self.agent_ids = list(range(self.num_agents))
+
+        self.observation_space_dict = self._make_dict(self._env.observation_space)
+        self.action_space_dict = self._make_dict(self._env.action_space)
+
+    def reset(self):
+        obs_dict = self._make_dict(self._env.reset())
+        return obs_dict
+
+    def step(self, action_dict):
+        actions = list(action_dict.values())
+        obs_list, _, _, infos = self._env.step(actions)
+        rew_list, done_list, _ = infos
+
+        obs_dict = self._make_dict(obs_list)
+        rew_dict = self._make_dict(rew_list)
+        done_dict = self._make_dict(done_list)
+        done_dict['__all__'] = all(done_list)
+        info_dict = self._make_dict([{'done': done} for done in done_list])
+
+        return obs_dict, rew_dict, done_dict, info_dict
+
+    def render(self, mode='human'):
+        self._env.render(mode=mode)
+
+    def _make_dict(self, values):
+        return dict(zip(self.agent_ids, values))
+
+
+def _video_callable(video_frequency):
+    def should_record_video(episode_id):
+        if episode_id % video_frequency == 0:
+            return True
+        return False
+
+    return should_record_video
+
+
+def _make_env(scenario_name, horizon, monitor_enabled, video_frequency):
+    if scenario_name in CUSTOM_SCENARIOS:
+        # Scenario file must exist locally
+        file_path = os.path.join(os.path.dirname(__file__), scenario_name + '.py')
+        scenario = imp.load_source('', file_path).Scenario()
+    else:
+        scenario = scenarios.load(scenario_name + '.py').Scenario()
+
+    world = scenario.make_world()
+
+    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
+    env.metadata['video.frames_per_second'] = 8
+
+    env = ParticleEnvRenderWrapper(env, horizon)
+
+    if not monitor_enabled:
+        return env
+
+    return wrappers.Monitor(env, './logs/videos', resume=True, video_callable=_video_callable(video_frequency))
+
+
+def env_creator(config):
+    monitor_enabled = False
+    if hasattr(config, 'worker_index') and hasattr(config, 'vector_index'):
+        monitor_enabled = (config.worker_index == 1 and config.vector_index == 0)
+
+    return RLlibMultiAgentParticleEnv(**config, monitor_enabled=monitor_enabled)
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/simple_switch.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/simple_switch.py
@@ -0,0 +1,358 @@
+import numpy as np
+import random
+
+from multiagent.core import World, Agent, Landmark
+from multiagent.scenario import BaseScenario
+
+
+class SwitchWorld(World):
+    """ Extended World with hills and switches """
+    def __init__(self, hills, switches):
+        super().__init__()
+        # add hills and switches
+        self.hills = hills
+        self.switches = switches
+        self.landmarks.extend(self.hills)
+        self.landmarks.extend(self.switches)
+
+    def step(self):
+
+        super().step()
+
+        # if all hills are activated, reset the switches and hills
+        if all([hill.active for hill in self.hills]):
+            self.reset_hills()
+            self.reset_switches()
+        else:
+            # Update switches
+            for switch in self.switches:
+                switch.step(self)
+            # Update hills
+            for hill in self.hills:
+                hill.step(self)
+
+    def reset_hills(self):
+        possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
+        hill_positions = random.sample(possible_hill_positions, k=len(self.hills))
+        for i, hill in enumerate(self.hills):
+            hill.state.p_pos = hill_positions[i]
+            hill.deactivate()
+
+    def reset_switches(self):
+        possible_switch_positions = [
+            np.array([-0.8, -0.8]),
+            np.array([-0.8, 0.8]),
+            np.array([0.8, -0.8]),
+            np.array([0.8, 0.8])]
+        switch_positions = random.sample(possible_switch_positions, k=len(self.switches))
+        for i, switch in enumerate(self.switches):
+            switch.state.p_pos = switch_positions[i]
+            switch.deactivate()
+
+
+class Scenario(BaseScenario):
+    def make_world(self):
+
+        # main configurations
+        num_agents = 2
+        num_hills = 2
+        num_switches = 1
+        self.max_episode_length = 100
+
+        # create hills (on edges)
+        possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
+        hill_positions = random.sample(possible_hill_positions, k=num_hills)
+        hills = [Hill(hill_positions[i]) for i in range(num_hills)]
+        # create switches (in corners)
+        possible_switch_positions = [
+            np.array([-0.8, -0.8]),
+            np.array([-0.8, 0.8]),
+            np.array([0.8, -0.8]),
+            np.array([0.8, 0.8])]
+        switch_positions = random.sample(possible_switch_positions, k=num_switches)
+        switches = [Switch(switch_positions[i]) for i in range(num_switches)]
+
+        # make world and set basic properties
+        world = SwitchWorld(hills, switches)
+        world.dim_c = 2
+        world.collaborative = True
+
+        # add agents
+        world.agents = [Agent() for i in range(num_agents)]
+        for i, agent in enumerate(world.agents):
+            agent.name = 'agent %d' % i
+            agent.collide = True
+            agent.silent = True
+            agent.size = 0.1
+            agent.accel = 5.0
+            agent.max_speed = 5.0
+            if i == 0:
+                agent.color = np.array([0.35, 0.35, 0.85])
+            else:
+                agent.color = np.array([0.35, 0.85, 0.85])
+
+        # make initial conditions
+        self.reset_world(world)
+
+        return world
+
+    def reset_world(self, world):
+        # set random initial states
+        for agent in world.agents:
+            agent.state.p_pos = np.array([random.uniform(-1, +1) for _ in range(world.dim_p)])
+            agent.state.p_vel = np.zeros(world.dim_p)
+            agent.state.c = np.zeros(world.dim_c)
+        # set hills randomly
+        world.reset_hills()
+        # set switches randomly
+        world.reset_switches()
+
+    def is_collision(self, agent1, agent2):
+        delta_pos = agent1.state.p_pos - agent2.state.p_pos
+        dist = np.sqrt(np.sum(np.square(delta_pos)))
+        dist_min = agent1.size + agent2.size
+        return True if dist < dist_min else False
+
+    def reward(self, agent, world):
+        # Agents are rewarded based on number of landmarks activated
+        rew = 0
+        if all([h.active for h in world.hills]):
+            rew += 100
+        else:
+            # give bonus each time a hill is activated
+            for hill in world.hills:
+                if hill.activated_just_now:
+                    rew += 50
+        # penalise timesteps where nothing is happening
+        if rew == 0:
+            rew -= 0.1
+        # add collision penalty
+        if agent.collide:
+            for a in world.agents:
+                # note: this also counts collision with "itself", so gives -1 at every timestep
+                # would be good to tune the reward function and use (not a == agent) here
+                if self.is_collision(a, agent):
+                    rew -= 1
+        return rew
+
+    def observation(self, agent, world):
+        # get positions of all entities in this agent's reference frame
+        entity_pos = []
+        for entity in world.landmarks:  # world.entities:
+            entity_pos.append(entity.state.p_pos - agent.state.p_pos)
+        # entity colors
+        entity_color = []
+        for entity in world.landmarks:  # world.entities:
+            entity_color.append(entity.color)
+        # communication of all other agents
+        comm = []
+        other_pos = []
+        for other in world.agents:
+            if other is agent:
+                continue
+            comm.append(other.state.c)
+            other_pos.append(other.state.p_pos - agent.state.p_pos)
+        return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
+
+
+class Hill(Landmark):
+    """
+    A hill that can be captured by an agent.
+    To be captured, a team must occupy a hill for a fixed amount of time.
+    """
+
+    def __init__(self,
+                 pos=None,
+                 size=0.08,
+                 capture_time=2
+                 ):
+
+        # Initialize Landmark super class
+        super().__init__()
+        self.movable = False
+        self.collide = False
+        self.state.p_pos = pos
+        self.size = size
+
+        # Set static configurations
+        self.capture_time = capture_time
+
+        # Initialize all hills to be inactive
+        self.active = False
+        self.color = np.array([0.5, 0.5, 0.5])
+        self.capture_timer = 0
+
+        self.activated_just_now = False
+
+    def activate(self):
+        self.active = True
+        self.color = np.array([0.1, 0.1, 0.9])
+
+    def deactivate(self):
+        self.active = False
+        self.color = np.array([0.5, 0.5, 0.5])
+
+    def _is_occupied(self, agents):
+        # a hill is occupied if an agent stands on it
+        for agent in agents:
+            dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
+            if dist < agent.size + self.size:
+                return True
+        return False
+
+    def step(self, world):
+
+        self.activated_just_now = False
+
+        # If hill isn't activated yet, check if an agent activates it
+        # if (not self.active) and (world.switch.is_active()):
+        if (not self.active):
+
+            # Check if an agent is on the hill and all switches are active
+            if (self._is_occupied(world.agents)) and all([switch.active for switch in world.switches]):
+                self.capture_timer += 1
+
+                # activate hill (this is irreversible)
+                if self.capture_timer > self.capture_time:
+                    self.activate()
+                    self.activated_just_now = True
+
+            # Reset capture timer if hill is not occupied
+            else:
+                self.capture_timer = 0
+
+
+class Switch(Landmark):
+    """
+    A switch that can be activated by an agent.
+    The agent has to stay on the switch for it to be active.
+    """
+
+    def __init__(self,
+                 pos=None,
+                 size=0.03,
+                 ):
+
+        # Initialize Landmark super class
+        super().__init__()
+        self.movable = False
+        self.collide = False
+        self.state.p_pos = pos
+        self.size = size
+
+        # Initialize all hills to be inactive
+        self.active = False
+        self.color = np.array([0.8, 0.05, 0.3])
+        self.capture_timer = 0
+
+    def activate(self):
+        self.active = True
+        self.color = np.array([0.1, 0.9, 0.4])
+
+    def deactivate(self):
+        self.active = False
+        self.color = np.array([0.8, 0.05, 0.3])
+
+    def _is_occupied(self, agents):
+        # a switch is active if an agent stands on it
+        for agent in agents:
+            dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
+            if dist < agent.size + self.size:
+                return True
+        return False
+
+    def step(self, world):
+        # check if an agent is on the switch and activate/deactive accordingly
+        if self._is_occupied(world.agents):
+            self.activate()
+        else:
+            self.deactivate()
+
+
+class SwitchExpertPolicy():
+    """
+    Hand-coded expert policy for the simple switch environment.
+    Types of possible experts:
+    - always go to the switch
+    - always go to the hills
+    """
+    def __init__(self, dim_c, agent, world, expert_type=None, discrete_action_input=True):
+
+        self.dim_c = dim_c
+        self.discrete_action_input = discrete_action_input
+        # the agent we control and world we're in
+        self.agent = agent
+        self.world = world
+
+        if expert_type is None:
+            self.expert_type = random.choice(['switch', 'hill'])
+        else:
+            self.expert_type = expert_type
+        if self.expert_type == 'switch':
+            self.target_switch = self.select_inital_target_switch()
+        elif self.expert_type == 'hill':
+            self.target_hill = self.select_inital_target_hill()
+        else:
+            raise NotImplementedError
+
+        self.step_count = 0
+
+    def select_inital_target_switch(self):
+        return random.choice(self.world.switches)
+
+    def select_inital_target_hill(self):
+        return random.choice(self.world.hills)
+
+    def action(self):
+
+        # select a target!
+        if self.expert_type == 'switch':
+            # if agent is not already on a switch, choose target switch
+            if not any([switch._is_occupied([self.agent]) for switch in self.world.switches]):
+                # select a target switch if there's an inactive one
+                inactive_switches = [switch for switch in self.world.switches if not switch.active]
+                if len(inactive_switches) > 0 and (self.target_switch not in inactive_switches):
+                    self.target_switch = random.choice(inactive_switches)
+            target = self.target_switch.state.p_pos
+        elif self.expert_type == 'hill':
+            # select a target hill if we haven't done so yet, or the current target switch is inactive
+            inactive_hills = [hill for hill in self.world.hills if not hill.active]
+            if len(inactive_hills) > 0 and (self.target_hill not in inactive_hills):
+                self.target_hill = random.choice(inactive_hills)
+            target = self.target_hill.state.p_pos
+
+        self.step_count += 1
+
+        impulse = np.clip(target - self.agent.state.p_pos, -self.agent.u_range, self.agent.u_range)
+
+        if self.discrete_action_input:
+            u_idx = np.argmax(np.abs(impulse))
+            if u_idx == 0 and impulse[u_idx] < 0:
+                u = 1
+            elif u_idx == 0 and impulse[u_idx] > 0:
+                u = 2
+            elif u_idx == 1 and impulse[u_idx] < 0:
+                u = 3
+            elif u_idx == 1 and impulse[u_idx] > 0:
+                u = 4
+            else:
+                u = 0
+        else:
+            u = np.zeros(5)
+            if (impulse[0] == impulse[1] == 0) \
+                or (self.step_count < self.burn_in) \
+                    or (self.burn_step != 0 and self.step_count % self.burn_step != 0):
+                u[0] = 0.1
+            else:
+                pass
+                # u: noop (?), right, left, down, up
+                if impulse[0] > 0:  # x-direction (- left/right + )
+                    u[1] = impulse[0]  # right
+                elif impulse[0] < 0:
+                    u[2] = -impulse[0]
+                if impulse[1] > 0:  # y-direction (- up/down + )
+                    u[3] = impulse[1]
+                elif impulse[1] < 0:
+                    u[4] = -impulse[1]
+
+        return u
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/util.py
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/files/util.py
@@ -0,0 +1,82 @@
+import argparse
+import os
+import re
+
+from rllib_multiagent_particle_env import CUSTOM_SCENARIOS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('MADDPG with OpenAI MPE')
+
+    # Environment
+    parser.add_argument('--scenario', type=str, default='simple',
+                        choices=['simple', 'simple_speaker_listener',
+                                 'simple_crypto', 'simple_push',
+                                 'simple_tag', 'simple_spread', 'simple_adversary'
+                                 ] + CUSTOM_SCENARIOS,
+                        help='name of the scenario script')
+    parser.add_argument('--max-episode-len', type=int, default=25,
+                        help='maximum episode length')
+    parser.add_argument('--num-episodes', type=int, default=60000,
+                        help='number of episodes')
+    parser.add_argument('--num-adversaries', type=int, default=0,
+                        help='number of adversaries')
+    parser.add_argument('--good-policy', type=str, default='maddpg',
+                        help='policy for good agents')
+    parser.add_argument('--adv-policy', type=str, default='maddpg',
+                        help='policy of adversaries')
+
+    # Core training parameters
+    parser.add_argument('--lr', type=float, default=1e-2,
+                        help='learning rate for Adam optimizer')
+    parser.add_argument('--gamma', type=float, default=0.95,
+                        help='discount factor')
+    # NOTE: 1 iteration = sample_batch_size * num_workers timesteps * num_envs_per_worker
+    parser.add_argument('--sample-batch-size', type=int, default=25,
+                        help='number of data points sampled /update /worker')
+    parser.add_argument('--train-batch-size', type=int, default=1024,
+                        help='number of data points /update')
+    parser.add_argument('--n-step', type=int, default=1,
+                        help='length of multistep value backup')
+    parser.add_argument('--num-units', type=int, default=64,
+                        help='number of units in the mlp')
+    parser.add_argument('--final-reward', type=int, default=-400,
+                        help='final reward after which to stop training')
+
+    # Checkpoint
+    parser.add_argument('--checkpoint-freq', type=int, default=200,
+                        help='save model once every time this many iterations are completed')
+    parser.add_argument('--local-dir', type=str, default='./logs',
+                        help='path to save checkpoints')
+    parser.add_argument('--restore', type=str, default=None,
+                        help='directory in which training state and model are loaded')
+
+    # Parallelism
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-envs-per-worker', type=int, default=4)
+    parser.add_argument('--num-gpus', type=int, default=0)
+
+    return parser.parse_args()
+
+
+def find_final_checkpoint(start_dir):
+    def find(pattern, path):
+        result = []
+        for root, _, files in os.walk(path):
+            for name in files:
+                if pattern.match(name):
+                    result.append(os.path.join(root, name))
+        return result
+
+    cp_pattern = re.compile('.*checkpoint-\\d+$')
+    checkpoint_files = find(cp_pattern, start_dir)
+
+    checkpoint_numbers = []
+    for file in checkpoint_files:
+        checkpoint_numbers.append(int(file.split('-')[-1]))
+
+    final_checkpoint_number = max(checkpoint_numbers)
+
+    return next(
+        checkpoint_file for checkpoint_file in checkpoint_files
+        if checkpoint_file.endswith(str(final_checkpoint_number)))
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/images/particle_simple_spread.gif
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/images/particle_simple_spread.gif
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb
@@ -0,0 +1,526 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
+        "\n",
+        "Licensed under the MIT License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.png)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Reinforcement Learning in Azure Machine Learning - Training multiple agents on collaborative ParticleEnv tasks\n",
+        "\n",
+        "This tutorial will show you how to train policies in a multi-agent scenario.\n",
+        "We use OpenAI Gym's [Particle environments](https://github.com/openai/multiagent-particle-envs),\n",
+        "which model agents and landmarks in a two-dimensional world. Particle comes with\n",
+        "several predefined scenarios, both competitive and collaborative, and with or without communication.\n",
+        "\n",
+        "For this tutorial, we pick a cooperative navigation scenario where N agents are in a world with N\n",
+        "landmarks.  The agents' goal is to cover all the landmarks without collisions,\n",
+        "so agents must learn to avoid each other (social distancing!).  The video below shows training\n",
+        "results for N=3 agents/landmarks:\n",
+        "\n",
+        "<table style=\"width:50%\">\n",
+        "  <tr>\n",
+        "      <th style=\"text-align: center;\">\n",
+        "          <img src=\"./images/particle_simple_spread.gif\" alt=\"Particle video\" align=\"middle\" margin-left=\"auto\" margin-right=\"auto\"/>\n",
+        "      </th>\n",
+        "  </tr>\n",
+        "  <tr style=\"text-align: center;\">\n",
+        "      <th>Fig 1. Video of 3 agents covering 3 landmarks in a multiagent Particle scenario.</th>\n",
+        "  </tr>\n",
+        "</table>\n",
+        "\n",
+        "The tutorial will cover the following steps:\n",
+        "- Initializing Azure Machine Learning resources for training\n",
+        "- Training policies in a multi-agent environment with Azure Machine Learning service\n",
+        "- Monitoring training progress\n",
+        "\n",
+        "## Prerequisites\n",
+        "\n",
+        "The user should have completed the Azure Machine Learning introductory tutorial. You will need to make sure that you have a valid subscription id, a resource group and a workspace. For detailed instructions see [Tutorial: Get started creating your first ML experiment](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup).\n",
+        "\n",
+        "Please ensure that you have a current version of IPython (>= 7.15) installed.\n",
+        "\n",
+        "While this is a standalone notebook, we highly recommend going over the introductory notebooks for RL first.\n",
+        "- Getting started:\n",
+        "  - [RL using a compute instance with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb)\n",
+        "  - [RL using Azure Machine Learning compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb)\n",
+        "- [Scaling RL training runs with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb)\n",
+        "\n",
+        "Advanced users might also be interested in [this notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb) demonstrating how to train a Minecraft RL agent in Azure Machine Learning.\n",
+        "\n",
+        "## Initialize resources\n",
+        "\n",
+        "All required Azure Machine Learning service resources for this tutorial can be set up from Jupyter. This includes:\n",
+        "\n",
+        "- Connecting to your existing Azure Machine Learning workspace.\n",
+        "- Creating an experiment to track runs.\n",
+        "- Creating remote compute targets for [Ray](https://docs.ray.io/en/latest/index.html).\n",
+        "\n",
+        "\n",
+        "### Azure Machine Learning SDK\n",
+        "\n",
+        "Display the Azure Machine Learning SDK version."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import azureml.core\n",
+        "print('Azure Machine Learning SDK Version: ', azureml.core.VERSION)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Connect to workspace\n",
+        "\n",
+        "Get a reference to an existing Azure Machine Learning workspace."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from azureml.core import Workspace\n",
+        "\n",
+        "ws = Workspace.from_config()\n",
+        "print(ws.name, ws.location, ws.resource_group, sep=' | ')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Create an experiment\n",
+        "\n",
+        "Create an experiment to track the runs in your workspace. A\n",
+        "workspace can have multiple experiments and each experiment\n",
+        "can be used to track multiple runs (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)\n",
+        "for details)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from azureml.core import Experiment\n",
+        "\n",
+        "exp = Experiment(workspace=ws, name='particle-multiagent')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Create or attach an existing compute resource\n",
+        "\n",
+        "A compute target is a designated compute resource where you run your training script. For more information, see [What are compute targets in Azure Machine Learning service?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target).\n",
+        "\n",
+        "#### CPU target for Ray head\n",
+        "\n",
+        "In the experiment setup for this tutorial, the Ray head node will\n",
+        "run on a CPU node (D3 type). A maximum cluster size of 1 node is\n",
+        "therefore sufficient. If you wish to run multiple experiments in\n",
+        "parallel using the same CPU cluster, you may elect to increase this\n",
+        "number. The cluster will automatically scale down to 0 nodes when\n",
+        "no training jobs are scheduled (see min_nodes).\n",
+        "\n",
+        "The code below creates a compute cluster of D3 type nodes.\n",
+        "If the cluster with the specified name is already in your workspace\n",
+        "the code will skip the creation process.\n",
+        "\n",
+        "**Note: Creation of a compute resource can take several minutes**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from azureml.core.compute import AmlCompute, ComputeTarget\n",
+        "\n",
+        "cpu_cluster_name = 'cpu-cl-d3'\n",
+        "\n",
+        "if cpu_cluster_name in ws.compute_targets:\n",
+        "    cpu_cluster = ws.compute_targets[cpu_cluster_name]\n",
+        "    if cpu_cluster and type(cpu_cluster) is AmlCompute:\n",
+        "        if cpu_cluster.provisioning_state == 'Succeeded':\n",
+        "            print('Found existing compute target for {}. Using it.'.format(cpu_cluster_name))\n",
+        "        else: \n",
+        "            raise Exception('Found existing compute target for {} '.format(cpu_cluster_name)\n",
+        "                            + 'but it is in state {}'.format(cpu_cluster.provisioning_state))\n",
+        "else:\n",
+        "    print('Creating a new compute target for {}...'.format(cpu_cluster_name))\n",
+        "    provisioning_config = AmlCompute.provisioning_configuration(\n",
+        "        vm_size='STANDARD_D3',\n",
+        "        min_nodes=0, \n",
+        "        max_nodes=1)\n",
+        "\n",
+        "    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
+        "    cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
+        "    \n",
+        "    print('Cluster created.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training the policies\n",
+        "\n",
+        "### Training environment\n",
+        "\n",
+        "This tutorial uses a custom docker image\n",
+        "with the necessary software installed. The [Environment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments)\n",
+        "class stores the configuration for the training environment. The\n",
+        "docker image is set via `env.docker.base_image`.\n",
+        "`user_managed_dependencies` is set so that\n",
+        "the preinstalled Python packages in the image are preserved.\n",
+        "\n",
+        "Note that since we want to capture videos of the training runs requiring a display, we set the interpreter_path such that the Python process is started via **xvfb-run**."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from azureml.core import Environment\n",
+        "    \n",
+        "cpu_particle_env = Environment(name='particle-cpu')\n",
+        "\n",
+        "cpu_particle_env.docker.enabled = True\n",
+        "cpu_particle_env.docker.base_image = 'akdmsft/particle-cpu'\n",
+        "cpu_particle_env.python.interpreter_path = 'xvfb-run -s \"-screen 0 640x480x16 -ac +extension GLX +render\" python'\n",
+        "\n",
+        "max_train_time = os.environ.get('AML_MAX_TRAIN_TIME_SECONDS', 2 * 60 * 60)\n",
+        "cpu_particle_env.environment_variables['AML_MAX_TRAIN_TIME_SECONDS'] = str(max_train_time)\n",
+        "cpu_particle_env.python.user_managed_dependencies = True"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Training script\n",
+        "\n",
+        "This tutorial uses the multiagent algorithm [Multi-Agent Deep Deterministic Policy Gradient (MADDPG)](https://docs.ray.io/en/latest/rllib-algorithms.html?highlight=maddpg#multi-agent-deep-deterministic-policy-gradient-contrib-maddpg).\n",
+        "For training policies in a multiagent scenario, Ray's RLlib also\n",
+        "requires the `multiagent` configuration section to be specified. You\n",
+        "can find more information in the [common parameters](https://docs.ray.io/en/latest/rllib-training.html?highlight=multiagent#common-parameters)\n",
+        "documentation.\n",
+        "\n",
+        "For monitoring and understanding the training progress, one\n",
+        "of the training environments is wrapped in a [Gym monitor](https://github.com/openai/gym/blob/master/gym/wrappers/monitor.py)\n",
+        "which periodically captures videos - by default every 200 training\n",
+        "iterations.\n",
+        "\n",
+        "The stopping criteria are set such that the training run is\n",
+        "terminated after either a mean reward of -400 is observed, or\n",
+        "training has run for over 2 hours.\n",
+        "\n",
+        "### Submitting a training run\n",
+        "\n",
+        "Below, you create the training run using a `ReinforcementLearningEstimator`\n",
+        "object, which contains all the configuration parameters for this experiment:\n",
+        "\n",
+        "- `source_directory`: Contains the training script and helper files to be\n",
+        "  copied onto the node.\n",
+        "- `entry_script`: The training script, described in more detail above.\n",
+        "- `script_params`: The command line arguments to pass to the entry script.\n",
+        "- `compute_target`: The compute target for training script execution.\n",
+        "- `environment`: The Azure Machine Learning environment definition for the node running the training.\n",
+        "- `max_run_duration_seconds`: The time after which to abort the run if it is still running.\n",
+        "\n",
+        "For more details, please take a look at the [online documentation](https://docs.microsoft.com/en-us/python/api/azureml-contrib-reinforcementlearning/?view=azure-ml-py)\n",
+        "for Azure Machine Learning service's reinforcement learning offering.\n",
+        "\n",
+        "Note that you can use the same notebook and scripts to experiment with\n",
+        "different Particle environments.  You can find a list of supported\n",
+        "environments [here](https://github.com/openai/multiagent-particle-envs/tree/master#list-of-environments).\n",
+        "Simply change the `--scenario` parameter to a supported scenario.\n",
+        "\n",
+        "In order to get the best training results, you can also adjust the\n",
+        "`--final-reward` parameter to determine when to stop training. A greater\n",
+        "reward means longer running time, but improved results. By default,\n",
+        "the final reward will be -400, which should show good progress after\n",
+        "about one hour of run time.\n",
+        "\n",
+        "For this notebook, we use a single D3 nodes, giving us a total of 4 CPUs and\n",
+        "0 GPUs. One CPU is used by the MADDPG trainer, and an additional CPU is\n",
+        "consumed by the RLlib rollout worker. The other 2 CPUs are not used, though\n",
+        "smaller node types will run out of memory for this task.\n",
+        "\n",
+        "Lastly, the RunDetails widget displays information about the submitted RL\n",
+        "experiment, including a link to the Azure portal with more details."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from azureml.contrib.train.rl import ReinforcementLearningEstimator\n",
+        "from azureml.widgets import RunDetails\n",
+        "\n",
+        "estimator = ReinforcementLearningEstimator(\n",
+        "    source_directory='files',\n",
+        "    entry_script='particle_train.py',\n",
+        "    script_params={\n",
+        "        '--scenario': 'simple_spread',\n",
+        "        '--final-reward': -400\n",
+        "    },\n",
+        "    compute_target=cpu_cluster,\n",
+        "    environment=cpu_particle_env,\n",
+        "    max_run_duration_seconds=3 * 60 * 60\n",
+        ")\n",
+        "\n",
+        "train_run = exp.submit(config=estimator)\n",
+        "\n",
+        "RunDetails(train_run).show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# If you wish to cancel the run before it completes, uncomment and execute:\n",
+        "#train_run.cancel()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Monitoring training progress\n",
+        "\n",
+        "### View the Tensorboard\n",
+        "\n",
+        "The Tensorboard can be displayed via the Azure Machine Learning\n",
+        "service's [Tensorboard API](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-monitor-tensorboard).\n",
+        "When running locally, please make sure to follow the instructions\n",
+        "in the link and install required packages. Running this cell will output a URL for the Tensorboard.\n",
+        "\n",
+        "Note that the training script sets the log directory when\n",
+        "starting RLlib via the local_dir parameter. ./logs will automatically\n",
+        "appear in the downloadable files for a run. Since this script is\n",
+        "executed on the Ray head node run, we need to get a reference to it\n",
+        "as shown below.\n",
+        "\n",
+        "The Tensorboard API will continuously stream logs from the run.\n",
+        "\n",
+        "**Note: It may take a couple of minutes after the run is in \"Running\"\n",
+        "state before Tensorboard files are available and the board will refresh automatically**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import time\n",
+        "from azureml.tensorboard import Tensorboard\n",
+        "\n",
+        "head_run = None\n",
+        "\n",
+        "timeout = 60\n",
+        "while timeout > 0 and head_run is None:\n",
+        "    timeout -= 1\n",
+        "    \n",
+        "    try:\n",
+        "        head_run = next(r for r in train_run.get_children() if r.id.endswith('head'))\n",
+        "    except StopIteration:\n",
+        "        time.sleep(1)\n",
+        "\n",
+        "tb = Tensorboard([head_run])\n",
+        "tb.start()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### View training videos\n",
+        "\n",
+        "As mentioned above, we record videos of the agents interacting with the\n",
+        "Particle world. These videos are often a crucial indicator for training\n",
+        "success. The code below downloads the latest video as it becomes available\n",
+        "and displays it in-line.\n",
+        "\n",
+        "Over time, the agents learn to cooperate and avoid collisions while\n",
+        "traveling to all landmarks.\n",
+        "\n",
+        "**Note: It can take several minutes for a video to appear after the run\n",
+        "was started.**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import tempfile\n",
+        "from azureml.core import Dataset\n",
+        "from azureml.data.dataset_error_handling import DatasetValidationError\n",
+        "\n",
+        "from IPython.display import clear_output\n",
+        "from IPython.core.display import display, Video\n",
+        "\n",
+        "datastore = ws.get_default_datastore()\n",
+        "path_prefix = './tmp_videos'\n",
+        "\n",
+        "def download_latest_training_video(run, video_checkpoint_counter):\n",
+        "    run_artifacts_path = os.path.join('azureml', run.id)\n",
+        "    \n",
+        "    try:\n",
+        "        run_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))\n",
+        "    except DatasetValidationError as e:\n",
+        "        # This happens at the start of the run when there is no data available\n",
+        "        # in the run's artifacts\n",
+        "        return None, video_checkpoint_counter\n",
+        "    \n",
+        "    video_files = [file for file in run_artifacts_ds.to_path() if file.endswith('.mp4')]\n",
+        "    if len(video_files) == video_checkpoint_counter:\n",
+        "        return None, video_checkpoint_counter\n",
+        "    \n",
+        "    iteration_numbers = [int(vf[vf.rindex('video') + len('video') : vf.index('.mp4')]) for vf in video_files]\n",
+        "    latest_video = next(vf for vf in video_files if vf.endswith('{num}.mp4'.format(num=max(iteration_numbers))))\n",
+        "    latest_video = os.path.join(run_artifacts_path, os.path.normpath(latest_video[1:]))\n",
+        "    \n",
+        "    datastore.download(\n",
+        "        target_path=path_prefix,\n",
+        "        prefix=latest_video.replace('\\\\', '/'),\n",
+        "        show_progress=False)\n",
+        "    \n",
+        "    return os.path.join(path_prefix, latest_video), len(video_files)\n",
+        "\n",
+        "\n",
+        "def render_video(vf):\n",
+        "    clear_output(wait=True)\n",
+        "    display(Video(data=vf, embed=True, html_attributes='loop autoplay width=50%'))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import shutil\n",
+        "\n",
+        "terminal_statuses = ['Canceled', 'Completed', 'Failed']\n",
+        "video_checkpoint_counter = 0\n",
+        "\n",
+        "while head_run.get_status() not in terminal_statuses:\n",
+        "    video_file, video_checkpoint_counter = download_latest_training_video(head_run, video_checkpoint_counter)\n",
+        "    if video_file is not None:\n",
+        "        render_video(video_file)\n",
+        "        \n",
+        "        print('Displaying video number {}'.format(video_checkpoint_counter))\n",
+        "        shutil.rmtree(path_prefix)\n",
+        "    \n",
+        "    # Interrupting the kernel can take up to 15 seconds\n",
+        "    # depending on when time.sleep started\n",
+        "    time.sleep(15)\n",
+        "    \n",
+        "train_run.wait_for_completion()\n",
+        "print('The training run has reached a terminal status.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cleaning up\n",
+        "\n",
+        "Below, you can find code snippets for your convenience to clean up any resources created as part of this tutorial you don't wish to retain."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# to stop the Tensorboard, uncomment and run\n",
+        "#tb.stop()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# to delete the cpu compute target, uncomment and run\n",
+        "#cpu_cluster.delete()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Next steps\n",
+        "\n",
+        "We would love to hear your feedback! Please let us know what you think of Reinforcement Learning in Azure Machine Learning and what features you are looking forward to."
+      ]
+    }
+  ],
+  "metadata": {
+    "authors": [
+      {
+        "name": "andress"
+      }
+    ],
+    "kernelspec": {
+      "display_name": "Python 3.6",
+      "language": "python",
+      "name": "python36"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.0"
+    },
+    "notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00afLicensed under the MIT License.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00af "
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
--- a/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.yml
+++ b/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.yml
@@ -0,0 +1,9 @@
+name: particle
+dependencies:
+- pip:
+  - azureml-sdk
+  - azureml-contrib-reinforcementlearning
+  - azureml-widgets
+  - tensorboard
+  - azureml-tensorboard
+  - ipython
--- a/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb
+++ b/how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb
@@ -58,7 +58,7 @@
      "metadata": {},
      "source": [
        "### Get Azure Machine Learning workspace\n",
-        "Get a reference to an existing Azure Machine Learning workspace. Please make sure to change `STANDARD_NC6` and `STANDARD_D2_V2` to [the ones available in your region](https://azure.microsoft.com/en-us/global-infrastructure/services/?products=virtual-machines).\n"
+        "Get a reference to an existing Azure Machine Learning workspace.\n"
      ]
    },
    {
--- a/index.md
+++ b/index.md
@@ -130,6 +130,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
 | [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) |  |  |  |  |  |  |
 | [cartpole_sc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb) |  |  |  |  |  |  |
 | [minecraft](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb) |  |  |  |  |  |  |
+| [particle](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb) |  |  |  |  |  |  |
 | [devenv_setup](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb) |  |  |  |  |  |  |
 | [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
 | [distributed-cntk-with-custom-docker](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/distributed-cntk-with-custom-docker/distributed-cntk-with-custom-docker.ipynb) |  |  |  |  |  |  |