Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks//notebooks/work-with-data/datasets/datasets-tutorial/datasets-diff.png)

# <center>Detect drift using Dataset Diff API </center>

<br>
<font size=2>
    This notebook provides step by step instructions on how to compare two different datasets. It includes two partsï¼š
    <br>&nbsp;&nbsp;&nbsp;&nbsp;&#x2611; compare two datasets using local compute;
    <br>&nbsp;&nbsp;&nbsp;&nbsp;&#x2611; compare two datasets remotely using Azure ML compute.
</font>

# Prerequisites and Setup

<font size=2>This section is shared by both local and remote execution, you may need duplicate this section if splitting this notebook into separate local/remote notebooks.</font>


## Prerequisites

### Install Supporting Packages

&nbsp;&nbsp;&nbsp;&nbsp;pip install scipy<br>
&nbsp;&nbsp;&nbsp;&nbsp;pip install tqdm<br>
&nbsp;&nbsp;&nbsp;&nbsp;pip install pandas<br>
&nbsp;&nbsp;&nbsp;&nbsp;pip install pyarrow<br>
&nbsp;&nbsp;&nbsp;&nbsp;pip install ipywidgets<br>
&nbsp;&nbsp;&nbsp;&nbsp;pip install lightgbm<br>
&nbsp;&nbsp;&nbsp;&nbsp;pip install matplotlib<br>

### Install AzureML Packages

&nbsp;&nbsp;&nbsp;&nbsp;pip install --user azureml-core<br>

&nbsp;&nbsp;&nbsp;&nbsp;pip install --user azureml-opendatasets<br>

### Import Dependencies

In [None]:
import os
import sys
import warnings
import requests
import pandas as pd
import numpy as np
import ipywidgets as widgets

import azureml.core

from io import StringIO
from tqdm import tqdm
from IPython import display
from datetime import datetime, timedelta
from azureml.core import Datastore, Dataset
from azureml.opendatasets import NoaaIsdWeather


## Declare Variables For Demo

Feel free to customize them.

In [None]:
year   = 2016
month  = 1
date   = 1
b_days = 2    # for baseline
t_days = 7    # for target

local_folder = "demo"
baseline_file = 'baseline.csv'

feature_columns = ['usaf', 'wban', 'latitude', 'longitude', 'elevation', 'temperature', 'p_k']

## Prepare Datasets

<font size=2>The diff calcualtion is always between two datasets, here for demo, we use "baseline" and "target" to present them.</font>

In [None]:
os.makedirs(local_folder, exist_ok=True)

local_baseline = os.path.join(local_folder, baseline_file)

start_date = datetime(year, month, date)

### Prepare Baseline Dataset
<font size=2>Retrieve wether data from NOAA for declared days (b_days declared in above cell). It may takes 2 minutes for 2 days.</font>

In [None]:
start = start_date
isd = NoaaIsdWeather(start, start + timedelta(days=b_days))

baseline_df = isd.to_pandas_dataframe()
baseline_df.head()

baseline_df.to_csv(local_baseline)

### Prepare Target Dataset(s)

<font size=2>Retrieve wether data from NOAA for declared days (t_days declared in above cell). It may takes 5 minutes for 7 days.</font>

In [None]:
for day in tqdm(range(0, t_days)):
    start = start_date + timedelta(days=day)
    isd = NoaaIsdWeather(start, start + timedelta(days=1))

    target_df = isd.to_pandas_dataframe()
    target_df = target_df[feature_columns]
    target_df.to_csv(os.path.join(local_folder, 'target_{}.csv'.format(day)))

# Predefine Methods For Result Processing

## Parse and Present Datasets' Diff Results

<font size=2>Each diff result is a list of "DiffMetric" objects. Typically each objec present a detailed measurement output for a specific column.
<br><br>Below is an example of "DiffMetric" object:</font>
<font face="monospace" size=1>
<br>&nbsp;&nbsp;&nbsp;&nbsp;{&nbsp;&nbsp;
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'name':'percentage_difference_median',&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;measurement&nbsp;name
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'value':0.01270670472603889,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;the result value a number to indicate how big the diff is for current measurement.
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'extended_properties':{&nbsp;&nbsp;
</font><font face="monospace" size=1 color=LightSteelBlue><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'action_id':'3d3da05d-0871-4cc9-93cb-f43859aae13b',&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;(remote&nbsp;calculation&nbsp;only)&nbsp;action&nbsp;id
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'from_dataset_id':'12edc566-8803-4e0f-ba91-c2ee05eeddee',&nbsp;&nbsp;-->&nbsp;(remote&nbsp;calculation&nbsp;only)&nbsp;baseline&nbsp;dataset
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'from_dataset_version':'1',&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;(remote&nbsp;calculation&nbsp;only)&nbsp;baseline&nbsp;version
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'to_dataset_id':'9b85c9ba-50c2-4227-a9bc-91dee4a18228',&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;(remote&nbsp;calculation&nbsp;only)&nbsp;target&nbsp;dataset
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'to_dataset_version':'1',&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;(remote&nbsp;calculation&nbsp;only)&nbsp;target&nbsp;version
</font><font face="monospace" size=1><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'column_name':'elevation',&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;column&nbsp;name&nbsp;in&nbsp;dataset,&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;could&nbsp;be&nbsp;['name':'datadrift_coefficient']&nbsp;for&nbsp;dataset&nbsp;level&nbsp;diff
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'metric_category':'profile_diff'&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-->&nbsp;category,&nbsp;could&nbsp;be&nbsp;:<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;dataset_drift (dataset level)<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;profile_diff (column level)<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;statistical_distance (column level)
<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}
<br>&nbsp;&nbsp;&nbsp;&nbsp;}
</font>

In [None]:
def parse_result(rst, columns, measurements):
    columnlist = list(columns)
    columnlist.insert(0, "measurements \ columns")
    measurementlist = list(measurements)
    
    daily_result = []
    daily_result.append(columnlist)
    
    drift = None
    daily_contribution = {}
    
    for m in measurements:
        emptylist = ([''] * len(columns))
        emptylist.insert(0, m)
        daily_result.append(emptylist)

    for r in rst:
        # get dataset level diff (drift)
        if r.name == "datadrift_coefficient":
            drift = r.value
        # get diff (drift) contribution for each column:
        elif r.name == "datadrift_contribution":
            daily_contribution[r.extended_properties["column_name"]] = r.value
        # get column level diff measurements
        else:
            if "column_name" in r.extended_properties:
                col = r.extended_properties["column_name"]
                msm = r.name
                val = r.value
                cid = columnlist.index(col)
                kid = measurementlist.index(msm) + 1
                daily_result[kid][cid] = val

    return daily_result, drift, daily_contribution

## Present Dataset Level Diff (aka drift)

<font size=2>This method will generate two graphs, the left graph presents dataset level difference for all compared baseline-target pairs, the right graph presents dataset level difference contribution for each column so that we know which column impacts more.</font>

In [None]:
%matplotlib inline

import matplotlib.dates as mdates
import matplotlib.pyplot as plt 
import matplotlib as mpl

def show_diff(drift_metrics, dates, columns, drift_contributions, summary_contribute, bottoms_contribute):
    drifts = [drift_metrics[day] for day in drift_metrics]
    daily_summary_contribution = list(summary_contribute.values())
    xrange = pd.date_range(dates[0], dates[-1], freq='D')

    figure = plt.figure(figsize=(16, 4))
    plt.tight_layout()

    # left graph
    ax1 = plt.subplot(1, 2, 1)
    ax1.grid()
    plt.sca(ax1)
    plt.title("Diff(Drift) Trend\n", fontsize=20)
    plt.xticks(rotation=30)
    plt.xlabel("Date", fontsize=16)
    plt.ylabel("Drift Coefficent", fontsize=16)
    plt.plot_date(dates, drifts, '-r', marker='.', linewidth=0.5, markersize=5)

    # right graph
    ax2 = plt.subplot(1, 2, 2)
    plt.sca(ax2)
    plt.title("Drift Contribution of columns\n", fontsize=20)
    plt.xticks(xrange, rotation=30)
    plt.xlabel("Date", fontsize=16)
    plt.ylabel("Drift Contribution", fontsize=16)

    yvals = ax2.get_yticks()
    ax2.set_yticklabels(['{:,.2%}'.format(v) for v in yvals])
    ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y%m-%d'))

    for c in columns:
        contribution = []
        for dt in drift_contributions:
            contribution.append(drift_contributions[dt][c])
            bar_ratio = [x / y for x, y in zip(contribution, daily_summary_contribution)]

        ax2.bar(dates, height=bar_ratio, bottom=bottoms_contribute)
        bottoms_contribute = [x + y for x, y in zip(bottoms_contribute, bar_ratio)]

    plt.legend(columns)

    plt.show()

# Execute Datasets' Diff Calculation Locally

<font size=2>Local execution let you to run in a Jupyter Notebook or Code editor in a local computer.</font>

## Calculate Dataset Diff At Local

### Create Baseline Dataset

<font size=2>Create baseline dataset object from the retrieved baseline data.</font>

In [None]:
from azureml.core import Dataset

baseline = Dataset.auto_read_files(local_baseline, include_path=True)

# The baseline data is not filtered by feature columns list, thus all retrieved data columns will be listed below.
# You'll see "Column1" in the output, which is a default name added when the original column is not available.
baseline.get_profile()

### Create Target Datasets

<font size=2>Create target dataset objects from retrieved target data.</font>

In [None]:
targets = {}

for day in tqdm(range(0, t_days)):
    target = Dataset.auto_read_files(os.path.join(local_folder, 'target_{}.csv'.format(day)))
    targets[day] = target

### Calculate Diff Between Each Target Dataset And Baseline Dataset

<font size=2>Compare each target dataset with baseline dataset to calculate diff between them.</font>

In [None]:
buf = {}

columns = set()
measurements = set()

for day in tqdm(range(0, t_days)):
    diff_action = baseline.diff(rhs_dataset=targets[day])
    diff_action.wait_for_completion()
    
    dt = (start_date + timedelta(days=day)).strftime("%Y-%m-%d")
    buf[dt] = diff_action._result
    
    for r in diff_action._result:
            if r.name not in measurements:
                measurements.add(r.name)
            if "column_name" in r.extended_properties and r.extended_properties["column_name"] not in columns:
                columns.add(r.extended_properties["column_name"])

## Parse And Present Local Execution Results

<font size=2>
<br>The diff outputs usually contains two different level information:
<br>&nbsp;&nbsp;&nbsp;&nbsp;1. General diff, aka dataset level diff. The output is a number between 0 and 1 to indicate what level the diff is. This dataset level diff is also called drift between two datasets.
<br>&nbsp;&nbsp;&nbsp;&nbsp;2. Detailed diff, aka column level diff. The output is a metrics organized like a 2-D array. One dimension is column names, that is why it's in column level. The other dimension are measurements. The diff calculation actually includes variuos measurements from different perspectives, each measurement will generate an index for each column to present how big impacts this column contributed.
</font>




### Parse and List Column Level Diff Results

<font size=2>Here will iteratively list all details per each measurement per column calculated.</font>

In [None]:
from pandas import DataFrame

dates = []
drift_metrics = {}
drift_contributions = {}
summary_contribute = {}
bottoms_contribute = []

for dt, rst in buf.items():
    dates.append(dt)
    print("\n---------------------------------------- Result of {} ----------------------------------------".format(dt))
    
    daily_result, drift, daily_contribution = parse_result(rst, columns, measurements)
    drift_metrics[dt] = drift
    drift_contributions[dt] = daily_contribution

    sum_contribution = 0
    bottoms_contribute.append(0)
    for col, val in daily_contribution.items():
        sum_contribution += val
    summary_contribute[dt] = sum_contribution

    
    display.display(pd.DataFrame(daily_result))

### Present Dataset Level Diff (aka drift) In Graphs

<font size=2>The left graph presents dataset level difference for all compared baseline-target pairs, the right graph presents dataset level difference contribution for each column so that we know which column impacts more.</font>

In [None]:
show_diff(drift_metrics, dates, columns, drift_contributions, summary_contribute, bottoms_contribute)

# Excute Datasets's Diff Calculation Remotely

<font size=2>Remote execution let you to data compare on more powerful computes - Machine Learning Compute clusters.</font>

## Prepare Remote Environment
### Get Workspace
<font size=2>
<br>If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, check the <a href="../../../configuration.ipynb" title="Create an Azure Machine Learning service workspace">configuration notebook</a> first if you haven't already to establish your connection to the AzureML Workspace.
</font>

In [None]:
from azureml.core.workspace import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

ws = Workspace.from_config()

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep="\n")

### Create Compute Resource For Calculation
<font size=2>Check if compute resouce exists and create a new one if not.</font>

In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget

existing = False
del_cmpt = False
cts = ws.compute_targets

if (ws.DEFAULT_CPU_CLUSTER_NAME in cts and cts[ws.DEFAULT_CPU_CLUSTER_NAME].type == 'AmlCompute'):
    existing = True
    aml_compute = cts[ws.DEFAULT_CPU_CLUSTER_NAME]
    
if not existing:
    aml_compute = AmlCompute.create(ws,ws.DEFAULT_CPU_CLUSTER_NAME,ws.DEFAULT_CPU_CLUSTER_CONFIGURATION)
    aml_compute.wait_for_completion(show_output=True)
    del_cmpt = True

### Upload Sample Data To Datastore

<font size=2>Upload data files to the blob storage in Azure ML workspace.</font>

In [None]:
from azureml.core import Datastore, Dataset
import azureml.data
from azureml.data.azure_storage_datastore import AzureFileDatastore, AzureBlobDatastore

remote_data_path ='demo'

dstore = ws.get_default_datastore()
dstore.upload_files([local_baseline],
      target_path=remote_data_path,
      overwrite=True,
      show_progress=True)

for day in tqdm(range(0, t_days)):
    target_file = os.path.join(local_folder, 'target_{}.csv'.format(day))
    dstore.upload_files([target_file],
          target_path=remote_data_path,
          overwrite=True,
          show_progress=True)

### Register DataSets

<font size=2>Create and Register Datasets.</font>

In [None]:
from azureml.core import Datastore, Dataset
dstore = ws.get_default_datastore()

xpath = remote_data_path + '/' + baseline_file
toregister_baseline = Dataset.from_delimited_files(dstore.path(xpath))
registered_baseline = toregister_baseline.register(workspace = ws,
                                                   name = 'dataset baseline for diff demo',
                                                   description = 'dataset baseline for diff comparison',
                                                   exist_ok = True,
                                                   update_if_exist = True
                                                  )

registered_targets = {}
for day in tqdm(range(0, t_days)):
    target_file = 'target_{}.csv'.format(day)
    toregister_target = Dataset.from_delimited_files(dstore.path(remote_data_path + '/' + target_file))
    registered_target = toregister_target.register(workspace = ws,
                                                   name = 'dataset target-{} for diff demo'.format(day),
                                                   description = 'target target-{} for diff comparison'.format(day),
                                                   exist_ok = True,
                                                   update_if_exist = True
                                                  )
    registered_targets[day] = registered_target

## Calculate Dataset Diff Remotely

<font size=2>Perform the calculation remotely. This may take 20 minutes.</font>


In [None]:
remote_diffs = {}

r_columns = set()
r_measurements = set()

for day, registered_target in registered_targets.items():
    dt = (start_date + timedelta(days=day)).strftime("%Y-%m-%d")
    remote_diff = registered_baseline.diff(registered_target, compute_target=ws.DEFAULT_CPU_CLUSTER_NAME)
    remote_diff.wait_for_completion()
    
    remote_diffs[dt] = remote_diff.get_result()
    
    for r in remote_diff.get_result():
            if r.name not in r_measurements:
                r_measurements.add(r.name)
            if "column_name" in r.extended_properties and r.extended_properties["column_name"] not in r_columns:
                r_columns.add(r.extended_properties["column_name"])

## Parse And Present Remote Execution Results

### Parse And List Column Level Diff Results

<font size=2>Here will iteratively list all details per each measurement per column calculated.</font>

In [None]:
from pandas import DataFrame

r_dates = []
r_drift_metrics = {}
r_drift_contributions = {}
r_summary_contribute = {}
r_bottoms_contribute = []

for dt, rst in remote_diffs.items():
    r_dates.append(dt)
    print("\n---------------------------------------- Result of {} ----------------------------------------".format(dt))
    
    daily_result, drift, daily_contribution = parse_result(rst, r_columns, r_measurements)
    r_drift_metrics[dt] = drift
    r_drift_contributions[dt] = daily_contribution

    sum_contribution = 0
    r_bottoms_contribute.append(0)
    for col, val in daily_contribution.items():
        sum_contribution += val
    r_summary_contribute[dt] = sum_contribution

    
    display.display(pd.DataFrame(daily_result))

### Present Dataset Level Diff (aka drift) In Graphs

<font size=2><font size=2>The left graph presents dataset level difference for all compared baseline-target pairs, the right graph presents dataset level difference contribution for each column so that we know which column impacts more.</font></font>

In [None]:
show_diff(r_drift_metrics, r_dates, r_columns, r_drift_contributions, r_summary_contribute, r_bottoms_contribute)

## Clean Resources Created

In [None]:
if del_cmpt == True:
    try:
        aml_compute.delete()
        aml_compute.wait_for_completion()
    except Exception as e:
        if 'ComputeTargetNotFound' in e.message:
            print("Compute target deleted.")
            del_cmpt = False

# Reference

<font size=2>Detailed description of Dataset Diff attribute can be found at</font><br>
https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset(class)?view=azure-ml-py#diff-rhs-dataset--compute-target-none--columns-none-