Compare commits

...

10 Commits

Author SHA1 Message Date
vizhur
879a272a8d update samples from Release-52 as a part of SDK release 2020-05-18 19:21:05 +00:00
Harneet Virk
bc65bde097 Merge pull request #971 from Azure/release_update/Release-51
update samples from Release-51 as a part of  SDK release
2020-05-13 22:17:45 -07:00
vizhur
690bdfbdbe update samples from Release-51 as a part of SDK release 2020-05-14 05:03:47 +00:00
Harneet Virk
3c02bd8782 Merge pull request #967 from Azure/release_update/Release-50
update samples from Release-50 as a part of  SDK release
2020-05-12 19:57:40 -07:00
vizhur
5c14610a1c update samples from Release-50 as a part of SDK release 2020-05-13 02:45:40 +00:00
Harneet Virk
4e3afae6fb Merge pull request #965 from Azure/release_update/Release-49
update samples from Release-49 as a part of  SDK release
2020-05-11 19:25:28 -07:00
vizhur
a2144aa083 update samples from Release-49 as a part of SDK release 2020-05-12 02:24:34 +00:00
Harneet Virk
0e6334178f Merge pull request #963 from Azure/release_update/Release-46
update samples from Release-46 as a part of  SDK release
2020-05-11 14:49:34 -07:00
vizhur
4ec9178d22 update samples from Release-46 as a part of SDK release 2020-05-11 21:48:31 +00:00
Harneet Virk
2aa7c53b0c Merge pull request #962 from Azure/release_update_stablev2/Release-11
update samples from Release-11 as a part of 1.5.0 SDK stable release
2020-05-11 12:42:32 -07:00
63 changed files with 9381 additions and 5788 deletions

View File

@@ -1,4 +1,4 @@
name: automl_env_master
name: azure_automl
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
@@ -12,7 +12,6 @@ dependencies:
- scipy==1.4.1
- scikit-learn>=0.19.0,<=0.20.3
- pandas>=0.22.0,<=0.23.4
- testpath=0.3.1
- py-xgboost<=0.90
- conda-forge::fbprophet==0.5
- pytorch::pytorch=1.4.0
@@ -20,14 +19,13 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF
- --extra-index-url https://dataprepdownloads.azureedge.net/pypi/weekly-rc-932B96D048E011E8B56608/latest/
- azureml-defaults<0.1.50
- azureml-dataprep[pandas]
- azureml-train-automl<0.1.50
- azureml-train<0.1.50
- azureml-widgets<0.1.50
- azureml-pipeline<0.1.50
- azureml-defaults
- azureml-train-automl
- azureml-train
- azureml-widgets
- azureml-pipeline
- pytorch-transformers==1.0.0
- spacy==2.1.8
- pyarrow==0.17.0
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz

View File

@@ -12,20 +12,20 @@ dependencies:
- urllib3<1.24
- scipy==1.4.1
- scikit-learn>=0.19.0,<=0.20.3
- pandas>=0.22.0,<0.23.0
- py-xgboost<=0.80
- pandas>=0.22.0,<=0.23.4
- py-xgboost<=0.90
- conda-forge::fbprophet==0.5
- pytorch::pytorch=1.4.0
- cudatoolkit=10.1.243
- cudatoolkit=9.0
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- azureml-dataprep[pandas]
- azureml-train-automl
- azureml-train
- azureml-widgets
- azureml-pipeline
- pytorch-transformers==1.0.0
- spacy==2.1.8
- pyarrow==0.17.0
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz

View File

@@ -1,23 +0,0 @@
-- This shows using the AutoMLForecast stored procedure to predict using a forecasting model for the nyc_energy dataset.
DECLARE @Model NVARCHAR(MAX) = (SELECT TOP 1 Model FROM dbo.aml_model
WHERE ExperimentName = 'automl-sql-forecast'
ORDER BY CreatedDate DESC)
DECLARE @max_horizon INT = 48
DECLARE @split_time NVARCHAR(22) = (SELECT DATEADD(hour, -@max_horizon, MAX(timeStamp)) FROM nyc_energy WHERE demand IS NOT NULL)
DECLARE @TestDataQuery NVARCHAR(MAX) = '
SELECT CAST(timeStamp AS NVARCHAR(30)) AS timeStamp,
demand,
precip,
temp
FROM nyc_energy
WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL
AND timeStamp > ''' + @split_time + ''''
EXEC dbo.AutoMLForecast @input_query=@TestDataQuery,
@label_column='demand',
@time_column_name='timeStamp',
@model=@model
WITH RESULT SETS ((timeStamp DATETIME, grain NVARCHAR(255), predicted_demand FLOAT, precip FLOAT, temp FLOAT, actual_demand FLOAT))

View File

@@ -1,10 +0,0 @@
-- This lists all the metrics for all iterations for the most recent run.
DECLARE @RunId NVARCHAR(43)
DECLARE @ExperimentName NVARCHAR(255)
SELECT TOP 1 @ExperimentName=ExperimentName, @RunId=SUBSTRING(RunId, 1, 43)
FROM aml_model
ORDER BY CreatedDate DESC
EXEC dbo.AutoMLGetMetrics @RunId, @ExperimentName

View File

@@ -1,25 +0,0 @@
-- This shows using the AutoMLTrain stored procedure to create a forecasting model for the nyc_energy dataset.
DECLARE @max_horizon INT = 48
DECLARE @split_time NVARCHAR(22) = (SELECT DATEADD(hour, -@max_horizon, MAX(timeStamp)) FROM nyc_energy WHERE demand IS NOT NULL)
DECLARE @TrainDataQuery NVARCHAR(MAX) = '
SELECT CAST(timeStamp as NVARCHAR(30)) as timeStamp,
demand,
precip,
temp
FROM nyc_energy
WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL
and timeStamp < ''' + @split_time + ''''
INSERT INTO dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
EXEC dbo.AutoMLTrain @input_query= @TrainDataQuery,
@label_column='demand',
@task='forecasting',
@iterations=10,
@iteration_timeout_minutes=5,
@time_column_name='timeStamp',
@max_horizon=@max_horizon,
@experiment_name='automl-sql-forecast',
@primary_metric='normalized_root_mean_squared_error'

View File

@@ -1,161 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train a model and use it for prediction\r\n",
"\r\n",
"Before running this notebook, run the auto-ml-sql-setup.ipynb notebook."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/auto-ml-sql-energy-demand.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set the default database"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"USE [automl]\r\n",
"GO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use the AutoMLTrain stored procedure to create a forecasting model for the nyc_energy dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"INSERT INTO dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
"EXEC dbo.AutoMLTrain @input_query='\r\n",
"SELECT CAST(timeStamp as NVARCHAR(30)) as timeStamp,\r\n",
" demand,\r\n",
"\t precip,\r\n",
"\t temp,\r\n",
"\t CASE WHEN timeStamp < ''2017-01-01'' THEN 0 ELSE 1 END AS is_validate_column\r\n",
"FROM nyc_energy\r\n",
"WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL\r\n",
"and timeStamp < ''2017-02-01''',\r\n",
"@label_column='demand',\r\n",
"@task='forecasting',\r\n",
"@iterations=10,\r\n",
"@iteration_timeout_minutes=5,\r\n",
"@time_column_name='timeStamp',\r\n",
"@is_validate_column='is_validate_column',\r\n",
"@experiment_name='automl-sql-forecast',\r\n",
"@primary_metric='normalized_root_mean_squared_error'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use the AutoMLPredict stored procedure to predict using the forecasting model for the nyc_energy dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DECLARE @Model NVARCHAR(MAX) = (SELECT TOP 1 Model FROM dbo.aml_model\r\n",
" WHERE ExperimentName = 'automl-sql-forecast'\r\n",
"\t\t\t\t\t\t\t\tORDER BY CreatedDate DESC)\r\n",
"\r\n",
"EXEC dbo.AutoMLPredict @input_query='\r\n",
"SELECT CAST(timeStamp AS NVARCHAR(30)) AS timeStamp,\r\n",
" demand,\r\n",
"\t precip,\r\n",
"\t temp\r\n",
"FROM nyc_energy\r\n",
"WHERE demand IS NOT NULL AND precip IS NOT NULL AND temp IS NOT NULL\r\n",
"AND timeStamp >= ''2017-02-01''',\r\n",
"@label_column='demand',\r\n",
"@model=@model\r\n",
"WITH RESULT SETS ((timeStamp NVARCHAR(30), actual_demand FLOAT, precip FLOAT, temp FLOAT, predicted_demand FLOAT))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## List all the metrics for all iterations for the most recent training run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DECLARE @RunId NVARCHAR(43)\r\n",
"DECLARE @ExperimentName NVARCHAR(255)\r\n",
"\r\n",
"SELECT TOP 1 @ExperimentName=ExperimentName, @RunId=SUBSTRING(RunId, 1, 43)\r\n",
"FROM aml_model\r\n",
"ORDER BY CreatedDate DESC\r\n",
"\r\n",
"EXEC dbo.AutoMLGetMetrics @RunId, @ExperimentName"
]
}
],
"metadata": {
"authors": [
{
"name": "jeffshep"
}
],
"category": "tutorial",
"compute": [
"Local"
],
"datasets": [
"NYC Energy"
],
"deployment": [
"None"
],
"exclude_from_index": false,
"framework": [
"Azure ML AutoML"
],
"tags": [
""
],
"friendly_name": "Forecasting with automated ML SQL integration",
"index_order": 1,
"kernelspec": {
"display_name": "Python 3.6",
"language": "sql",
"name": "python36"
},
"language_info": {
"name": "sql",
"version": ""
},
"task": "Forecasting"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,92 +0,0 @@
-- This procedure forecast values based on a forecasting model returned by AutoMLTrain.
-- It returns a dataset with the forecasted values.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLForecast]
(
@input_query NVARCHAR(MAX), -- A SQL query returning data to predict on.
@model NVARCHAR(MAX), -- A model returned from AutoMLTrain.
@time_column_name NVARCHAR(255)='', -- The name of the timestamp column for forecasting.
@label_column NVARCHAR(255)='', -- Optional name of the column from input_query, which should be ignored when predicting
@y_query_column NVARCHAR(255)='', -- Optional value column that can be used for predicting.
-- If specified, this can contain values for past times (after the model was trained)
-- and contain Nan for future times.
@forecast_column_name NVARCHAR(255) = 'predicted'
-- The name of the output column containing the forecast value.
) AS
BEGIN
EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
import azureml.core
import numpy as np
from azureml.train.automl import AutoMLConfig
import pickle
import codecs
model_obj = pickle.loads(codecs.decode(model.encode(), "base64"))
test_data = input_data.copy()
if label_column != "" and label_column is not None:
y_test = test_data.pop(label_column).values
else:
y_test = None
if y_query_column != "" and y_query_column is not None:
y_query = test_data.pop(y_query_column).values
else:
y_query = np.repeat(np.nan, len(test_data))
X_test = test_data
if time_column_name != "" and time_column_name is not None:
X_test[time_column_name] = pd.to_datetime(X_test[time_column_name])
y_fcst, X_trans = model_obj.forecast(X_test, y_query)
def align_outputs(y_forecast, X_trans, X_test, y_test, forecast_column_name):
# Demonstrates how to get the output aligned to the inputs
# using pandas indexes. Helps understand what happened if
# the output shape differs from the input shape, or if
# the data got re-sorted by time and grain during forecasting.
# Typical causes of misalignment are:
# * we predicted some periods that were missing in actuals -> drop from eval
# * model was asked to predict past max_horizon -> increase max horizon
# * data at start of X_test was needed for lags -> provide previous periods
df_fcst = pd.DataFrame({forecast_column_name : y_forecast})
# y and X outputs are aligned by forecast() function contract
df_fcst.index = X_trans.index
# align original X_test to y_test
X_test_full = X_test.copy()
if y_test is not None:
X_test_full[label_column] = y_test
# X_test_full does not include origin, so reset for merge
df_fcst.reset_index(inplace=True)
X_test_full = X_test_full.reset_index().drop(columns=''index'')
together = df_fcst.merge(X_test_full, how=''right'')
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = together[together[[label_column, forecast_column_name]].notnull().all(axis=1)]
return(clean)
combined_output = align_outputs(y_fcst, X_trans, X_test, y_test, forecast_column_name)
'
, @input_data_1 = @input_query
, @input_data_1_name = N'input_data'
, @output_data_1_name = N'combined_output'
, @params = N'@model NVARCHAR(MAX), @time_column_name NVARCHAR(255), @label_column NVARCHAR(255), @y_query_column NVARCHAR(255), @forecast_column_name NVARCHAR(255)'
, @model = @model
, @time_column_name = @time_column_name
, @label_column = @label_column
, @y_query_column = @y_query_column
, @forecast_column_name = @forecast_column_name
END

View File

@@ -1,70 +0,0 @@
-- This procedure returns a list of metrics for each iteration of a run.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLGetMetrics]
(
@run_id NVARCHAR(250), -- The RunId
@experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
@connection_name NVARCHAR(255)='default' -- The AML connection to use.
) AS
BEGIN
DECLARE @tenantid NVARCHAR(255)
DECLARE @appid NVARCHAR(255)
DECLARE @password NVARCHAR(255)
DECLARE @config_file NVARCHAR(255)
SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
FROM aml_connection
WHERE ConnectionName = @connection_name;
EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
import logging
import azureml.core
import numpy as np
from azureml.core.experiment import Experiment
from azureml.train.automl.run import AutoMLRun
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core.workspace import Workspace
auth = ServicePrincipalAuthentication(tenantid, appid, password)
ws = Workspace.from_config(path=config_file, auth=auth)
experiment = Experiment(ws, experiment_name)
ml_run = AutoMLRun(experiment = experiment, run_id = run_id)
children = list(ml_run.get_children())
iterationlist = []
metricnamelist = []
metricvaluelist = []
for run in children:
properties = run.get_properties()
if "iteration" in properties:
iteration = int(properties["iteration"])
for metric_name, metric_value in run.get_metrics().items():
if isinstance(metric_value, float):
iterationlist.append(iteration)
metricnamelist.append(metric_name)
metricvaluelist.append(metric_value)
metrics = pd.DataFrame({"iteration": iterationlist, "metric_name": metricnamelist, "metric_value": metricvaluelist})
'
, @output_data_1_name = N'metrics'
, @params = N'@run_id NVARCHAR(250),
@experiment_name NVARCHAR(32),
@tenantid NVARCHAR(255),
@appid NVARCHAR(255),
@password NVARCHAR(255),
@config_file NVARCHAR(255)'
, @run_id = @run_id
, @experiment_name = @experiment_name
, @tenantid = @tenantid
, @appid = @appid
, @password = @password
, @config_file = @config_file
WITH RESULT SETS ((iteration INT, metric_name NVARCHAR(100), metric_value FLOAT))
END

View File

@@ -1,41 +0,0 @@
-- This procedure predicts values based on a model returned by AutoMLTrain and a dataset.
-- It returns the dataset with a new column added, which is the predicted value.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLPredict]
(
@input_query NVARCHAR(MAX), -- A SQL query returning data to predict on.
@model NVARCHAR(MAX), -- A model returned from AutoMLTrain.
@label_column NVARCHAR(255)='' -- Optional name of the column from input_query, which should be ignored when predicting
) AS
BEGIN
EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
import azureml.core
import numpy as np
from azureml.train.automl import AutoMLConfig
import pickle
import codecs
model_obj = pickle.loads(codecs.decode(model.encode(), "base64"))
test_data = input_data.copy()
if label_column != "" and label_column is not None:
y_test = test_data.pop(label_column).values
X_test = test_data
predicted = model_obj.predict(X_test)
combined_output = input_data.assign(predicted=predicted)
'
, @input_data_1 = @input_query
, @input_data_1_name = N'input_data'
, @output_data_1_name = N'combined_output'
, @params = N'@model NVARCHAR(MAX), @label_column NVARCHAR(255)'
, @model = @model
, @label_column = @label_column
END

View File

@@ -1,240 +0,0 @@
-- This stored procedure uses automated machine learning to train several models
-- and returns the best model.
--
-- The result set has several columns:
-- best_run - iteration ID for the best model
-- experiment_name - experiment name pass in with the @experiment_name parameter
-- fitted_model - best model found
-- log_file_text - AutoML debug_log contents
-- workspace - name of the Azure ML workspace where run history is stored
--
-- An example call for a classification problem is:
-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
-- exec dbo.AutoMLTrain @input_query='
-- SELECT top 100000
-- CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime
-- ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime
-- ,[passenger_count]
-- ,[trip_time_in_secs]
-- ,[trip_distance]
-- ,[payment_type]
-- ,[tip_class]
-- FROM [dbo].[nyctaxi_sample] order by [hack_license] ',
-- @label_column = 'tip_class',
-- @iterations=10
--
-- An example call for forecasting is:
-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
-- exec dbo.AutoMLTrain @input_query='
-- select cast(timeStamp as nvarchar(30)) as timeStamp,
-- demand,
-- precip,
-- temp,
-- case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column
-- from nyc_energy
-- where demand is not null and precip is not null and temp is not null
-- and timeStamp < ''2017-02-01''',
-- @label_column='demand',
-- @task='forecasting',
-- @iterations=10,
-- @iteration_timeout_minutes=5,
-- @time_column_name='timeStamp',
-- @is_validate_column='is_validate_column',
-- @experiment_name='automl-sql-forecast',
-- @primary_metric='normalized_root_mean_squared_error'
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]
(
@input_query NVARCHAR(MAX), -- The SQL Query that will return the data to train and validate the model.
@label_column NVARCHAR(255)='Label', -- The name of the column in the result of @input_query that is the label.
@primary_metric NVARCHAR(40)='AUC_weighted', -- The metric to optimize.
@iterations INT=100, -- The maximum number of pipelines to train.
@task NVARCHAR(40)='classification', -- The type of task. Can be classification, regression or forecasting.
@experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
@iteration_timeout_minutes INT = 15, -- The maximum time in minutes for training a single pipeline.
@experiment_timeout_hours FLOAT = 1, -- The maximum time in hours for training all pipelines.
@n_cross_validations INT = 3, -- The number of cross validations.
@blacklist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that will not be used.
-- The list of possible models can be found at:
-- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
@whitelist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that can be used.
-- The list of possible models can be found at:
-- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
@experiment_exit_score FLOAT = 0, -- Stop the experiment if this score is acheived.
@sample_weight_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that gives a sample weight.
@is_validate_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that indicates if the row is for training or validation.
-- In the values of the column, 0 means for training and 1 means for validation.
@time_column_name NVARCHAR(255)='', -- The name of the timestamp column for forecasting.
@connection_name NVARCHAR(255)='default', -- The AML connection to use.
@max_horizon INT = 0 -- A forecast horizon is a time span into the future (or just beyond the latest date in the training data)
-- where forecasts of the target quantity are needed.
-- For example, if data is recorded daily and max_horizon is 5, we will predict 5 days ahead.
) AS
BEGIN
DECLARE @tenantid NVARCHAR(255)
DECLARE @appid NVARCHAR(255)
DECLARE @password NVARCHAR(255)
DECLARE @config_file NVARCHAR(255)
SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
FROM aml_connection
WHERE ConnectionName = @connection_name;
EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
import logging
import azureml.core
import pandas as pd
import numpy as np
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
from sklearn import datasets
import pickle
import codecs
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core.workspace import Workspace
if __name__.startswith("sqlindb"):
auth = ServicePrincipalAuthentication(tenantid, appid, password)
ws = Workspace.from_config(path=config_file, auth=auth)
project_folder = "./sample_projects/" + experiment_name
experiment = Experiment(ws, experiment_name)
data_train = input_data
X_valid = None
y_valid = None
sample_weight_valid = None
if is_validate_column != "" and is_validate_column is not None:
data_train = input_data[input_data[is_validate_column] <= 0]
data_valid = input_data[input_data[is_validate_column] > 0]
data_train.pop(is_validate_column)
data_valid.pop(is_validate_column)
y_valid = data_valid.pop(label_column).values
if sample_weight_column != "" and sample_weight_column is not None:
sample_weight_valid = data_valid.pop(sample_weight_column).values
X_valid = data_valid
n_cross_validations = None
y_train = data_train.pop(label_column).values
sample_weight = None
if sample_weight_column != "" and sample_weight_column is not None:
sample_weight = data_train.pop(sample_weight_column).values
X_train = data_train
if experiment_timeout_hours == 0:
experiment_timeout_hours = None
if experiment_exit_score == 0:
experiment_exit_score = None
if blacklist_models == "":
blacklist_models = None
if blacklist_models is not None:
blacklist_models = blacklist_models.replace(" ", "").split(",")
if whitelist_models == "":
whitelist_models = None
if whitelist_models is not None:
whitelist_models = whitelist_models.replace(" ", "").split(",")
automl_settings = {}
preprocess = True
if time_column_name != "" and time_column_name is not None:
automl_settings = { "time_column_name": time_column_name }
preprocess = False
if max_horizon > 0:
automl_settings["max_horizon"] = max_horizon
log_file_name = "automl_sqlindb_errors.log"
automl_config = AutoMLConfig(task = task,
debug_log = log_file_name,
primary_metric = primary_metric,
iteration_timeout_minutes = iteration_timeout_minutes,
experiment_timeout_hours = experiment_timeout_hours,
iterations = iterations,
n_cross_validations = n_cross_validations,
preprocess = preprocess,
verbosity = logging.INFO,
X = X_train,
y = y_train,
path = project_folder,
blacklist_models = blacklist_models,
whitelist_models = whitelist_models,
experiment_exit_score = experiment_exit_score,
sample_weight = sample_weight,
X_valid = X_valid,
y_valid = y_valid,
sample_weight_valid = sample_weight_valid,
**automl_settings)
local_run = experiment.submit(automl_config, show_output = True)
best_run, fitted_model = local_run.get_output()
pickled_model = codecs.encode(pickle.dumps(fitted_model), "base64").decode()
log_file_text = ""
try:
with open(log_file_name, "r") as log_file:
log_file_text = log_file.read()
except:
log_file_text = "Log file not found"
returned_model = pd.DataFrame({"best_run": [best_run.id], "experiment_name": [experiment_name], "fitted_model": [pickled_model], "log_file_text": [log_file_text], "workspace": [ws.name]}, dtype=np.dtype(np.str))
'
, @input_data_1 = @input_query
, @input_data_1_name = N'input_data'
, @output_data_1_name = N'returned_model'
, @params = N'@label_column NVARCHAR(255),
@primary_metric NVARCHAR(40),
@iterations INT, @task NVARCHAR(40),
@experiment_name NVARCHAR(32),
@iteration_timeout_minutes INT,
@experiment_timeout_hours FLOAT,
@n_cross_validations INT,
@blacklist_models NVARCHAR(MAX),
@whitelist_models NVARCHAR(MAX),
@experiment_exit_score FLOAT,
@sample_weight_column NVARCHAR(255),
@is_validate_column NVARCHAR(255),
@time_column_name NVARCHAR(255),
@tenantid NVARCHAR(255),
@appid NVARCHAR(255),
@password NVARCHAR(255),
@config_file NVARCHAR(255),
@max_horizon INT'
, @label_column = @label_column
, @primary_metric = @primary_metric
, @iterations = @iterations
, @task = @task
, @experiment_name = @experiment_name
, @iteration_timeout_minutes = @iteration_timeout_minutes
, @experiment_timeout_hours = @experiment_timeout_hours
, @n_cross_validations = @n_cross_validations
, @blacklist_models = @blacklist_models
, @whitelist_models = @whitelist_models
, @experiment_exit_score = @experiment_exit_score
, @sample_weight_column = @sample_weight_column
, @is_validate_column = @is_validate_column
, @time_column_name = @time_column_name
, @tenantid = @tenantid
, @appid = @appid
, @password = @password
, @config_file = @config_file
, @max_horizon = @max_horizon
WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))
END

View File

@@ -1,18 +0,0 @@
-- This is a table to store the Azure ML connection information.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[aml_connection](
[Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,
[ConnectionName] [nvarchar](255) NULL,
[TenantId] [nvarchar](255) NULL,
[AppId] [nvarchar](255) NULL,
[Password] [nvarchar](255) NULL,
[ConfigFile] [nvarchar](255) NULL
) ON [PRIMARY]
GO

View File

@@ -1,22 +0,0 @@
-- This is a table to hold the results from the AutoMLTrain procedure.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[aml_model](
[Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,
[Model] [varchar](max) NOT NULL, -- The model, which can be passed to AutoMLPredict for testing or prediction.
[RunId] [nvarchar](250) NULL, -- The RunId, which can be used to view the model in the Azure Portal.
[CreatedDate] [datetime] NULL,
[ExperimentName] [nvarchar](100) NULL, -- Azure ML Experiment Name
[WorkspaceName] [nvarchar](100) NULL, -- Azure ML Workspace Name
[LogFileText] [nvarchar](max) NULL
)
GO
ALTER TABLE [dbo].[aml_model] ADD DEFAULT (getutcdate()) FOR [CreatedDate]
GO

View File

@@ -1,581 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Set up Azure ML Automated Machine Learning on SQL Server 2019 CTP 2.4 big data cluster\r\n",
"\r\n",
"\\# Prerequisites: \r\n",
"\\# - An Azure subscription and resource group \r\n",
"\\# - An Azure Machine Learning workspace \r\n",
"\\# - A SQL Server 2019 CTP 2.4 big data cluster with Internet access and a database named 'automl' \r\n",
"\\# - Azure CLI \r\n",
"\\# - kubectl command \r\n",
"\\# - The https://github.com/Azure/MachineLearningNotebooks repository downloaded (cloned) to your local machine\r\n",
"\r\n",
"\\# In the 'automl' database, create a table named 'dbo.nyc_energy' as follows: \r\n",
"\\# - In SQL Server Management Studio, right-click the 'automl' database, select Tasks, then Import Flat File. \r\n",
"\\# - Select the file AzureMlCli\\notebooks\\how-to-use-azureml\\automated-machine-learning\\forecasting-energy-demand\\nyc_energy.csv. \r\n",
"\\# - Using the \"Modify Columns\" page, allow nulls for all columns. \r\n",
"\r\n",
"\\# Create an Azure Machine Learning Workspace using the instructions at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace \r\n",
"\r\n",
"\\# Create an Azure service principal. You can do this with the following commands: \r\n",
"\r\n",
"az login \r\n",
"az account set --subscription *subscriptionid* \r\n",
"\r\n",
"\\# The following command prints out the **appId** and **tenant**, \r\n",
"\\# which you insert into the indicated cell later in this notebook \r\n",
"\\# to allow AutoML to authenticate with Azure: \r\n",
"\r\n",
"az ad sp create-for-rbac --name *principlename* --password *password*\r\n",
"\r\n",
"\\# Log into the master instance of SQL Server 2019 CTP 2.4: \r\n",
"kubectl exec -it mssql-master-pool-0 -n *clustername* -c mssql-server -- /bin/bash\r\n",
"\r\n",
"mkdir /tmp/aml\r\n",
"\r\n",
"cd /tmp/aml\r\n",
"\r\n",
"\\# **Modify** the following with your subscription_id, resource_group, and workspace_name: \r\n",
"cat > config.json << EOF \r\n",
"{ \r\n",
" \"subscription_id\": \"123456ab-78cd-0123-45ef-abcd12345678\", \r\n",
" \"resource_group\": \"myrg1\", \r\n",
" \"workspace_name\": \"myws1\" \r\n",
"} \r\n",
"EOF\r\n",
"\r\n",
"\\# The directory referenced below is appropriate for the master instance of SQL Server 2019 CTP 2.4.\r\n",
"\r\n",
"cd /opt/mssql/mlservices/runtime/python/bin\r\n",
"\r\n",
"./python -m pip install azureml-sdk[automl]\r\n",
"\r\n",
"./python -m pip install --upgrade numpy \r\n",
"\r\n",
"./python -m pip install --upgrade sklearn\r\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.png)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- Enable external scripts to allow invoking Python\r\n",
"sp_configure 'external scripts enabled',1 \r\n",
"reconfigure with override \r\n",
"GO\r\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- Use database 'automl'\r\n",
"USE [automl]\r\n",
"GO"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- This is a table to hold the Azure ML connection information.\r\n",
"SET ANSI_NULLS ON\r\n",
"GO\r\n",
"\r\n",
"SET QUOTED_IDENTIFIER ON\r\n",
"GO\r\n",
"\r\n",
"CREATE TABLE [dbo].[aml_connection](\r\n",
" [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,\r\n",
"\t[ConnectionName] [nvarchar](255) NULL,\r\n",
"\t[TenantId] [nvarchar](255) NULL,\r\n",
"\t[AppId] [nvarchar](255) NULL,\r\n",
"\t[Password] [nvarchar](255) NULL,\r\n",
"\t[ConfigFile] [nvarchar](255) NULL\r\n",
") ON [PRIMARY]\r\n",
"GO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Copy the values from create-for-rbac above into the cell below"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- Use the following values:\r\n",
"-- Leave the name as 'Default'\r\n",
"-- Insert <tenant> returned by create-for-rbac above\r\n",
"-- Insert <AppId> returned by create-for-rbac above\r\n",
"-- Insert <password> used in create-for-rbac above\r\n",
"-- Leave <path> as '/tmp/aml/config.json'\r\n",
"INSERT INTO [dbo].[aml_connection] \r\n",
"VALUES (\r\n",
" N'Default', -- Name\r\n",
" N'11111111-2222-3333-4444-555555555555', -- Tenant\r\n",
" N'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', -- AppId\r\n",
" N'insertpasswordhere', -- Password\r\n",
" N'/tmp/aml/config.json' -- Path\r\n",
" );\r\n",
"GO"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- This is a table to hold the results from the AutoMLTrain procedure.\r\n",
"SET ANSI_NULLS ON\r\n",
"GO\r\n",
"\r\n",
"SET QUOTED_IDENTIFIER ON\r\n",
"GO\r\n",
"\r\n",
"CREATE TABLE [dbo].[aml_model](\r\n",
" [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,\r\n",
" [Model] [varchar](max) NOT NULL, -- The model, which can be passed to AutoMLPredict for testing or prediction.\r\n",
" [RunId] [nvarchar](250) NULL, -- The RunId, which can be used to view the model in the Azure Portal.\r\n",
" [CreatedDate] [datetime] NULL,\r\n",
" [ExperimentName] [nvarchar](100) NULL, -- Azure ML Experiment Name\r\n",
" [WorkspaceName] [nvarchar](100) NULL, -- Azure ML Workspace Name\r\n",
"\t[LogFileText] [nvarchar](max) NULL\r\n",
") \r\n",
"GO\r\n",
"\r\n",
"ALTER TABLE [dbo].[aml_model] ADD DEFAULT (getutcdate()) FOR [CreatedDate]\r\n",
"GO\r\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- This stored procedure uses automated machine learning to train several models\r\n",
"-- and return the best model.\r\n",
"--\r\n",
"-- The result set has several columns:\r\n",
"-- best_run - ID of the best model found\r\n",
"-- experiment_name - training run name\r\n",
"-- fitted_model - best model found\r\n",
"-- log_file_text - console output\r\n",
"-- workspace - name of the Azure ML workspace where run history is stored\r\n",
"--\r\n",
"-- An example call for a classification problem is:\r\n",
"-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
"-- exec dbo.AutoMLTrain @input_query='\r\n",
"-- SELECT top 100000 \r\n",
"-- CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime\r\n",
"-- ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime\r\n",
"-- ,[passenger_count]\r\n",
"-- ,[trip_time_in_secs]\r\n",
"-- ,[trip_distance]\r\n",
"-- ,[payment_type]\r\n",
"-- ,[tip_class]\r\n",
"-- FROM [dbo].[nyctaxi_sample] order by [hack_license] ',\r\n",
"-- @label_column = 'tip_class',\r\n",
"-- @iterations=10\r\n",
"-- \r\n",
"-- An example call for forecasting is:\r\n",
"-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
"-- exec dbo.AutoMLTrain @input_query='\r\n",
"-- select cast(timeStamp as nvarchar(30)) as timeStamp,\r\n",
"-- demand,\r\n",
"-- \t precip,\r\n",
"-- \t temp,\r\n",
"-- case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column\r\n",
"-- from nyc_energy\r\n",
"-- where demand is not null and precip is not null and temp is not null\r\n",
"-- and timeStamp < ''2017-02-01''',\r\n",
"-- @label_column='demand',\r\n",
"-- @task='forecasting',\r\n",
"-- @iterations=10,\r\n",
"-- @iteration_timeout_minutes=5,\r\n",
"-- @time_column_name='timeStamp',\r\n",
"-- @is_validate_column='is_validate_column',\r\n",
"-- @experiment_name='automl-sql-forecast',\r\n",
"-- @primary_metric='normalized_root_mean_squared_error'\r\n",
"\r\n",
"SET ANSI_NULLS ON\r\n",
"GO\r\n",
"SET QUOTED_IDENTIFIER ON\r\n",
"GO\r\n",
"CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]\r\n",
" (\r\n",
" @input_query NVARCHAR(MAX), -- The SQL Query that will return the data to train and validate the model.\r\n",
" @label_column NVARCHAR(255)='Label', -- The name of the column in the result of @input_query that is the label.\r\n",
" @primary_metric NVARCHAR(40)='AUC_weighted', -- The metric to optimize.\r\n",
" @iterations INT=100, -- The maximum number of pipelines to train.\r\n",
" @task NVARCHAR(40)='classification', -- The type of task. Can be classification, regression or forecasting.\r\n",
" @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.\r\n",
" @iteration_timeout_minutes INT = 15, -- The maximum time in minutes for training a single pipeline. \r\n",
" @experiment_timeout_hours FLOAT = 1, -- The maximum time in hours for training all pipelines.\r\n",
" @n_cross_validations INT = 3, -- The number of cross validations.\r\n",
" @blacklist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that will not be used.\r\n",
" -- The list of possible models can be found at:\r\n",
" -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings\r\n",
" @whitelist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that can be used.\r\n",
" -- The list of possible models can be found at:\r\n",
" -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings\r\n",
" @experiment_exit_score FLOAT = 0, -- Stop the experiment if this score is acheived.\r\n",
" @sample_weight_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that gives a sample weight.\r\n",
" @is_validate_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that indicates if the row is for training or validation.\r\n",
"\t -- In the values of the column, 0 means for training and 1 means for validation.\r\n",
" @time_column_name NVARCHAR(255)='', -- The name of the timestamp column for forecasting.\r\n",
"\t@connection_name NVARCHAR(255)='default' -- The AML connection to use.\r\n",
" ) AS\r\n",
"BEGIN\r\n",
"\r\n",
" DECLARE @tenantid NVARCHAR(255)\r\n",
" DECLARE @appid NVARCHAR(255)\r\n",
" DECLARE @password NVARCHAR(255)\r\n",
" DECLARE @config_file NVARCHAR(255)\r\n",
"\r\n",
"\tSELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile\r\n",
"\tFROM aml_connection\r\n",
"\tWHERE ConnectionName = @connection_name;\r\n",
"\r\n",
"\tEXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd\r\n",
"import logging \r\n",
"import azureml.core \r\n",
"import pandas as pd\r\n",
"import numpy as np\r\n",
"from azureml.core.experiment import Experiment \r\n",
"from azureml.train.automl import AutoMLConfig \r\n",
"from sklearn import datasets \r\n",
"import pickle\r\n",
"import codecs\r\n",
"from azureml.core.authentication import ServicePrincipalAuthentication \r\n",
"from azureml.core.workspace import Workspace \r\n",
"\r\n",
"if __name__.startswith(\"sqlindb\"):\r\n",
" auth = ServicePrincipalAuthentication(tenantid, appid, password) \r\n",
" \r\n",
" ws = Workspace.from_config(path=config_file, auth=auth) \r\n",
" \r\n",
" project_folder = \"./sample_projects/\" + experiment_name\r\n",
" \r\n",
" experiment = Experiment(ws, experiment_name) \r\n",
"\r\n",
" data_train = input_data\r\n",
" X_valid = None\r\n",
" y_valid = None\r\n",
" sample_weight_valid = None\r\n",
"\r\n",
" if is_validate_column != \"\" and is_validate_column is not None:\r\n",
" data_train = input_data[input_data[is_validate_column] <= 0]\r\n",
" data_valid = input_data[input_data[is_validate_column] > 0]\r\n",
" data_train.pop(is_validate_column)\r\n",
" data_valid.pop(is_validate_column)\r\n",
" y_valid = data_valid.pop(label_column).values\r\n",
" if sample_weight_column != \"\" and sample_weight_column is not None:\r\n",
" sample_weight_valid = data_valid.pop(sample_weight_column).values\r\n",
" X_valid = data_valid\r\n",
" n_cross_validations = None\r\n",
"\r\n",
" y_train = data_train.pop(label_column).values\r\n",
"\r\n",
" sample_weight = None\r\n",
" if sample_weight_column != \"\" and sample_weight_column is not None:\r\n",
" sample_weight = data_train.pop(sample_weight_column).values\r\n",
"\r\n",
" X_train = data_train\r\n",
"\r\n",
" if experiment_timeout_hours == 0:\r\n",
" experiment_timeout_hours = None\r\n",
"\r\n",
" if experiment_exit_score == 0:\r\n",
" experiment_exit_score = None\r\n",
"\r\n",
" if blacklist_models == \"\":\r\n",
" blacklist_models = None\r\n",
"\r\n",
" if blacklist_models is not None:\r\n",
" blacklist_models = blacklist_models.replace(\" \", \"\").split(\",\")\r\n",
"\r\n",
" if whitelist_models == \"\":\r\n",
" whitelist_models = None\r\n",
"\r\n",
" if whitelist_models is not None:\r\n",
" whitelist_models = whitelist_models.replace(\" \", \"\").split(\",\")\r\n",
"\r\n",
" automl_settings = {}\r\n",
" preprocess = True\r\n",
" if time_column_name != \"\" and time_column_name is not None:\r\n",
" automl_settings = { \"time_column_name\": time_column_name }\r\n",
" preprocess = False\r\n",
"\r\n",
" log_file_name = \"automl_errors.log\"\r\n",
"\t \r\n",
" automl_config = AutoMLConfig(task = task, \r\n",
" debug_log = log_file_name, \r\n",
" primary_metric = primary_metric, \r\n",
" iteration_timeout_minutes = iteration_timeout_minutes, \r\n",
" experiment_timeout_hours = experiment_timeout_hours,\r\n",
" iterations = iterations, \r\n",
" n_cross_validations = n_cross_validations, \r\n",
" preprocess = preprocess,\r\n",
" verbosity = logging.INFO, \r\n",
" X = X_train, \r\n",
" y = y_train, \r\n",
" path = project_folder,\r\n",
" blacklist_models = blacklist_models,\r\n",
" whitelist_models = whitelist_models,\r\n",
" experiment_exit_score = experiment_exit_score,\r\n",
" sample_weight = sample_weight,\r\n",
" X_valid = X_valid,\r\n",
" y_valid = y_valid,\r\n",
" sample_weight_valid = sample_weight_valid,\r\n",
" **automl_settings) \r\n",
" \r\n",
" local_run = experiment.submit(automl_config, show_output = True) \r\n",
"\r\n",
" best_run, fitted_model = local_run.get_output()\r\n",
"\r\n",
" pickled_model = codecs.encode(pickle.dumps(fitted_model), \"base64\").decode()\r\n",
"\r\n",
" log_file_text = \"\"\r\n",
"\r\n",
" try:\r\n",
" with open(log_file_name, \"r\") as log_file:\r\n",
" log_file_text = log_file.read()\r\n",
" except:\r\n",
" log_file_text = \"Log file not found\"\r\n",
"\r\n",
" returned_model = pd.DataFrame({\"best_run\": [best_run.id], \"experiment_name\": [experiment_name], \"fitted_model\": [pickled_model], \"log_file_text\": [log_file_text], \"workspace\": [ws.name]}, dtype=np.dtype(np.str))\r\n",
"'\r\n",
"\t, @input_data_1 = @input_query\r\n",
"\t, @input_data_1_name = N'input_data'\r\n",
"\t, @output_data_1_name = N'returned_model'\r\n",
"\t, @params = N'@label_column NVARCHAR(255), \r\n",
"\t @primary_metric NVARCHAR(40),\r\n",
"\t\t\t\t @iterations INT, @task NVARCHAR(40),\r\n",
"\t\t\t\t @experiment_name NVARCHAR(32),\r\n",
"\t\t\t\t @iteration_timeout_minutes INT,\r\n",
"\t\t\t\t @experiment_timeout_hours FLOAT,\r\n",
"\t\t\t\t @n_cross_validations INT,\r\n",
"\t\t\t\t @blacklist_models NVARCHAR(MAX),\r\n",
"\t\t\t\t @whitelist_models NVARCHAR(MAX),\r\n",
"\t\t\t\t @experiment_exit_score FLOAT,\r\n",
"\t\t\t\t @sample_weight_column NVARCHAR(255),\r\n",
"\t\t\t\t @is_validate_column NVARCHAR(255),\r\n",
"\t\t\t\t @time_column_name NVARCHAR(255),\r\n",
"\t\t\t\t @tenantid NVARCHAR(255),\r\n",
"\t\t\t\t @appid NVARCHAR(255),\r\n",
"\t\t\t\t @password NVARCHAR(255),\r\n",
"\t\t\t\t @config_file NVARCHAR(255)'\r\n",
"\t, @label_column = @label_column\r\n",
"\t, @primary_metric = @primary_metric\r\n",
"\t, @iterations = @iterations\r\n",
"\t, @task = @task\r\n",
"\t, @experiment_name = @experiment_name\r\n",
"\t, @iteration_timeout_minutes = @iteration_timeout_minutes\r\n",
"\t, @experiment_timeout_hours = @experiment_timeout_hours\r\n",
"\t, @n_cross_validations = @n_cross_validations\r\n",
"\t, @blacklist_models = @blacklist_models\r\n",
"\t, @whitelist_models = @whitelist_models\r\n",
"\t, @experiment_exit_score = @experiment_exit_score\r\n",
"\t, @sample_weight_column = @sample_weight_column\r\n",
"\t, @is_validate_column = @is_validate_column\r\n",
"\t, @time_column_name = @time_column_name\r\n",
"\t, @tenantid = @tenantid\r\n",
"\t, @appid = @appid\r\n",
"\t, @password = @password\r\n",
"\t, @config_file = @config_file\r\n",
"WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))\r\n",
"END"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- This procedure returns a list of metrics for each iteration of a training run.\r\n",
"SET ANSI_NULLS ON\r\n",
"GO\r\n",
"SET QUOTED_IDENTIFIER ON\r\n",
"GO\r\n",
"CREATE OR ALTER PROCEDURE [dbo].[AutoMLGetMetrics]\r\n",
" (\r\n",
"\t@run_id NVARCHAR(250), -- The RunId\r\n",
" @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.\r\n",
" @connection_name NVARCHAR(255)='default' -- The AML connection to use.\r\n",
" ) AS\r\n",
"BEGIN\r\n",
" DECLARE @tenantid NVARCHAR(255)\r\n",
" DECLARE @appid NVARCHAR(255)\r\n",
" DECLARE @password NVARCHAR(255)\r\n",
" DECLARE @config_file NVARCHAR(255)\r\n",
"\r\n",
"\tSELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile\r\n",
"\tFROM aml_connection\r\n",
"\tWHERE ConnectionName = @connection_name;\r\n",
"\r\n",
" EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd\r\n",
"import logging \r\n",
"import azureml.core \r\n",
"import numpy as np\r\n",
"from azureml.core.experiment import Experiment \r\n",
"from azureml.train.automl.run import AutoMLRun\r\n",
"from azureml.core.authentication import ServicePrincipalAuthentication \r\n",
"from azureml.core.workspace import Workspace \r\n",
"\r\n",
"auth = ServicePrincipalAuthentication(tenantid, appid, password) \r\n",
" \r\n",
"ws = Workspace.from_config(path=config_file, auth=auth) \r\n",
" \r\n",
"experiment = Experiment(ws, experiment_name) \r\n",
"\r\n",
"ml_run = AutoMLRun(experiment = experiment, run_id = run_id)\r\n",
"\r\n",
"children = list(ml_run.get_children())\r\n",
"iterationlist = []\r\n",
"metricnamelist = []\r\n",
"metricvaluelist = []\r\n",
"\r\n",
"for run in children:\r\n",
" properties = run.get_properties()\r\n",
" if \"iteration\" in properties:\r\n",
" iteration = int(properties[\"iteration\"])\r\n",
" for metric_name, metric_value in run.get_metrics().items():\r\n",
" if isinstance(metric_value, float):\r\n",
" iterationlist.append(iteration)\r\n",
" metricnamelist.append(metric_name)\r\n",
" metricvaluelist.append(metric_value)\r\n",
" \r\n",
"metrics = pd.DataFrame({\"iteration\": iterationlist, \"metric_name\": metricnamelist, \"metric_value\": metricvaluelist})\r\n",
"'\r\n",
" , @output_data_1_name = N'metrics'\r\n",
"\t, @params = N'@run_id NVARCHAR(250), \r\n",
"\t\t\t\t @experiment_name NVARCHAR(32),\r\n",
" \t\t\t\t @tenantid NVARCHAR(255),\r\n",
"\t\t\t\t @appid NVARCHAR(255),\r\n",
"\t\t\t\t @password NVARCHAR(255),\r\n",
"\t\t\t\t @config_file NVARCHAR(255)'\r\n",
" , @run_id = @run_id\r\n",
"\t, @experiment_name = @experiment_name\r\n",
"\t, @tenantid = @tenantid\r\n",
"\t, @appid = @appid\r\n",
"\t, @password = @password\r\n",
"\t, @config_file = @config_file\r\n",
"WITH RESULT SETS ((iteration INT, metric_name NVARCHAR(100), metric_value FLOAT))\r\n",
"END"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"-- This procedure predicts values based on a model returned by AutoMLTrain and a dataset.\r\n",
"-- It returns the dataset with a new column added, which is the predicted value.\r\n",
"SET ANSI_NULLS ON\r\n",
"GO\r\n",
"SET QUOTED_IDENTIFIER ON\r\n",
"GO\r\n",
"CREATE OR ALTER PROCEDURE [dbo].[AutoMLPredict]\r\n",
" (\r\n",
" @input_query NVARCHAR(MAX), -- A SQL query returning data to predict on.\r\n",
" @model NVARCHAR(MAX), -- A model returned from AutoMLTrain.\r\n",
" @label_column NVARCHAR(255)='' -- Optional name of the column from input_query, which should be ignored when predicting\r\n",
" ) AS \r\n",
"BEGIN \r\n",
" \r\n",
" EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd \r\n",
"import azureml.core \r\n",
"import numpy as np \r\n",
"from azureml.train.automl import AutoMLConfig \r\n",
"import pickle \r\n",
"import codecs \r\n",
" \r\n",
"model_obj = pickle.loads(codecs.decode(model.encode(), \"base64\")) \r\n",
" \r\n",
"test_data = input_data.copy() \r\n",
"\r\n",
"if label_column != \"\" and label_column is not None:\r\n",
" y_test = test_data.pop(label_column).values \r\n",
"X_test = test_data \r\n",
" \r\n",
"predicted = model_obj.predict(X_test) \r\n",
" \r\n",
"combined_output = input_data.assign(predicted=predicted)\r\n",
" \r\n",
"' \r\n",
" , @input_data_1 = @input_query \r\n",
" , @input_data_1_name = N'input_data' \r\n",
" , @output_data_1_name = N'combined_output' \r\n",
" , @params = N'@model NVARCHAR(MAX), @label_column NVARCHAR(255)' \r\n",
" , @model = @model \r\n",
"\t, @label_column = @label_column\r\n",
"END"
]
}
],
"metadata": {
"authors": [
{
"name": "jeffshep"
}
],
"category": "tutorial",
"compute": [
"None"
],
"datasets": [
"None"
],
"deployment": [
"None"
],
"exclude_from_index": false,
"framework": [
"Azure ML AutoML"
],
"friendly_name": "Setup automated ML SQL integration",
"index_order": 1,
"kernelspec": {
"display_name": "Python 3.6",
"language": "sql",
"name": "python36"
},
"language_info": {
"name": "sql",
"version": ""
},
"tags": [
""
],
"task": "None"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,497 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/deployment/accelerated-models/accelerated-models-object-detection.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Azure ML Hardware Accelerated Object Detection"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This tutorial will show you how to deploy an object detection service based on the SSD-VGG model in just a few minutes using the Azure Machine Learning Accelerated AI service.\n",
"\n",
"We will use the SSD-VGG model accelerated on an FPGA. Our Accelerated Models Service handles translating deep neural networks (DNN) into an FPGA program.\n",
"\n",
"The steps in this notebook are: \n",
"1. [Setup Environment](#set-up-environment)\n",
"* [Construct Model](#construct-model)\n",
" * Image Preprocessing\n",
" * Featurizer\n",
" * Save Model\n",
" * Save input and output tensor names\n",
"* [Create Image](#create-image)\n",
"* [Deploy Image](#deploy-image)\n",
"* [Test the Service](#test-service)\n",
" * Create Client\n",
" * Serve the model\n",
"* [Cleanup](#cleanup)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"set-up-environment\"></a>\n",
"## 1. Set up Environment\n",
"### 1.a. Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import tensorflow as tf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.b. Retrieve Workspace\n",
"If you haven't created a Workspace, please follow [this notebook](\"../../../configuration.ipynb\") to do so. If you have, run the codeblock below to retrieve it. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"construct-model\"></a>\n",
"## 2. Construct model\n",
"### 2.a. Image preprocessing\n",
"We'd like our service to accept JPEG images as input. However the input to SSD-VGG is a float tensor of shape \\[1, 300, 300, 3\\]. The first dimension is batch, then height, width, and channels (i.e. NHWC). To bridge this gap, we need code that decodes JPEG images and resizes them appropriately for input to SSD-VGG. The Accelerated AI service can execute TensorFlow graphs as part of the service and we'll use that ability to do the image preprocessing. This code defines a TensorFlow graph that preprocesses an array of JPEG images (as TensorFlow strings) and produces a tensor that is ready to be featurized by SSD-VGG.\n",
"\n",
"**Note:** Expect to see TF deprecation warnings until we port our SDK over to use Tensorflow 2.0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Input images as a two-dimensional tensor containing an arbitrary number of images represented a strings\n",
"import azureml.accel.models.utils as utils\n",
"tf.reset_default_graph()\n",
"\n",
"in_images = tf.placeholder(tf.string)\n",
"image_tensors = utils.preprocess_array(in_images, output_width=300, output_height=300, preserve_aspect_ratio=False)\n",
"print(image_tensors.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.b. Featurizer\n",
"The SSD-VGG model is different from our other models in that it generates 12 tensor outputs. These corresponds to x,y displacements of the anchor boxes and the detection confidence (for 21 classes). Because these outputs are not convenient to work with, we will later use a pre-defined post-processing utility to transform the outputs into a simplified list of bounding boxes with their respective class and confidence.\n",
"\n",
"For more information about the output tensors, take this example: the output tensor 'ssd_300_vgg/block4_box/Reshape_1:0' has a shape of [None, 37, 37, 4, 21]. This gives the pre-softmax confidence for 4 anchor boxes situated at each site of a 37 x 37 grid imposed on the image, one confidence score for each of the 21 classes. The first dimension is the batch dimension. Likewise, 'ssd_300_vgg/block4_box/Reshape:0' has shape [None, 37, 37, 4, 4] and encodes the (cx, cy) center shift and rescaling (sw, sh) relative to each anchor box. Refer to the [SSD-VGG paper](https://arxiv.org/abs/1512.02325) to understand how these are computed. The other 10 tensors are defined similarly."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.accel.models import SsdVgg\n",
"\n",
"saved_model_dir = os.path.join(os.path.expanduser('~'), 'models')\n",
"model_graph = SsdVgg(saved_model_dir, is_frozen = True)\n",
"\n",
"print('SSD-VGG Input Tensors:')\n",
"for idx, input_name in enumerate(model_graph.input_tensor_list):\n",
" print('{}, {}'.format(input_name, model_graph.get_input_dims(idx)))\n",
" \n",
"print('SSD-VGG Output Tensors:')\n",
"for idx, output_name in enumerate(model_graph.output_tensor_list):\n",
" print('{}, {}'.format(output_name, model_graph.get_output_dims(idx)))\n",
"\n",
"ssd_outputs = model_graph.import_graph_def(image_tensors, is_training=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.c. Save Model\n",
"Now that we loaded both parts of the tensorflow graph (preprocessor and SSD-VGG featurizer), we can save the graph and associated variables to a directory which we can register as an Azure ML Model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = \"ssdvgg\"\n",
"model_save_path = os.path.join(saved_model_dir, model_name, \"saved_model\")\n",
"print(\"Saving model in {}\".format(model_save_path))\n",
"\n",
"output_map = {}\n",
"for i, output in enumerate(ssd_outputs):\n",
" output_map['out_{}'.format(i)] = output\n",
"\n",
"with tf.Session() as sess:\n",
" model_graph.restore_weights(sess)\n",
" tf.saved_model.simple_save(sess, \n",
" model_save_path, \n",
" inputs={'images': in_images}, \n",
" outputs=output_map)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.d. Important! Save names of input and output tensors\n",
"\n",
"These input and output tensors that were created during the preprocessing and classifier steps are also going to be used when **converting the model** to an Accelerated Model that can run on FPGA's and for **making an inferencing request**. It is very important to save this information!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"input_tensors = in_images.name\n",
"# We will use the list of output tensors during inferencing\n",
"output_tensors = [output.name for output in ssd_outputs]\n",
"# However, for multiple output tensors, our AccelOnnxConverter will \n",
"# accept comma-delimited strings (lists will cause error)\n",
"output_tensors_str = \",\".join(output_tensors)\n",
"\n",
"print(input_tensors)\n",
"print(output_tensors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"create-image\"></a>\n",
"## 3. Create AccelContainerImage\n",
"Below we will execute all the same steps as in the [Quickstart](./accelerated-models-quickstart.ipynb#create-image) to package the model we have saved locally into an accelerated Docker image saved in our workspace. To complete all the steps, it may take a few minutes. For more details on each step, check out the [Quickstart section on model registration](./accelerated-models-quickstart.ipynb#register-model)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"from azureml.core.model import Model\n",
"from azureml.core.image import Image\n",
"from azureml.accel import AccelOnnxConverter\n",
"from azureml.accel import AccelContainerImage\n",
"\n",
"# Retrieve workspace\n",
"ws = Workspace.from_config()\n",
"print(\"Successfully retrieved workspace:\", ws.name, ws.resource_group, ws.location, ws.subscription_id, '\\n')\n",
"\n",
"# Register model\n",
"registered_model = Model.register(workspace = ws,\n",
" model_path = model_save_path,\n",
" model_name = model_name)\n",
"print(\"Successfully registered: \", registered_model.name, registered_model.description, registered_model.version, '\\n', sep = '\\t')\n",
"\n",
"# Convert model\n",
"convert_request = AccelOnnxConverter.convert_tf_model(ws, registered_model, input_tensors, output_tensors_str)\n",
"if convert_request.wait_for_completion(show_output = False):\n",
" # If the above call succeeded, get the converted model\n",
" converted_model = convert_request.result\n",
" print(\"\\nSuccessfully converted: \", converted_model.name, converted_model.url, converted_model.version, \n",
" converted_model.id, converted_model.created_time, '\\n')\n",
"else:\n",
" print(\"Model conversion failed. Showing output.\")\n",
" convert_request.wait_for_completion(show_output = True)\n",
"\n",
"# Package into AccelContainerImage\n",
"image_config = AccelContainerImage.image_configuration()\n",
"# Image name must be lowercase\n",
"image_name = \"{}-image\".format(model_name)\n",
"image = Image.create(name = image_name,\n",
" models = [converted_model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"image.wait_for_creation()\n",
"print(\"Created AccelContainerImage: {} {} {}\\n\".format(image.name, image.creation_state, image.image_location))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"deploy-image\"></a>\n",
"## 4. Deploy image\n",
"Once you have an Azure ML Accelerated Image in your Workspace, you can deploy it to two destinations, to a Databox Edge machine or to an AKS cluster. \n",
"\n",
"### 4.a. Deploy to Databox Edge Machine using IoT Hub\n",
"See the sample [here](https://github.com/Azure-Samples/aml-real-time-ai/) for using the Azure IoT CLI extension for deploying your Docker image to your Databox Edge Machine.\n",
"\n",
"### 4.b. Deploy to AKS Cluster\n",
"Same as in the [Quickstart section on image deployment](./accelerated-models-quickstart.ipynb#deploy-image), we are going to create an AKS cluster with FPGA-enabled machines, then deploy our service to it.\n",
"#### Create AKS ComputeTarget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"\n",
"# Uses the specific FPGA enabled VM (sku: Standard_PB6s)\n",
"# Standard_PB6s are available in: eastus, westus2, westeurope, southeastasia\n",
"prov_config = AksCompute.provisioning_configuration(vm_size = \"Standard_PB6s\",\n",
" agent_count = 1, \n",
" location = \"eastus\")\n",
"\n",
"aks_name = 'aks-pb6-obj'\n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n",
" provisioning_configuration = prov_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Provisioning an AKS cluster might take awhile (15 or so minutes), and we want to wait until it's successfully provisioned before we can deploy a service to it. If you interrupt this cell, provisioning of the cluster will continue. You can re-run it or check the status in your Workspace under Compute."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Deploy AccelContainerImage to AKS ComputeTarget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"\n",
"# Set the web service configuration (for creating a test service, we don't want autoscale enabled)\n",
"# Authentication is enabled by default, but for testing we specify False\n",
"aks_config = AksWebservice.deploy_configuration(autoscale_enabled=False,\n",
" num_replicas=1,\n",
" auth_enabled = False)\n",
"\n",
"aks_service_name ='my-aks-service-3'\n",
"\n",
"aks_service = Webservice.deploy_from_image(workspace = ws,\n",
" name = aks_service_name,\n",
" image = image,\n",
" deployment_config = aks_config,\n",
" deployment_target = aks_target)\n",
"aks_service.wait_for_deployment(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"test-service\"></a>\n",
"## 5. Test the service\n",
"<a id=\"create-client\"></a>\n",
"### 5.a. Create Client\n",
"The image supports gRPC and the TensorFlow Serving \"predict\" API. We will create a PredictionClient from the Webservice object that can call into the docker image to get predictions. If you do not have the Webservice object, you can also create [PredictionClient](https://docs.microsoft.com/en-us/python/api/azureml-accel-models/azureml.accel.predictionclient?view=azure-ml-py) directly.\n",
"\n",
"**Note:** If you chose to use auth_enabled=True when creating your AksWebservice.deploy_configuration(), see documentation [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice(class)?view=azure-ml-py#get-keys--) on how to retrieve your keys and use either key as an argument to PredictionClient(...,access_token=key).\n",
"**WARNING:** If you are running on Azure Notebooks free compute, you will not be able to make outgoing calls to your service. Try locating your client on a different machine to consume it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Using the grpc client in AzureML Accelerated Models SDK\n",
"from azureml.accel import client_from_service\n",
"\n",
"# Initialize AzureML Accelerated Models client\n",
"client = client_from_service(aks_service)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can adapt the client [code](https://github.com/Azure/aml-real-time-ai/blob/master/pythonlib/amlrealtimeai/client.py) to meet your needs. There is also an example C# [client](https://github.com/Azure/aml-real-time-ai/blob/master/sample-clients/csharp).\n",
"\n",
"The service provides an API that is compatible with TensorFlow Serving. There are instructions to download a sample client [here](https://www.tensorflow.org/serving/setup)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"serve-model\"></a>\n",
"### 5.b. Serve the model\n",
"The SSD-VGG model returns the confidence and bounding boxes for all possible anchor boxes. As mentioned earlier, we will use a post-processing routine to transform this into a list of bounding boxes (y1, x1, y2, x2) where x, y are fractional coordinates measured from left and top respectively. A respective list of classes and scores is also returned to tag each bounding box. Below we make use of this information to draw the bounding boxes on top the original image. Note that in the post-processing routine we select a confidence threshold of 0.5."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"from matplotlib import pyplot as plt\n",
"\n",
"colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),\n",
" (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),\n",
" (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),\n",
" (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),\n",
" (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]\n",
"\n",
"\n",
"def draw_boxes_on_img(img, classes, scores, bboxes, thickness=2):\n",
" shape = img.shape\n",
" for i in range(bboxes.shape[0]):\n",
" bbox = bboxes[i]\n",
" color = colors_tableau[classes[i]]\n",
" # Draw bounding box...\n",
" p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))\n",
" p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))\n",
" cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)\n",
" # Draw text...\n",
" s = '%s/%.3f' % (classes[i], scores[i])\n",
" p1 = (p1[0]-5, p1[1])\n",
" cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.accel._external.ssdvgg_utils as ssdvgg_utils\n",
"\n",
"result = client.score_file(path=\"meeting.jpg\", input_name=input_tensors, outputs=output_tensors)\n",
"classes, scores, bboxes = ssdvgg_utils.postprocess(result, select_threshold=0.5)\n",
"\n",
"img = cv2.imread('meeting.jpg', 1)\n",
"img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
"draw_boxes_on_img(img, classes, scores, bboxes)\n",
"plt.imshow(img)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"cleanup\"></a>\n",
"## 6. Cleanup\n",
"It's important to clean up your resources, so that you won't incur unnecessary costs. In the [next notebook](./accelerated-models-training.ipynb) you will learn how to train a classfier on a new dataset using transfer learning."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service.delete()\n",
"aks_target.delete()\n",
"image.delete()\n",
"registered_model.delete()\n",
"converted_model.delete()"
]
}
],
"metadata": {
"authors": [
{
"name": "coverste"
},
{
"name": "paledger"
},
{
"name": "sukha"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,7 +0,0 @@
name: accelerated-models-object-detection
dependencies:
- pip:
- azureml-sdk
- azureml-accel-models[cpu]
- opencv-python
- matplotlib

View File

@@ -1,555 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/deployment/accelerated-models/accelerated-models-quickstart.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Azure ML Hardware Accelerated Models Quickstart"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This tutorial will show you how to deploy an image recognition service based on the ResNet 50 classifier using the Azure Machine Learning Accelerated Models service. Get more information about our service from our [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-accelerate-with-fpgas), [API reference](https://docs.microsoft.com/en-us/python/api/azureml-accel-models/azureml.accel?view=azure-ml-py), or [forum](https://aka.ms/aml-forum).\n",
"\n",
"We will use an accelerated ResNet50 featurizer running on an FPGA. Our Accelerated Models Service handles translating deep neural networks (DNN) into an FPGA program.\n",
"\n",
"For more information about using other models besides Resnet50, see the [README](./README.md).\n",
"\n",
"The steps covered in this notebook are: \n",
"1. [Set up environment](#set-up-environment)\n",
"* [Construct model](#construct-model)\n",
" * Image Preprocessing\n",
" * Featurizer (Resnet50)\n",
" * Classifier\n",
" * Save Model\n",
"* [Register Model](#register-model)\n",
"* [Convert into Accelerated Model](#convert-model)\n",
"* [Create Image](#create-image)\n",
"* [Deploy](#deploy-image)\n",
"* [Test service](#test-service)\n",
"* [Clean-up](#clean-up)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"set-up-environment\"></a>\n",
"## 1. Set up environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import tensorflow as tf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrieve Workspace\n",
"If you haven't created a Workspace, please follow [this notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) to do so. If you have, run the codeblock below to retrieve it. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"construct-model\"></a>\n",
"## 2. Construct model\n",
"\n",
"There are three parts to the model we are deploying: pre-processing, featurizer with ResNet50, and classifier with ImageNet dataset. Then we will save this complete Tensorflow model graph locally before registering it to your Azure ML Workspace.\n",
"\n",
"### 2.a. Image preprocessing\n",
"We'd like our service to accept JPEG images as input. However the input to ResNet50 is a tensor. So we need code that decodes JPEG images and does the preprocessing required by ResNet50. The Accelerated AI service can execute TensorFlow graphs as part of the service and we'll use that ability to do the image preprocessing. This code defines a TensorFlow graph that preprocesses an array of JPEG images (as strings) and produces a tensor that is ready to be featurized by ResNet50.\n",
"\n",
"**Note:** Expect to see TF deprecation warnings until we port our SDK over to use Tensorflow 2.0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Input images as a two-dimensional tensor containing an arbitrary number of images represented a strings\n",
"import azureml.accel.models.utils as utils\n",
"tf.reset_default_graph()\n",
"\n",
"in_images = tf.placeholder(tf.string)\n",
"image_tensors = utils.preprocess_array(in_images)\n",
"print(image_tensors.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.b. Featurizer\n",
"We use ResNet50 as a featurizer. In this step we initialize the model. This downloads a TensorFlow checkpoint of the quantized ResNet50."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.accel.models import QuantizedResnet50\n",
"save_path = os.path.expanduser('~/models')\n",
"model_graph = QuantizedResnet50(save_path, is_frozen = True)\n",
"feature_tensor = model_graph.import_graph_def(image_tensors)\n",
"print(model_graph.version)\n",
"print(feature_tensor.name)\n",
"print(feature_tensor.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.c. Classifier\n",
"The model we downloaded includes a classifier which takes the output of the ResNet50 and identifies an image. This classifier is trained on the ImageNet dataset. We are going to use this classifier for our service. The next [notebook](./accelerated-models-training.ipynb) shows how to train a classifier for a different data set. The input to the classifier is a tensor matching the output of our ResNet50 featurizer."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"classifier_output = model_graph.get_default_classifier(feature_tensor)\n",
"print(classifier_output)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.d. Save Model\n",
"Now that we loaded all three parts of the tensorflow graph (preprocessor, resnet50 featurizer, and the classifier), we can save the graph and associated variables to a directory which we can register as an Azure ML Model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# model_name must be lowercase\n",
"model_name = \"resnet50\"\n",
"model_save_path = os.path.join(save_path, model_name)\n",
"print(\"Saving model in {}\".format(model_save_path))\n",
"\n",
"with tf.Session() as sess:\n",
" model_graph.restore_weights(sess)\n",
" tf.saved_model.simple_save(sess, model_save_path,\n",
" inputs={'images': in_images},\n",
" outputs={'output_alias': classifier_output})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.e. Important! Save names of input and output tensors\n",
"\n",
"These input and output tensors that were created during the preprocessing and classifier steps are also going to be used when **converting the model** to an Accelerated Model that can run on FPGA's and for **making an inferencing request**. It is very important to save this information! You can see our defaults for all the models in the [README](./README.md).\n",
"\n",
"By default for Resnet50, these are the values you should see when running the cell below: \n",
"* input_tensors = \"Placeholder:0\"\n",
"* output_tensors = \"classifier/resnet_v1_50/predictions/Softmax:0\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"input_tensors = in_images.name\n",
"output_tensors = classifier_output.name\n",
"\n",
"print(input_tensors)\n",
"print(output_tensors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"register-model\"></a>\n",
"## 3. Register Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can add tags and descriptions to your models. Using tags, you can track useful information such as the name and version of the machine learning library used to train the model. Note that tags must be alphanumeric."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"from azureml.core.model import Model\n",
"\n",
"registered_model = Model.register(workspace = ws,\n",
" model_path = model_save_path,\n",
" model_name = model_name)\n",
"\n",
"print(\"Successfully registered: \", registered_model.name, registered_model.description, registered_model.version, sep = '\\t')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"convert-model\"></a>\n",
"## 4. Convert Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For conversion you need to provide names of input and output tensors. This information can be found from the model_graph you saved in step 2.e. above.\n",
"\n",
"**Note**: Conversion may take a while and on average for FPGA model it is about 1-3 minutes and it depends on model type."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"register model from file"
]
},
"outputs": [],
"source": [
"from azureml.accel import AccelOnnxConverter\n",
"\n",
"convert_request = AccelOnnxConverter.convert_tf_model(ws, registered_model, input_tensors, output_tensors)\n",
"\n",
"if convert_request.wait_for_completion(show_output = False):\n",
" # If the above call succeeded, get the converted model\n",
" converted_model = convert_request.result\n",
" print(\"\\nSuccessfully converted: \", converted_model.name, converted_model.url, converted_model.version, \n",
" converted_model.id, converted_model.created_time, '\\n')\n",
"else:\n",
" print(\"Model conversion failed. Showing output.\")\n",
" convert_request.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"create-image\"></a>\n",
"## 5. Package the model into an Image"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can add tags and descriptions to image. Also, for FPGA model an image can only contain **single** model.\n",
"\n",
"**Note**: The following command can take few minutes. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import Image\n",
"from azureml.accel import AccelContainerImage\n",
"\n",
"image_config = AccelContainerImage.image_configuration()\n",
"# Image name must be lowercase\n",
"image_name = \"{}-image\".format(model_name)\n",
"\n",
"image = Image.create(name = image_name,\n",
" models = [converted_model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"image.wait_for_creation(show_output = False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"deploy-image\"></a>\n",
"## 6. Deploy\n",
"Once you have an Azure ML Accelerated Image in your Workspace, you can deploy it to two destinations, to a Databox Edge machine or to an AKS cluster. \n",
"\n",
"### 6.a. Databox Edge Machine using IoT Hub\n",
"See the sample [here](https://github.com/Azure-Samples/aml-real-time-ai/) for using the Azure IoT CLI extension for deploying your Docker image to your Databox Edge Machine.\n",
"\n",
"### 6.b. Azure Kubernetes Service (AKS) using Azure ML Service\n",
"We are going to create an AKS cluster with FPGA-enabled machines, then deploy our service to it. For more information, see [AKS official docs](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aks).\n",
"\n",
"#### Create AKS ComputeTarget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"sample-akscompute-provision"
]
},
"outputs": [],
"source": [
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"\n",
"# Uses the specific FPGA enabled VM (sku: Standard_PB6s)\n",
"# Standard_PB6s are available in: eastus, westus2, westeurope, southeastasia\n",
"prov_config = AksCompute.provisioning_configuration(vm_size = \"Standard_PB6s\",\n",
" agent_count = 1, \n",
" location = \"eastus\")\n",
"\n",
"aks_name = 'my-aks-pb6'\n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n",
" provisioning_configuration = prov_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Provisioning an AKS cluster might take awhile (15 or so minutes), and we want to wait until it's successfully provisioned before we can deploy a service to it. If you interrupt this cell, provisioning of the cluster will continue. You can also check the status in your Workspace under Compute."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Deploy AccelContainerImage to AKS ComputeTarget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"\n",
"# Set the web service configuration (for creating a test service, we don't want autoscale enabled)\n",
"# Authentication is enabled by default, but for testing we specify False\n",
"aks_config = AksWebservice.deploy_configuration(autoscale_enabled=False,\n",
" num_replicas=1,\n",
" auth_enabled = False)\n",
"\n",
"aks_service_name ='my-aks-service-1'\n",
"\n",
"aks_service = Webservice.deploy_from_image(workspace = ws,\n",
" name = aks_service_name,\n",
" image = image,\n",
" deployment_config = aks_config,\n",
" deployment_target = aks_target)\n",
"aks_service.wait_for_deployment(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"test-service\"></a>\n",
"## 7. Test the service"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.a. Create Client\n",
"The image supports gRPC and the TensorFlow Serving \"predict\" API. We will create a PredictionClient from the Webservice object that can call into the docker image to get predictions. If you do not have the Webservice object, you can also create [PredictionClient](https://docs.microsoft.com/en-us/python/api/azureml-accel-models/azureml.accel.predictionclient?view=azure-ml-py) directly.\n",
"\n",
"**Note:** If you chose to use auth_enabled=True when creating your AksWebservice, see documentation [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice(class)?view=azure-ml-py#get-keys--) on how to retrieve your keys and use either key as an argument to PredictionClient(...,access_token=key).\n",
"**WARNING:** If you are running on Azure Notebooks free compute, you will not be able to make outgoing calls to your service. Try locating your client on a different machine to consume it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Using the grpc client in AzureML Accelerated Models SDK\n",
"from azureml.accel import client_from_service\n",
"\n",
"# Initialize AzureML Accelerated Models client\n",
"client = client_from_service(aks_service)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can adapt the client [code](https://github.com/Azure/aml-real-time-ai/blob/master/pythonlib/amlrealtimeai/client.py) to meet your needs. There is also an example C# [client](https://github.com/Azure/aml-real-time-ai/blob/master/sample-clients/csharp).\n",
"\n",
"The service provides an API that is compatible with TensorFlow Serving. There are instructions to download a sample client [here](https://www.tensorflow.org/serving/setup)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.b. Serve the model\n",
"To understand the results we need a mapping to the human readable imagenet classes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"classes_entries = requests.get(\"https://raw.githubusercontent.com/Lasagne/Recipes/master/examples/resnet50/imagenet_classes.txt\").text.splitlines()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Score image with input and output tensor names\n",
"results = client.score_file(path=\"./snowleopardgaze.jpg\", \n",
" input_name=input_tensors, \n",
" outputs=output_tensors)\n",
"\n",
"# map results [class_id] => [confidence]\n",
"results = enumerate(results)\n",
"# sort results by confidence\n",
"sorted_results = sorted(results, key=lambda x: x[1], reverse=True)\n",
"# print top 5 results\n",
"for top in sorted_results[:5]:\n",
" print(classes_entries[top[0]], 'confidence:', top[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"clean-up\"></a>\n",
"## 8. Clean-up\n",
"Run the cell below to delete your webservice, image, and model (must be done in that order). In the [next notebook](./accelerated-models-training.ipynb) you will learn how to train a classfier on a new dataset using transfer learning and finetune the weights."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service.delete()\n",
"aks_target.delete()\n",
"image.delete()\n",
"registered_model.delete()\n",
"converted_model.delete()"
]
}
],
"metadata": {
"authors": [
{
"name": "coverste"
},
{
"name": "paledger"
},
{
"name": "aibhalla"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,5 +0,0 @@
name: accelerated-models-quickstart
dependencies:
- pip:
- azureml-sdk
- azureml-accel-models[cpu]

View File

@@ -1,870 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/deployment/accelerated-models/accelerated-models-training.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training with the Azure Machine Learning Accelerated Models Service"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook will introduce how to apply common machine learning techniques, like transfer learning, custom weights, and unquantized vs. quantized models, when working with our Azure Machine Learning Accelerated Models Service (Azure ML Accel Models).\n",
"\n",
"We will use Tensorflow for the preprocessing steps, ResNet50 for the featurizer, and the Keras API (built on Tensorflow backend) to build the classifier layers instead of the default ImageNet classifier used in Quickstart. Then we will train the model, evaluate it, and deploy it to run on an FPGA.\n",
"\n",
"#### Transfer Learning and Custom weights\n",
"We will walk you through two ways to build and train a ResNet50 model on the Kaggle Cats and Dogs dataset: transfer learning only and then transfer learning with custom weights.\n",
"\n",
"In using transfer learning, our goal is to re-purpose the ResNet50 model already trained on the [ImageNet image dataset](http://www.image-net.org/) as a basis for our training of the Kaggle Cats and Dogs dataset. The ResNet50 featurizer will be imported as frozen, so only the Keras classifier will be trained.\n",
"\n",
"With the addition of custom weights, we will build the model so that the ResNet50 featurizer weights as not frozen. This will let us retrain starting with custom weights trained with ImageNet on ResNet50 and then use the Kaggle Cats and Dogs dataset to retrain and fine-tune the quantized version of the model.\n",
"\n",
"#### Unquantized vs. Quantized models\n",
"The unquantized version of our models (ie. Resnet50, Resnet152, Densenet121, Vgg16, SsdVgg) uses native float precision (32-bit floats), which will be faster at training. We will use this for our first run through, then fine-tune the weights with the quantized version. The quantized version of our models (i.e. QuantizedResnet50, QuantizedResnet152, QuantizedDensenet121, QuantizedVgg16, QuantizedSsdVgg) will have the same node names as the unquantized version, but use quantized operations and will match the performance of the model when running on an FPGA.\n",
"\n",
"#### Contents\n",
"1. [Setup Environment](#setup)\n",
"* [Prepare Data](#prepare-data)\n",
"* [Construct Model](#construct-model)\n",
" * Preprocessor\n",
" * Classifier\n",
" * Model construction\n",
"* [Train Model](#train-model)\n",
"* [Test Model](#test-model)\n",
"* [Execution](#execution)\n",
" * [Transfer Learning](#transfer-learning)\n",
" * [Transfer Learning with Custom Weights](#custom-weights)\n",
"* [Create Image](#create-image)\n",
"* [Deploy Image](#deploy-image)\n",
"* [Test the service](#test-service)\n",
"* [Clean-up](#cleanup)\n",
"* [Appendix](#appendix)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"setup\"></a>\n",
"## 1. Setup Environment\n",
"#### 1.a. Please set up your environment as described in the [Quickstart](./accelerated-models-quickstart.ipynb), meaning:\n",
"* Make sure your Workspace config.json exists and has the correct info\n",
"* Install Tensorflow\n",
"\n",
"#### 1.b. Download dataset into ~/catsanddogs \n",
"The dataset we will be using for training can be downloaded [here](https://www.microsoft.com/en-us/download/details.aspx?id=54765). Download the zip and extract to a directory named 'catsanddogs' under your user directory (\"~/catsanddogs\"). \n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.c. Import packages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import tensorflow as tf\n",
"import numpy as np\n",
"from keras import backend as K\n",
"import sklearn\n",
"import tqdm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.d. Create directories for later use\n",
"After you train your model in float32, you'll write the weights to a place on disk. We also need a location to store the models that get downloaded."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"custom_weights_dir = os.path.expanduser(\"~/custom-weights\")\n",
"saved_model_dir = os.path.expanduser(\"~/models\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"prepare-data\"></a>\n",
"## 2. Prepare Data\n",
"Load the files we are going to use for training and testing. By default this notebook uses only a very small subset of the Cats and Dogs dataset. That makes it run relatively quickly."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"import imghdr\n",
"datadir = os.path.expanduser(\"~/catsanddogs\")\n",
"\n",
"cat_files = glob.glob(os.path.join(datadir, 'PetImages', 'Cat', '*.jpg'))\n",
"dog_files = glob.glob(os.path.join(datadir, 'PetImages', 'Dog', '*.jpg'))\n",
"\n",
"# Limit the data set to make the notebook execute quickly.\n",
"cat_files = cat_files[:64]\n",
"dog_files = dog_files[:64]\n",
"\n",
"# The data set has a few images that are not jpeg. Remove them.\n",
"cat_files = [f for f in cat_files if imghdr.what(f) == 'jpeg']\n",
"dog_files = [f for f in dog_files if imghdr.what(f) == 'jpeg']\n",
"\n",
"if(not len(cat_files) or not len(dog_files)):\n",
" print(\"Please download the Kaggle Cats and Dogs dataset form https://www.microsoft.com/en-us/download/details.aspx?id=54765 and extract the zip to \" + datadir) \n",
" raise ValueError(\"Data not found\")\n",
"else:\n",
" print(cat_files[0])\n",
" print(dog_files[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Construct a numpy array as labels\n",
"image_paths = cat_files + dog_files\n",
"total_files = len(cat_files) + len(dog_files)\n",
"labels = np.zeros(total_files)\n",
"labels[len(cat_files):] = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split images data as training data and test data\n",
"from sklearn.model_selection import train_test_split\n",
"onehot_labels = np.array([[0,1] if i else [1,0] for i in labels])\n",
"img_train, img_test, label_train, label_test = train_test_split(image_paths, onehot_labels, random_state=42, shuffle=True)\n",
"\n",
"print(len(img_train), len(img_test), label_train.shape, label_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"construct-model\"></a>\n",
"## 3. Construct Model\n",
"We will define the functions to handle creating the preprocessor and the classifier first, and then run them together to actually construct the model with the Resnet50 featurizer in a single Tensorflow session in a separate cell.\n",
"\n",
"We use ResNet50 for the featurizer and build our own classifier using Keras layers. We train the featurizer and the classifier as one model. We will provide parameters to determine whether we are using the quantized version and whether we are using custom weights in training or not."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.a. Define image preprocessing step\n",
"Same as in the Quickstart, before passing image dataset to the ResNet50 featurizer, we need to preprocess the input file to get it into the form expected by ResNet50. ResNet50 expects float tensors representing the images in BGR, channel last order. We've provided a default implementation of the preprocessing that you can use.\n",
"\n",
"**Note:** Expect to see TF deprecation warnings until we port our SDK over to use Tensorflow 2.0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.accel.models.utils as utils\n",
"\n",
"def preprocess_images(scaling_factor=1.0):\n",
" # Convert images to 3D tensors [width,height,channel] - channels are in BGR order.\n",
" in_images = tf.placeholder(tf.string)\n",
" image_tensors = utils.preprocess_array(in_images, 'RGB', scaling_factor)\n",
" return in_images, image_tensors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.b. Define classifier\n",
"We use Keras layer APIs to construct the classifier. Because we're using the tensorflow backend, we can train this classifier in one session with our Resnet50 model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def construct_classifier(in_tensor, seed=None):\n",
" from keras.layers import Dropout, Dense, Flatten\n",
" from keras.initializers import glorot_uniform\n",
" K.set_session(tf.get_default_session())\n",
"\n",
" FC_SIZE = 1024\n",
" NUM_CLASSES = 2\n",
"\n",
" x = Dropout(0.2, input_shape=(1, 1, int(in_tensor.shape[3]),), seed=seed)(in_tensor)\n",
" x = Dense(FC_SIZE, activation='relu', input_dim=(1, 1, int(in_tensor.shape[3]),),\n",
" kernel_initializer=glorot_uniform(seed=seed), bias_initializer='zeros')(x)\n",
" x = Flatten()(x)\n",
" preds = Dense(NUM_CLASSES, activation='softmax', input_dim=FC_SIZE, name='classifier_output',\n",
" kernel_initializer=glorot_uniform(seed=seed), bias_initializer='zeros')(x)\n",
" return preds"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.c. Define model construction\n",
"Now that the preprocessor and classifier for the model are defined, we can define how we want to construct the model. \n",
"\n",
"Constructing the model has these steps: \n",
"1. Get preprocessing steps\n",
"* Get featurizer using the Azure ML Accel Models SDK:\n",
" * import the graph definition\n",
" * restore the weights of the model into a Tensorflow session\n",
"* Get classifier\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def construct_model(quantized, starting_weights_directory = None):\n",
" from azureml.accel.models import Resnet50, QuantizedResnet50\n",
" \n",
" # Convert images to 3D tensors [width,height,channel]\n",
" in_images, image_tensors = preprocess_images(1.0)\n",
"\n",
" # Construct featurizer using quantized or unquantized ResNet50 model\n",
" if not quantized:\n",
" featurizer = Resnet50(saved_model_dir)\n",
" else:\n",
" featurizer = QuantizedResnet50(saved_model_dir, custom_weights_directory = starting_weights_directory)\n",
"\n",
" features = featurizer.import_graph_def(input_tensor=image_tensors)\n",
" \n",
" # Construct classifier\n",
" preds = construct_classifier(features)\n",
" \n",
" # Initialize weights\n",
" sess = tf.get_default_session()\n",
" tf.global_variables_initializer().run()\n",
"\n",
" featurizer.restore_weights(sess)\n",
"\n",
" return in_images, image_tensors, features, preds, featurizer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"train-model\"></a>\n",
"## 4. Train Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def read_files(files):\n",
" \"\"\" Read files to array\"\"\"\n",
" contents = []\n",
" for path in files:\n",
" with open(path, 'rb') as f:\n",
" contents.append(f.read())\n",
" return contents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def train_model(preds, in_images, img_train, label_train, is_retrain = False, train_epoch = 10, learning_rate=None):\n",
" \"\"\" training model \"\"\"\n",
" from keras.objectives import binary_crossentropy\n",
" from tqdm import tqdm\n",
" \n",
" learning_rate = learning_rate if learning_rate else 0.001 if is_retrain else 0.01\n",
" \n",
" # Specify the loss function\n",
" in_labels = tf.placeholder(tf.float32, shape=(None, 2)) \n",
" cross_entropy = tf.reduce_mean(binary_crossentropy(in_labels, preds))\n",
" optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)\n",
"\n",
" def chunks(a, b, n):\n",
" \"\"\"Yield successive n-sized chunks from a and b.\"\"\"\n",
" if (len(a) != len(b)):\n",
" print(\"a and b are not equal in chunks(a,b,n)\")\n",
" raise ValueError(\"Parameter error\")\n",
"\n",
" for i in range(0, len(a), n):\n",
" yield a[i:i + n], b[i:i + n]\n",
"\n",
" chunk_size = 16\n",
" chunk_num = len(label_train) / chunk_size\n",
"\n",
" sess = tf.get_default_session()\n",
" for epoch in range(train_epoch):\n",
" avg_loss = 0\n",
" for img_chunk, label_chunk in tqdm(chunks(img_train, label_train, chunk_size)):\n",
" contents = read_files(img_chunk)\n",
" _, loss = sess.run([optimizer, cross_entropy],\n",
" feed_dict={in_images: contents,\n",
" in_labels: label_chunk,\n",
" K.learning_phase(): 1})\n",
" avg_loss += loss / chunk_num\n",
" print(\"Epoch:\", (epoch + 1), \"loss = \", \"{:.3f}\".format(avg_loss))\n",
" \n",
" # Reach desired performance\n",
" if (avg_loss < 0.001):\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"test-model\"></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"test-model\"></a>\n",
"## 5. Test Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def test_model(preds, in_images, img_test, label_test):\n",
" \"\"\"Test the model\"\"\"\n",
" from keras.metrics import categorical_accuracy\n",
"\n",
" in_labels = tf.placeholder(tf.float32, shape=(None, 2))\n",
" accuracy = tf.reduce_mean(categorical_accuracy(in_labels, preds))\n",
" contents = read_files(img_test)\n",
"\n",
" accuracy = accuracy.eval(feed_dict={in_images: contents,\n",
" in_labels: label_test,\n",
" K.learning_phase(): 0})\n",
" return accuracy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"execution\"></a>\n",
"## 6. Execute steps\n",
"You can run through the Transfer Learning section, then skip to Create AccelContainerImage. By default, because the custom weights section takes much longer for training twice, it is not saved as executable cells. You can copy the code or change cell type to 'Code'.\n",
"\n",
"<a id=\"transfer-learning\"></a>\n",
"### 6.a. Training using Transfer Learning"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# Launch the training\n",
"tf.reset_default_graph()\n",
"sess = tf.Session(graph=tf.get_default_graph())\n",
"\n",
"with sess.as_default():\n",
" in_images, image_tensors, features, preds, featurizer = construct_model(quantized=True)\n",
" train_model(preds, in_images, img_train, label_train, is_retrain=False, train_epoch=10, learning_rate=0.01) \n",
" accuracy = test_model(preds, in_images, img_test, label_test) \n",
" print(\"Accuracy:\", accuracy)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Save Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'resnet50-catsanddogs-tl'\n",
"model_save_path = os.path.join(saved_model_dir, model_name)\n",
"\n",
"tf.saved_model.simple_save(sess, model_save_path,\n",
" inputs={'images': in_images},\n",
" outputs={'output_alias': preds})\n",
"\n",
"input_tensors = in_images.name\n",
"output_tensors = preds.name\n",
"\n",
"print(input_tensors)\n",
"print(output_tensors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"custom-weights\"></a>\n",
"### 6.b. Traning using Custom Weights\n",
"\n",
"Because the quantized graph defintion and the float32 graph defintion share the same node names in the graph definitions, we can initally train the weights in float32, and then reload them with the quantized operations (which take longer) to fine-tune the model.\n",
"\n",
"First we train the model with custom weights but without quantization. Training is done with native float precision (32-bit floats). We load the training data set and batch the training with 10 epochs. When the performance reaches desired level or starts decredation, we stop the training iteration and save the weights as tensorflow checkpoint files. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Launch the training\n",
"```\n",
"tf.reset_default_graph()\n",
"sess = tf.Session(graph=tf.get_default_graph())\n",
"\n",
"with sess.as_default():\n",
" in_images, image_tensors, features, preds, featurizer = construct_model(quantized=False)\n",
" train_model(preds, in_images, img_train, label_train, is_retrain=False, train_epoch=10) \n",
" accuracy = test_model(preds, in_images, img_test, label_test) \n",
" print(\"Accuracy:\", accuracy)\n",
" featurizer.save_weights(custom_weights_dir + \"/rn50\", tf.get_default_session())\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Test Model\n",
"After training, we evaluate the trained model's accuracy on test dataset with quantization. So that we know the model's performance if it is deployed on the FPGA."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"tf.reset_default_graph()\n",
"sess = tf.Session(graph=tf.get_default_graph())\n",
"\n",
"with sess.as_default():\n",
" print(\"Testing trained model with quantization\")\n",
" in_images, image_tensors, features, preds, quantized_featurizer = construct_model(quantized=True, starting_weights_directory=custom_weights_dir)\n",
" accuracy = test_model(preds, in_images, img_test, label_test) \n",
" print(\"Accuracy:\", accuracy)\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Fine-Tune Model\n",
"Sometimes, the model's accuracy can drop significantly after quantization. In those cases, we need to retrain the model enabled with quantization to get better model accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"if (accuracy < 0.93):\n",
" with sess.as_default():\n",
" print(\"Fine-tuning model with quantization\")\n",
" train_model(preds, in_images, img_train, label_train, is_retrain=True, train_epoch=10)\n",
" accuracy = test_model(preds, in_images, img_test, label_test) \n",
" print(\"Accuracy:\", accuracy)\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Save Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"model_name = 'resnet50-catsanddogs-cw'\n",
"model_save_path = os.path.join(saved_model_dir, model_name)\n",
"\n",
"tf.saved_model.simple_save(sess, model_save_path,\n",
" inputs={'images': in_images},\n",
" outputs={'output_alias': preds})\n",
"\n",
"input_tensors = in_images.name\n",
"output_tensors = preds.name\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"create-image\"></a>\n",
"## 7. Create AccelContainerImage\n",
"\n",
"Below we will execute all the same steps as in the [Quickstart](./accelerated-models-quickstart.ipynb#create-image) to package the model we have saved locally into an accelerated Docker image saved in our workspace. To complete all the steps, it may take a few minutes. For more details on each step, check out the [Quickstart section on model registration](./accelerated-models-quickstart.ipynb#register-model)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"from azureml.core.model import Model\n",
"from azureml.core.image import Image\n",
"from azureml.accel import AccelOnnxConverter\n",
"from azureml.accel import AccelContainerImage\n",
"\n",
"# Retrieve workspace\n",
"ws = Workspace.from_config()\n",
"print(\"Successfully retrieved workspace:\", ws.name, ws.resource_group, ws.location, ws.subscription_id, '\\n')\n",
"\n",
"# Register model\n",
"registered_model = Model.register(workspace = ws,\n",
" model_path = model_save_path,\n",
" model_name = model_name)\n",
"print(\"Successfully registered: \", registered_model.name, registered_model.description, registered_model.version, '\\n', sep = '\\t')\n",
"\n",
"# Convert model\n",
"convert_request = AccelOnnxConverter.convert_tf_model(ws, registered_model, input_tensors, output_tensors)\n",
"if convert_request.wait_for_completion(show_output = False):\n",
" # If the above call succeeded, get the converted model\n",
" converted_model = convert_request.result\n",
" print(\"\\nSuccessfully converted: \", converted_model.name, converted_model.url, converted_model.version, \n",
" converted_model.id, converted_model.created_time, '\\n')\n",
"else:\n",
" print(\"Model conversion failed. Showing output.\")\n",
" convert_request.wait_for_completion(show_output = True)\n",
"\n",
"# Package into AccelContainerImage\n",
"image_config = AccelContainerImage.image_configuration()\n",
"# Image name must be lowercase\n",
"image_name = \"{}-image\".format(model_name)\n",
"image = Image.create(name = image_name,\n",
" models = [converted_model],\n",
" image_config = image_config, \n",
" workspace = ws)\n",
"image.wait_for_creation()\n",
"print(\"Created AccelContainerImage: {} {} {}\\n\".format(image.name, image.creation_state, image.image_location))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"deploy-image\"></a>\n",
"## 8. Deploy image\n",
"Once you have an Azure ML Accelerated Image in your Workspace, you can deploy it to two destinations, to a Databox Edge machine or to an AKS cluster. \n",
"\n",
"### 8.a. Deploy to Databox Edge Machine using IoT Hub\n",
"See the sample [here](https://github.com/Azure-Samples/aml-real-time-ai/) for using the Azure IoT CLI extension for deploying your Docker image to your Databox Edge Machine.\n",
"\n",
"### 8.b. Deploy to AKS Cluster"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create AKS ComputeTarget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"\n",
"# Uses the specific FPGA enabled VM (sku: Standard_PB6s)\n",
"# Standard_PB6s are available in: eastus, westus2, westeurope, southeastasia\n",
"prov_config = AksCompute.provisioning_configuration(vm_size = \"Standard_PB6s\",\n",
" agent_count = 1,\n",
" location = \"eastus\")\n",
"\n",
"aks_name = 'aks-pb6-tl'\n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace = ws, \n",
" name = aks_name, \n",
" provisioning_configuration = prov_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Provisioning an AKS cluster might take awhile (15 or so minutes), and we want to wait until it's successfully provisioned before we can deploy a service to it. If you interrupt this cell, provisioning of the cluster will continue. You can re-run it or check the status in your Workspace under Compute."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Deploy AccelContainerImage to AKS ComputeTarget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"sample-akswebservice-deploy-from-image"
]
},
"outputs": [],
"source": [
"%%time\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"\n",
"# Set the web service configuration (for creating a test service, we don't want autoscale enabled)\n",
"# Authentication is enabled by default, but for testing we specify False\n",
"aks_config = AksWebservice.deploy_configuration(autoscale_enabled=False,\n",
" num_replicas=1,\n",
" auth_enabled = False)\n",
"\n",
"aks_service_name ='my-aks-service-2'\n",
"\n",
"aks_service = Webservice.deploy_from_image(workspace = ws,\n",
" name = aks_service_name,\n",
" image = image,\n",
" deployment_config = aks_config,\n",
" deployment_target = aks_target)\n",
"aks_service.wait_for_deployment(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"test-service\"></a>\n",
"## 9. Test the service\n",
"\n",
"<a id=\"create-client\"></a>\n",
"### 9.a. Create Client\n",
"The image supports gRPC and the TensorFlow Serving \"predict\" API. We will create a PredictionClient from the Webservice object that can call into the docker image to get predictions. If you do not have the Webservice object, you can also create [PredictionClient](https://docs.microsoft.com/en-us/python/api/azureml-accel-models/azureml.accel.predictionclient?view=azure-ml-py) directly.\n",
"\n",
"**Note:** If you chose to use auth_enabled=True when creating your AksWebservice.deploy_configuration(), see documentation [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice(class)?view=azure-ml-py#get-keys--) on how to retrieve your keys and use either key as an argument to PredictionClient(...,access_token=key).\n",
"**WARNING:** If you are running on Azure Notebooks free compute, you will not be able to make outgoing calls to your service. Try locating your client on a different machine to consume it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Using the grpc client in AzureML Accelerated Models SDK\n",
"from azureml.accel import client_from_service\n",
"\n",
"# Initialize AzureML Accelerated Models client\n",
"client = client_from_service(aks_service)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"serve-model\"></a>\n",
"### 9.b. Serve the model\n",
"Let's see how our service does on a few images. It may get a few wrong."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Specify an image to classify\n",
"print('CATS')\n",
"for image_file in cat_files[:8]:\n",
" results = client.score_file(path=image_file, \n",
" input_name=input_tensors, \n",
" outputs=output_tensors)\n",
" result = 'CORRECT ' if results[0] > results[1] else 'WRONG '\n",
" print(result + str(results))\n",
"print('DOGS')\n",
"for image_file in dog_files[:8]:\n",
" results = client.score_file(path=image_file, \n",
" input_name=input_tensors, \n",
" outputs=output_tensors)\n",
" result = 'CORRECT ' if results[1] > results[0] else 'WRONG '\n",
" print(result + str(results))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"cleanup\"></a>\n",
"## 10. Cleanup\n",
"It's important to clean up your resources, so that you won't incur unnecessary costs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service.delete()\n",
"aks_target.delete()\n",
"image.delete()\n",
"registered_model.delete()\n",
"converted_model.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"appendix\"></a>\n",
"## 11. Appendix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"License for plot_confusion_matrix:\n",
"\n",
"New BSD License\n",
"\n",
"Copyright (c) 2007-2018 The scikit-learn developers.\n",
"All rights reserved.\n",
"\n",
"\n",
"Redistribution and use in source and binary forms, with or without\n",
"modification, are permitted provided that the following conditions are met:\n",
"\n",
" a. Redistributions of source code must retain the above copyright notice,\n",
" this list of conditions and the following disclaimer.\n",
" b. Redistributions in binary form must reproduce the above copyright\n",
" notice, this list of conditions and the following disclaimer in the\n",
" documentation and/or other materials provided with the distribution.\n",
" c. Neither the name of the Scikit-learn Developers nor the names of\n",
" its contributors may be used to endorse or promote products\n",
" derived from this software without specific prior written\n",
" permission. \n",
"\n",
"\n",
"THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n",
"AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n",
"IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n",
"ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR\n",
"ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n",
"DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n",
"SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n",
"CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\n",
"LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\n",
"OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH\n",
"DAMAGE.\n"
]
}
],
"metadata": {
"authors": [
{
"name": "coverste"
},
{
"name": "paledger"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,8 +0,0 @@
name: accelerated-models-training
dependencies:
- pip:
- azureml-sdk
- azureml-accel-models[cpu]
- keras
- tqdm
- sklearn

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

View File

@@ -1,33 +0,0 @@
import json
import pandas as pd
from sklearn.externals import joblib
from azureml.core.model import Model
import tensorflow as tf
def init():
global preprocess
global network
global scoring_explainer
# Retrieve the path to the model file using the model name
# Assume original model is named original_prediction_model
featurize_path = Model.get_model_path('featurize')
keras_model_path = Model.get_model_path('keras_model')
scoring_explainer_path = Model.get_model_path('IBM_attrition_explainer')
preprocess = joblib.load(featurize_path)
network = tf.keras.models.load_model(keras_model_path)
scoring_explainer = joblib.load(scoring_explainer_path)
def run(raw_data):
# Get predictions and explanations for each data point
data = pd.read_json(raw_data)
preprocessed_data = preprocess.transform(data)
# Make prediction
predictions = network.predict(preprocessed_data)
# Retrieve model explanations
local_importance_values = scoring_explainer.explain(data)
# You can return any data type as long as it is JSON-serializable
return {'predictions': predictions.tolist(), 'local_importance_values': local_importance_values}

View File

@@ -1,612 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train and explain keras model locally and deploy model with scoring explainer\n",
"\n",
"\n",
"_**This notebook illustrates how to use the Azure Machine Learning Interpretability SDK to deploy a locally-trained keras model and its corresponding deep scoring explainer to Azure Container Instances (ACI) as a web service.**_\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"Problem: IBM employee attrition classification with keras (train and explain a model locally and use Azure Container Instances (ACI) for deploying your model and its corresponding deep scoring explainer as a web service.)\n",
"\n",
"---\n",
"\n",
"## Table of Contents\n",
"\n",
"1. [Introduction](#Introduction)\n",
"1. [Setup](#Setup)\n",
"1. [Run model explainer locally at training time](#Explain)\n",
" 1. Apply feature transformations\n",
" 1. Train a binary classification keras model\n",
" 1. Explain the model on raw features\n",
" 1. Generate global explanations\n",
" 1. Generate local explanations\n",
"1. [Visualize explanations](#Visualize)\n",
"1. [Deploy keras model and scoring explainer](#Deploy)\n",
"1. [Next steps](#Next)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction\n",
"\n",
"\n",
"This notebook showcases how to train and explain a keras classification model locally, and deploy the trained model and its corresponding DeepExplainer to Azure Container Instances (ACI).\n",
"It demonstrates the API calls that you need to make to submit a run for training and explaining a keras model to AMLCompute, download the compute explanations remotely, and visualizing the global and local explanations via a visualization dashboard that provides an interactive way of discovering patterns in model predictions and downloaded explanations. It also demonstrates how to use Azure Machine Learning MLOps capabilities to deploy your keras model and its corresponding DeepExplainer.\n",
"\n",
"We will showcase one of the tabular data explainers, DeepExplainer (SHAP), following these steps:\n",
"1.\tDevelop a machine learning script in Python which involves the training script and the explanation script.\n",
"2.\tRun the script locally.\n",
"3.\tUse the interpretability toolkit\u00e2\u20ac\u2122s visualization dashboard to visualize predictions and their explanation. If the metrics and explanations don't indicate a desired outcome, loop back to step 1 and iterate on your scripts.\n",
"5.\tAfter a satisfactory run is found, create a Deep Scoring Explainer and register the persisted model and its corresponding DeepExplainer in the model registry.\n",
"6.\tDevelop a scoring script.\n",
"7.\tCreate an image and register it in the image registry.\n",
"8.\tDeploy the image as a web service in Azure.\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"Make sure you go through the [configuration notebook](../../../../configuration.ipynb) first if you haven't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize a Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create workspace"
]
},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explain\n",
"Create An Experiment: **Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"experiment_name = 'explain_model_at_scoring_time'\n",
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
"run = experiment.start_logging()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get IBM attrition data\n",
"import os\n",
"import pandas as pd\n",
"\n",
"outdirname = 'dataset.6.21.19'\n",
"try:\n",
" from urllib import urlretrieve\n",
"except ImportError:\n",
" from urllib.request import urlretrieve\n",
"import zipfile\n",
"zipfilename = outdirname + '.zip'\n",
"urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)\n",
"with zipfile.ZipFile(zipfilename, 'r') as unzip:\n",
" unzip.extractall('.')\n",
"attritionData = pd.read_csv('./WA_Fn-UseC_-HR-Employee-Attrition.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.externals import joblib\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn_pandas import DataFrameMapper\n",
"\n",
"os.makedirs('./outputs', exist_ok=True)\n",
"\n",
"# Dropping Employee count as all values are 1 and hence attrition is independent of this feature\n",
"attritionData = attritionData.drop(['EmployeeCount'], axis=1)\n",
"# Dropping Employee Number since it is merely an identifier\n",
"attritionData = attritionData.drop(['EmployeeNumber'], axis=1)\n",
"attritionData = attritionData.drop(['Over18'], axis=1)\n",
"# Since all values are 80\n",
"attritionData = attritionData.drop(['StandardHours'], axis=1)\n",
"\n",
"# Converting target variables from string to numerical values\n",
"target_map = {'Yes': 1, 'No': 0}\n",
"attritionData[\"Attrition_numerical\"] = attritionData[\"Attrition\"].apply(lambda x: target_map[x])\n",
"target = attritionData[\"Attrition_numerical\"]\n",
"\n",
"attritionXData = attritionData.drop(['Attrition_numerical', 'Attrition'], axis=1)\n",
"\n",
"# Creating dummy columns for each categorical feature\n",
"categorical = []\n",
"for col, value in attritionXData.iteritems():\n",
" if value.dtype == 'object':\n",
" categorical.append(col)\n",
"\n",
"# Store the numerical columns in a list numerical\n",
"numerical = attritionXData.columns.difference(categorical)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"\n",
"# We create the preprocessing pipelines for both numeric and categorical data.\n",
"numeric_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())])\n",
"\n",
"categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
"\n",
"preprocess = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, numerical),\n",
" ('cat', categorical_transformer, categorical)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import make_pipeline\n",
"pipeline = make_pipeline(preprocess)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(attritionXData, \n",
" target, \n",
" test_size=0.2,\n",
" random_state=0,\n",
" stratify=target)\n",
"\n",
"X_train_t = pipeline.fit_transform(X_train)\n",
"X_test_t = pipeline.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# check tensorflow version\n",
"import tensorflow as tf\n",
"from distutils.version import StrictVersion\n",
"\n",
"print(tf.__version__)\n",
"# Append classifier to preprocessing pipeline.\n",
"# Now we have a full prediction pipeline.\n",
"\n",
"\n",
"network = tf.keras.models.Sequential()\n",
"network.add(tf.keras.layers.Dense(units=16, activation='relu', input_shape=(X_train_t.shape[1],)))\n",
"network.add(tf.keras.layers.Dense(units=16, activation='relu'))\n",
"network.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))\n",
"\n",
"# Compile neural network\n",
"network.compile(loss='binary_crossentropy', # Cross-entropy\n",
" optimizer='rmsprop', # Root Mean Square Propagation\n",
" metrics=['accuracy']) # Accuracy performance metric\n",
"\n",
"# Train neural network\n",
"history = network.fit(X_train_t, # Features\n",
" y_train, # Target vector\n",
" epochs=20, # Number of epochs\n",
" verbose=1, # Print description after each epoch\n",
" batch_size=100, # Number of observations per batch\n",
" validation_data=(X_test_t, y_test)) # Data for evaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# You can run the DeepExplainer directly, or run the TabularExplainer which will choose the most appropriate explainer\n",
"from interpret.ext.greybox import DeepExplainer\n",
"explainer = DeepExplainer(network,\n",
" X_train,\n",
" features=X_train.columns,\n",
" classes=[\"STAYING\", \"LEAVING\"], \n",
" transformations=preprocess,\n",
" model_task=\"classification\",\n",
" is_classifier=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save featurization prior to keras model in the outputs folder so it automatically get uploaded\n",
"# We cannot save Keras with the pipeline due to known issues with pickling Keras models\n",
"featurize_file_name = 'featurize.pkl'\n",
"\n",
"with open(featurize_file_name, 'wb') as file:\n",
" joblib.dump(value=preprocess, filename=os.path.join('./outputs/', featurize_file_name))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save keras model to disk\n",
"keras_model_file_name = 'keras_model.pkl'\n",
"network.save(os.path.join('./outputs/', keras_model_file_name))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Explain overall model predictions (global explanation)\n",
"# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data\n",
"# x_train can be passed as well, but with more examples explanations it will\n",
"# take longer although they may be more accurate\n",
"global_explanation = explainer.explain_global(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.interpret.scoring.scoring_explainer import DeepScoringExplainer, save\n",
"from azureml.interpret.model.serialize import KerasSerializer\n",
"# ScoringExplainer with custom keras serializer\n",
"scoring_explainer = DeepScoringExplainer(explainer, serializer=KerasSerializer())\n",
"# Pickle scoring explainer locally\n",
"save(scoring_explainer, exist_ok=True)\n",
"\n",
"# Register featurization\n",
"run.upload_file(featurize_file_name, os.path.join('./outputs/', featurize_file_name))\n",
"featurize_model = run.register_model(model_name='featurize',\n",
" model_path=featurize_file_name)\n",
"\n",
"# Register keras model\n",
"run.upload_file(keras_model_file_name, os.path.join('./outputs/', keras_model_file_name))\n",
"keras_model = run.register_model(model_name='keras_model',\n",
" model_path=keras_model_file_name)\n",
"\n",
"# Register scoring explainer\n",
"run.upload_file('IBM_attrition_explainer.pkl', 'scoring_explainer.pkl')\n",
"scoring_explainer_model = run.register_model(model_name='IBM_attrition_explainer', model_path='IBM_attrition_explainer.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use helper utility to wrap keras model in scikit-learn style API for visualization dashboard\n",
"from interpret_community.common.model_wrapper import wrap_model\n",
"from interpret_community.dataset.dataset_wrapper import DatasetWrapper\n",
"wrapped_model, ml_domain = wrap_model(network, DatasetWrapper(X_test_t), \"classification\")\n",
"wrapped_model.fit = network.fit\n",
"from sklearn.pipeline import Pipeline\n",
"dashboard_pipeline = Pipeline(steps=[('preprocess', preprocess), ('network', wrapped_model)])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualize\n",
"Visualize the explanations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from interpret_community.widget import ExplanationDashboard"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ExplanationDashboard(global_explanation, dashboard_pipeline, datasetX=X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy \n",
"\n",
"Deploy Model and ScoringExplainer.\n",
"\n",
"Please note that you must indicate azureml-defaults with verion >= 1.0.45 as a pip dependency, because it contains the functionality needed to host the model as a web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.conda_dependencies import CondaDependencies \n",
"\n",
"# azureml-defaults is required to host the model as a web service.\n",
"azureml_pip_packages = [\n",
" 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n",
" 'azureml-interpret'\n",
"]\n",
"# Note: this is to pin the scikit-learn and pandas versions to be same as notebook.\n",
"# In production scenario user would choose their dependencies\n",
"import pkg_resources\n",
"available_packages = pkg_resources.working_set\n",
"sklearn_ver = None\n",
"pandas_ver = None\n",
"for dist in available_packages:\n",
" if dist.key == 'scikit-learn':\n",
" sklearn_ver = dist.version\n",
" elif dist.key == 'pandas':\n",
" pandas_ver = dist.version\n",
"sklearn_dep = 'scikit-learn'\n",
"pandas_dep = 'pandas'\n",
"if sklearn_ver:\n",
" sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n",
"if pandas_ver:\n",
" pandas_dep = 'pandas=={}'.format(pandas_ver)\n",
"# specify CondaDependencies obj\n",
"myenv = CondaDependencies.create(conda_packages=[sklearn_dep, pandas_dep],\n",
" pip_packages=['sklearn-pandas', 'pyyaml', 'tensorflow<2.0', 'keras==2.3.1'] + azureml_pip_packages)\n",
"\n",
"with open(\"myenv.yml\",\"w\") as f:\n",
" f.write(myenv.serialize_to_string())\n",
"\n",
"with open(\"myenv.yml\",\"r\") as f:\n",
" print(f.read())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.model import Model\n",
"# retrieve scoring explainer for deployment\n",
"scoring_explainer_model = Model(ws, 'IBM_attrition_explainer')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import Webservice\n",
"from azureml.core.model import InferenceConfig\n",
"from azureml.core.webservice import AciWebservice\n",
"from azureml.core.model import Model\n",
"from azureml.core.environment import Environment\n",
"\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1,\n",
" memory_gb=1,\n",
" tags={\"data\": \"IBM_Attrition\",\n",
" \"method\" : \"local_explanation\"},\n",
" description='Get local explanations for IBM Employee Attrition data')\n",
"\n",
"myenv = Environment.from_conda_specification(name=\"myenv\", file_path=\"myenv.yml\")\n",
"inference_config = InferenceConfig(entry_script=\"score_local_explain_keras.py\", environment=myenv)\n",
"\n",
"# Use configs and models generated above\n",
"service = Model.deploy(ws, 'model-scoring-keras-deploy-local', [scoring_explainer_model, featurize_model, keras_model], inference_config, aciconfig)\n",
"service.wait_for_deployment(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(service.get_logs())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"\n",
"# Create data to test service with\n",
"sample_data = '{\"Age\":{\"899\":49},\"BusinessTravel\":{\"899\":\"Travel_Rarely\"},\"DailyRate\":{\"899\":1098},\"Department\":{\"899\":\"Research & Development\"},\"DistanceFromHome\":{\"899\":4},\"Education\":{\"899\":2},\"EducationField\":{\"899\":\"Medical\"},\"EnvironmentSatisfaction\":{\"899\":1},\"Gender\":{\"899\":\"Male\"},\"HourlyRate\":{\"899\":85},\"JobInvolvement\":{\"899\":2},\"JobLevel\":{\"899\":5},\"JobRole\":{\"899\":\"Manager\"},\"JobSatisfaction\":{\"899\":3},\"MaritalStatus\":{\"899\":\"Married\"},\"MonthlyIncome\":{\"899\":18711},\"MonthlyRate\":{\"899\":12124},\"NumCompaniesWorked\":{\"899\":2},\"OverTime\":{\"899\":\"No\"},\"PercentSalaryHike\":{\"899\":13},\"PerformanceRating\":{\"899\":3},\"RelationshipSatisfaction\":{\"899\":3},\"StockOptionLevel\":{\"899\":1},\"TotalWorkingYears\":{\"899\":23},\"TrainingTimesLastYear\":{\"899\":2},\"WorkLifeBalance\":{\"899\":4},\"YearsAtCompany\":{\"899\":1},\"YearsInCurrentRole\":{\"899\":0},\"YearsSinceLastPromotion\":{\"899\":0},\"YearsWithCurrManager\":{\"899\":0}}'\n",
"\n",
"headers = {'Content-Type':'application/json'}\n",
"\n",
"# send request to service\n",
"resp = requests.post(service.scoring_uri, sample_data, headers=headers)\n",
"\n",
"print(\"POST to url\", service.scoring_uri)\n",
"# can covert back to Python objects from json string if desired\n",
"print(\"prediction:\", resp.text)\n",
"result = json.loads(resp.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#plot the feature importance for the prediction\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt; plt.rcdefaults()\n",
"\n",
"labels = json.loads(sample_data)\n",
"labels = labels.keys()\n",
"objects = labels\n",
"y_pos = np.arange(len(objects))\n",
"performance = result[\"local_importance_values\"][0][0]\n",
"\n",
"plt.bar(y_pos, performance, align='center', alpha=0.5)\n",
"plt.xticks(y_pos, objects)\n",
"locs, labels = plt.xticks()\n",
"plt.setp(labels, rotation=90)\n",
"plt.ylabel('Feature impact - leaving vs not leaving')\n",
"plt.title('Local feature importance for prediction')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next\n",
"Learn about other use cases of the explain package on a:\n",
"1. [Training time: regression problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-regression-local.ipynb) \n",
"1. [Training time: binary classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-binary-classification-local.ipynb)\n",
"1. [Training time: multiclass classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-multiclass-classification-local.ipynb)\n",
"1. Explain models with engineered features:\n",
" 1. [Simple feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/simple-feature-transformations-explain-local.ipynb)\n",
" 1. [Advanced feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/advanced-feature-transformations-explain-local.ipynb)\n",
"1. [Save model explanations via Azure Machine Learning Run History](../run-history/save-retrieve-explanations-run-history.ipynb)\n",
"1. [Run explainers remotely on Azure Machine Learning Compute (AMLCompute)](../remote-explanation/explain-model-on-amlcompute.ipynb)\n",
"1. [Inferencing time: deploy a remotely-trained model and explainer](./train-explain-model-on-amlcompute-and-deploy.ipynb)\n",
"1. [Inferencing time: deploy a locally-trained model and explainer](./train-explain-model-locally-and-deploy.ipynb)"
]
}
],
"metadata": {
"authors": [
{
"name": "mesameki"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,12 +0,0 @@
name: train-explain-model-keras-locally-and-deploy
dependencies:
- pip:
- azureml-sdk
- azureml-interpret
- interpret-community[visualization]
- matplotlib
- azureml-contrib-interpret
- sklearn-pandas
- ipywidgets
- tensorflow<2.0
- keras

View File

@@ -1,350 +0,0 @@
import json
import tempfile
import numpy as np
import copy
import time
import torch
import torch._six
from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO
import pycocotools.mask as mask_util
from collections import defaultdict
import utils
class CocoEvaluator(object):
def __init__(self, coco_gt, iou_types):
assert isinstance(iou_types, (list, tuple))
coco_gt = copy.deepcopy(coco_gt)
self.coco_gt = coco_gt
self.iou_types = iou_types
self.coco_eval = {}
for iou_type in iou_types:
self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
self.img_ids = []
self.eval_imgs = {k: [] for k in iou_types}
def update(self, predictions):
img_ids = list(np.unique(list(predictions.keys())))
self.img_ids.extend(img_ids)
for iou_type in self.iou_types:
results = self.prepare(predictions, iou_type)
coco_dt = loadRes(self.coco_gt, results) if results else COCO()
coco_eval = self.coco_eval[iou_type]
coco_eval.cocoDt = coco_dt
coco_eval.params.imgIds = list(img_ids)
img_ids, eval_imgs = evaluate(coco_eval)
self.eval_imgs[iou_type].append(eval_imgs)
def synchronize_between_processes(self):
for iou_type in self.iou_types:
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
def accumulate(self):
for coco_eval in self.coco_eval.values():
coco_eval.accumulate()
def summarize(self):
for iou_type, coco_eval in self.coco_eval.items():
print("IoU metric: {}".format(iou_type))
coco_eval.summarize()
def prepare(self, predictions, iou_type):
if iou_type == "bbox":
return self.prepare_for_coco_detection(predictions)
elif iou_type == "segm":
return self.prepare_for_coco_segmentation(predictions)
elif iou_type == "keypoints":
return self.prepare_for_coco_keypoint(predictions)
else:
raise ValueError("Unknown iou type {}".format(iou_type))
def prepare_for_coco_detection(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
return coco_results
def prepare_for_coco_segmentation(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
scores = prediction["scores"]
labels = prediction["labels"]
masks = prediction["masks"]
masks = masks > 0.5
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
rles = [
mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
for mask in masks
]
for rle in rles:
rle["counts"] = rle["counts"].decode("utf-8")
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"segmentation": rle,
"score": scores[k],
}
for k, rle in enumerate(rles)
]
)
return coco_results
def prepare_for_coco_keypoint(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
keypoints = prediction["keypoints"]
keypoints = keypoints.flatten(start_dim=1).tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
'keypoints': keypoint,
"score": scores[k],
}
for k, keypoint in enumerate(keypoints)
]
)
return coco_results
def convert_to_xywh(boxes):
xmin, ymin, xmax, ymax = boxes.unbind(1)
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
def merge(img_ids, eval_imgs):
all_img_ids = utils.all_gather(img_ids)
all_eval_imgs = utils.all_gather(eval_imgs)
merged_img_ids = []
for p in all_img_ids:
merged_img_ids.extend(p)
merged_eval_imgs = []
for p in all_eval_imgs:
merged_eval_imgs.append(p)
merged_img_ids = np.array(merged_img_ids)
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
# keep only unique (and in sorted order) images
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
merged_eval_imgs = merged_eval_imgs[..., idx]
return merged_img_ids, merged_eval_imgs
def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
img_ids, eval_imgs = merge(img_ids, eval_imgs)
img_ids = list(img_ids)
eval_imgs = list(eval_imgs.flatten())
coco_eval.evalImgs = eval_imgs
coco_eval.params.imgIds = img_ids
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################
# Ideally, pycocotools wouldn't have hard-coded prints
# so that we could avoid copy-pasting those two functions
def createIndex(self):
# create index
# print('creating index...')
anns, cats, imgs = {}, {}, {}
imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
imgToAnns[ann['image_id']].append(ann)
anns[ann['id']] = ann
if 'images' in self.dataset:
for img in self.dataset['images']:
imgs[img['id']] = img
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToImgs[ann['category_id']].append(ann['image_id'])
# print('index created!')
# create class members
self.anns = anns
self.imgToAnns = imgToAnns
self.catToImgs = catToImgs
self.imgs = imgs
self.cats = cats
maskUtils = mask_util
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = COCO()
res.dataset['images'] = [img for img in self.dataset['images']]
# print('Loading and preparing results...')
# tic = time.time()
if isinstance(resFile, torch._six.string_classes):
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, 'results in not an array of objects'
annsImgIds = [ann['image_id'] for ann in anns]
assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
'Results do not correspond to current coco set'
if 'caption' in anns[0]:
imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
for id, ann in enumerate(anns):
ann['id'] = id + 1
elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
bb = ann['bbox']
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
if 'segmentation' not in ann:
ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
ann['area'] = bb[2] * bb[3]
ann['id'] = id + 1
ann['iscrowd'] = 0
elif 'segmentation' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
# now only support compressed RLE format as segmentation results
ann['area'] = maskUtils.area(ann['segmentation'])
if 'bbox' not in ann:
ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
ann['id'] = id + 1
ann['iscrowd'] = 0
elif 'keypoints' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
s = ann['keypoints']
x = s[0::3]
y = s[1::3]
x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
ann['area'] = (x2 - x1) * (y2 - y1)
ann['id'] = id + 1
ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
# print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res.dataset['annotations'] = anns
createIndex(res)
return res
def evaluate(self):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
:return: None
'''
# tic = time.time()
# print('Running per image evaluation...')
p = self.params
# add backward compatibility if useSegm is specified in params
if p.useSegm is not None:
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
# print('Evaluate annotation type *{}*'.format(p.iouType))
p.imgIds = list(np.unique(p.imgIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == 'segm' or p.iouType == 'bbox':
computeIoU = self.computeIoU
elif p.iouType == 'keypoints':
computeIoU = self.computeOks
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds
for catId in catIds}
evaluateImg = self.evaluateImg
maxDet = p.maxDets[-1]
evalImgs = [
evaluateImg(imgId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for imgId in p.imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs = np.asarray(evalImgs).reshape(
len(catIds), len(p.areaRng), len(p.imgIds))
self._paramsEval = copy.deepcopy(self.params)
# toc = time.time()
# print('DONE (t={:0.2f}s).'.format(toc-tic))
return p.imgIds, evalImgs
#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################

View File

@@ -1,252 +0,0 @@
import copy
import os
from PIL import Image
import torch
import torch.utils.data
import torchvision
from pycocotools import mask as coco_mask
from pycocotools.coco import COCO
import transforms as T
class FilterAndRemapCocoCategories(object):
def __init__(self, categories, remap=True):
self.categories = categories
self.remap = remap
def __call__(self, image, target):
anno = target["annotations"]
anno = [obj for obj in anno if obj["category_id"] in self.categories]
if not self.remap:
target["annotations"] = anno
return image, target
anno = copy.deepcopy(anno)
for obj in anno:
obj["category_id"] = self.categories.index(obj["category_id"])
target["annotations"] = anno
return image, target
def convert_coco_poly_to_mask(segmentations, height, width):
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8)
mask = mask.any(dim=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, dim=0)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8)
return masks
class ConvertCocoPolysToMask(object):
def __call__(self, image, target):
w, h = image.size
image_id = target["image_id"]
image_id = torch.tensor([image_id])
anno = target["annotations"]
anno = [obj for obj in anno if obj['iscrowd'] == 0]
boxes = [obj["bbox"] for obj in anno]
# guard against no boxes via resizing
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2].clamp_(min=0, max=w)
boxes[:, 1::2].clamp_(min=0, max=h)
classes = [obj["category_id"] for obj in anno]
classes = torch.tensor(classes, dtype=torch.int64)
segmentations = [obj["segmentation"] for obj in anno]
masks = convert_coco_poly_to_mask(segmentations, h, w)
keypoints = None
if anno and "keypoints" in anno[0]:
keypoints = [obj["keypoints"] for obj in anno]
keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
num_keypoints = keypoints.shape[0]
if num_keypoints:
keypoints = keypoints.view(num_keypoints, -1, 3)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
classes = classes[keep]
masks = masks[keep]
if keypoints is not None:
keypoints = keypoints[keep]
target = {}
target["boxes"] = boxes
target["labels"] = classes
target["masks"] = masks
target["image_id"] = image_id
if keypoints is not None:
target["keypoints"] = keypoints
# for conversion to coco api
area = torch.tensor([obj["area"] for obj in anno])
iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
target["area"] = area
target["iscrowd"] = iscrowd
return image, target
def _coco_remove_images_without_annotations(dataset, cat_list=None):
def _has_only_empty_bbox(anno):
return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
def _count_visible_keypoints(anno):
return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
min_keypoints_per_image = 10
def _has_valid_annotation(anno):
# if it's empty, there is no annotation
if len(anno) == 0:
return False
# if all boxes have close to zero area, there is no annotation
if _has_only_empty_bbox(anno):
return False
# keypoints task have a slight different critera for considering
# if an annotation is valid
if "keypoints" not in anno[0]:
return True
# for keypoint detection tasks, only consider valid images those
# containing at least min_keypoints_per_image
if _count_visible_keypoints(anno) >= min_keypoints_per_image:
return True
return False
assert isinstance(dataset, torchvision.datasets.CocoDetection)
ids = []
for ds_idx, img_id in enumerate(dataset.ids):
ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = dataset.coco.loadAnns(ann_ids)
if cat_list:
anno = [obj for obj in anno if obj["category_id"] in cat_list]
if _has_valid_annotation(anno):
ids.append(ds_idx)
dataset = torch.utils.data.Subset(dataset, ids)
return dataset
def convert_to_coco_api(ds):
coco_ds = COCO()
# annotation IDs need to start at 1, not 0, see torchvision issue #1530
ann_id = 1
dataset = {'images': [], 'categories': [], 'annotations': []}
categories = set()
for img_idx in range(len(ds)):
# find better way to get target
# targets = ds.get_annotations(img_idx)
img, targets = ds[img_idx]
image_id = targets["image_id"].item()
img_dict = {}
img_dict['id'] = image_id
img_dict['height'] = img.shape[-2]
img_dict['width'] = img.shape[-1]
dataset['images'].append(img_dict)
bboxes = targets["boxes"]
bboxes[:, 2:] -= bboxes[:, :2]
bboxes = bboxes.tolist()
labels = targets['labels'].tolist()
areas = targets['area'].tolist()
iscrowd = targets['iscrowd'].tolist()
if 'masks' in targets:
masks = targets['masks']
# make masks Fortran contiguous for coco_mask
masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
if 'keypoints' in targets:
keypoints = targets['keypoints']
keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
num_objs = len(bboxes)
for i in range(num_objs):
ann = {}
ann['image_id'] = image_id
ann['bbox'] = bboxes[i]
ann['category_id'] = labels[i]
categories.add(labels[i])
ann['area'] = areas[i]
ann['iscrowd'] = iscrowd[i]
ann['id'] = ann_id
if 'masks' in targets:
ann["segmentation"] = coco_mask.encode(masks[i].numpy())
if 'keypoints' in targets:
ann['keypoints'] = keypoints[i]
ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3])
dataset['annotations'].append(ann)
ann_id += 1
dataset['categories'] = [{'id': i} for i in sorted(categories)]
coco_ds.dataset = dataset
coco_ds.createIndex()
return coco_ds
def get_coco_api_from_dataset(dataset):
for _ in range(10):
if isinstance(dataset, torchvision.datasets.CocoDetection):
break
if isinstance(dataset, torch.utils.data.Subset):
dataset = dataset.dataset
if isinstance(dataset, torchvision.datasets.CocoDetection):
return dataset.coco
return convert_to_coco_api(dataset)
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super(CocoDetection, self).__init__(img_folder, ann_file)
self._transforms = transforms
def __getitem__(self, idx):
img, target = super(CocoDetection, self).__getitem__(idx)
image_id = self.ids[idx]
target = dict(image_id=image_id, annotations=target)
if self._transforms is not None:
img, target = self._transforms(img, target)
return img, target
def get_coco(root, image_set, transforms, mode='instances'):
anno_file_template = "{}_{}2017.json"
PATHS = {
"train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
"val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
# "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
}
t = [ConvertCocoPolysToMask()]
if transforms is not None:
t.append(transforms)
transforms = T.Compose(t)
img_folder, ann_file = PATHS[image_set]
img_folder = os.path.join(root, img_folder)
ann_file = os.path.join(root, ann_file)
dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
if image_set == "train":
dataset = _coco_remove_images_without_annotations(dataset)
# dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
return dataset
def get_coco_kp(root, image_set, transforms):
return get_coco(root, image_set, transforms, mode="person_keypoints")

View File

@@ -1,77 +0,0 @@
import numpy as np
import os
import torch.utils.data
from azureml.core import Run
from PIL import Image
class PennFudanDataset(torch.utils.data.Dataset):
def __init__(self, root, transforms=None):
self.root = root
self.transforms = transforms
# load all image files, sorting them to ensure that they are aligned
self.img_dir = os.path.join(root, "PNGImages")
self.mask_dir = os.path.join(root, "PedMasks")
self.imgs = list(sorted(os.listdir(self.img_dir)))
self.masks = list(sorted(os.listdir(self.mask_dir)))
def __getitem__(self, idx):
# load images ad masks
img_path = os.path.join(self.img_dir, self.imgs[idx])
mask_path = os.path.join(self.mask_dir, self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
mask = np.array(mask)
# instances are encoded as different colors
obj_ids = np.unique(mask)
# first id is the background, so remove it
obj_ids = obj_ids[1:]
# split the color-encoded mask into a set
# of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)

View File

@@ -1,16 +0,0 @@
# From https://github.com/microsoft/AzureML-BERT/blob/master/finetune/PyTorch/dockerfile
FROM mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04
RUN apt update && apt install git -y && rm -rf /var/lib/apt/lists/*
RUN /opt/miniconda/bin/conda update -n base -c defaults conda
RUN /opt/miniconda/bin/conda install -y cython=0.29.15 numpy=1.18.1
RUN /opt/miniconda/bin/conda install -y pytorch=1.4 torchvision=0.5.0 -c pytorch
# Install cocoapi, required for drawing bounding boxes
RUN git clone https://github.com/cocodataset/cocoapi.git && cd cocoapi/PythonAPI && python setup.py build_ext install
RUN pip install azureml-defaults
RUN pip install "azureml-dataprep[fuse]"
RUN pip install pandas pyarrow

View File

@@ -1,108 +0,0 @@
import math
import sys
import time
import torch
import torchvision.models.detection.mask_rcnn
from coco_utils import get_coco_api_from_dataset
from coco_eval import CocoEvaluator
import utils
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
lr_scheduler = None
if epoch == 0:
warmup_factor = 1. / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def _get_iou_types(model):
model_without_ddp = model
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
model_without_ddp = model.module
iou_types = ["bbox"]
if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
iou_types.append("segm")
if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
iou_types.append("keypoints")
return iou_types
@torch.no_grad()
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for image, targets in metric_logger.log_every(data_loader, 100, header):
image = list(img.to(device) for img in image)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
torch.cuda.synchronize()
model_time = time.time()
outputs = model(image)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator

View File

@@ -1,23 +0,0 @@
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
def get_instance_segmentation_model(num_classes):
# load an instance segmentation model pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model

View File

@@ -1,544 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Object detection with PyTorch, Mask R-CNN, and a custom Dockerfile\n",
"\n",
"In this tutorial, you will finetune a pre-trained [Mask R-CNN](https://arxiv.org/abs/1703.06870) model on images from the [Penn-Fudan Database for Pedestrian Detection and Segmentation](https://www.cis.upenn.edu/~jshi/ped_html/). The dataset has 170 images with 345 instances of pedestrians. After running this tutorial, you will have a model that can outline the silhouettes of all pedestrians within an image.\n",
"\n",
"You\u00e2\u20ac\u2122ll use Azure Machine Learning to: \n",
"\n",
"- Initialize a workspace \n",
"- Create a compute cluster\n",
"- Define a training environment\n",
"- Train a model remotely\n",
"- Register your model\n",
"- Generate predictions locally\n",
"\n",
"## Prerequisities\n",
"\n",
"- If you are using an Azure Machine Learning Notebook VM, your environment already meets these prerequisites. Otherwise, go through the [configuration notebook](../../../../../configuration.ipynb) to install the Azure Machine Learning Python SDK and [create an Azure ML Workspace](https://docs.microsoft.com/azure/machine-learning/how-to-manage-workspace#create-a-workspace). You also need matplotlib 3.2, pycocotools-2.0.0, torchvision >= 0.5.0 and torch >= 1.4.0.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number, check other dependencies\n",
"import azureml.core\n",
"import matplotlib\n",
"import pycocotools\n",
"import torch\n",
"import torchvision\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Diagnostics\n",
"\n",
"Opt-in diagnostics for better experience, quality, and security in future releases."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.telemetry import set_diagnostics_collection\n",
"\n",
"set_diagnostics_collection(send_diagnostics=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize a workspace\n",
"\n",
"Initialize a [workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`, using the [from_config()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.workspace(class)?view=azure-ml-py#from-config-path-none--auth-none---logger-none---file-name-none-) method."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or attach existing Azure ML Managed Compute\n",
"\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/concept-compute-target) for training your model. In this tutorial, we use [Azure ML managed compute](https://docs.microsoft.com/azure/machine-learning/how-to-set-up-training-targets#amlcompute) for our remote training compute resource. Specifically, the below code creates a `STANDARD_NC6` GPU cluster that autoscales from 0 to 4 nodes.\n",
"\n",
"**Creation of Compute takes approximately 5 minutes.** If the Aauzre ML Compute with that name is already in your workspace, this code will skip the creation process. \n",
"\n",
"As with other Azure servies, there are limits on certain resources associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/azure/machine-learning/how-to-manage-quotas) on the default limits and how to request more quota.\n",
"\n",
"> Note that the below code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"\n",
"# choose a name for your cluster\n",
"cluster_name = 'gpu-cluster'\n",
"\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
" compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current cluster. \n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define a training environment\n",
"\n",
"### Create a project directory\n",
"Create a directory that will contain all the code from your local machine that you will need access to on the remote resource. This includes the training script an any additional files your training script depends on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"project_folder = './pytorch-peds'\n",
"\n",
"try:\n",
" os.makedirs(project_folder, exist_ok=False)\n",
"except FileExistsError:\n",
" print('project folder {} exists, moving on...'.format(project_folder))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Copy training script and dependencies into project directory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"\n",
"files_to_copy = ['data', 'model', 'script', 'utils', 'transforms', 'coco_eval', 'engine', 'coco_utils']\n",
"for file in files_to_copy:\n",
" shutil.copy(os.path.join(os.getcwd(), (file + '.py')), project_folder)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"experiment_name = 'pytorch-peds'\n",
"experiment = Experiment(ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify dependencies with a custom Dockerfile\n",
"\n",
"There are a number of ways to [use environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments) for specifying dependencies during model training. In this case, we use a custom Dockerfile."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"\n",
"my_env = Environment(name='maskr-docker')\n",
"my_env.docker.enabled = True\n",
"with open(\"dockerfiles/Dockerfile\", \"r\") as f:\n",
" dockerfile_contents=f.read()\n",
"my_env.docker.base_dockerfile=dockerfile_contents\n",
"my_env.docker.base_image = None\n",
"my_env.python.interpreter_path = '/opt/miniconda/bin/python'\n",
"my_env.python.user_managed_dependencies = True\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a ScriptRunConfig\n",
"\n",
"Use the [ScriptRunConfig](https://docs.microsoft.com/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py) class to define your run. Specify the source directory, compute target, and environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.dnn import PyTorch\n",
"from azureml.core import ScriptRunConfig\n",
"\n",
"model_name = 'pytorch-peds'\n",
"output_dir = './outputs/'\n",
"n_epochs = 2\n",
"\n",
"script_args = [\n",
" '--model_name', model_name,\n",
" '--output_dir', output_dir,\n",
" '--n_epochs', n_epochs,\n",
"]\n",
"# Add training script to run config\n",
"runconfig = ScriptRunConfig(\n",
" source_directory=project_folder,\n",
" script=\"script.py\",\n",
" arguments=script_args)\n",
"\n",
"# Attach compute target to run config\n",
"runconfig.run_config.target = cluster_name\n",
"\n",
"# Uncomment the line below if you want to try this locally first\n",
"#runconfig.run_config.target = \"local\"\n",
"\n",
"# Attach environment to run config\n",
"runconfig.run_config.environment = my_env"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train remotely\n",
"\n",
"### Submit your run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Submit run \n",
"run = experiment.submit(runconfig)\n",
"\n",
"# to get more details of your run\n",
"print(run.get_details())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor your run\n",
"\n",
"Use a widget to keep track of your run. You can also view the status of the run within the [Azure Machine Learning service portal](https://ml.azure.com)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"\n",
"RunDetails(run).show()\n",
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test your model\n",
"\n",
"Now that we are done training, let's see how well this model actually performs.\n",
"\n",
"### Get your latest run\n",
"First, pull the latest run using `experiment.get_runs()`, which lists runs from `experiment` in reverse chronological order."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Run\n",
"\n",
"last_run = next(experiment.get_runs())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Register your model\n",
"Next, [register the model](https://docs.microsoft.com/azure/machine-learning/concept-model-management-and-deployment#register-package-and-deploy-models-from-anywhere) from your run. Registering your model assigns it a version and helps you with auditability."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"last_run.register_model(model_name=model_name, model_path=os.path.join(output_dir, model_name))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download your model\n",
"Next, download this registered model. Notice how we can initialize the `Model` object with the name of the registered model, rather than a path to the file itself."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Model\n",
"\n",
"model = Model(workspace=ws, name=model_name)\n",
"path = model.download(target_dir='model', exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Use your model to make a prediction\n",
"\n",
"Run inferencing on a single test image and display the results."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from azureml.core import Dataset\n",
"from data import PennFudanDataset\n",
"from script import get_transform, download_data, NUM_CLASSES\n",
"from model import get_instance_segmentation_model\n",
"\n",
"if torch.cuda.is_available():\n",
" device = torch.device('cuda')\n",
"else:\n",
" device = torch.device('cpu')\n",
"\n",
"# Instantiate model with correct weights, cast to correct device, place in evaluation mode\n",
"predict_model = get_instance_segmentation_model(NUM_CLASSES)\n",
"predict_model.to(device)\n",
"predict_model.load_state_dict(torch.load(path, map_location=device))\n",
"predict_model.eval()\n",
"\n",
"# Load dataset\n",
"root_dir=download_data()\n",
"dataset_test = PennFudanDataset(root=root_dir, transforms=get_transform(train=False))\n",
"\n",
"# pick one image from the test set\n",
"img, _ = dataset_test[0]\n",
"\n",
"with torch.no_grad():\n",
" prediction = predict_model([img.to(device)])\n",
"\n",
"# model = torch.load(path)\n",
"#torch.load(model.get_model_path(model_name='outputs/model.pt'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Display the input image\n",
"\n",
"While tensors are great for computers, a tensor of RGB values doesn't mean much to a human. Let's display the input image in a way that a human could understand."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image\n",
"\n",
"\n",
"Image.fromarray(img.mul(255).permute(1, 2, 0).byte().numpy())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Display the predicted masks\n",
"\n",
"The prediction consists of masks, displaying the outline of pedestrians in the image. Let's take a look at the first two masks, below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Image.fromarray(prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Image.fromarray(prediction[0]['masks'][1, 0].mul(255).byte().cpu().numpy())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next steps\n",
"\n",
"Congratulations! You just trained a Mask R-CNN model with PyTorch in Azure Machine Learning. As next steps, consider:\n",
"1. Learn more about using PyTorch in Azure Machine Learning service by checking out the [README](./README.md]\n",
"2. Try exporting your model to [ONNX](https://docs.microsoft.com/azure/machine-learning/concept-onnx) for accelerated inferencing."
]
}
],
"metadata": {
"authors": [
{
"name": "gopalv"
}
],
"category": "training",
"compute": [
"AML Compute"
],
"datasets": [
"Custom"
],
"deployment": [
"None"
],
"exclude_from_index": false,
"framework": [
"PyTorch"
],
"friendly_name": "PyTorch object detection",
"index_order": 1,
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5-final"
},
"nteract": {
"version": "nteract-front-end@1.0.0"
},
"tags": [
"remote run",
"docker"
],
"task": "Fine-tune PyTorch object detection model with a custom dockerfile"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,14 +0,0 @@
name: pytorch-mask-rcnn
dependencies:
- cython
- pytorch==1.4.0 -c pytorch
- torchvision -c pytorch
- pip:
- azureml-sdk
- azureml-widgets
- azureml-dataprep
- fuse
- pandas
- matplotlib
- pillow==7.0.0
- git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI

View File

@@ -1,117 +0,0 @@
import argparse
import os
import torch
import torchvision
import transforms as T
import urllib.request
import utils
from azureml.core import Dataset, Run
from data import PennFudanDataset
from engine import train_one_epoch, evaluate
from model import get_instance_segmentation_model
from zipfile import ZipFile
NUM_CLASSES = 2
def download_data():
data_file = 'PennFudanPed.zip'
ds_path = 'PennFudanPed/'
urllib.request.urlretrieve('https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip', data_file)
zip = ZipFile(file=data_file)
zip.extractall(path=ds_path)
return os.path.join(ds_path, zip.namelist()[0])
def get_transform(train):
transforms = []
# converts the image, a PIL image, into a PyTorch Tensor
transforms.append(T.ToTensor())
if train:
# during training, randomly flip the training images
# and ground-truth for data augmentation
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def main():
print("Torch version:", torch.__version__)
# get command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str, default="pytorch-peds.pt",
help='name with which to register your model')
parser.add_argument('--output_dir', default="local-outputs",
type=str, help='output directory')
parser.add_argument('--n_epochs', type=int,
default=10, help='number of epochs')
args = parser.parse_args()
# In case user inputs a nested output directory
os.makedirs(name=args.output_dir, exist_ok=True)
# Get a dataset by name
root_dir = download_data()
# use our dataset and defined transformations
dataset = PennFudanDataset(root=root_dir, transforms=get_transform(train=True))
dataset_test = PennFudanDataset(root=root_dir, transforms=get_transform(train=False))
# split the dataset in train and test set
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=2, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
if torch.cuda.is_available():
print('Using GPU')
device = torch.device('cuda')
else:
print('Using CPU')
device = torch.device('cpu')
# our dataset has two classes only - background and person
num_classes = NUM_CLASSES
# get the model using our helper function
model = get_instance_segmentation_model(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
for epoch in range(args.n_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(
model, optimizer, data_loader, device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
evaluate(model, data_loader_test, device=device)
# Saving the state dict is recommended method, per
# https://pytorch.org/tutorials/beginner/saving_loading_models.html
torch.save(model.state_dict(), os.path.join(args.output_dir, args.model_name))
if __name__ == '__main__':
main()

View File

@@ -1,50 +0,0 @@
import random
import torch
from torchvision.transforms import functional as F
def _flip_coco_person_keypoints(kps, width):
flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
flipped_data = kps[:, flip_inds]
flipped_data[..., 0] = width - flipped_data[..., 0]
# Maintain COCO convention that if visibility == 0, then x, y = 0
inds = flipped_data[..., 2] == 0
flipped_data[inds] = 0
return flipped_data
class Compose(object):
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, image, target):
for t in self.transforms:
image, target = t(image, target)
return image, target
class RandomHorizontalFlip(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image, target):
if random.random() < self.prob:
height, width = image.shape[-2:]
image = image.flip(-1)
bbox = target["boxes"]
bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
target["boxes"] = bbox
if "masks" in target:
target["masks"] = target["masks"].flip(-1)
if "keypoints" in target:
keypoints = target["keypoints"]
keypoints = _flip_coco_person_keypoints(keypoints, width)
target["keypoints"] = keypoints
return image, target
class ToTensor(object):
def __call__(self, image, target):
image = F.to_tensor(image)
return image, target

View File

@@ -1,326 +0,0 @@
from __future__ import print_function
from collections import defaultdict, deque
import datetime
import pickle
import time
import torch
import torch.distributed as dist
import errno
import os
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size=20, fmt=None):
if fmt is None:
fmt = "{median:.4f} ({global_avg:.4f})"
self.deque = deque(maxlen=window_size)
self.total = 0.0
self.count = 0
self.fmt = fmt
def update(self, value, n=1):
self.deque.append(value)
self.count += n
self.total += value * n
def synchronize_between_processes(self):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
@property
def median(self):
d = torch.tensor(list(self.deque))
return d.median().item()
@property
def avg(self):
d = torch.tensor(list(self.deque), dtype=torch.float32)
return d.mean().item()
@property
def global_avg(self):
return self.total / self.count
@property
def max(self):
return max(self.deque)
@property
def value(self):
return self.deque[-1]
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
def all_gather(data):
"""
Run all_gather on arbitrary picklable data (not necessarily tensors)
Args:
data: any picklable object
Returns:
list[data]: list of data gathered from each rank
"""
world_size = get_world_size()
if world_size == 1:
return [data]
# serialized to a Tensor
buffer = pickle.dumps(data)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to("cuda")
# obtain Tensor size of each rank
local_size = torch.tensor([tensor.numel()], device="cuda")
size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
dist.all_gather(size_list, local_size)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# receiving Tensor from all ranks
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
tensor_list = []
for _ in size_list:
tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
if local_size != max_size:
padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
tensor = torch.cat((tensor, padding), dim=0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def reduce_dict(input_dict, average=True):
"""
Args:
input_dict (dict): all the values will be reduced
average (bool): whether to do average or sum
Reduce the values in the dictionary from all processes so that all processes
have the averaged results. Returns a dict with the same fields as
input_dict, after reduction.
"""
world_size = get_world_size()
if world_size < 2:
return input_dict
with torch.no_grad():
names = []
values = []
# sort the keys so that they are consistent across processes
for k in sorted(input_dict.keys()):
names.append(k)
values.append(input_dict[k])
values = torch.stack(values, dim=0)
dist.all_reduce(values)
if average:
values /= world_size
reduced_dict = {k: v for k, v in zip(names, values)}
return reduced_dict
class MetricLogger(object):
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
def update(self, **kwargs):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
self.meters[k].update(v)
def __getattr__(self, attr):
if attr in self.meters:
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
for meter in self.meters.values():
meter.synchronize_between_processes()
def add_meter(self, name, meter):
self.meters[name] = meter
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
yield obj
iter_time.update(time.time() - end)
if i % print_freq == 0 or i == len(iterable) - 1:
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format(
header, total_time_str, total_time / len(iterable)))
def collate_fn(batch):
return tuple(zip(*batch))
def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):
def f(x):
if x >= warmup_iters:
return 1
alpha = float(x) / warmup_iters
return warmup_factor * (1 - alpha) + alpha
return torch.optim.lr_scheduler.LambdaLR(optimizer, f)
def mkdir(path):
try:
os.makedirs(path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0
def save_on_master(*args, **kwargs):
if is_main_process():
torch.save(*args, **kwargs)
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)

View File

@@ -9,5 +9,4 @@ dependencies:
- keras
- tensorflow==2.0.0
- matplotlib
- azureml-dataprep
- fuse

View File

@@ -7,6 +7,5 @@ dependencies:
- tensorflow-gpu==1.13.2
- horovod==0.16.1
- matplotlib
- azureml-dataprep
- pandas
- fuse

View File

@@ -7,5 +7,4 @@ dependencies:
- keras
- tensorflow==1.14.0
- matplotlib
- azureml-dataprep
- fuse

View File

@@ -20,11 +20,11 @@ Using these samples, you will be able to do the following.
| File/folder | Description |
|-------------------|--------------------------------------------|
| [README.md](README.md) | This README file. |
| [devenv_setup.ipynb](setup/devenv_setup.ipynb) | Notebook to setup development environment for Azure ML RL |
| [cartpole_ci.ipynb](cartpole-on-compute-instance/cartpole_ci.ipynb) | Notebook to train a Cartpole playing agent on an Azure ML Compute Instance |
| [cartpole_cc.ipynb](cartpole-on-single-compute/cartpole_cc.ipynb) | Notebook to train a Cartpole playing agent on an Azure ML Compute Cluster (single node) |
| [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb) | Notebook to train Pong agent using RLlib on multiple compute targets |
| [minecraft.ipynb](minecraft-on-distributed-compute/minecraft.ipynb) | Notebook to train an agent to navigate through a lava maze in the Minecraft game |
## Prerequisites
@@ -111,7 +111,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
For more on SDK concepts, please refer to [notebooks](https://github.com/Azure/MachineLearningNotebooks).
**Please let us know your feedback.**
**Please let us know your [feedback](https://github.com/Azure/MachineLearningNotebooks/labels/Reinforcement%20Learning).**

View File

@@ -0,0 +1,70 @@
FROM mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04
# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
cpio \
git \
bzip2 \
libx11-6 \
tmux \
htop \
gcc \
xvfb \
python-opengl \
x11-xserver-utils \
ffmpeg \
mesa-utils \
nano \
vim \
rsync \
&& rm -rf /var/lib/apt/lists/*
# Create a working directory
RUN mkdir /app
WORKDIR /app
# Install Minecraft needed libraries
RUN mkdir -p /usr/share/man/man1 && \
sudo apt-get update && \
sudo apt-get install -y \
openjdk-8-jre-headless=8u162-b12-1 \
openjdk-8-jdk-headless=8u162-b12-1 \
openjdk-8-jre=8u162-b12-1 \
openjdk-8-jdk=8u162-b12-1
# Create a Python 3.7 environment
RUN conda install conda-build \
&& conda create -y --name py37 python=3.7.3 \
&& conda clean -ya
ENV CONDA_DEFAULT_ENV=py37
# Install minerl
RUN pip install --upgrade --user minerl
RUN pip install \
pandas \
matplotlib \
numpy \
scipy \
azureml-defaults \
tensorboardX \
tensorflow==1.15rc2 \
tabulate \
dm_tree \
lz4 \
ray==0.8.3 \
ray[rllib]==0.8.3 \
ray[tune]==0.8.3
COPY patch_files/* /root/.local/lib/python3.7/site-packages/minerl/env/Malmo/Minecraft/src/main/java/com/microsoft/Malmo/Client/
# Start minerl to pre-fetch minerl files (saves time when starting minerl during training)
RUN xvfb-run -a -s "-screen 0 1400x900x24" python -c "import gym; import minerl; env = gym.make('MineRLTreechop-v0'); env.close();"
RUN pip install --index-url https://test.pypi.org/simple/ malmo && \
python -c "import malmo.minecraftbootstrap; malmo.minecraftbootstrap.download();"
ENV MALMO_XSD_PATH="/app/MalmoPlatform/Schemas"

View File

@@ -0,0 +1,939 @@
// --------------------------------------------------------------------------------------------------
// Copyright (c) 2016 Microsoft Corporation
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or l copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// --------------------------------------------------------------------------------------------------
package com.microsoft.Malmo.Client;
import com.microsoft.Malmo.MalmoMod;
import com.microsoft.Malmo.MissionHandlerInterfaces.IWantToQuit;
import com.microsoft.Malmo.Schemas.MissionInit;
import com.microsoft.Malmo.Utils.TCPUtils;
import net.minecraft.profiler.Profiler;
import com.microsoft.Malmo.Utils.TimeHelper;
import net.minecraftforge.common.config.Configuration;
import java.io.*;
import java.net.ServerSocket;
import java.net.Socket;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.Hashtable;
import com.microsoft.Malmo.Utils.TCPInputPoller;
import java.util.logging.Level;
import java.util.LinkedList;
import java.util.List;
/**
* MalmoEnvServer - service supporting OpenAI gym "environment" for multi-agent Malmo missions.
*/
public class MalmoEnvServer implements IWantToQuit {
private static Profiler profiler = new Profiler();
private static int nsteps = 0;
private static boolean debug = false;
private static String hello = "<MalmoEnv" ;
private class EnvState {
// Mission parameters:
String missionInit = null;
String token = null;
String experimentId = null;
int agentCount = 0;
int reset = 0;
boolean quit = false;
boolean synchronous = false;
Long seed = null;
// OpenAI gym state:
boolean done = false;
double reward = 0.0;
byte[] obs = null;
String info = "";
LinkedList<String> commands = new LinkedList<String>();
}
private static boolean envPolicy = false; // Are we configured by config policy?
// Synchronize on EnvStateasd
private Lock lock = new ReentrantLock();
private Condition cond = lock.newCondition();
private EnvState envState = new EnvState();
private Hashtable<String, Integer> initTokens = new Hashtable<String, Integer>();
static final long COND_WAIT_SECONDS = 3; // Max wait in seconds before timing out (and replying to RPC).
static final int BYTES_INT = 4;
static final int BYTES_DOUBLE = 8;
private static final Charset utf8 = Charset.forName("UTF-8");
// Service uses a single per-environment client connection - initiated by the remote environment.
private int port;
private TCPInputPoller missionPoller; // Used for command parsing and not actual communication.
private String version;
// AOG: From running experiments, I've found that MineRL can get stuck resetting the
// environment which causes huge delays while we wait for the Python side to time
// out and restart the Minecraft instace. Minecraft itself is normally in a recoverable
// state, but the MalmoEnvServer instance will be blocked in a tight spin loop trying
// handling a Peek request from the Python client. To unstick things, I've added this
// flag that can be set when we know things are in a bad state to abort the peek request.
// WARNING: THIS IS ONLY TREATING THE SYMPTOM AND NOT THE ROOT CAUSE
// The reason things are getting stuck is because the player is either dying or we're
// receiving a quit request while an episode reset is in progress.
private boolean abortRequest;
public void abort() {
System.out.println("AOG: MalmoEnvServer.abort");
abortRequest = true;
}
/***
* Malmo "Env" service.
* @param port the port the service listens on.
* @param missionPoller for plugging into existing comms handling.
*/
public MalmoEnvServer(String version, int port, TCPInputPoller missionPoller) {
this.version = version;
this.missionPoller = missionPoller;
this.port = port;
// AOG - Assume we don't wan't to be aborting in the first place
this.abortRequest = false;
}
/** Initialize malmo env configuration. For now either on or "legacy" AgentHost protocol.*/
static public void update(Configuration configs) {
envPolicy = configs.get(MalmoMod.ENV_CONFIGS, "env", "false").getBoolean();
}
public static boolean isEnv() {
return envPolicy;
}
/**
* Start servicing the MalmoEnv protocol.
* @throws IOException
*/
public void serve() throws IOException {
ServerSocket serverSocket = new ServerSocket(port);
serverSocket.setPerformancePreferences(0,2,1);
while (true) {
try {
final Socket socket = serverSocket.accept();
socket.setTcpNoDelay(true);
Thread thread = new Thread("EnvServerSocketHandler") {
public void run() {
boolean running = false;
try {
checkHello(socket);
while (true) {
DataInputStream din = new DataInputStream(socket.getInputStream());
int hdr = din.readInt();
byte[] data = new byte[hdr];
din.readFully(data);
String command = new String(data, utf8);
if (command.startsWith("<Step")) {
profiler.startSection("root");
long start = System.nanoTime();
step(command, socket, din);
profiler.endSection();
if (nsteps % 100 == 0 && debug){
List<Profiler.Result> dat = profiler.getProfilingData("root");
for(int qq = 0; qq < dat.size(); qq++){
Profiler.Result res = dat.get(qq);
System.out.println(res.profilerName + " " + res.totalUsePercentage + " "+ res.usePercentage);
}
}
} else if (command.startsWith("<Peek")) {
peek(command, socket, din);
} else if (command.startsWith("<Init")) {
init(command, socket);
} else if (command.startsWith("<Find")) {
find(command, socket);
} else if (command.startsWith("<MissionInit")) {
if (missionInit(din, command, socket))
{
running = true;
}
} else if (command.startsWith("<Quit")) {
quit(command, socket);
profiler.profilingEnabled = false;
} else if (command.startsWith("<Exit")) {
exit(command, socket);
profiler.profilingEnabled = false;
} else if (command.startsWith("<Close")) {
close(command, socket);
profiler.profilingEnabled = false;
} else if (command.startsWith("<Status")) {
status(command, socket);
} else if (command.startsWith("<Echo")) {
command = "<Echo>" + command + "</Echo>";
data = command.getBytes(utf8);
hdr = data.length;
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(hdr);
dout.write(data, 0, hdr);
dout.flush();
} else {
throw new IOException("Unknown env service command");
}
}
} catch (IOException ioe) {
// ioe.printStackTrace();
TCPUtils.Log(Level.SEVERE, "MalmoEnv socket error: " + ioe + " (can be on disconnect)");
// System.out.println("[ERROR] " + "MalmoEnv socket error: " + ioe + " (can be on disconnect)");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] MalmoEnv socket error");
try {
if (running) {
TCPUtils.Log(Level.INFO,"Want to quit on disconnect.");
System.out.println("[LOGTOPY] " + "Want to quit on disconnect.");
setWantToQuit();
}
socket.close();
} catch (IOException ioe2) {
}
}
}
};
thread.start();
} catch (IOException ioe) {
TCPUtils.Log(Level.SEVERE, "MalmoEnv service exits on " + ioe);
}
}
}
private void checkHello(Socket socket) throws IOException {
DataInputStream din = new DataInputStream(socket.getInputStream());
int hdr = din.readInt();
if (hdr <= 0 || hdr > hello.length() + 8) // Version number may be somewhat longer in future.
throw new IOException("Invalid MalmoEnv hello header length");
byte[] data = new byte[hdr];
din.readFully(data);
if (!new String(data).startsWith(hello + version))
throw new IOException("MalmoEnv invalid protocol or version - expected " + hello + version);
}
// Handler for <MissionInit> messages.
private boolean missionInit(DataInputStream din, String command, Socket socket) throws IOException {
String ipOriginator = socket.getInetAddress().getHostName();
int hdr;
byte[] data;
hdr = din.readInt();
data = new byte[hdr];
din.readFully(data);
String id = new String(data, utf8);
TCPUtils.Log(Level.INFO,"Mission Init" + id);
String[] token = id.split(":");
String experimentId = token[0];
int role = Integer.parseInt(token[1]);
int reset = Integer.parseInt(token[2]);
int agentCount = Integer.parseInt(token[3]);
Boolean isSynchronous = Boolean.parseBoolean(token[4]);
Long seed = null;
if(token.length > 5)
seed = Long.parseLong(token[5]);
if(isSynchronous && agentCount > 1){
throw new IOException("Synchronous mode currently does not support multiple agents.");
}
port = -1;
boolean allTokensConsumed = true;
boolean started = false;
lock.lock();
try {
if (role == 0) {
String previousToken = experimentId + ":0:" + (reset - 1);
initTokens.remove(previousToken);
String myToken = experimentId + ":0:" + reset;
if (!initTokens.containsKey(myToken)) {
TCPUtils.Log(Level.INFO,"(Pre)Start " + role + " reset " + reset);
started = startUp(command, ipOriginator, experimentId, reset, agentCount, myToken, seed, isSynchronous);
if (started)
initTokens.put(myToken, 0);
} else {
started = true; // Pre-started previously.
}
// Check that all previous tokens have been consumed. If not don't proceed to mission.
allTokensConsumed = areAllTokensConsumed(experimentId, reset, agentCount);
if (!allTokensConsumed) {
try {
cond.await(COND_WAIT_SECONDS, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
}
allTokensConsumed = areAllTokensConsumed(experimentId, reset, agentCount);
}
} else {
TCPUtils.Log(Level.INFO, "Start " + role + " reset " + reset);
started = startUp(command, ipOriginator, experimentId, reset, agentCount, experimentId + ":" + role + ":" + reset, seed, isSynchronous);
}
} finally {
lock.unlock();
}
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(allTokensConsumed && started ? 1 : 0);
dout.flush();
dout.flush();
return allTokensConsumed && started;
}
private boolean areAllTokensConsumed(String experimentId, int reset, int agentCount) {
boolean allTokensConsumed = true;
for (int i = 1; i < agentCount; i++) {
String tokenForAgent = experimentId + ":" + i + ":" + (reset - 1);
if (initTokens.containsKey(tokenForAgent)) {
TCPUtils.Log(Level.FINE,"Mission init - unconsumed " + tokenForAgent);
allTokensConsumed = false;
}
}
return allTokensConsumed;
}
private boolean startUp(String command, String ipOriginator, String experimentId, int reset, int agentCount, String myToken, Long seed, Boolean isSynchronous) throws IOException {
// Clear out mission state
envState.reward = 0.0;
envState.commands.clear();
envState.obs = null;
envState.info = "";
envState.missionInit = command;
envState.done = false;
envState.quit = false;
envState.token = myToken;
envState.experimentId = experimentId;
envState.agentCount = agentCount;
envState.reset = reset;
envState.synchronous = isSynchronous;
envState.seed = seed;
return startUpMission(command, ipOriginator);
}
private boolean startUpMission(String command, String ipOriginator) throws IOException {
if (missionPoller == null)
return false;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(baos);
missionPoller.commandReceived(command, ipOriginator, dos);
dos.flush();
byte[] reply = baos.toByteArray();
ByteArrayInputStream bais = new ByteArrayInputStream(reply);
DataInputStream dis = new DataInputStream(bais);
int hdr = dis.readInt();
byte[] replyBytes = new byte[hdr];
dis.readFully(replyBytes);
String replyStr = new String(replyBytes);
if (replyStr.equals("MALMOOK")) {
TCPUtils.Log(Level.INFO, "MalmoEnvServer Mission starting ...");
return true;
} else if (replyStr.equals("MALMOBUSY")) {
TCPUtils.Log(Level.INFO, "MalmoEnvServer Busy - I want to quit");
this.envState.quit = true;
}
return false;
}
private static final int stepTagLength = "<Step_>".length(); // Step with option code.
private synchronized void stepSync(String command, Socket socket, DataInputStream din) throws IOException
{
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Entering synchronous step.");
nsteps += 1;
profiler.startSection("commandProcessing");
String actions = command.substring(stepTagLength, command.length() - (stepTagLength + 2));
int options = Character.getNumericValue(command.charAt(stepTagLength - 2));
boolean withInfo = options == 0 || options == 2;
// Prepare to write data to the client.
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
double reward = 0.0;
boolean done;
byte[] obs;
String info = "";
boolean sent = false;
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Acquiring lock for synchronous step.");
lock.lock();
try {
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Lock is acquired.");
done = envState.done;
// TODO Handle when the environment is done.
// Process the actions.
if (actions.contains("\n")) {
String[] cmds = actions.split("\\n");
for(String cmd : cmds) {
envState.commands.add(cmd);
}
} else {
if (!actions.isEmpty())
envState.commands.add(actions);
}
sent = true;
profiler.endSection(); //cmd
profiler.startSection("requestTick");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Received: " + actions);
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Requesting tick.");
// Now wait to run a tick
// If synchronous mode is off then we should see if want to quit is true.
while(!TimeHelper.SyncManager.requestTick() && !done ){Thread.yield();}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Tick request granted.");
profiler.endSection();
profiler.startSection("waitForTick");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Waiting for tick.");
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted() && !done ){ Thread.yield();}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> TICK DONE. Getting observation.");
profiler.endSection();
profiler.startSection("getObservation");
// After which, get the observations.
obs = getObservation(done);
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Observation received. Getting info.");
profiler.endSection();
profiler.startSection("getInfo");
// Pick up rewards.
reward = envState.reward;
if (withInfo) {
info = envState.info;
// if(info == null)
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> FILLING INFO: NULL");
// else
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> FILLING " + info.toString());
}
done = envState.done;
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> STATUS " + Boolean.toString(done));
envState.info = null;
envState.obs = null;
envState.reward = 0.0;
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Info received..");
profiler.endSection();
} finally {
lock.unlock();
}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Lock released. Writing observation, info, done.");
profiler.startSection("writeObs");
dout.writeInt(obs.length);
dout.write(obs);
dout.writeInt(BYTES_DOUBLE + 2);
dout.writeDouble(reward);
dout.writeByte(done ? 1 : 0);
dout.writeByte(sent ? 1 : 0);
if (withInfo) {
byte[] infoBytes = info.getBytes(utf8);
dout.writeInt(infoBytes.length);
dout.write(infoBytes);
}
profiler.endSection(); //write obs
profiler.startSection("flush");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Packets written. Flushing.");
dout.flush();
profiler.endSection(); // flush
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Done with step.");
}
// Handler for <Step_> messages. Single digit option code after _ specifies if turnkey and info are included in message.
private void step(String command, Socket socket, DataInputStream din) throws IOException {
if(envState.synchronous){
stepSync(command, socket, din);
}
else{
System.out.println("[ERROR] Asynchronous stepping is not supported in MineRL.");
}
}
// Handler for <Peek> messages.
private void peek(String command, Socket socket, DataInputStream din) throws IOException {
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
byte[] obs;
boolean done;
String info = "";
// AOG - As we've only seen issues with the peek reqest, I've focused my changes to just
// this function. Initially we want to be optimistic and assume we're not going to abort
// the request and my observations of event timings indicate that there is plenty of time
// between the peek request being received and the reset failing, so a race condition is
// unlikely.
abortRequest = false;
lock.lock();
try {
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Waiting for pistol to fire.");
while(!TimeHelper.SyncManager.hasServerFiredPistol() && !abortRequest){
// Now wait to run a tick
while(!TimeHelper.SyncManager.requestTick() && !abortRequest){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted() && !abortRequest){ Thread.yield();}
Thread.yield();
}
if (abortRequest) {
System.out.println("AOG: Aborting peek request");
// AOG - We detect the lack of observation within our Python wrapper and throw a slightly
// diferent exception that by-passes MineRLs automatic clean up code. If we were to report
// 'done', the MineRL detects this as a runtime error and kills the Minecraft process
// triggering a lengthy restart. So far from testing, Minecraft itself is fine can we can
// retry the reset, it's only the tight loops above that were causing things to stall and
// timeout.
// No observation
dout.writeInt(0);
// No info
dout.writeInt(0);
// Done
dout.writeInt(1);
dout.writeByte(0);
dout.flush();
return;
}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Pistol fired!.");
// Wait two ticks for the first observation from server to be propagated.
while(!TimeHelper.SyncManager.requestTick() ){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted()){ Thread.yield();}
while(!TimeHelper.SyncManager.requestTick() ){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted()){ Thread.yield();}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Getting observation.");
obs = getObservation(false);
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Observation acquired.");
done = envState.done;
info = envState.info;
} finally {
lock.unlock();
}
dout.writeInt(obs.length);
dout.write(obs);
byte[] infoBytes = info.getBytes(utf8);
dout.writeInt(infoBytes.length);
dout.write(infoBytes);
dout.writeInt(1);
dout.writeByte(done ? 1 : 0);
dout.flush();
}
// Get the current observation. If none and not done wait for a short time.
public byte[] getObservation(boolean done) {
byte[] obs = envState.obs;
if (obs == null){
System.out.println("[ERROR] Video observation is null; please notify the developer.");
}
return obs;
}
// Handler for <Find> messages - used by non-zero roles to discover integrated server port from primary (role 0) service.
private final static int findTagLength = "<Find>".length();
private void find(String command, Socket socket) throws IOException {
Integer port;
lock.lock();
try {
String token = command.substring(findTagLength, command.length() - (findTagLength + 1));
TCPUtils.Log(Level.INFO, "Find token? " + token);
// Purge previous token.
String[] tokenSplits = token.split(":");
String experimentId = tokenSplits[0];
int role = Integer.parseInt(tokenSplits[1]);
int reset = Integer.parseInt(tokenSplits[2]);
String previousToken = experimentId + ":" + role + ":" + (reset - 1);
initTokens.remove(previousToken);
cond.signalAll();
// Check for next token. Wait for a short time if not already produced.
port = initTokens.get(token);
if (port == null) {
try {
cond.await(COND_WAIT_SECONDS, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
}
port = initTokens.get(token);
if (port == null) {
port = 0;
TCPUtils.Log(Level.INFO,"Role " + role + " reset " + reset + " waiting for token.");
}
}
} finally {
lock.unlock();
}
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(port);
dout.flush();
}
public boolean isSynchronous(){
return envState.synchronous;
}
// Handler for <Init> messages. These reset the service so use with care!
private void init(String command, Socket socket) throws IOException {
lock.lock();
try {
initTokens = new Hashtable<String, Integer>();
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(1);
dout.flush();
} finally {
lock.unlock();
}
}
// Handler for <Quit> (quit mission) messages.
private void quit(String command, Socket socket) throws IOException {
lock.lock();
try {
if (!envState.done){
envState.quit = true;
}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Pistol fired!.");
// Wait two ticks for the first observation from server to be propagated.
while(!TimeHelper.SyncManager.requestTick() ){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted()){ Thread.yield();}
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(envState.done ? 1 : 0);
dout.flush();
} finally {
lock.unlock();
}
}
private final static int closeTagLength = "<Close>".length();
// Handler for <Close> messages.
private void close(String command, Socket socket) throws IOException {
lock.lock();
try {
String token = command.substring(closeTagLength, command.length() - (closeTagLength + 1));
initTokens.remove(token);
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(1);
dout.flush();
} finally {
lock.unlock();
}
}
// Handler for <Status> messages.
private void status(String command, Socket socket) throws IOException {
lock.lock();
try {
String status = "{}"; // TODO Possibly have something more interesting to report.
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
byte[] statusBytes = status.getBytes(utf8);
dout.writeInt(statusBytes.length);
dout.write(statusBytes);
dout.flush();
} finally {
lock.unlock();
}
}
// Handler for <Exit> messages. These "kill the service" temporarily so use with care!f
private void exit(String command, Socket socket) throws IOException {
// lock.lock();
try {
// We may exit before we get a chance to reply.
TimeHelper.SyncManager.setSynchronous(false);
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(1);
dout.flush();
ClientStateMachine.exitJava();
} finally {
// lock.unlock();
}
}
// Malmo client state machine interface methods:
public String getCommand() {
try {
String command = envState.commands.poll();
if (command == null)
return "";
else
return command;
} finally {
}
}
public void endMission() {
// lock.lock();
try {
// AOG - If the mission is ending, we always want to abort requests and they won't
// be able to progress to completion and will stall.
System.out.println("AOG: MalmoEnvServer.endMission");
abort();
envState.done = true;
envState.quit = false;
envState.missionInit = null;
if (envState.token != null) {
initTokens.remove(envState.token);
envState.token = null;
envState.experimentId = null;
envState.agentCount = 0;
envState.reset = 0;
// cond.signalAll();
}
// lock.unlock();
} finally {
}
}
// Record a Malmo "observation" json - as the env info since an environment "obs" is a video frame.
public void observation(String info) {
// Parsing obs as JSON would be slower but less fragile than extracting the turn_key using string search.
// lock.lock();
try {
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <OBSERVATION> Inserting: " + info);
envState.info = info;
// cond.signalAll();
} finally {
// lock.unlock();
}
}
public void addRewards(double rewards) {
// lock.lock();
try {
envState.reward += rewards;
} finally {
// lock.unlock();
}
}
public void addFrame(byte[] frame) {
// lock.lock();
try {
envState.obs = frame; // Replaces current.
// cond.signalAll();
} finally {
// lock.unlock();
}
}
public void notifyIntegrationServerStarted(int integrationServerPort) {
lock.lock();
try {
if (envState.token != null) {
TCPUtils.Log(Level.INFO,"Integration server start up - token: " + envState.token);
addTokens(integrationServerPort, envState.token, envState.experimentId, envState.agentCount, envState.reset);
cond.signalAll();
} else {
TCPUtils.Log(Level.WARNING,"No mission token on integration server start up!");
}
} finally {
lock.unlock();
}
}
private void addTokens(int integratedServerPort, String myToken, String experimentId, int agentCount, int reset) {
initTokens.put(myToken, integratedServerPort);
// Place tokens for other agents to find.
for (int i = 1; i < agentCount; i++) {
String tokenForAgent = experimentId + ":" + i + ":" + reset;
initTokens.put(tokenForAgent, integratedServerPort);
}
}
// IWantToQuit implementation.
@Override
public boolean doIWantToQuit(MissionInit missionInit) {
// lock.lock();
try {
return envState.quit;
} finally {
// lock.unlock();
}
}
public Long getSeed(){
return envState.seed;
}
private void setWantToQuit() {
// lock.lock();
try {
envState.quit = true;
} finally {
if(TimeHelper.SyncManager.isSynchronous()){
// We want to dsynchronize everything.
TimeHelper.SyncManager.setSynchronous(false);
}
// lock.unlock();
}
}
@Override
public void prepare(MissionInit missionInit) {
}
@Override
public void cleanup() {
}
@Override
public String getOutcome() {
return "Env quit";
}
}

View File

@@ -0,0 +1,78 @@
FROM mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04
# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
cpio \
git \
bzip2 \
libx11-6 \
tmux \
htop \
gcc \
xvfb \
python-opengl \
x11-xserver-utils \
ffmpeg \
mesa-utils \
nano \
vim \
rsync \
&& rm -rf /var/lib/apt/lists/*
# Create a working directory
RUN mkdir /app
WORKDIR /app
# Create a Python 3.7 environment
RUN conda install conda-build \
&& conda create -y --name py37 python=3.7.3 \
&& conda clean -ya
ENV CONDA_DEFAULT_ENV=py37
# Install Minecraft needed libraries
RUN mkdir -p /usr/share/man/man1 && \
sudo apt-get update && \
sudo apt-get install -y \
openjdk-8-jre-headless=8u162-b12-1 \
openjdk-8-jdk-headless=8u162-b12-1 \
openjdk-8-jre=8u162-b12-1 \
openjdk-8-jdk=8u162-b12-1
RUN pip install --upgrade --user minerl
# PyTorch with CUDA 10 installation
RUN conda install -y -c pytorch \
cuda100=1.0 \
magma-cuda100=2.4.0 \
"pytorch=1.1.0=py3.7_cuda10.0.130_cudnn7.5.1_0" \
torchvision=0.3.0 \
&& conda clean -ya
RUN pip install \
pandas \
matplotlib \
numpy \
scipy \
azureml-defaults \
tensorboardX \
tensorflow-gpu==1.15rc2 \
GPUtil \
tabulate \
dm_tree \
lz4 \
ray==0.8.3 \
ray[rllib]==0.8.3 \
ray[tune]==0.8.3
COPY patch_files/* /root/.local/lib/python3.7/site-packages/minerl/env/Malmo/Minecraft/src/main/java/com/microsoft/Malmo/Client/
# Start minerl to pre-fetch minerl files (saves time when starting minerl during training)
RUN xvfb-run -a -s "-screen 0 1400x900x24" python -c "import gym; import minerl; env = gym.make('MineRLTreechop-v0'); env.close();"
RUN pip install --index-url https://test.pypi.org/simple/ malmo && \
python -c "import malmo.minecraftbootstrap; malmo.minecraftbootstrap.download();"
ENV MALMO_XSD_PATH="/app/MalmoPlatform/Schemas"

View File

@@ -0,0 +1,939 @@
// --------------------------------------------------------------------------------------------------
// Copyright (c) 2016 Microsoft Corporation
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or l copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// --------------------------------------------------------------------------------------------------
package com.microsoft.Malmo.Client;
import com.microsoft.Malmo.MalmoMod;
import com.microsoft.Malmo.MissionHandlerInterfaces.IWantToQuit;
import com.microsoft.Malmo.Schemas.MissionInit;
import com.microsoft.Malmo.Utils.TCPUtils;
import net.minecraft.profiler.Profiler;
import com.microsoft.Malmo.Utils.TimeHelper;
import net.minecraftforge.common.config.Configuration;
import java.io.*;
import java.net.ServerSocket;
import java.net.Socket;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.Hashtable;
import com.microsoft.Malmo.Utils.TCPInputPoller;
import java.util.logging.Level;
import java.util.LinkedList;
import java.util.List;
/**
* MalmoEnvServer - service supporting OpenAI gym "environment" for multi-agent Malmo missions.
*/
public class MalmoEnvServer implements IWantToQuit {
private static Profiler profiler = new Profiler();
private static int nsteps = 0;
private static boolean debug = false;
private static String hello = "<MalmoEnv" ;
private class EnvState {
// Mission parameters:
String missionInit = null;
String token = null;
String experimentId = null;
int agentCount = 0;
int reset = 0;
boolean quit = false;
boolean synchronous = false;
Long seed = null;
// OpenAI gym state:
boolean done = false;
double reward = 0.0;
byte[] obs = null;
String info = "";
LinkedList<String> commands = new LinkedList<String>();
}
private static boolean envPolicy = false; // Are we configured by config policy?
// Synchronize on EnvStateasd
private Lock lock = new ReentrantLock();
private Condition cond = lock.newCondition();
private EnvState envState = new EnvState();
private Hashtable<String, Integer> initTokens = new Hashtable<String, Integer>();
static final long COND_WAIT_SECONDS = 3; // Max wait in seconds before timing out (and replying to RPC).
static final int BYTES_INT = 4;
static final int BYTES_DOUBLE = 8;
private static final Charset utf8 = Charset.forName("UTF-8");
// Service uses a single per-environment client connection - initiated by the remote environment.
private int port;
private TCPInputPoller missionPoller; // Used for command parsing and not actual communication.
private String version;
// AOG: From running experiments, I've found that MineRL can get stuck resetting the
// environment which causes huge delays while we wait for the Python side to time
// out and restart the Minecraft instace. Minecraft itself is normally in a recoverable
// state, but the MalmoEnvServer instance will be blocked in a tight spin loop trying
// handling a Peek request from the Python client. To unstick things, I've added this
// flag that can be set when we know things are in a bad state to abort the peek request.
// WARNING: THIS IS ONLY TREATING THE SYMPTOM AND NOT THE ROOT CAUSE
// The reason things are getting stuck is because the player is either dying or we're
// receiving a quit request while an episode reset is in progress.
private boolean abortRequest;
public void abort() {
System.out.println("AOG: MalmoEnvServer.abort");
abortRequest = true;
}
/***
* Malmo "Env" service.
* @param port the port the service listens on.
* @param missionPoller for plugging into existing comms handling.
*/
public MalmoEnvServer(String version, int port, TCPInputPoller missionPoller) {
this.version = version;
this.missionPoller = missionPoller;
this.port = port;
// AOG - Assume we don't wan't to be aborting in the first place
this.abortRequest = false;
}
/** Initialize malmo env configuration. For now either on or "legacy" AgentHost protocol.*/
static public void update(Configuration configs) {
envPolicy = configs.get(MalmoMod.ENV_CONFIGS, "env", "false").getBoolean();
}
public static boolean isEnv() {
return envPolicy;
}
/**
* Start servicing the MalmoEnv protocol.
* @throws IOException
*/
public void serve() throws IOException {
ServerSocket serverSocket = new ServerSocket(port);
serverSocket.setPerformancePreferences(0,2,1);
while (true) {
try {
final Socket socket = serverSocket.accept();
socket.setTcpNoDelay(true);
Thread thread = new Thread("EnvServerSocketHandler") {
public void run() {
boolean running = false;
try {
checkHello(socket);
while (true) {
DataInputStream din = new DataInputStream(socket.getInputStream());
int hdr = din.readInt();
byte[] data = new byte[hdr];
din.readFully(data);
String command = new String(data, utf8);
if (command.startsWith("<Step")) {
profiler.startSection("root");
long start = System.nanoTime();
step(command, socket, din);
profiler.endSection();
if (nsteps % 100 == 0 && debug){
List<Profiler.Result> dat = profiler.getProfilingData("root");
for(int qq = 0; qq < dat.size(); qq++){
Profiler.Result res = dat.get(qq);
System.out.println(res.profilerName + " " + res.totalUsePercentage + " "+ res.usePercentage);
}
}
} else if (command.startsWith("<Peek")) {
peek(command, socket, din);
} else if (command.startsWith("<Init")) {
init(command, socket);
} else if (command.startsWith("<Find")) {
find(command, socket);
} else if (command.startsWith("<MissionInit")) {
if (missionInit(din, command, socket))
{
running = true;
}
} else if (command.startsWith("<Quit")) {
quit(command, socket);
profiler.profilingEnabled = false;
} else if (command.startsWith("<Exit")) {
exit(command, socket);
profiler.profilingEnabled = false;
} else if (command.startsWith("<Close")) {
close(command, socket);
profiler.profilingEnabled = false;
} else if (command.startsWith("<Status")) {
status(command, socket);
} else if (command.startsWith("<Echo")) {
command = "<Echo>" + command + "</Echo>";
data = command.getBytes(utf8);
hdr = data.length;
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(hdr);
dout.write(data, 0, hdr);
dout.flush();
} else {
throw new IOException("Unknown env service command");
}
}
} catch (IOException ioe) {
// ioe.printStackTrace();
TCPUtils.Log(Level.SEVERE, "MalmoEnv socket error: " + ioe + " (can be on disconnect)");
// System.out.println("[ERROR] " + "MalmoEnv socket error: " + ioe + " (can be on disconnect)");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] MalmoEnv socket error");
try {
if (running) {
TCPUtils.Log(Level.INFO,"Want to quit on disconnect.");
System.out.println("[LOGTOPY] " + "Want to quit on disconnect.");
setWantToQuit();
}
socket.close();
} catch (IOException ioe2) {
}
}
}
};
thread.start();
} catch (IOException ioe) {
TCPUtils.Log(Level.SEVERE, "MalmoEnv service exits on " + ioe);
}
}
}
private void checkHello(Socket socket) throws IOException {
DataInputStream din = new DataInputStream(socket.getInputStream());
int hdr = din.readInt();
if (hdr <= 0 || hdr > hello.length() + 8) // Version number may be somewhat longer in future.
throw new IOException("Invalid MalmoEnv hello header length");
byte[] data = new byte[hdr];
din.readFully(data);
if (!new String(data).startsWith(hello + version))
throw new IOException("MalmoEnv invalid protocol or version - expected " + hello + version);
}
// Handler for <MissionInit> messages.
private boolean missionInit(DataInputStream din, String command, Socket socket) throws IOException {
String ipOriginator = socket.getInetAddress().getHostName();
int hdr;
byte[] data;
hdr = din.readInt();
data = new byte[hdr];
din.readFully(data);
String id = new String(data, utf8);
TCPUtils.Log(Level.INFO,"Mission Init" + id);
String[] token = id.split(":");
String experimentId = token[0];
int role = Integer.parseInt(token[1]);
int reset = Integer.parseInt(token[2]);
int agentCount = Integer.parseInt(token[3]);
Boolean isSynchronous = Boolean.parseBoolean(token[4]);
Long seed = null;
if(token.length > 5)
seed = Long.parseLong(token[5]);
if(isSynchronous && agentCount > 1){
throw new IOException("Synchronous mode currently does not support multiple agents.");
}
port = -1;
boolean allTokensConsumed = true;
boolean started = false;
lock.lock();
try {
if (role == 0) {
String previousToken = experimentId + ":0:" + (reset - 1);
initTokens.remove(previousToken);
String myToken = experimentId + ":0:" + reset;
if (!initTokens.containsKey(myToken)) {
TCPUtils.Log(Level.INFO,"(Pre)Start " + role + " reset " + reset);
started = startUp(command, ipOriginator, experimentId, reset, agentCount, myToken, seed, isSynchronous);
if (started)
initTokens.put(myToken, 0);
} else {
started = true; // Pre-started previously.
}
// Check that all previous tokens have been consumed. If not don't proceed to mission.
allTokensConsumed = areAllTokensConsumed(experimentId, reset, agentCount);
if (!allTokensConsumed) {
try {
cond.await(COND_WAIT_SECONDS, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
}
allTokensConsumed = areAllTokensConsumed(experimentId, reset, agentCount);
}
} else {
TCPUtils.Log(Level.INFO, "Start " + role + " reset " + reset);
started = startUp(command, ipOriginator, experimentId, reset, agentCount, experimentId + ":" + role + ":" + reset, seed, isSynchronous);
}
} finally {
lock.unlock();
}
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(allTokensConsumed && started ? 1 : 0);
dout.flush();
dout.flush();
return allTokensConsumed && started;
}
private boolean areAllTokensConsumed(String experimentId, int reset, int agentCount) {
boolean allTokensConsumed = true;
for (int i = 1; i < agentCount; i++) {
String tokenForAgent = experimentId + ":" + i + ":" + (reset - 1);
if (initTokens.containsKey(tokenForAgent)) {
TCPUtils.Log(Level.FINE,"Mission init - unconsumed " + tokenForAgent);
allTokensConsumed = false;
}
}
return allTokensConsumed;
}
private boolean startUp(String command, String ipOriginator, String experimentId, int reset, int agentCount, String myToken, Long seed, Boolean isSynchronous) throws IOException {
// Clear out mission state
envState.reward = 0.0;
envState.commands.clear();
envState.obs = null;
envState.info = "";
envState.missionInit = command;
envState.done = false;
envState.quit = false;
envState.token = myToken;
envState.experimentId = experimentId;
envState.agentCount = agentCount;
envState.reset = reset;
envState.synchronous = isSynchronous;
envState.seed = seed;
return startUpMission(command, ipOriginator);
}
private boolean startUpMission(String command, String ipOriginator) throws IOException {
if (missionPoller == null)
return false;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(baos);
missionPoller.commandReceived(command, ipOriginator, dos);
dos.flush();
byte[] reply = baos.toByteArray();
ByteArrayInputStream bais = new ByteArrayInputStream(reply);
DataInputStream dis = new DataInputStream(bais);
int hdr = dis.readInt();
byte[] replyBytes = new byte[hdr];
dis.readFully(replyBytes);
String replyStr = new String(replyBytes);
if (replyStr.equals("MALMOOK")) {
TCPUtils.Log(Level.INFO, "MalmoEnvServer Mission starting ...");
return true;
} else if (replyStr.equals("MALMOBUSY")) {
TCPUtils.Log(Level.INFO, "MalmoEnvServer Busy - I want to quit");
this.envState.quit = true;
}
return false;
}
private static final int stepTagLength = "<Step_>".length(); // Step with option code.
private synchronized void stepSync(String command, Socket socket, DataInputStream din) throws IOException
{
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Entering synchronous step.");
nsteps += 1;
profiler.startSection("commandProcessing");
String actions = command.substring(stepTagLength, command.length() - (stepTagLength + 2));
int options = Character.getNumericValue(command.charAt(stepTagLength - 2));
boolean withInfo = options == 0 || options == 2;
// Prepare to write data to the client.
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
double reward = 0.0;
boolean done;
byte[] obs;
String info = "";
boolean sent = false;
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Acquiring lock for synchronous step.");
lock.lock();
try {
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Lock is acquired.");
done = envState.done;
// TODO Handle when the environment is done.
// Process the actions.
if (actions.contains("\n")) {
String[] cmds = actions.split("\\n");
for(String cmd : cmds) {
envState.commands.add(cmd);
}
} else {
if (!actions.isEmpty())
envState.commands.add(actions);
}
sent = true;
profiler.endSection(); //cmd
profiler.startSection("requestTick");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Received: " + actions);
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Requesting tick.");
// Now wait to run a tick
// If synchronous mode is off then we should see if want to quit is true.
while(!TimeHelper.SyncManager.requestTick() && !done ){Thread.yield();}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Tick request granted.");
profiler.endSection();
profiler.startSection("waitForTick");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Waiting for tick.");
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted() && !done ){ Thread.yield();}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> TICK DONE. Getting observation.");
profiler.endSection();
profiler.startSection("getObservation");
// After which, get the observations.
obs = getObservation(done);
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Observation received. Getting info.");
profiler.endSection();
profiler.startSection("getInfo");
// Pick up rewards.
reward = envState.reward;
if (withInfo) {
info = envState.info;
// if(info == null)
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> FILLING INFO: NULL");
// else
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> FILLING " + info.toString());
}
done = envState.done;
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> STATUS " + Boolean.toString(done));
envState.info = null;
envState.obs = null;
envState.reward = 0.0;
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Info received..");
profiler.endSection();
} finally {
lock.unlock();
}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Lock released. Writing observation, info, done.");
profiler.startSection("writeObs");
dout.writeInt(obs.length);
dout.write(obs);
dout.writeInt(BYTES_DOUBLE + 2);
dout.writeDouble(reward);
dout.writeByte(done ? 1 : 0);
dout.writeByte(sent ? 1 : 0);
if (withInfo) {
byte[] infoBytes = info.getBytes(utf8);
dout.writeInt(infoBytes.length);
dout.write(infoBytes);
}
profiler.endSection(); //write obs
profiler.startSection("flush");
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Packets written. Flushing.");
dout.flush();
profiler.endSection(); // flush
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <STEP> Done with step.");
}
// Handler for <Step_> messages. Single digit option code after _ specifies if turnkey and info are included in message.
private void step(String command, Socket socket, DataInputStream din) throws IOException {
if(envState.synchronous){
stepSync(command, socket, din);
}
else{
System.out.println("[ERROR] Asynchronous stepping is not supported in MineRL.");
}
}
// Handler for <Peek> messages.
private void peek(String command, Socket socket, DataInputStream din) throws IOException {
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
byte[] obs;
boolean done;
String info = "";
// AOG - As we've only seen issues with the peek reqest, I've focused my changes to just
// this function. Initially we want to be optimistic and assume we're not going to abort
// the request and my observations of event timings indicate that there is plenty of time
// between the peek request being received and the reset failing, so a race condition is
// unlikely.
abortRequest = false;
lock.lock();
try {
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Waiting for pistol to fire.");
while(!TimeHelper.SyncManager.hasServerFiredPistol() && !abortRequest){
// Now wait to run a tick
while(!TimeHelper.SyncManager.requestTick() && !abortRequest){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted() && !abortRequest){ Thread.yield();}
Thread.yield();
}
if (abortRequest) {
System.out.println("AOG: Aborting peek request");
// AOG - We detect the lack of observation within our Python wrapper and throw a slightly
// diferent exception that by-passes MineRLs automatic clean up code. If we were to report
// 'done', the MineRL detects this as a runtime error and kills the Minecraft process
// triggering a lengthy restart. So far from testing, Minecraft itself is fine can we can
// retry the reset, it's only the tight loops above that were causing things to stall and
// timeout.
// No observation
dout.writeInt(0);
// No info
dout.writeInt(0);
// Done
dout.writeInt(1);
dout.writeByte(0);
dout.flush();
return;
}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Pistol fired!.");
// Wait two ticks for the first observation from server to be propagated.
while(!TimeHelper.SyncManager.requestTick() ){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted()){ Thread.yield();}
while(!TimeHelper.SyncManager.requestTick() ){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted()){ Thread.yield();}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Getting observation.");
obs = getObservation(false);
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Observation acquired.");
done = envState.done;
info = envState.info;
} finally {
lock.unlock();
}
dout.writeInt(obs.length);
dout.write(obs);
byte[] infoBytes = info.getBytes(utf8);
dout.writeInt(infoBytes.length);
dout.write(infoBytes);
dout.writeInt(1);
dout.writeByte(done ? 1 : 0);
dout.flush();
}
// Get the current observation. If none and not done wait for a short time.
public byte[] getObservation(boolean done) {
byte[] obs = envState.obs;
if (obs == null){
System.out.println("[ERROR] Video observation is null; please notify the developer.");
}
return obs;
}
// Handler for <Find> messages - used by non-zero roles to discover integrated server port from primary (role 0) service.
private final static int findTagLength = "<Find>".length();
private void find(String command, Socket socket) throws IOException {
Integer port;
lock.lock();
try {
String token = command.substring(findTagLength, command.length() - (findTagLength + 1));
TCPUtils.Log(Level.INFO, "Find token? " + token);
// Purge previous token.
String[] tokenSplits = token.split(":");
String experimentId = tokenSplits[0];
int role = Integer.parseInt(tokenSplits[1]);
int reset = Integer.parseInt(tokenSplits[2]);
String previousToken = experimentId + ":" + role + ":" + (reset - 1);
initTokens.remove(previousToken);
cond.signalAll();
// Check for next token. Wait for a short time if not already produced.
port = initTokens.get(token);
if (port == null) {
try {
cond.await(COND_WAIT_SECONDS, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
}
port = initTokens.get(token);
if (port == null) {
port = 0;
TCPUtils.Log(Level.INFO,"Role " + role + " reset " + reset + " waiting for token.");
}
}
} finally {
lock.unlock();
}
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(port);
dout.flush();
}
public boolean isSynchronous(){
return envState.synchronous;
}
// Handler for <Init> messages. These reset the service so use with care!
private void init(String command, Socket socket) throws IOException {
lock.lock();
try {
initTokens = new Hashtable<String, Integer>();
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(1);
dout.flush();
} finally {
lock.unlock();
}
}
// Handler for <Quit> (quit mission) messages.
private void quit(String command, Socket socket) throws IOException {
lock.lock();
try {
if (!envState.done){
envState.quit = true;
}
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <PEEK> Pistol fired!.");
// Wait two ticks for the first observation from server to be propagated.
while(!TimeHelper.SyncManager.requestTick() ){Thread.yield();}
// Then wait until the tick is finished
while(!TimeHelper.SyncManager.isTickCompleted()){ Thread.yield();}
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(envState.done ? 1 : 0);
dout.flush();
} finally {
lock.unlock();
}
}
private final static int closeTagLength = "<Close>".length();
// Handler for <Close> messages.
private void close(String command, Socket socket) throws IOException {
lock.lock();
try {
String token = command.substring(closeTagLength, command.length() - (closeTagLength + 1));
initTokens.remove(token);
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(1);
dout.flush();
} finally {
lock.unlock();
}
}
// Handler for <Status> messages.
private void status(String command, Socket socket) throws IOException {
lock.lock();
try {
String status = "{}"; // TODO Possibly have something more interesting to report.
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
byte[] statusBytes = status.getBytes(utf8);
dout.writeInt(statusBytes.length);
dout.write(statusBytes);
dout.flush();
} finally {
lock.unlock();
}
}
// Handler for <Exit> messages. These "kill the service" temporarily so use with care!f
private void exit(String command, Socket socket) throws IOException {
// lock.lock();
try {
// We may exit before we get a chance to reply.
TimeHelper.SyncManager.setSynchronous(false);
DataOutputStream dout = new DataOutputStream(socket.getOutputStream());
dout.writeInt(BYTES_INT);
dout.writeInt(1);
dout.flush();
ClientStateMachine.exitJava();
} finally {
// lock.unlock();
}
}
// Malmo client state machine interface methods:
public String getCommand() {
try {
String command = envState.commands.poll();
if (command == null)
return "";
else
return command;
} finally {
}
}
public void endMission() {
// lock.lock();
try {
// AOG - If the mission is ending, we always want to abort requests and they won't
// be able to progress to completion and will stall.
System.out.println("AOG: MalmoEnvServer.endMission");
abort();
envState.done = true;
envState.quit = false;
envState.missionInit = null;
if (envState.token != null) {
initTokens.remove(envState.token);
envState.token = null;
envState.experimentId = null;
envState.agentCount = 0;
envState.reset = 0;
// cond.signalAll();
}
// lock.unlock();
} finally {
}
}
// Record a Malmo "observation" json - as the env info since an environment "obs" is a video frame.
public void observation(String info) {
// Parsing obs as JSON would be slower but less fragile than extracting the turn_key using string search.
// lock.lock();
try {
// TimeHelper.SyncManager.debugLog("[MALMO_ENV_SERVER] <OBSERVATION> Inserting: " + info);
envState.info = info;
// cond.signalAll();
} finally {
// lock.unlock();
}
}
public void addRewards(double rewards) {
// lock.lock();
try {
envState.reward += rewards;
} finally {
// lock.unlock();
}
}
public void addFrame(byte[] frame) {
// lock.lock();
try {
envState.obs = frame; // Replaces current.
// cond.signalAll();
} finally {
// lock.unlock();
}
}
public void notifyIntegrationServerStarted(int integrationServerPort) {
lock.lock();
try {
if (envState.token != null) {
TCPUtils.Log(Level.INFO,"Integration server start up - token: " + envState.token);
addTokens(integrationServerPort, envState.token, envState.experimentId, envState.agentCount, envState.reset);
cond.signalAll();
} else {
TCPUtils.Log(Level.WARNING,"No mission token on integration server start up!");
}
} finally {
lock.unlock();
}
}
private void addTokens(int integratedServerPort, String myToken, String experimentId, int agentCount, int reset) {
initTokens.put(myToken, integratedServerPort);
// Place tokens for other agents to find.
for (int i = 1; i < agentCount; i++) {
String tokenForAgent = experimentId + ":" + i + ":" + reset;
initTokens.put(tokenForAgent, integratedServerPort);
}
}
// IWantToQuit implementation.
@Override
public boolean doIWantToQuit(MissionInit missionInit) {
// lock.lock();
try {
return envState.quit;
} finally {
// lock.unlock();
}
}
public Long getSeed(){
return envState.seed;
}
private void setWantToQuit() {
// lock.lock();
try {
envState.quit = true;
} finally {
if(TimeHelper.SyncManager.isSynchronous()){
// We want to dsynchronize everything.
TimeHelper.SyncManager.setSynchronous(false);
}
// lock.unlock();
}
}
@Override
public void prepare(MissionInit missionInit) {
}
@Override
public void cleanup() {
}
@Override
public String getOutcome() {
return "Env quit";
}
}

View File

@@ -0,0 +1,173 @@
import time
import glob
import pathlib
from malmo import MalmoPython, malmoutils
from malmo.launch_minecraft_in_background import launch_minecraft_in_background
class MalmoVideoRecorder:
DEFAULT_RECORDINGS_DIR = './logs/videos'
def __init__(self):
self.agent_host_bot = None
self.agent_host_camera = None
self.client_pool = None
self.is_malmo_initialized = False
def init_malmo(self, recordings_directory=DEFAULT_RECORDINGS_DIR):
if self.is_malmo_initialized:
return
launch_minecraft_in_background(
'/app/MalmoPlatform/Minecraft',
ports=[10000, 10001])
# Set up two agent hosts
self.agent_host_bot = MalmoPython.AgentHost()
self.agent_host_camera = MalmoPython.AgentHost()
# Create list of Minecraft clients to attach to. The agents must
# have been launched before calling record_malmo_video using
# init_malmo()
self.client_pool = MalmoPython.ClientPool()
self.client_pool.add(MalmoPython.ClientInfo('127.0.0.1', 10000))
self.client_pool.add(MalmoPython.ClientInfo('127.0.0.1', 10001))
# Use bot's agenthost to hold the command-line options
malmoutils.parse_command_line(
self.agent_host_bot,
['--record_video', '--recording_dir', recordings_directory])
self.is_malmo_initialized = True
def _start_mission(self, agent_host, mission, recording_spec, role):
used_attempts = 0
max_attempts = 5
while True:
try:
agent_host.startMission(
mission,
self.client_pool,
recording_spec,
role,
'')
break
except MalmoPython.MissionException as e:
errorCode = e.details.errorCode
if errorCode == (MalmoPython.MissionErrorCode
.MISSION_SERVER_WARMING_UP):
time.sleep(2)
elif errorCode == (MalmoPython.MissionErrorCode
.MISSION_INSUFFICIENT_CLIENTS_AVAILABLE):
print('Not enough Minecraft instances running.')
used_attempts += 1
if used_attempts < max_attempts:
print('Will wait in case they are starting up.')
time.sleep(300)
elif errorCode == (MalmoPython.MissionErrorCode
.MISSION_SERVER_NOT_FOUND):
print('Server not found.')
used_attempts += 1
if used_attempts < max_attempts:
print('Will wait and retry.')
time.sleep(2)
else:
used_attempts = max_attempts
if used_attempts >= max_attempts:
raise e
def _wait_for_start(self, agent_hosts):
start_flags = [False for a in agent_hosts]
start_time = time.time()
time_out = 120
while not all(start_flags) and time.time() - start_time < time_out:
states = [a.peekWorldState() for a in agent_hosts]
start_flags = [w.has_mission_begun for w in states]
errors = [e for w in states for e in w.errors]
if len(errors) > 0:
print("Errors waiting for mission start:")
for e in errors:
print(e.text)
raise Exception("Encountered errors while starting mission.")
if time.time() - start_time >= time_out:
raise Exception("Timed out while waiting for mission to start.")
def _get_xml(self, xml_file, seed):
with open(xml_file, 'r') as mission_file:
return mission_file.read().format(SEED_PLACEHOLDER=seed)
def _is_mission_running(self):
return self.agent_host_bot.peekWorldState().is_mission_running or \
self.agent_host_camera.peekWorldState().is_mission_running
def record_malmo_video(self, instructions, xml_file, seed):
'''
Replays a set of instructions through Malmo using two players. The
first player will navigate the specified mission based on the given
instructions. The second player observes the first player's moves,
which is captured in a video.
'''
if not self.is_malmo_initialized:
raise Exception('Malmo not initialized. Call init_malmo() first.')
# Set up the mission
my_mission = MalmoPython.MissionSpec(
self._get_xml(xml_file, seed),
True)
bot_recording_spec = MalmoPython.MissionRecordSpec()
camera_recording_spec = MalmoPython.MissionRecordSpec()
recordingsDirectory = \
malmoutils.get_recordings_directory(self.agent_host_bot)
if recordingsDirectory:
camera_recording_spec.setDestination(
recordingsDirectory + "//rollout_" + str(seed) + ".tgz")
camera_recording_spec.recordMP4(
MalmoPython.FrameType.VIDEO,
36,
2000000,
False)
# Start the agents
self._start_mission(
self.agent_host_bot,
my_mission,
bot_recording_spec,
0)
self._start_mission(
self.agent_host_camera,
my_mission,
camera_recording_spec,
1)
self._wait_for_start([self.agent_host_camera, self.agent_host_bot])
# Teleport the camera agent to the required position
self.agent_host_camera.sendCommand('tp -29 72 -6.7')
instruction_index = 0
while self._is_mission_running():
command = instructions[instruction_index]
instruction_index += 1
self.agent_host_bot.sendCommand(command)
# Pause for half a second - change this for faster/slower videos
time.sleep(0.5)
if instruction_index == len(instructions):
self.agent_host_bot.sendCommand("jump 1")
time.sleep(2)
self.agent_host_bot.sendCommand("quit")
# Wait a little for Malmo to reset before the
# next mission is started
time.sleep(2)
print("Video recorded.")

View File

@@ -0,0 +1,180 @@
import json
import logging
import gym
import minerl.env.core
import minerl.env.comms
import numpy as np
from ray.rllib.env.atari_wrappers import FrameStack
from minerl.env.malmo import InstanceManager
# Modify the MineRL timeouts to detect common errors
# quicker and speed up recovery
minerl.env.core.SOCKTIME = 60.0
minerl.env.comms.retry_timeout = 1
class EnvWrapper(minerl.env.core.MineRLEnv):
def __init__(self, xml, port):
InstanceManager.configure_malmo_base_port(port)
self.action_to_command_array = [
'move 1',
'camera 0 270',
'camera 0 90']
super().__init__(
xml,
gym.spaces.Box(low=0, high=255, shape=(84, 84, 3), dtype=np.uint8),
gym.spaces.Discrete(3)
)
self.metadata['video.frames_per_second'] = 2
def _setup_spaces(self, observation_space, action_space):
self.observation_space = observation_space
self.action_space = action_space
def _process_action(self, action_in) -> str:
assert self.action_space.contains(action_in)
assert action_in <= len(
self.action_to_command_array) - 1, 'action index out of bounds.'
return self.action_to_command_array[action_in]
def _process_observation(self, pov, info):
'''
Overwritten to simplify: returns only `pov` and
not as the MineRLEnv an obs_dict (observation directory)
'''
pov = np.frombuffer(pov, dtype=np.uint8)
if pov is None or len(pov) == 0:
raise Exception('Invalid observation, probably an aborted peek')
else:
pov = pov.reshape(
(self.height, self.width, self.depth)
)[::-1, :, :]
assert self.observation_space.contains(pov)
self._last_pov = pov
return pov
class TrackingEnv(gym.Wrapper):
def __init__(self, env):
super().__init__(env)
self._actions = [
self._forward,
self._turn_left,
self._turn_right
]
def _reset_state(self):
self._facing = (1, 0)
self._position = (0, 0)
self._visited = {}
self._update_visited()
def _forward(self):
self._position = (
self._position[0] + self._facing[0],
self._position[1] + self._facing[1]
)
def _turn_left(self):
self._facing = (self._facing[1], -self._facing[0])
def _turn_right(self):
self._facing = (-self._facing[1], self._facing[0])
def _encode_state(self):
return self._position
def _update_visited(self):
state = self._encode_state()
value = self._visited.get(state, 0)
self._visited[state] = value + 1
return value
def reset(self):
self._reset_state()
return super().reset()
def step(self, action):
o, r, d, i = super().step(action)
self._actions[action]()
revisit_count = self._update_visited()
if revisit_count == 0:
r += 0.1
return o, r, d, i
class TrajectoryWrapper(gym.Wrapper):
def __init__(self, env):
super().__init__(env)
self._trajectory = []
self._action_to_malmo_command_array = ['move 1', 'turn -1', 'turn 1']
def get_trajectory(self):
return self._trajectory
def _to_malmo_action(self, action_index):
return self._action_to_malmo_command_array[action_index]
def step(self, action):
self._trajectory.append(self._to_malmo_action(action))
o, r, d, i = super().step(action)
return o, r, d, i
class DummyEnv(gym.Env):
def __init__(self):
self.observation_space = gym.spaces.Box(
low=0,
high=255,
shape=(84, 84, 6),
dtype=np.uint8)
self.action_space = gym.spaces.Discrete(3)
# Define a function to create a MineRL environment
def create_env(config):
mission = config["mission"]
port = 1000 * config.worker_index + config.vector_index
print('*********************************************')
print(f'* Worker {config.worker_index} creating from \
mission: {mission}, port {port}')
print('*********************************************')
if config.worker_index == 0:
# The first environment is only used for checking the action
# and observation space. By using a dummy environment, there's
# no need to spin up a Minecraft instance behind it saving some
# CPU resources on the head node.
return DummyEnv()
env = EnvWrapper(mission, port)
env = TrackingEnv(env)
env = FrameStack(env, 2)
return env
def create_env_for_rollout(config):
mission = config['mission']
port = 1000 * config.worker_index + config.vector_index
print('*********************************************')
print(f'* Worker {config.worker_index} creating from \
mission: {mission}, port {port}')
print('*********************************************')
env = EnvWrapper(mission, port)
env = TrackingEnv(env)
env = FrameStack(env, 2)
env = TrajectoryWrapper(env)
return env

View File

@@ -0,0 +1,95 @@
<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<About>
<Summary>$(ENV_NAME)</Summary>
</About>
<ModSettings>
<MsPerTick>50</MsPerTick>
</ModSettings>
<ServerSection>
<ServerInitialConditions>
<Time>
<StartTime>6000</StartTime>
<AllowPassageOfTime>false</AllowPassageOfTime>
</Time>
<Weather>clear</Weather>
<AllowSpawning>false</AllowSpawning>
</ServerInitialConditions>
<ServerHandlers>
<FlatWorldGenerator generatorString="3;7,220*1,5*3,2;3;,biome_1"/>
<DrawingDecorator>
<DrawSphere x="-29" y="70" z="-2" radius="100" type="air"/>
<DrawCuboid x1="-34" y1="70" z1="-7" x2="-24" y2="70" z2="3" type="lava" />
</DrawingDecorator>
<MazeDecorator>
<Seed>random</Seed>
<SizeAndPosition width="5" length="5" height="10" xOrigin="-32" yOrigin="69" zOrigin="-5"/>
<StartBlock type="emerald_block" fixedToEdge="false"/>
<EndBlock type="lapis_block" fixedToEdge="false"/>
<PathBlock type="grass"/>
<FloorBlock type="air"/>
<GapBlock type="lava"/>
<GapProbability>0.6</GapProbability>
<AllowDiagonalMovement>false</AllowDiagonalMovement>
</MazeDecorator>
<ServerQuitFromTimeUp timeLimitMs="300000" description="out_of_time"/>
<ServerQuitWhenAnyAgentFinishes/>
</ServerHandlers>
</ServerSection>
<AgentSection mode="Survival">
<Name>AML_Bot</Name>
<AgentStart>
<Placement x="-28.5" y="71.0" z="-1.5" pitch="70" yaw="0"/>
</AgentStart>
<AgentHandlers>
<VideoProducer want_depth="false">
<Width>84</Width>
<Height>84</Height>
</VideoProducer>
<FileBasedPerformanceProducer/>
<ObservationFromFullInventory flat="false"/>
<ObservationFromFullStats/>
<HumanLevelCommands>
<ModifierList type="deny-list">
<command>moveMouse</command>
<command>inventory</command>
</ModifierList>
</HumanLevelCommands>
<CameraCommands/>
<ObservationFromCompass/>
<DiscreteMovementCommands/>
<RewardForMissionEnd>
<Reward description="out_of_time" reward="-1" />
</RewardForMissionEnd>
<RewardForTouchingBlockType>
<Block reward="-1.0" type="lava" behaviour="onceOnly"/>
<Block reward="1.0" type="lapis_block" behaviour="onceOnly"/>
</RewardForTouchingBlockType>
<RewardForSendingCommand reward="-0.02"/>
<AgentQuitFromTouchingBlockType>
<Block type="lava" />
<Block type="lapis_block" />
</AgentQuitFromTouchingBlockType>
<PauseCommand/>
<AgentQuitFromReachingCommandQuota total="50"/>
</AgentHandlers>
</AgentSection>
</Mission>

View File

@@ -0,0 +1,95 @@
<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<About>
<Summary>$(ENV_NAME)</Summary>
</About>
<ModSettings>
<MsPerTick>50</MsPerTick>
</ModSettings>
<ServerSection>
<ServerInitialConditions>
<Time>
<StartTime>6000</StartTime>
<AllowPassageOfTime>false</AllowPassageOfTime>
</Time>
<Weather>clear</Weather>
<AllowSpawning>false</AllowSpawning>
</ServerInitialConditions>
<ServerHandlers>
<FlatWorldGenerator generatorString="3;7,220*1,5*3,2;3;,biome_1"/>
<DrawingDecorator>
<DrawSphere x="-29" y="70" z="-2" radius="100" type="air"/>
<DrawCuboid x1="-34" y1="70" z1="-7" x2="-24" y2="70" z2="3" type="lava" />
</DrawingDecorator>
<MazeDecorator>
<Seed>{SEED_PLACEHOLDER}</Seed>
<SizeAndPosition width="6" length="6" height="10" xOrigin="-32" yOrigin="69" zOrigin="-5"/>
<StartBlock type="emerald_block" fixedToEdge="false"/>
<EndBlock type="lapis_block" fixedToEdge="false"/>
<PathBlock type="grass"/>
<FloorBlock type="air"/>
<GapBlock type="lava"/>
<GapProbability>0.6</GapProbability>
<AllowDiagonalMovement>false</AllowDiagonalMovement>
</MazeDecorator>
<ServerQuitFromTimeUp timeLimitMs="300000" description="out_of_time"/>
<ServerQuitWhenAnyAgentFinishes/>
</ServerHandlers>
</ServerSection>
<AgentSection mode="Survival">
<Name>AML_Bot</Name>
<AgentStart>
<Placement x="-28.5" y="71.0" z="-1.5" pitch="70" yaw="0"/>
</AgentStart>
<AgentHandlers>
<VideoProducer want_depth="false">
<Width>84</Width>
<Height>84</Height>
</VideoProducer>
<FileBasedPerformanceProducer/>
<ObservationFromFullInventory flat="false"/>
<ObservationFromFullStats/>
<HumanLevelCommands>
<ModifierList type="deny-list">
<command>moveMouse</command>
<command>inventory</command>
</ModifierList>
</HumanLevelCommands>
<CameraCommands/>
<ObservationFromCompass/>
<DiscreteMovementCommands/>
<RewardForMissionEnd>
<Reward description="out_of_time" reward="-1" />
</RewardForMissionEnd>
<RewardForTouchingBlockType>
<Block reward="-1.0" type="lava" behaviour="onceOnly"/>
<Block reward="1.0" type="lapis_block" behaviour="onceOnly"/>
</RewardForTouchingBlockType>
<RewardForSendingCommand reward="-0.02"/>
<AgentQuitFromTouchingBlockType>
<Block type="lava" />
<Block type="lapis_block" />
</AgentQuitFromTouchingBlockType>
<PauseCommand/>
<AgentQuitFromReachingCommandQuota total="50"/>
</AgentHandlers>
</AgentSection>
</Mission>

View File

@@ -0,0 +1,74 @@
<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<About>
<Summary>AML-Video-Gatherer</Summary>
</About>
<ModSettings>
<MsPerTick>50</MsPerTick>
</ModSettings>
<ServerSection>
<ServerInitialConditions>
<Time>
<StartTime>6000</StartTime>
<AllowPassageOfTime>false</AllowPassageOfTime>
</Time>
<Weather>clear</Weather>
<AllowSpawning>false</AllowSpawning>
</ServerInitialConditions>
<ServerHandlers>
<FlatWorldGenerator generatorString="3;7,220*1,5*3,2;3;,biome_1"/>
<MazeDecorator>
<Seed>{SEED_PLACEHOLDER}</Seed>
<SizeAndPosition width="6" length="6" height="10" xOrigin="-32" yOrigin="69" zOrigin="-5"/>
<StartBlock type="emerald_block" fixedToEdge="false"/>
<EndBlock type="lapis_block" fixedToEdge="false"/>
<PathBlock type="grass"/>
<FloorBlock type="air"/>
<GapBlock type="lava"/>
<GapProbability>0.6</GapProbability>
<AllowDiagonalMovement>false</AllowDiagonalMovement>
</MazeDecorator>
<ServerQuitFromTimeUp timeLimitMs="300000" description="out_of_time"/>
<ServerQuitWhenAnyAgentFinishes/>
</ServerHandlers>
</ServerSection>
<AgentSection mode="Survival">
<Name>Agent</Name>
<AgentStart>
<Placement x="-28.5" y="71.0" z="-1.5" yaw="0"/>
</AgentStart>
<AgentHandlers>
<HumanLevelCommands>
<ModifierList type="deny-list">
<command>moveMouse</command>
<command>inventory</command>
</ModifierList>
</HumanLevelCommands>
<DiscreteMovementCommands/>
<MissionQuitCommands/>
<AgentQuitFromReachingCommandQuota total="50"/>
</AgentHandlers>
</AgentSection>
<AgentSection mode="Spectator">
<Name>Camera_Bot</Name>
<AgentStart>
<Placement x="-29" y="72" z="-6.7" pitch="16" yaw="0"/>
</AgentStart>
<AgentHandlers>
<VideoProducer want_depth="false">
<Width>860</Width>
<Height>480</Height>
</VideoProducer>
<AbsoluteMovementCommands/>
</AgentHandlers>
</AgentSection>
</Mission>

View File

@@ -0,0 +1,130 @@
import argparse
import os
import re
from azureml.core import Run
from azureml.core.model import Model
from minecraft_environment import create_env_for_rollout
from malmo_video_recorder import MalmoVideoRecorder
from gym import wrappers
import ray
import ray.tune as tune
from ray.rllib import rollout
from ray.tune.registry import get_trainable_cls
def write_mission_file_for_seed(mission_file, seed):
with open(mission_file, 'r') as base_file:
mission_file_path = mission_file.replace('v0', seed)
content = base_file.read().format(SEED_PLACEHOLDER=seed)
mission_file = open(mission_file_path, 'w')
mission_file.writelines(content)
mission_file.close()
return mission_file_path
def run_rollout(trainable_type, mission_file, seed):
# Writes the mission file for minerl
mission_file_path = write_mission_file_for_seed(mission_file, seed)
# Instantiate the agent. Note: the IMPALA trainer implementation in
# Ray uses an AsyncSamplesOptimizer. Under the hood, this starts a
# LearnerThread which will wait for training samples. This will fail
# after a timeout, but has no influence on the rollout. See
# https://github.com/ray-project/ray/blob/708dff6d8f7dd6f7919e06c1845f1fea0cca5b89/rllib/optimizers/aso_learner.py#L66
config = {
"env_config": {
"mission": mission_file_path,
"is_rollout": True,
"seed": seed
},
"num_workers": 0
}
cls = get_trainable_cls(args.run)
agent = cls(env="Minecraft", config=config)
# The optimizer is not needed during a rollout
agent.optimizer.stop()
# Load state from checkpoint
agent.restore(f'{checkpoint_path}/{checkpoint_file}')
# Get a reference to the environment
env = agent.workers.local_worker().env
# Let the agent choose actions until the game is over
obs = env.reset()
done = False
total_reward = 0
while not done:
action = agent.compute_action(obs)
obs, reward, done, info = env.step(action)
total_reward += reward
print(f'Total reward using seed {seed}: {total_reward}')
# This avoids a sigterm trace in the logs, see minerl.env.malmo.Instance
env.instance.watcher_process.kill()
env.close()
agent.stop()
return env.get_trajectory()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', required=True)
parser.add_argument('--run', required=False, default="IMPALA")
args = parser.parse_args()
# Register custom Minecraft environment
tune.register_env("Minecraft", create_env_for_rollout)
ray.init(address='auto')
# Download the model files (contains a checkpoint)
ws = Run.get_context().experiment.workspace
model = Model(ws, args.model_name)
checkpoint_path = model.download(exist_ok=True)
files_ = os.listdir(checkpoint_path)
cp_pattern = re.compile('^checkpoint-\\d+$')
checkpoint_file = None
for f_ in files_:
if cp_pattern.match(f_):
checkpoint_file = f_
if checkpoint_file is None:
raise Exception("No checkpoint file found.")
# These are the Minecraft mission seeds for the rollouts
rollout_seeds = ['1234', '43289', '65224', '983341']
# Initialize the Malmo video recorder
video_recorder = MalmoVideoRecorder()
video_recorder.init_malmo()
# Path references to the mission files
base_training_mission_file = \
'minecraft_missions/lava_maze_rollout-v0.xml'
base_video_recording_mission_file = \
'minecraft_missions/lava_maze_rollout_video.xml'
for seed in rollout_seeds:
trajectory = run_rollout(
args.run,
base_training_mission_file,
seed)
video_recorder.record_malmo_video(
trajectory,
base_video_recording_mission_file,
seed)

View File

@@ -0,0 +1,45 @@
import ray
import ray.tune as tune
from utils import callbacks
from minecraft_environment import create_env
def stop(trial_id, result):
return result["episode_reward_mean"] >= 1 \
or result["time_total_s"] > 5 * 60 * 60
if __name__ == '__main__':
tune.register_env("Minecraft", create_env)
ray.init(address='auto')
tune.run(
run_or_experiment="IMPALA",
config={
"env": "Minecraft",
"env_config": {
"mission": "minecraft_missions/lava_maze-v0.xml"
},
"num_workers": 10,
"num_cpus_per_worker": 2,
"rollout_fragment_length": 50,
"train_batch_size": 1024,
"replay_buffer_num_slots": 4000,
"replay_proportion": 10,
"learner_queue_timeout": 900,
"num_sgd_iter": 2,
"num_data_loader_buffers": 2,
"exploration_config": {
"type": "EpsilonGreedy",
"initial_epsilon": 1.0,
"final_epsilon": 0.02,
"epsilon_timesteps": 500000
},
"callbacks": {"on_train_result": callbacks.on_train_result},
},
stop=stop,
checkpoint_at_end=True,
local_dir='./logs'
)

View File

@@ -0,0 +1,18 @@
'''RLlib callbacks module:
Common callback methods to be passed to RLlib trainer.
'''
from azureml.core import Run
def on_train_result(info):
'''Callback on train result to record metrics returned by trainer.
'''
run = Run.get_context()
run.log(
name='episode_reward_mean',
value=info["result"]["episode_reward_mean"])
run.log(
name='episodes_total',
value=info["result"]["episodes_total"])

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 MiB

View File

@@ -0,0 +1,925 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Reinforcement Learning in Azure Machine Learning - Training a Minecraft agent using custom environments\n",
"\n",
"This tutorial will show how to set up a more complex reinforcement\n",
"learning (RL) training scenario. It demonstrates how to train an agent to\n",
"navigate through a lava maze in the Minecraft game using Azure Machine\n",
"Learning.\n",
"\n",
"**Please note:** This notebook trains an agent on a randomly generated\n",
"Minecraft level. As a result, on rare occasions, a training run may fail\n",
"to produce a model that can solve the maze. If this happens, you can\n",
"re-run the training step as indicated below.\n",
"\n",
"**Please note:** This notebook uses 1 NC6 type node and 8 D2 type nodes\n",
"for up to 5 hours of training, which corresponds to approximately $9.06 (USD)\n",
"as of May 2020.\n",
"\n",
"Minecraft is currently one of the most popular video\n",
"games and as such has been a study object for RL. [Project \n",
"Malmo](https://www.microsoft.com/en-us/research/project/project-malmo/) is\n",
"a platform for artificial intelligence experimentation and research built on\n",
"top of Minecraft. We will use Minecraft [gym](https://gym.openai.com) environments from Project\n",
"Malmo's 2019 MineRL competition, which are part of the \n",
"[MineRL](http://minerl.io/docs/index.html) Python package.\n",
"\n",
"Minecraft environments require a display to run, so we will demonstrate\n",
"how to set up a virtual display within the docker container used for training.\n",
"Learning will be based on the agent's visual observations. To\n",
"generate the necessary amount of sample data, we will run several\n",
"instances of the Minecraft game in parallel. Below, you can see a video of\n",
"a trained agent navigating a lava maze. Starting from the green position,\n",
"it moves to the blue position by moving forward, turning left or turning right:\n",
"\n",
"<table style=\"width:50%\">\n",
" <tr>\n",
" <th style=\"text-align: center;\">\n",
" <img src=\"./images/lava_maze_minecraft.gif\" alt=\"Minecraft lava maze\" align=\"middle\" margin-left=\"auto\" margin-right=\"auto\"/>\n",
" </th>\n",
" </tr>\n",
" <tr style=\"text-align: center;\">\n",
" <th>Fig 1. Video of a trained Minecraft agent navigating a lava maze.</th>\n",
" </tr>\n",
"</table>\n",
"\n",
"The tutorial will cover the following steps:\n",
"- Initializing Azure Machine Learning resources for training\n",
"- Training the RL agent with Azure Machine Learning service\n",
"- Monitoring training progress\n",
"- Reviewing training results\n",
"\n",
"\n",
"## Prerequisites\n",
"\n",
"The user should have completed the Azure Machine Learning introductory tutorial.\n",
"You will need to make sure that you have a valid subscription id, a resource group and a\n",
"workspace. For detailed instructions see [Tutorial: Get started creating\n",
"your first ML experiment.](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup)\n",
"\n",
"In addition, please follow the instructions in the [Reinforcement Learning in\n",
"Azure Machine Learning - Setting Up Development Environment](../setup/devenv_setup.ipynb)\n",
"notebook to correctly set up a Virtual Network which is required for completing \n",
"this tutorial.\n",
"\n",
"While this is a standalone notebook, we highly recommend going over the\n",
"introductory notebooks for RL first.\n",
"- Getting started:\n",
" - [RL using a compute instance with Azure Machine Learning service](../cartpole-on-compute-instance/cartpole_ci.ipynb)\n",
" - [Using Azure Machine Learning compute](../cartpole-on-single-compute/cartpole_sc.ipynb)\n",
"- [Scaling RL training runs with Azure Machine Learning service](../atari-on-distributed-compute/pong_rllib.ipynb)\n",
"\n",
"\n",
"## Initialize resources\n",
"\n",
"All required Azure Machine Learning service resources for this tutorial can be set up from Jupyter.\n",
"This includes:\n",
"- Connecting to your existing Azure Machine Learning workspace.\n",
"- Creating an experiment to track runs.\n",
"- Creating remote compute targets for [Ray](https://docs.ray.io/en/latest/index.html).\n",
"\n",
"### Azure Machine Learning SDK\n",
"\n",
"Display the Azure Machine Learning SDK version."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"print(\"Azure Machine Learning SDK Version: \", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Connect to workspace\n",
"\n",
"Get a reference to an existing Azure Machine Learning workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.location, ws.resource_group, sep=' | ')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment\n",
"\n",
"Create an experiment to track the runs in your workspace. A\n",
"workspace can have multiple experiments and each experiment\n",
"can be used to track multiple runs (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)\n",
"for details)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"nbpresent": {
"id": "bc70f780-c240-4779-96f3-bc5ef9a37d59"
}
},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"exp = Experiment(workspace=ws, name='minecraft-maze')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create or attach an existing compute resource\n",
"\n",
"A compute target is a designated compute resource where you\n",
"run your training script. For more information, see [What\n",
"are compute targets in Azure Machine Learning service?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target).\n",
"\n",
"#### GPU target for Ray head\n",
"\n",
"In the experiment setup for this tutorial, the Ray head node\n",
"will run on a GPU-enabled node. A maximum cluster size\n",
"of 1 node is therefore sufficient. If you wish to run\n",
"multiple experiments in parallel using the same GPU\n",
"cluster, you may elect to increase this number. The cluster\n",
"will automatically scale down to 0 nodes when no training jobs\n",
"are scheduled (see `min_nodes`).\n",
"\n",
"The code below creates a compute cluster of GPU-enabled NC6\n",
"nodes. If the cluster with the specified name is already in\n",
"your workspace the code will skip the creation process.\n",
"\n",
"Note that we must specify a Virtual Network during compute\n",
"creation to allow communication between the cluster running\n",
"the Ray head node and the additional Ray compute nodes. For\n",
"details on how to setup the Virtual Network, please follow the\n",
"instructions in the \"Prerequisites\" section above.\n",
"\n",
"**Note: Creation of a compute resource can take several minutes**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# please enter the name of your Virtual Network (see Prerequisites -> Workspace setup)\n",
"vnet_name = 'your_vnet'\n",
"\n",
"# name of the Virtual Network subnet ('default' the default name)\n",
"subnet_name = 'default'\n",
"\n",
"gpu_cluster_name = 'gpu-cluster-nc6'\n",
"\n",
"try:\n",
" gpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size='Standard_NC6',\n",
" min_nodes=0,\n",
" max_nodes=1,\n",
" vnet_resourcegroup_name=ws.resource_group,\n",
" vnet_name=vnet_name,\n",
" subnet_name=subnet_name)\n",
"\n",
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
" gpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n",
" print('Cluster created.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### CPU target for additional Ray nodes\n",
"\n",
"The code below creates a compute cluster of D2 nodes. If the cluster with the specified name is already in your workspace the code will skip the creation process.\n",
"\n",
"This cluster will be used to start additional Ray nodes\n",
"increasing the clusters CPU resources.\n",
"\n",
"**Note: Creation of a compute resource can take several minutes**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cpu_cluster_name = 'cpu-cluster-d2'\n",
"\n",
"try:\n",
" cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
" print('Found existing compute target')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size='STANDARD_D2',\n",
" min_nodes=0,\n",
" max_nodes=10,\n",
" vnet_resourcegroup_name=ws.resource_group,\n",
" vnet_name=vnet_name,\n",
" subnet_name=subnet_name)\n",
"\n",
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
" cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
"\n",
" print('Cluster created.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training the agent\n",
"\n",
"### Training environments\n",
"\n",
"This tutorial uses custom docker images (CPU and GPU respectively)\n",
"with the necessary software installed. The\n",
"[Environment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments)\n",
"class stores the configuration for the training environment. The docker\n",
"image is set via `env.docker.base_image` which can point to any\n",
"publicly available docker image. `user_managed_dependencies`\n",
"is set so that the preinstalled Python packages in the image are preserved.\n",
"\n",
"Note that since Minecraft requires a display to start, we set the `interpreter_path`\n",
"such that the Python process is started via **xvfb-run**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"\n",
"def create_env(env_type):\n",
" env = Environment(name='minecraft-{env_type}'.format(env_type=env_type))\n",
"\n",
" env.docker.enabled = True\n",
" env.docker.base_image = 'akdmsft/minecraft-{env_type}'.format(env_type=env_type)\n",
"\n",
" env.python.interpreter_path = \"xvfb-run -s '-screen 0 640x480x16 -ac +extension GLX +render' python\"\n",
" env.python.user_managed_dependencies = True\n",
" \n",
" return env\n",
" \n",
"cpu_minecraft_env = create_env('cpu')\n",
"gpu_minecraft_env = create_env('gpu')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training script\n",
"\n",
"As described above, we use the MineRL Python package to launch\n",
"Minecraft game instances. MineRL provides several OpenAI gym\n",
"environments for different scenarios, such as chopping wood.\n",
"Besides predefined environments, MineRL lets its users create\n",
"custom Minecraft environments through\n",
"[minerl.env](http://minerl.io/docs/api/env.html). In the helper\n",
"file **minecraft_environment.py** provided with this tutorial, we use the\n",
"latter option to customize a Minecraft level with a lava maze\n",
"that the agent has to navigate. The agent receives a negative\n",
"reward of -1 for falling into the lava, a negative reward of\n",
"-0.02 for sending a command (i.e. navigating through the maze\n",
"with fewer actions yields a higher total reward) and a positive reward\n",
"of 1 for reaching the goal. To encourage the agent to explore\n",
"the maze, it also receives a positive reward of 0.1 for visiting\n",
"a tile for the first time.\n",
"\n",
"The agent learns purely from visual observations and the image\n",
"is scaled to an 84x84 format, stacking four frames. For the\n",
"purposes of this example, we use a small action space of size\n",
"three: move forward, turn 90 degrees to the left, and turn 90\n",
"degrees to the right.\n",
"\n",
"The training script itself registers the function to create training\n",
"environments with the `tune.register_env` function and connects to\n",
"the Ray cluster Azure Machine Learning service started on the GPU \n",
"and CPU nodes. Lastly, it starts a RL training run with `tune.run()`.\n",
"\n",
"We recommend setting the `local_dir` parameter to `./logs` as this\n",
"directory will automatically become available as part of the training\n",
"run's files in the Azure Portal. The Tensorboard integration\n",
"(see \"View the Tensorboard\" section below) also depends on the files'\n",
"availability. For a list of common parameter options, please refer\n",
"to the [Ray documentation](https://docs.ray.io/en/latest/rllib-training.html#common-parameters).\n",
"\n",
"\n",
"```python\n",
"# Taken from minecraft_environment.py and minecraft_train.py\n",
"\n",
"# Define a function to create a MineRL environment\n",
"def create_env(config):\n",
" mission = config['mission']\n",
" port = 1000 * config.worker_index + config.vector_index\n",
" print('*********************************************')\n",
" print(f'* Worker {config.worker_index} creating from mission: {mission}, port {port}')\n",
" print('*********************************************')\n",
"\n",
" if config.worker_index == 0:\n",
" # The first environment is only used for checking the action and observation space.\n",
" # By using a dummy environment, there's no need to spin up a Minecraft instance behind it\n",
" # saving some CPU resources on the head node.\n",
" return DummyEnv()\n",
"\n",
" env = EnvWrapper(mission, port)\n",
" env = TrackingEnv(env)\n",
" env = FrameStack(env, 2)\n",
" \n",
" return env\n",
"\n",
"\n",
"def stop(trial_id, result):\n",
" return result[\"episode_reward_mean\"] >= 1 \\\n",
" or result[\"time_total_s\"] > 5 * 60 * 60\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" tune.register_env(\"Minecraft\", create_env)\n",
"\n",
" ray.init(address='auto')\n",
"\n",
" tune.run(\n",
" run_or_experiment=\"IMPALA\",\n",
" config={\n",
" \"env\": \"Minecraft\",\n",
" \"env_config\": {\n",
" \"mission\": \"minecraft_missions/lava_maze-v0.xml\"\n",
" },\n",
" \"num_workers\": 10,\n",
" \"num_cpus_per_worker\": 2,\n",
" \"rollout_fragment_length\": 50,\n",
" \"train_batch_size\": 1024,\n",
" \"replay_buffer_num_slots\": 4000,\n",
" \"replay_proportion\": 10,\n",
" \"learner_queue_timeout\": 900,\n",
" \"num_sgd_iter\": 2,\n",
" \"num_data_loader_buffers\": 2,\n",
" \"exploration_config\": {\n",
" \"type\": \"EpsilonGreedy\",\n",
" \"initial_epsilon\": 1.0,\n",
" \"final_epsilon\": 0.02,\n",
" \"epsilon_timesteps\": 500000\n",
" },\n",
" \"callbacks\": {\"on_train_result\": callbacks.on_train_result},\n",
" },\n",
" stop=stop,\n",
" checkpoint_at_end=True,\n",
" local_dir='./logs'\n",
" )\n",
"```\n",
"\n",
"### Submitting a training run\n",
"\n",
"Below, you create the training run using a `ReinforcementLearningEstimator`\n",
"object, which contains all the configuration parameters for this experiment:\n",
"- `source_directory`: Contains the training script and helper files to be\n",
"copied onto the node running the Ray head.\n",
"- `entry_script`: The training script, described in more detail above..\n",
"- `compute_target`: The compute target for the Ray head and training\n",
"script execution.\n",
"- `environment`: The Azure machine learning environment definition for\n",
"the node running the Ray head.\n",
"- `worker_configuration`: The configuration object for the additional\n",
"Ray nodes to be attached to the Ray cluster:\n",
" - `compute_target`: The compute target for the additional Ray nodes.\n",
" - `node_count`: The number of nodes to attach to the Ray cluster.\n",
" - `environment`: The environment definition for the additional Ray nodes.\n",
"- `max_run_duration_seconds`: The time after which to abort the run if it\n",
"is still running.\n",
"- `shm_size`: The size of docker container's shared memory block. \n",
"\n",
"For more details, please take a look at the [online documentation](https://docs.microsoft.com/en-us/python/api/azureml-contrib-reinforcementlearning/?view=azure-ml-py)\n",
"for Azure Machine Learning service's reinforcement learning offering.\n",
"\n",
"We configure 8 extra D2 (worker) nodes for the Ray cluster, giving us a total of\n",
"22 CPUs and 1 GPU. The GPU and one CPU are used by the IMPALA learner,\n",
"and each MineRL environment receives 2 CPUs allowing us to spawn a total\n",
"of 10 rollout workers (see `num_workers` parameter in the training script).\n",
"\n",
"\n",
"Lastly, the `RunDetails` widget displays information about the submitted\n",
"RL experiment, including a link to the Azure portal with more details."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.contrib.train.rl import ReinforcementLearningEstimator, WorkerConfiguration\n",
"from azureml.widgets import RunDetails\n",
"\n",
"worker_config = WorkerConfiguration(\n",
" compute_target=cpu_cluster, \n",
" node_count=8,\n",
" environment=cpu_minecraft_env)\n",
"\n",
"rl_est = ReinforcementLearningEstimator(\n",
" source_directory='files',\n",
" entry_script='minecraft_train.py',\n",
" compute_target=gpu_cluster,\n",
" environment=gpu_minecraft_env,\n",
" worker_configuration=worker_config,\n",
" max_run_duration_seconds=6 * 60 * 60,\n",
" shm_size=1024 * 1024 * 1024 * 30)\n",
"\n",
"train_run = exp.submit(rl_est)\n",
"\n",
"RunDetails(train_run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If you wish to cancel the run before it completes, uncomment and execute:\n",
"#train_run.cancel()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Monitoring training progress\n",
"\n",
"### View the Tensorboard\n",
"\n",
"The Tensorboard can be displayed via the Azure Machine Learning service's\n",
"[Tensorboard API](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-monitor-tensorboard).\n",
"When running locally, please make sure to follow the instructions in the\n",
"link and install required packages. Running this cell will output a URL\n",
"for the Tensorboard.\n",
"\n",
"Note that the training script sets the log directory when starting RLlib\n",
"via the `local_dir` parameter. `./logs` will automatically appear in\n",
"the downloadable files for a run. Since this script is executed on the\n",
"Ray head node run, we need to get a reference to it as shown below.\n",
"\n",
"The Tensorboard API will continuously stream logs from the run.\n",
"\n",
"**Note: It may take a couple of minutes after the run is in \"Running\" state\n",
"before Tensorboard files are available and the board will refresh automatically**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from azureml.tensorboard import Tensorboard\n",
"\n",
"head_run = None\n",
"\n",
"timeout = 60\n",
"while timeout > 0 and head_run is None:\n",
" timeout -= 1\n",
" \n",
" try:\n",
" head_run = next(r for r in train_run.get_children() if r.id.endswith('head'))\n",
" except StopIteration:\n",
" time.sleep(1)\n",
"\n",
"tb = Tensorboard([head_run])\n",
"tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Review results\n",
"\n",
"Please ensure that the training run has completed before continuing with this section."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_run.wait_for_completion()\n",
"\n",
"print('Training run completed.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Please note:** If the final \"episode_reward_mean\" metric from the training run is negative,\n",
"the produced model does not solve the problem of navigating the maze well. You can view\n",
"the metric on the Tensorboard or in \"Metrics\" section of the head run in the Azure Machine Learning\n",
"portal. We recommend training a new model by rerunning the notebook starting from \"Submitting a training run\".\n",
"\n",
"\n",
"### Export final model\n",
"\n",
"The key result from the training run is the final checkpoint\n",
"containing the state of the IMPALA trainer (model) upon meeting the\n",
"stopping criteria specified in `minecraft_train.py`.\n",
"\n",
"Azure Machine Learning service offers the [Model.register()](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py)\n",
"API which allows you to persist the model files from the\n",
"training run. We identify the directory containing the\n",
"final model written during the training run and register\n",
"it with Azure Machine Learning service. We use a Dataset\n",
"object to filter out the correct files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import os\n",
"import tempfile\n",
"\n",
"from azureml.core import Dataset\n",
"\n",
"path_prefix = os.path.join(tempfile.gettempdir(), 'tmp_training_artifacts')\n",
"\n",
"run_artifacts_path = os.path.join('azureml', head_run.id)\n",
"datastore = ws.get_default_datastore()\n",
"\n",
"run_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))\n",
"\n",
"cp_pattern = re.compile('.*checkpoint-\\\\d+$')\n",
"\n",
"checkpoint_files = [file for file in run_artifacts_ds.to_path() if cp_pattern.match(file)]\n",
"\n",
"# There should only be one checkpoint with our training settings...\n",
"final_checkpoint = os.path.dirname(os.path.join(run_artifacts_path, os.path.normpath(checkpoint_files[-1][1:])))\n",
"datastore.download(target_path=path_prefix, prefix=final_checkpoint.replace('\\\\', '/'), show_progress=True)\n",
"\n",
"print('Download complete.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.model import Model\n",
"\n",
"model_name = 'final_model_minecraft_maze'\n",
"\n",
"model = Model.register(\n",
" workspace=ws,\n",
" model_path=os.path.join(path_prefix, final_checkpoint),\n",
" model_name=model_name,\n",
" description='Model of an agent trained to navigate a lava maze in Minecraft.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Models can be used through a varity of APIs. Please see the\n",
"[documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where)\n",
"for more details.\n",
"\n",
"### Test agent performance in a rollout\n",
"\n",
"To observe the trained agent's behavior, it is a common practice to\n",
"view its behavior in a rollout. The previous reinforcement learning\n",
"tutorials explain rollouts in more detail.\n",
"\n",
"The provided `minecraft_rollout.py` script loads the final checkpoint\n",
"of the trained agent from the model registered with Azure Machine Learning\n",
"service. It then starts a rollout on 4 different lava maze layouts, that\n",
"are all larger and thus more difficult than the maze the agent was trained\n",
"on. The script further records videos by replaying the agent's decisions\n",
"in [Malmo](https://github.com/microsoft/malmo). Malmo supports multiple\n",
"agents in the same environment, thus allowing us to capture videos that\n",
"depict the agent from another agent's perspective. The provided\n",
"`malmo_video_recorder.py` file and the Malmo Github repository have more\n",
"details on the video recording setup.\n",
"\n",
"You can view the rewards for each rollout episode in the logs for the 'head'\n",
"run submitted below. In some episodes, the agent may fail to reach the goal\n",
"due to the higher level of difficulty - in practice, we could continue\n",
"training the agent on harder tasks starting with the final checkpoint."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_params = {\n",
" '--model_name': model_name\n",
"}\n",
"\n",
"rollout_est = ReinforcementLearningEstimator(\n",
" source_directory='files',\n",
" entry_script='minecraft_rollout.py',\n",
" script_params=script_params,\n",
" compute_target=gpu_cluster,\n",
" environment=gpu_minecraft_env,\n",
" shm_size=1024 * 1024 * 1024 * 30)\n",
"\n",
"rollout_run = exp.submit(rollout_est)\n",
"\n",
"RunDetails(rollout_run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View videos captured during rollout\n",
"\n",
"To inspect the agent's training progress you can view the videos captured\n",
"during the rollout episodes. First, ensure that the training run has\n",
"completed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rollout_run.wait_for_completion()\n",
"\n",
"head_run_rollout = next(r for r in rollout_run.get_children() if r.id.endswith('head'))\n",
"\n",
"print('Rollout completed.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, you need to download the video files from the training run. We use a\n",
"Dataset to filter out the video files which are in tgz archives."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rollout_run_artifacts_path = os.path.join('azureml', head_run_rollout.id)\n",
"datastore = ws.get_default_datastore()\n",
"\n",
"rollout_run_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(rollout_run_artifacts_path, '**')))\n",
"\n",
"video_archives = [file for file in rollout_run_artifacts_ds.to_path() if file.endswith('.tgz')]\n",
"video_archives = [os.path.join(rollout_run_artifacts_path, os.path.normpath(file[1:])) for file in video_archives]\n",
"\n",
"datastore.download(\n",
" target_path=path_prefix,\n",
" prefix=os.path.dirname(video_archives[0]).replace('\\\\', '/'),\n",
" show_progress=True)\n",
"\n",
"print('Download complete.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, unzip the video files and rename them by the Minecraft mission seed used\n",
"(see `minecraft_rollout.py` for more details on how the seed is used)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tarfile\n",
"import shutil\n",
"\n",
"training_artifacts_dir = './training_artifacts'\n",
"video_dir = os.path.join(training_artifacts_dir, 'videos')\n",
"video_files = []\n",
"\n",
"for tar_file_path in video_archives:\n",
" seed = tar_file_path[tar_file_path.index('rollout_') + len('rollout_'): tar_file_path.index('.tgz')]\n",
" \n",
" tar = tarfile.open(os.path.join(path_prefix, tar_file_path).replace('\\\\', '/'), 'r')\n",
" tar_info = next(t_info for t_info in tar.getmembers() if t_info.name.endswith('mp4'))\n",
" tar.extract(tar_info, video_dir)\n",
" tar.close()\n",
" \n",
" unzipped_folder = os.path.join(video_dir, next(f_ for f_ in os.listdir(video_dir) if not f_.endswith('mp4'))) \n",
" video_file = os.path.join(unzipped_folder,'video.mp4')\n",
" final_video_path = os.path.join(video_dir, '{seed}.mp4'.format(seed=seed))\n",
" \n",
" shutil.move(video_file, final_video_path) \n",
" video_files.append(final_video_path)\n",
" \n",
" shutil.rmtree(unzipped_folder)\n",
"\n",
"# Clean up any downloaded 'tmp' files\n",
"shutil.rmtree(path_prefix)\n",
"\n",
"print('Local video files:\\n', video_files)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, run the cell below to display the videos in-line. In some cases,\n",
"the agent may struggle to find the goal since the maze size was increased\n",
"compared to training."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
"index = 0\n",
"while index < len(video_files) - 1:\n",
" display(\n",
" HTML('\\\n",
" <video controls alt=\"cannot display video\" autoplay loop width=49%> \\\n",
" <source src=\"{f1}\" type=\"video/mp4\"> \\\n",
" </video> \\\n",
" <video controls alt=\"cannot display video\" autoplay loop width=49%> \\\n",
" <source src=\"{f2}\" type=\"video/mp4\"> \\\n",
" </video>'.format(f1=video_files[index], f2=video_files[index + 1]))\n",
" )\n",
" \n",
" index += 2\n",
"\n",
"if index < len(video_files):\n",
" display(\n",
" HTML('\\\n",
" <video controls alt=\"cannot display video\" autoplay loop width=49%> \\\n",
" <source src=\"{f1}\" type=\"video/mp4\"> \\\n",
" </video>'.format(f1=video_files[index]))\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning up\n",
"\n",
"Below, you can find code snippets for your convenience to clean up any resources created as part of this tutorial you don't wish to retain."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to stop the Tensorboard, uncomment and run\n",
"#tb.stop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to delete the gpu compute target, uncomment and run\n",
"#gpu_cluster.delete()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to delete the cpu compute target, uncomment and run\n",
"#cpu_cluster.delete()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to delete the registered model, uncomment and run\n",
"#model.delete()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to delete the local video files, uncomment and run\n",
"#shutil.rmtree(training_artifacts_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next steps\n",
"\n",
"This is currently the last introductory tutorial for Azure Machine Learning\n",
"service's Reinforcement\n",
"Learning offering. We would love to hear your feedback to build the features\n",
"you need!\n",
"\n"
]
}
],
"metadata": {
"authors": [
{
"name": "andress"
}
],
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00e2\u20ac\u00afLicensed under the MIT License.\u00e2\u20ac\u00af "
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,8 @@
name: minecraft
dependencies:
- pip:
- azureml-sdk
- azureml-contrib-reinforcementlearning
- azureml-widgets
- tensorboard
- azureml-tensorboard

View File

@@ -72,7 +72,7 @@
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.location, ws.resource_group, sep = ' | ')"
"print(ws.name, ws.location, ws.resource_group, sep = ' | ') "
]
},
{

View File

@@ -335,7 +335,7 @@
" admin_user_ssh_key='<my-sshkey>',\n",
" remote_login_port_public_access='enabled',\n",
" identity_type='UserAssigned',\n",
" identity_id='<user-assigned-identity-id>')\n",
" identity_id=['<resource-id1>'])\n",
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
"\n",
"cpu_cluster.wait_for_completion(show_output=True)"

View File

@@ -18,8 +18,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
| [Forecasting BikeShare Demand](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb) | Forecasting | BikeShare | Remote | None | Azure ML AutoML | Forecasting |
| [Forecasting orange juice sales with deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb) | Forecasting | Orange Juice Sales | Remote | Azure Container Instance | Azure ML AutoML | None |
| [Forecasting with automated ML SQL integration](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/auto-ml-sql-energy-demand.ipynb) | Forecasting | NYC Energy | Local | None | Azure ML AutoML | |
| [Setup automated ML SQL integration](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.ipynb) | None | None | None | None | Azure ML AutoML | |
| [Register a model and deploy locally](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb) | Deployment | None | Local | Local | None | None |
| :star:[Data drift on aks](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/monitor-models/data-drift/drift-on-aks.ipynb) | Filtering | NOAA | Remote | AKS | Azure ML | Dataset, Timeseries, Drift |
| [Train and deploy a model using Python SDK](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb) | Training and deploying a model from a notebook | Diabetes | Local | Azure Container Instance | None | None |
@@ -58,7 +56,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None |
| [Distributed PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.ipynb) | Train a model using the distributed training via Horovod | MNIST | AML Compute | None | PyTorch | None |
| [Distributed training with PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/distributed-pytorch-with-nccl-gloo.ipynb) | Train a model using distributed training via Nccl/Gloo | MNIST | AML Compute | None | PyTorch | None |
| [PyTorch object detection](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.ipynb) | Fine-tune PyTorch object detection model with a custom dockerfile | Custom | AML Compute | None | PyTorch | remote run, docker |
| [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None |
| [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
| [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/training/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None |
@@ -110,9 +107,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [automl-databricks-local-01](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb) | | | | | | |
| [automl-databricks-local-with-deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb) | | | | | | |
| [aml-pipelines-use-databricks-as-compute-target](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/databricks-as-remote-compute-target/aml-pipelines-use-databricks-as-compute-target.ipynb) | | | | | | |
| [accelerated-models-object-detection](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/accelerated-models/accelerated-models-object-detection.ipynb) | | | | | | |
| [accelerated-models-quickstart](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/accelerated-models/accelerated-models-quickstart.ipynb) | | | | | | |
| [accelerated-models-training](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/accelerated-models/accelerated-models-training.ipynb) | | | | | | |
| [multi-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb) | | | | | | |
| [register-model-deploy-local-advanced](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb) | | | | | | |
| [enable-app-insights-in-production-service](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/enable-app-insights-in-production-service/enable-app-insights-in-production-service.ipynb) | | | | | | |
@@ -122,7 +116,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [tensorflow-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/tensorflow/tensorflow-model-register-and-deploy.ipynb) | | | | | | |
| [explain-model-on-amlcompute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb) | | | | | | |
| [save-retrieve-explanations-run-history](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb) | | | | | | |
| [train-explain-model-keras-locally-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-keras-locally-and-deploy.ipynb) | | | | | | |
| [train-explain-model-locally-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb) | | | | | | |
| [train-explain-model-on-amlcompute-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb) | | | | | | |
| [training_notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/notebook_runner/training_notebook.ipynb) | | | | | | |
@@ -131,6 +124,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [pong_rllib](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb) | | | | | | |
| [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) | | | | | | |
| [cartpole_cc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_cc.ipynb) | | | | | | |
| [minecraft](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/minecraft-on-distributed-compute/minecraft.ipynb) | | | | | | |
| [devenv_setup](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/setup/devenv_setup.ipynb) | | | | | | |
| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
| [distributed-cntk-with-custom-docker](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/distributed-cntk-with-custom-docker/distributed-cntk-with-custom-docker.ipynb) | | | | | | |
@@ -139,5 +133,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [tutorial-1st-experiment-sdk-train](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb) | | | | | | |
| [img-classification-part1-training](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part1-training.ipynb) | | | | | | |
| [img-classification-part2-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb) | | | | | | |
| [img-classification-part3-deploy-encrypted](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part3-deploy-encrypted.ipynb) | | | | | | |
| [tutorial-pipeline-batch-scoring-classification](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb) | | | | | | |
| [regression-automated-ml](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/regression-automl-nyc-taxi-data/regression-automated-ml.ipynb) | | | | | | |

View File

@@ -19,6 +19,7 @@ The following tutorials are intended to provide an introductory overview of Azur
| [Train your first ML Model](https://docs.microsoft.com/azure/machine-learning/tutorial-1st-experiment-sdk-train) | Learn the foundational design patterns in Azure Machine Learning and train a scikit-learn model based on a diabetes data set. | [tutorial-quickstart-train-model.ipynb](create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb) | Regression | Scikit-Learn
| [Train an image classification model](https://docs.microsoft.com/azure/machine-learning/tutorial-train-models-with-aml) | Train a scikit-learn image classification model. | [img-classification-part1-training.ipynb](image-classification-mnist-data/img-classification-part1-training.ipynb) | Image Classification | Scikit-Learn
| [Deploy an image classification model](https://docs.microsoft.com/azure/machine-learning/tutorial-deploy-models-with-aml) | Deploy a scikit-learn image classification model to Azure Container Instances. | [img-classification-part2-deploy.ipynb](image-classification-mnist-data/img-classification-part2-deploy.ipynb) | Image Classification | Scikit-Learn
| [Deploy an encrypted inferencing service](https://docs.microsoft.com/azure/machine-learning/tutorial-deploy-models-with-aml) |Deploy an image classification model for encrypted inferencing in Azure Container Instances | [img-classification-part3-deploy-encrypted.ipynb](image-classification-mnist-data/img-classification-part3-deploy-encrypted.ipynb) | Image Classification | Scikit-Learn
| [Use automated machine learning to predict taxi fares](https://docs.microsoft.com/azure/machine-learning/tutorial-auto-train-models) | Train a regression model to predict taxi fares using Automated Machine Learning. | [regression-part2-automated-ml.ipynb](regression-automl-nyc-taxi-data/regression-automated-ml.ipynb) | Regression | Automated ML
## Advanced Samples

View File

@@ -0,0 +1,615 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tutorial #3: Deploy an image classification model for encrypted inferencing in Azure Container Instance (ACI)\n",
"\n",
"This tutorial is **a new addition to the two-part series**. In the [previous tutorial](img-classification-part1-training.ipynb), you trained machine learning models and then registered a model in your workspace on the cloud. \n",
"\n",
"Now, you're ready to deploy the model as a encrypted inferencing web service in [Azure Container Instances](https://docs.microsoft.com/azure/container-instances/) (ACI). A web service is an image, in this case a Docker image, that encapsulates the scoring logic and the model itself. \n",
"\n",
"In this part of the tutorial, you use Azure Machine Learning service (Preview) to:\n",
"\n",
"> * Set up your testing environment\n",
"> * Retrieve the model from your workspace\n",
"> * Test the model locally\n",
"> * Deploy the model to ACI\n",
"> * Test the deployed model\n",
"\n",
"ACI is a great solution for testing and understanding the workflow. For scalable production deployments, consider using Azure Kubernetes Service. For more information, see [how to deploy and where](https://docs.microsoft.com/azure/machine-learning/service/how-to-deploy-and-where).\n",
"\n",
"\n",
"## Prerequisites\n",
"\n",
"Complete the model training in the [Tutorial #1: Train an image classification model with Azure Machine Learning](train-models.ipynb) notebook. \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If you did NOT complete the tutorial, you can instead run this cell \n",
"# This will register a model and download the data needed for this tutorial\n",
"# These prerequisites are created in the training tutorial\n",
"# Feel free to skip this cell if you completed the training tutorial \n",
"\n",
"# register a model\n",
"from azureml.core import Workspace\n",
"ws = Workspace.from_config()\n",
"\n",
"from azureml.core.model import Model\n",
"\n",
"model_name = \"sklearn_mnist\"\n",
"model = Model.register(model_path=\"sklearn_mnist_model.pkl\",\n",
" model_name=model_name,\n",
" tags={\"data\": \"mnist\", \"model\": \"classification\"},\n",
" description=\"Mnist handwriting recognition\",\n",
" workspace=ws)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup the Environment \n",
"\n",
"Add `encrypted-inference` package as a conda dependency "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.environment import Environment\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# to install required packages\n",
"env = Environment('tutorial-env')\n",
"cd = CondaDependencies.create(pip_packages=['azureml-dataprep[pandas,fuse]>=1.1.14', 'azureml-defaults', 'azure-storage-blob', 'encrypted-inference==0.9'], conda_packages = ['scikit-learn==0.22.1'])\n",
"\n",
"env.python.conda_dependencies = cd\n",
"\n",
"# Register environment to re-use later\n",
"env.register(workspace = ws)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up the environment\n",
"\n",
"Start by setting up a testing environment.\n",
"\n",
"### Import packages\n",
"\n",
"Import the Python packages needed for this tutorial."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"check version"
]
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
" \n",
"import azureml.core\n",
"\n",
"# display the core SDK version number\n",
"print(\"Azure ML SDK Version: \", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Install Homomorphic Encryption based library for Secure Inferencing\n",
"\n",
"Our library is based on [Microsoft SEAL](https://github.com/Microsoft/SEAL) and pubished to [PyPi.org](https://pypi.org/project/encrypted-inference) as an easy to use package "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install encrypted-inference==0.9"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy as web service\n",
"\n",
"Deploy the model as a web service hosted in ACI. \n",
"\n",
"To build the correct environment for ACI, provide the following:\n",
"* A scoring script to show how to use the model\n",
"* A configuration file to build the ACI\n",
"* The model you trained before\n",
"\n",
"### Create scoring script\n",
"\n",
"Create the scoring script, called score.py, used by the web service call to show how to use the model.\n",
"\n",
"You must include two required functions into the scoring script:\n",
"* The `init()` function, which typically loads the model into a global object. This function is run only once when the Docker container is started. \n",
"\n",
"* The `run(input_data)` function uses the model to predict a value based on the input data. Inputs and outputs to the run typically use JSON for serialization and de-serialization, but other formats are supported. The function fetches homomorphic encryption based public keys that are uploaded by the service caller. \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"import json\n",
"import os\n",
"import pickle\n",
"import joblib\n",
"from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, PublicAccess\n",
"from encrypted.inference.eiserver import EIServer\n",
"\n",
"def init():\n",
" global model\n",
" # AZUREML_MODEL_DIR is an environment variable created during deployment.\n",
" # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)\n",
" # For multiple models, it points to the folder containing all deployed models (./azureml-models)\n",
" model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl')\n",
" model = joblib.load(model_path)\n",
"\n",
" global server\n",
" server = EIServer(model.coef_, model.intercept_, verbose=True)\n",
"\n",
"def run(raw_data):\n",
"\n",
" json_properties = json.loads(raw_data)\n",
"\n",
" key_id = json_properties['key_id']\n",
" conn_str = json_properties['conn_str']\n",
" container = json_properties['container']\n",
" data = json_properties['data']\n",
"\n",
" # download the Galois keys from blob storage\n",
" #TODO optimize by caching the keys locally \n",
" blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)\n",
" blob_client = blob_service_client.get_blob_client(container=container, blob=key_id)\n",
" public_keys = blob_client.download_blob().readall()\n",
" \n",
" result = {}\n",
" # make prediction\n",
" result = server.predict(data, public_keys)\n",
"\n",
" # you can return any data type as long as it is JSON-serializable\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create configuration file\n",
"\n",
"Create a deployment configuration file and specify the number of CPUs and gigabyte of RAM needed for your ACI container. While it depends on your model, the default of 1 core and 1 gigabyte of RAM is usually sufficient for many models. If you feel you need more later, you would have to recreate the image and redeploy the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"configure web service",
"aci"
]
},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n",
" memory_gb=1, \n",
" tags={\"data\": \"MNIST\", \"method\" : \"sklearn\"}, \n",
" description='Encrypted Predict MNIST with sklearn + SEAL')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy in ACI\n",
"Estimated time to complete: **about 2-5 minutes**\n",
"\n",
"Configure the image and deploy. The following code goes through these steps:\n",
"\n",
"1. Create environment object containing dependencies needed by the model using the environment file (`myenv.yml`)\n",
"1. Create inference configuration necessary to deploy the model as a web service using:\n",
" * The scoring file (`score.py`)\n",
" * envrionment object created in previous step\n",
"1. Deploy the model to the ACI container.\n",
"1. Get the web service HTTP endpoint."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"configure image",
"create image",
"deploy web service",
"aci"
]
},
"outputs": [],
"source": [
"%%time\n",
"from azureml.core.webservice import Webservice\n",
"from azureml.core.model import InferenceConfig\n",
"from azureml.core.environment import Environment\n",
"from azureml.core import Workspace\n",
"from azureml.core.model import Model\n",
"\n",
"ws = Workspace.from_config()\n",
"model = Model(ws, 'sklearn_mnist')\n",
"\n",
"myenv = Environment.get(workspace=ws, name=\"tutorial-env\")\n",
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=myenv)\n",
"\n",
"service = Model.deploy(workspace=ws, \n",
" name='sklearn-mnist-svc', \n",
" models=[model], \n",
" inference_config=inference_config, \n",
" deployment_config=aciconfig)\n",
"\n",
"service.wait_for_deployment(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the scoring web service's HTTP endpoint, which accepts REST client calls. This endpoint can be shared with anyone who wants to test the web service or integrate it into an application."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"get scoring uri"
]
},
"outputs": [],
"source": [
"print(service.scoring_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test the model\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download test data\n",
"Download the test data to the **./data/** directory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from azureml.core import Dataset\n",
"from azureml.opendatasets import MNIST\n",
"\n",
"data_folder = os.path.join(os.getcwd(), 'data')\n",
"os.makedirs(data_folder, exist_ok=True)\n",
"\n",
"mnist_file_dataset = MNIST.get_file_dataset()\n",
"mnist_file_dataset.download(data_folder, overwrite=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load test data\n",
"\n",
"Load the test data from the **./data/** directory created during the training tutorial."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from utils import load_data\n",
"import os\n",
"import glob\n",
"\n",
"data_folder = os.path.join(os.getcwd(), 'data')\n",
"# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster\n",
"X_test = load_data(glob.glob(os.path.join(data_folder,\"**/t10k-images-idx3-ubyte.gz\"), recursive=True)[0], False) / 255.0\n",
"y_test = load_data(glob.glob(os.path.join(data_folder,\"**/t10k-labels-idx1-ubyte.gz\"), recursive=True)[0], True).reshape(-1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict test data\n",
"\n",
"Feed the test dataset to the model to get predictions.\n",
"\n",
"\n",
"The following code goes through these steps:\n",
"\n",
"1. Create our Homomorphic Encryption based client \n",
"\n",
"1. Upload HE generated public keys \n",
"\n",
"1. Encrypt the data\n",
"\n",
"1. Send the data as JSON to the web service hosted in ACI. \n",
"\n",
"1. Use the SDK's `run` API to invoke the service. You can also make raw calls using any HTTP tool such as curl."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create our Homomorphic Encryption based client \n",
"\n",
"Create a new EILinearRegressionClient and setup the public keys "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from encrypted.inference.eiclient import EILinearRegressionClient\n",
"\n",
"# Create a new Encrypted inference client and a new secret key.\n",
"edp = EILinearRegressionClient(verbose=True)\n",
"\n",
"public_keys_blob, public_keys_data = edp.get_public_keys()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Upload HE generated public keys\n",
"\n",
"Upload the public keys to the workspace default blob store. This will allow us to share the keys with the inference server"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"from azureml.core import Workspace, Datastore\n",
"import os\n",
"\n",
"ws = Workspace.from_config()\n",
"\n",
"datastore = ws.get_default_datastore()\n",
"container_name=datastore.container_name\n",
"\n",
"# Create a local file and write the keys to it\n",
"public_keys = open(public_keys_blob, \"wb\")\n",
"public_keys.write(public_keys_data)\n",
"public_keys.close()\n",
"\n",
"# Upload the file to blob store\n",
"datastore.upload_files([public_keys_blob])\n",
"\n",
"# Delete the local file\n",
"os.remove(public_keys_blob)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Encrypt the data "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#choose any one sample from the test data \n",
"sample_index = 1\n",
"\n",
"#encrypt the data\n",
"raw_data = edp.encrypt(X_test[sample_index])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Send the test data to the webservice hosted in ACI\n",
"\n",
"Feed the test dataset to the model to get predictions. We will need to send the connection string to the blob storage where the public keys were uploaded \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from azureml.core import Webservice\n",
"\n",
"service = Webservice(ws, 'sklearn-mnist-svc')\n",
"\n",
"#pass the connection string for blob storage to give the server access to the uploaded public keys \n",
"conn_str_template = 'DefaultEndpointsProtocol={};AccountName={};AccountKey={};EndpointSuffix=core.windows.net'\n",
"conn_str = conn_str_template.format(datastore.protocol, datastore.account_name, datastore.account_key)\n",
"\n",
"#build the json \n",
"data = json.dumps({\"data\": raw_data, \"key_id\" : public_keys_blob, \"conn_str\" : conn_str, \"container\" : container_name })\n",
"data = bytes(data, encoding='ASCII')\n",
"\n",
"print ('Making an encrypted inference web service call ')\n",
"eresult = service.run(input_data=data)\n",
"\n",
"print ('Received encrypted inference results')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Decrypt the data\n",
"\n",
"Use the client to decrypt the results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"\n",
"results = edp.decrypt(eresult)\n",
"\n",
"print ('Decrypted the results ', results)\n",
"\n",
"#Apply argmax to identify the prediction result\n",
"prediction = np.argmax(results)\n",
"\n",
"print ( ' Prediction : ', prediction)\n",
"print ( ' Actual Label : ', y_test[sample_index])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean up resources\n",
"\n",
"To keep the resource group and workspace for other tutorials and exploration, you can delete only the ACI deployment using this API call:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"delete web service"
]
},
"outputs": [],
"source": [
"service.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"If you're not going to use what you've created here, delete the resources you just created with this quickstart so you don't incur any charges. In the Azure portal, select and delete your resource group. You can also keep the resource group, but delete a single workspace by displaying the workspace properties and selecting the Delete button.\n",
"\n",
"\n",
"## Next steps\n",
"\n",
"In this Azure Machine Learning tutorial, you used Python to:\n",
"\n",
"> * Set up your testing environment\n",
"> * Retrieve the model from your workspace\n",
"> * Test the model locally\n",
"> * Deploy the model to ACI\n",
"> * Test the deployed model\n",
" \n",
"You can also try out the [regression tutorial](regression-part1-data-prep.ipynb)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/tutorials/img-classification-part2-deploy.png)"
]
}
],
"metadata": {
"authors": [
{
"name": "vkanne"
}
],
"celltoolbar": "Edit Metadata",
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6"
},
"msauthor": "vkanne"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,10 @@
name: img-classification-part3-deploy-encrypted
dependencies:
- pip:
- azureml-sdk
- matplotlib
- sklearn
- pandas
- azureml-opendatasets
- encrypted-inference==0.9
- azure-storage-blob