# Set up Azure ML Automated Machine Learning on SQL Server 2019 CTP 2.4 big data cluster

\# Prerequisites:  
\# - An Azure subscription and resource group  
\# - An Azure Machine Learning workspace  
\# - A SQL Server 2019 CTP 2.4 big data cluster with Internet access and a database named 'automl'  
\# - Azure CLI  
\# - kubectl command  
\# - The https://github.com/Azure/MachineLearningNotebooks repository downloaded (cloned) to your local machine

\# In the 'automl' database, create a table named 'dbo.nyc_energy' as follows:  
\# - In SQL Server Management Studio, right-click the 'automl' database, select Tasks, then Import Flat File.  
\# - Select the file AzureMlCli\notebooks\how-to-use-azureml\automated-machine-learning\forecasting-energy-demand\nyc_energy.csv.  
\# - Using the "Modify Columns" page, allow nulls for all columns. 

\# Create an Azure Machine Learning Workspace using the instructions at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace 

\# Create an Azure service principal.  You can do this with the following commands: 

az login  
az account set --subscription *subscriptionid*  

\# The following command prints out the **appId** and **tenant**,  
\# which you insert into the indicated cell later in this notebook  
\# to allow AutoML to authenticate with Azure:  

az ad sp create-for-rbac --name *principlename* --password *password*

\# Log into the master instance of SQL Server 2019 CTP 2.4:  
kubectl exec -it mssql-master-pool-0 -n *clustername* -c mssql-server -- /bin/bash

mkdir /tmp/aml

cd /tmp/aml

\# **Modify** the following with your subscription_id, resource_group, and workspace_name:  
cat > config.json << EOF  
{  
    "subscription_id": "123456ab-78cd-0123-45ef-abcd12345678",  
    "resource_group": "myrg1",  
    "workspace_name": "myws1"  
}  
EOF

\# The directory referenced below is appropriate for the master instance of SQL Server 2019 CTP 2.4.

cd /opt/mssql/mlservices/runtime/python/bin

./python -m pip install azureml-sdk[automl]

./python -m pip install --upgrade numpy 

./python -m pip install --upgrade sklearn


![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.png)

In [None]:
-- Enable external scripts to allow invoking Python
sp_configure 'external scripts enabled',1 
reconfigure with override 
GO


In [None]:
-- Use database 'automl'
USE [automl]
GO

In [None]:
-- This is a table to hold the Azure ML connection information.
SET ANSI_NULLS ON
GO

SET QUOTED_IDENTIFIER ON
GO

CREATE TABLE [dbo].[aml_connection](
    [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,
	[ConnectionName] [nvarchar](255) NULL,
	[TenantId] [nvarchar](255) NULL,
	[AppId] [nvarchar](255) NULL,
	[Password] [nvarchar](255) NULL,
	[ConfigFile] [nvarchar](255) NULL
) ON [PRIMARY]
GO

# Copy the values from create-for-rbac above into the cell below

In [None]:
-- Use the following values:
-- Leave the name as 'Default'
-- Insert <tenant> returned by create-for-rbac above
-- Insert <AppId> returned by create-for-rbac above
-- Insert <password> used in create-for-rbac above
-- Leave <path> as '/tmp/aml/config.json'
INSERT INTO [dbo].[aml_connection]  
VALUES (
    N'Default', -- Name
    N'11111111-2222-3333-4444-555555555555', -- Tenant
    N'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', -- AppId
    N'insertpasswordhere', -- Password
    N'/tmp/aml/config.json' -- Path
    );
GO

In [None]:
-- This is a table to hold the results from the AutoMLTrain procedure.
SET ANSI_NULLS ON
GO

SET QUOTED_IDENTIFIER ON
GO

CREATE TABLE [dbo].[aml_model](
    [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,
    [Model] [varchar](max) NOT NULL,        -- The model, which can be passed to AutoMLPredict for testing or prediction.
    [RunId] [nvarchar](250) NULL,           -- The RunId, which can be used to view the model in the Azure Portal.
    [CreatedDate] [datetime] NULL,
    [ExperimentName] [nvarchar](100) NULL,  -- Azure ML Experiment Name
    [WorkspaceName] [nvarchar](100) NULL,   -- Azure ML Workspace Name
	[LogFileText] [nvarchar](max) NULL
) 
GO

ALTER TABLE [dbo].[aml_model] ADD  DEFAULT (getutcdate()) FOR [CreatedDate]
GO


In [None]:
-- This stored procedure uses automated machine learning to train several models
-- and return the best model.
--
-- The result set has several columns:
--   best_run - ID of the best model found
--   experiment_name - training run name
--   fitted_model - best model found
--   log_file_text - console output
--   workspace - name of the Azure ML workspace where run history is stored
--
-- An example call for a classification problem is:
--    insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
--    exec dbo.AutoMLTrain @input_query='
--    SELECT top 100000 
--          CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime
--          ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime
--          ,[passenger_count]
--          ,[trip_time_in_secs]
--          ,[trip_distance]
--          ,[payment_type]
--          ,[tip_class]
--      FROM [dbo].[nyctaxi_sample] order by [hack_license] ',
--      @label_column = 'tip_class',
--      @iterations=10
-- 
-- An example call for forecasting is:
--      insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
--      exec dbo.AutoMLTrain @input_query='
--      select cast(timeStamp as nvarchar(30)) as timeStamp,
--             demand,
--      	   precip,
--      	   temp,
--             case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column
--      from nyc_energy
--      where demand is not null and precip is not null and temp is not null
--      and timeStamp < ''2017-02-01''',
--      @label_column='demand',
--      @task='forecasting',
--      @iterations=10,
--      @iteration_timeout_minutes=5,
--      @time_column_name='timeStamp',
--      @is_validate_column='is_validate_column',
--      @experiment_name='automl-sql-forecast',
--      @primary_metric='normalized_root_mean_squared_error'

SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]
 (
    @input_query NVARCHAR(MAX),                      -- The SQL Query that will return the data to train and validate the model.
    @label_column NVARCHAR(255)='Label',             -- The name of the column in the result of @input_query that is the label.
    @primary_metric NVARCHAR(40)='AUC_weighted',     -- The metric to optimize.
    @iterations INT=100,                             -- The maximum number of pipelines to train.
    @task NVARCHAR(40)='classification',             -- The type of task.  Can be classification, regression or forecasting.
    @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
    @iteration_timeout_minutes INT = 15,             -- The maximum time in minutes for training a single pipeline. 
    @experiment_timeout_hours FLOAT = 1,             -- The maximum time in hours for training all pipelines.
    @n_cross_validations INT = 3,                    -- The number of cross validations.
    @blacklist_models NVARCHAR(MAX) = '',            -- A comma separated list of algos that will not be used.
                                                     -- The list of possible models can be found at:
                                                     -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
    @whitelist_models NVARCHAR(MAX) = '',            -- A comma separated list of algos that can be used.
                                                     -- The list of possible models can be found at:
                                                     -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
    @experiment_exit_score FLOAT = 0,                -- Stop the experiment if this score is acheived.
    @sample_weight_column NVARCHAR(255)='',          -- The name of the column in the result of  @input_query that gives a sample weight.
    @is_validate_column NVARCHAR(255)='',            -- The name of the column in the result of  @input_query that indicates if the row is for training or validation.
	                                                 -- In the values of the column, 0 means for training and 1 means for validation.
    @time_column_name  NVARCHAR(255)='',             -- The name of the timestamp column for forecasting.
	@connection_name NVARCHAR(255)='default'         -- The AML connection to use.
 ) AS
BEGIN

    DECLARE @tenantid NVARCHAR(255)
    DECLARE @appid NVARCHAR(255)
    DECLARE @password NVARCHAR(255)
    DECLARE @config_file NVARCHAR(255)

	SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
	FROM aml_connection
	WHERE ConnectionName = @connection_name;

	EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
import logging 
import azureml.core 
import pandas as pd
import numpy as np
from azureml.core.experiment import Experiment 
from azureml.train.automl import AutoMLConfig 
from sklearn import datasets 
import pickle
import codecs
from azureml.core.authentication import ServicePrincipalAuthentication 
from azureml.core.workspace import Workspace 

if __name__.startswith("sqlindb"):
    auth = ServicePrincipalAuthentication(tenantid, appid, password) 
 
    ws = Workspace.from_config(path=config_file, auth=auth) 
 
    project_folder = "./sample_projects/" + experiment_name
 
    experiment = Experiment(ws, experiment_name) 

    data_train = input_data
    X_valid = None
    y_valid = None
    sample_weight_valid = None

    if is_validate_column != "" and is_validate_column is not None:
        data_train = input_data[input_data[is_validate_column] <= 0]
        data_valid = input_data[input_data[is_validate_column] > 0]
        data_train.pop(is_validate_column)
        data_valid.pop(is_validate_column)
        y_valid = data_valid.pop(label_column).values
        if sample_weight_column != "" and sample_weight_column is not None:
            sample_weight_valid = data_valid.pop(sample_weight_column).values
        X_valid = data_valid
        n_cross_validations = None

    y_train = data_train.pop(label_column).values

    sample_weight = None
    if sample_weight_column != "" and sample_weight_column is not None:
        sample_weight = data_train.pop(sample_weight_column).values

    X_train = data_train

    if experiment_timeout_hours == 0:
        experiment_timeout_hours = None

    if experiment_exit_score == 0:
        experiment_exit_score = None

    if blacklist_models == "":
        blacklist_models = None

    if blacklist_models is not None:
        blacklist_models = blacklist_models.replace(" ", "").split(",")

    if whitelist_models == "":
        whitelist_models = None

    if whitelist_models is not None:
        whitelist_models = whitelist_models.replace(" ", "").split(",")

    automl_settings = {}
    preprocess = True
    if time_column_name != "" and time_column_name is not None:
        automl_settings = { "time_column_name": time_column_name }
        preprocess = False

    log_file_name = "automl_errors.log"
	 
    automl_config = AutoMLConfig(task = task, 
                                 debug_log = log_file_name, 
                                 primary_metric = primary_metric, 
                                 iteration_timeout_minutes = iteration_timeout_minutes, 
                                 experiment_timeout_hours = experiment_timeout_hours,
                                 iterations = iterations, 
                                 n_cross_validations = n_cross_validations, 
                                 preprocess = preprocess,
                                 verbosity = logging.INFO, 
                                 X = X_train,  
                                 y = y_train, 
                                 path = project_folder,
                                 blacklist_models = blacklist_models,
                                 whitelist_models = whitelist_models,
                                 experiment_exit_score = experiment_exit_score,
                                 sample_weight = sample_weight,
                                 X_valid = X_valid,
                                 y_valid = y_valid,
                                 sample_weight_valid = sample_weight_valid,
                                 **automl_settings) 
 
    local_run = experiment.submit(automl_config, show_output = True) 

    best_run, fitted_model = local_run.get_output()

    pickled_model = codecs.encode(pickle.dumps(fitted_model), "base64").decode()

    log_file_text = ""

    try:
        with open(log_file_name, "r") as log_file:
            log_file_text = log_file.read()
    except:
        log_file_text = "Log file not found"

    returned_model = pd.DataFrame({"best_run": [best_run.id], "experiment_name": [experiment_name], "fitted_model": [pickled_model], "log_file_text": [log_file_text], "workspace": [ws.name]}, dtype=np.dtype(np.str))
'
	, @input_data_1 = @input_query
	, @input_data_1_name = N'input_data'
	, @output_data_1_name = N'returned_model'
	, @params = N'@label_column NVARCHAR(255), 
	              @primary_metric NVARCHAR(40),
				  @iterations INT, @task NVARCHAR(40),
				  @experiment_name NVARCHAR(32),
				  @iteration_timeout_minutes INT,
				  @experiment_timeout_hours FLOAT,
				  @n_cross_validations INT,
				  @blacklist_models NVARCHAR(MAX),
				  @whitelist_models NVARCHAR(MAX),
				  @experiment_exit_score FLOAT,
				  @sample_weight_column NVARCHAR(255),
				  @is_validate_column NVARCHAR(255),
				  @time_column_name  NVARCHAR(255),
				  @tenantid NVARCHAR(255),
				  @appid NVARCHAR(255),
				  @password NVARCHAR(255),
				  @config_file NVARCHAR(255)'
	, @label_column = @label_column
	, @primary_metric = @primary_metric
	, @iterations = @iterations
	, @task = @task
	, @experiment_name = @experiment_name
	, @iteration_timeout_minutes = @iteration_timeout_minutes
	, @experiment_timeout_hours = @experiment_timeout_hours
	, @n_cross_validations = @n_cross_validations
	, @blacklist_models = @blacklist_models
	, @whitelist_models = @whitelist_models
	, @experiment_exit_score = @experiment_exit_score
	, @sample_weight_column = @sample_weight_column
	, @is_validate_column = @is_validate_column
	, @time_column_name = @time_column_name
	, @tenantid = @tenantid
	, @appid = @appid
	, @password = @password
	, @config_file = @config_file
WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))
END

In [None]:
-- This procedure returns a list of metrics for each iteration of a training run.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLGetMetrics]
 (
	@run_id NVARCHAR(250),                           -- The RunId
    @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
    @connection_name NVARCHAR(255)='default'         -- The AML connection to use.
 ) AS
BEGIN
    DECLARE @tenantid NVARCHAR(255)
    DECLARE @appid NVARCHAR(255)
    DECLARE @password NVARCHAR(255)
    DECLARE @config_file NVARCHAR(255)

	SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
	FROM aml_connection
	WHERE ConnectionName = @connection_name;

    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
import logging 
import azureml.core 
import numpy as np
from azureml.core.experiment import Experiment 
from azureml.train.automl.run import AutoMLRun
from azureml.core.authentication import ServicePrincipalAuthentication 
from azureml.core.workspace import Workspace 

auth = ServicePrincipalAuthentication(tenantid, appid, password) 
 
ws = Workspace.from_config(path=config_file, auth=auth) 
 
experiment = Experiment(ws, experiment_name) 

ml_run = AutoMLRun(experiment = experiment, run_id = run_id)

children = list(ml_run.get_children())
iterationlist = []
metricnamelist = []
metricvaluelist = []

for run in children:
    properties = run.get_properties()
    if "iteration" in properties:
        iteration = int(properties["iteration"])
        for metric_name, metric_value in run.get_metrics().items():
            if isinstance(metric_value, float):
                iterationlist.append(iteration)
                metricnamelist.append(metric_name)
                metricvaluelist.append(metric_value)
             
metrics = pd.DataFrame({"iteration": iterationlist, "metric_name": metricnamelist, "metric_value": metricvaluelist})
'
    , @output_data_1_name = N'metrics'
	, @params = N'@run_id NVARCHAR(250), 
				  @experiment_name NVARCHAR(32),
  				  @tenantid NVARCHAR(255),
				  @appid NVARCHAR(255),
				  @password NVARCHAR(255),
				  @config_file NVARCHAR(255)'
    , @run_id = @run_id
	, @experiment_name = @experiment_name
	, @tenantid = @tenantid
	, @appid = @appid
	, @password = @password
	, @config_file = @config_file
WITH RESULT SETS ((iteration INT, metric_name NVARCHAR(100), metric_value FLOAT))
END

In [None]:
-- This procedure predicts values based on a model returned by AutoMLTrain and a dataset.
-- It returns the dataset with a new column added, which is the predicted value.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLPredict]
 (
   @input_query NVARCHAR(MAX),      -- A SQL query returning data to predict on.
   @model NVARCHAR(MAX),            -- A model returned from AutoMLTrain.
   @label_column  NVARCHAR(255)=''  -- Optional name of the column from input_query, which should be ignored when predicting
 ) AS 
BEGIN 
  
    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd 
import azureml.core  
import numpy as np 
from azureml.train.automl import AutoMLConfig  
import pickle 
import codecs 
  
model_obj = pickle.loads(codecs.decode(model.encode(), "base64")) 
  
test_data = input_data.copy() 

if label_column != "" and label_column is not None:
    y_test = test_data.pop(label_column).values 
X_test = test_data 
  
predicted = model_obj.predict(X_test) 
  
combined_output = input_data.assign(predicted=predicted)
  
' 
    , @input_data_1 = @input_query 
    , @input_data_1_name = N'input_data' 
    , @output_data_1_name = N'combined_output' 
    , @params = N'@model NVARCHAR(MAX), @label_column  NVARCHAR(255)' 
    , @model = @model 
	, @label_column = @label_column
END