mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-19 17:17:04 -05:00
240 lines
11 KiB
Transact-SQL
240 lines
11 KiB
Transact-SQL
-- This stored procedure uses automated machine learning to train several models
|
|
-- and returns the best model.
|
|
--
|
|
-- The result set has several columns:
|
|
-- best_run - iteration ID for the best model
|
|
-- experiment_name - experiment name pass in with the @experiment_name parameter
|
|
-- fitted_model - best model found
|
|
-- log_file_text - AutoML debug_log contents
|
|
-- workspace - name of the Azure ML workspace where run history is stored
|
|
--
|
|
-- An example call for a classification problem is:
|
|
-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
|
|
-- exec dbo.AutoMLTrain @input_query='
|
|
-- SELECT top 100000
|
|
-- CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime
|
|
-- ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime
|
|
-- ,[passenger_count]
|
|
-- ,[trip_time_in_secs]
|
|
-- ,[trip_distance]
|
|
-- ,[payment_type]
|
|
-- ,[tip_class]
|
|
-- FROM [dbo].[nyctaxi_sample] order by [hack_license] ',
|
|
-- @label_column = 'tip_class',
|
|
-- @iterations=10
|
|
--
|
|
-- An example call for forecasting is:
|
|
-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)
|
|
-- exec dbo.AutoMLTrain @input_query='
|
|
-- select cast(timeStamp as nvarchar(30)) as timeStamp,
|
|
-- demand,
|
|
-- precip,
|
|
-- temp,
|
|
-- case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column
|
|
-- from nyc_energy
|
|
-- where demand is not null and precip is not null and temp is not null
|
|
-- and timeStamp < ''2017-02-01''',
|
|
-- @label_column='demand',
|
|
-- @task='forecasting',
|
|
-- @iterations=10,
|
|
-- @iteration_timeout_minutes=5,
|
|
-- @time_column_name='timeStamp',
|
|
-- @is_validate_column='is_validate_column',
|
|
-- @experiment_name='automl-sql-forecast',
|
|
-- @primary_metric='normalized_root_mean_squared_error'
|
|
|
|
SET ANSI_NULLS ON
|
|
GO
|
|
SET QUOTED_IDENTIFIER ON
|
|
GO
|
|
CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]
|
|
(
|
|
@input_query NVARCHAR(MAX), -- The SQL Query that will return the data to train and validate the model.
|
|
@label_column NVARCHAR(255)='Label', -- The name of the column in the result of @input_query that is the label.
|
|
@primary_metric NVARCHAR(40)='AUC_weighted', -- The metric to optimize.
|
|
@iterations INT=100, -- The maximum number of pipelines to train.
|
|
@task NVARCHAR(40)='classification', -- The type of task. Can be classification, regression or forecasting.
|
|
@experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.
|
|
@iteration_timeout_minutes INT = 15, -- The maximum time in minutes for training a single pipeline.
|
|
@experiment_timeout_minutes INT = 60, -- The maximum time in minutes for training all pipelines.
|
|
@n_cross_validations INT = 3, -- The number of cross validations.
|
|
@blacklist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that will not be used.
|
|
-- The list of possible models can be found at:
|
|
-- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
|
|
@whitelist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that can be used.
|
|
-- The list of possible models can be found at:
|
|
-- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings
|
|
@experiment_exit_score FLOAT = 0, -- Stop the experiment if this score is acheived.
|
|
@sample_weight_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that gives a sample weight.
|
|
@is_validate_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that indicates if the row is for training or validation.
|
|
-- In the values of the column, 0 means for training and 1 means for validation.
|
|
@time_column_name NVARCHAR(255)='', -- The name of the timestamp column for forecasting.
|
|
@connection_name NVARCHAR(255)='default', -- The AML connection to use.
|
|
@max_horizon INT = 0 -- A forecast horizon is a time span into the future (or just beyond the latest date in the training data)
|
|
-- where forecasts of the target quantity are needed.
|
|
-- For example, if data is recorded daily and max_horizon is 5, we will predict 5 days ahead.
|
|
) AS
|
|
BEGIN
|
|
|
|
DECLARE @tenantid NVARCHAR(255)
|
|
DECLARE @appid NVARCHAR(255)
|
|
DECLARE @password NVARCHAR(255)
|
|
DECLARE @config_file NVARCHAR(255)
|
|
|
|
SELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile
|
|
FROM aml_connection
|
|
WHERE ConnectionName = @connection_name;
|
|
|
|
EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
|
|
import logging
|
|
import azureml.core
|
|
import pandas as pd
|
|
import numpy as np
|
|
from azureml.core.experiment import Experiment
|
|
from azureml.train.automl import AutoMLConfig
|
|
from sklearn import datasets
|
|
import pickle
|
|
import codecs
|
|
from azureml.core.authentication import ServicePrincipalAuthentication
|
|
from azureml.core.workspace import Workspace
|
|
|
|
if __name__.startswith("sqlindb"):
|
|
auth = ServicePrincipalAuthentication(tenantid, appid, password)
|
|
|
|
ws = Workspace.from_config(path=config_file, auth=auth)
|
|
|
|
project_folder = "./sample_projects/" + experiment_name
|
|
|
|
experiment = Experiment(ws, experiment_name)
|
|
|
|
data_train = input_data
|
|
X_valid = None
|
|
y_valid = None
|
|
sample_weight_valid = None
|
|
|
|
if is_validate_column != "" and is_validate_column is not None:
|
|
data_train = input_data[input_data[is_validate_column] <= 0]
|
|
data_valid = input_data[input_data[is_validate_column] > 0]
|
|
data_train.pop(is_validate_column)
|
|
data_valid.pop(is_validate_column)
|
|
y_valid = data_valid.pop(label_column).values
|
|
if sample_weight_column != "" and sample_weight_column is not None:
|
|
sample_weight_valid = data_valid.pop(sample_weight_column).values
|
|
X_valid = data_valid
|
|
n_cross_validations = None
|
|
|
|
y_train = data_train.pop(label_column).values
|
|
|
|
sample_weight = None
|
|
if sample_weight_column != "" and sample_weight_column is not None:
|
|
sample_weight = data_train.pop(sample_weight_column).values
|
|
|
|
X_train = data_train
|
|
|
|
if experiment_timeout_minutes == 0:
|
|
experiment_timeout_minutes = None
|
|
|
|
if experiment_exit_score == 0:
|
|
experiment_exit_score = None
|
|
|
|
if blacklist_models == "":
|
|
blacklist_models = None
|
|
|
|
if blacklist_models is not None:
|
|
blacklist_models = blacklist_models.replace(" ", "").split(",")
|
|
|
|
if whitelist_models == "":
|
|
whitelist_models = None
|
|
|
|
if whitelist_models is not None:
|
|
whitelist_models = whitelist_models.replace(" ", "").split(",")
|
|
|
|
automl_settings = {}
|
|
preprocess = True
|
|
if time_column_name != "" and time_column_name is not None:
|
|
automl_settings = { "time_column_name": time_column_name }
|
|
preprocess = False
|
|
if max_horizon > 0:
|
|
automl_settings["max_horizon"] = max_horizon
|
|
|
|
log_file_name = "automl_sqlindb_errors.log"
|
|
|
|
automl_config = AutoMLConfig(task = task,
|
|
debug_log = log_file_name,
|
|
primary_metric = primary_metric,
|
|
iteration_timeout_minutes = iteration_timeout_minutes,
|
|
experiment_timeout_minutes = experiment_timeout_minutes,
|
|
iterations = iterations,
|
|
n_cross_validations = n_cross_validations,
|
|
preprocess = preprocess,
|
|
verbosity = logging.INFO,
|
|
X = X_train,
|
|
y = y_train,
|
|
path = project_folder,
|
|
blacklist_models = blacklist_models,
|
|
whitelist_models = whitelist_models,
|
|
experiment_exit_score = experiment_exit_score,
|
|
sample_weight = sample_weight,
|
|
X_valid = X_valid,
|
|
y_valid = y_valid,
|
|
sample_weight_valid = sample_weight_valid,
|
|
**automl_settings)
|
|
|
|
local_run = experiment.submit(automl_config, show_output = True)
|
|
|
|
best_run, fitted_model = local_run.get_output()
|
|
|
|
pickled_model = codecs.encode(pickle.dumps(fitted_model), "base64").decode()
|
|
|
|
log_file_text = ""
|
|
|
|
try:
|
|
with open(log_file_name, "r") as log_file:
|
|
log_file_text = log_file.read()
|
|
except:
|
|
log_file_text = "Log file not found"
|
|
|
|
returned_model = pd.DataFrame({"best_run": [best_run.id], "experiment_name": [experiment_name], "fitted_model": [pickled_model], "log_file_text": [log_file_text], "workspace": [ws.name]}, dtype=np.dtype(np.str))
|
|
'
|
|
, @input_data_1 = @input_query
|
|
, @input_data_1_name = N'input_data'
|
|
, @output_data_1_name = N'returned_model'
|
|
, @params = N'@label_column NVARCHAR(255),
|
|
@primary_metric NVARCHAR(40),
|
|
@iterations INT, @task NVARCHAR(40),
|
|
@experiment_name NVARCHAR(32),
|
|
@iteration_timeout_minutes INT,
|
|
@experiment_timeout_minutes INT,
|
|
@n_cross_validations INT,
|
|
@blacklist_models NVARCHAR(MAX),
|
|
@whitelist_models NVARCHAR(MAX),
|
|
@experiment_exit_score FLOAT,
|
|
@sample_weight_column NVARCHAR(255),
|
|
@is_validate_column NVARCHAR(255),
|
|
@time_column_name NVARCHAR(255),
|
|
@tenantid NVARCHAR(255),
|
|
@appid NVARCHAR(255),
|
|
@password NVARCHAR(255),
|
|
@config_file NVARCHAR(255),
|
|
@max_horizon INT'
|
|
, @label_column = @label_column
|
|
, @primary_metric = @primary_metric
|
|
, @iterations = @iterations
|
|
, @task = @task
|
|
, @experiment_name = @experiment_name
|
|
, @iteration_timeout_minutes = @iteration_timeout_minutes
|
|
, @experiment_timeout_minutes = @experiment_timeout_minutes
|
|
, @n_cross_validations = @n_cross_validations
|
|
, @blacklist_models = @blacklist_models
|
|
, @whitelist_models = @whitelist_models
|
|
, @experiment_exit_score = @experiment_exit_score
|
|
, @sample_weight_column = @sample_weight_column
|
|
, @is_validate_column = @is_validate_column
|
|
, @time_column_name = @time_column_name
|
|
, @tenantid = @tenantid
|
|
, @appid = @appid
|
|
, @password = @password
|
|
, @config_file = @config_file
|
|
, @max_horizon = @max_horizon
|
|
WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))
|
|
END |