mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-19 17:17:04 -05:00
562 lines
27 KiB
Plaintext
562 lines
27 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Set up Azure ML Automated Machine Learning on SQL Server 2019 CTP 2.4 big data cluster\r\n",
|
|
"\r\n",
|
|
"\\# Prerequisites: \r\n",
|
|
"\\# - An Azure subscription and resource group \r\n",
|
|
"\\# - An Azure Machine Learning workspace \r\n",
|
|
"\\# - A SQL Server 2019 CTP 2.4 big data cluster with Internet access and a database named 'automl' \r\n",
|
|
"\\# - Azure CLI \r\n",
|
|
"\\# - kubectl command \r\n",
|
|
"\\# - The https://github.com/Azure/MachineLearningNotebooks repository downloaded (cloned) to your local machine\r\n",
|
|
"\r\n",
|
|
"\\# In the 'automl' database, create a table named 'dbo.nyc_energy' as follows: \r\n",
|
|
"\\# - In SQL Server Management Studio, right-click the 'automl' database, select Tasks, then Import Flat File. \r\n",
|
|
"\\# - Select the file AzureMlCli\\notebooks\\how-to-use-azureml\\automated-machine-learning\\forecasting-energy-demand\\nyc_energy.csv. \r\n",
|
|
"\\# - Using the \"Modify Columns\" page, allow nulls for all columns. \r\n",
|
|
"\r\n",
|
|
"\\# Create an Azure Machine Learning Workspace using the instructions at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace \r\n",
|
|
"\r\n",
|
|
"\\# Create an Azure service principal. You can do this with the following commands: \r\n",
|
|
"\r\n",
|
|
"az login \r\n",
|
|
"az account set --subscription *subscriptionid* \r\n",
|
|
"\r\n",
|
|
"\\# The following command prints out the **appId** and **tenant**, \r\n",
|
|
"\\# which you insert into the indicated cell later in this notebook \r\n",
|
|
"\\# to allow AutoML to authenticate with Azure: \r\n",
|
|
"\r\n",
|
|
"az ad sp create-for-rbac --name *principlename* --password *password*\r\n",
|
|
"\r\n",
|
|
"\\# Log into the master instance of SQL Server 2019 CTP 2.4: \r\n",
|
|
"kubectl exec -it mssql-master-pool-0 -n *clustername* -c mssql-server -- /bin/bash\r\n",
|
|
"\r\n",
|
|
"mkdir /tmp/aml\r\n",
|
|
"\r\n",
|
|
"cd /tmp/aml\r\n",
|
|
"\r\n",
|
|
"\\# **Modify** the following with your subscription_id, resource_group, and workspace_name: \r\n",
|
|
"cat > config.json << EOF \r\n",
|
|
"{ \r\n",
|
|
" \"subscription_id\": \"123456ab-78cd-0123-45ef-abcd12345678\", \r\n",
|
|
" \"resource_group\": \"myrg1\", \r\n",
|
|
" \"workspace_name\": \"myws1\" \r\n",
|
|
"} \r\n",
|
|
"EOF\r\n",
|
|
"\r\n",
|
|
"\\# The directory referenced below is appropriate for the master instance of SQL Server 2019 CTP 2.4.\r\n",
|
|
"\r\n",
|
|
"cd /opt/mssql/mlservices/runtime/python/bin\r\n",
|
|
"\r\n",
|
|
"./python -m pip install azureml-sdk[automl]\r\n",
|
|
"\r\n",
|
|
"./python -m pip install --upgrade numpy \r\n",
|
|
"\r\n",
|
|
"./python -m pip install --upgrade sklearn\r\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- Enable external scripts to allow invoking Python\r\n",
|
|
"sp_configure 'external scripts enabled',1 \r\n",
|
|
"reconfigure with override \r\n",
|
|
"GO\r\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- Use database 'automl'\r\n",
|
|
"USE [automl]\r\n",
|
|
"GO"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- This is a table to hold the Azure ML connection information.\r\n",
|
|
"SET ANSI_NULLS ON\r\n",
|
|
"GO\r\n",
|
|
"\r\n",
|
|
"SET QUOTED_IDENTIFIER ON\r\n",
|
|
"GO\r\n",
|
|
"\r\n",
|
|
"CREATE TABLE [dbo].[aml_connection](\r\n",
|
|
" [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,\r\n",
|
|
"\t[ConnectionName] [nvarchar](255) NULL,\r\n",
|
|
"\t[TenantId] [nvarchar](255) NULL,\r\n",
|
|
"\t[AppId] [nvarchar](255) NULL,\r\n",
|
|
"\t[Password] [nvarchar](255) NULL,\r\n",
|
|
"\t[ConfigFile] [nvarchar](255) NULL\r\n",
|
|
") ON [PRIMARY]\r\n",
|
|
"GO"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Copy the values from create-for-rbac above into the cell below"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- Use the following values:\r\n",
|
|
"-- Leave the name as 'Default'\r\n",
|
|
"-- Insert <tenant> returned by create-for-rbac above\r\n",
|
|
"-- Insert <AppId> returned by create-for-rbac above\r\n",
|
|
"-- Insert <password> used in create-for-rbac above\r\n",
|
|
"-- Leave <path> as '/tmp/aml/config.json'\r\n",
|
|
"INSERT INTO [dbo].[aml_connection] \r\n",
|
|
"VALUES (\r\n",
|
|
" N'Default', -- Name\r\n",
|
|
" N'11111111-2222-3333-4444-555555555555', -- Tenant\r\n",
|
|
" N'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', -- AppId\r\n",
|
|
" N'insertpasswordhere', -- Password\r\n",
|
|
" N'/tmp/aml/config.json' -- Path\r\n",
|
|
" );\r\n",
|
|
"GO"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- This is a table to hold the results from the AutoMLTrain procedure.\r\n",
|
|
"SET ANSI_NULLS ON\r\n",
|
|
"GO\r\n",
|
|
"\r\n",
|
|
"SET QUOTED_IDENTIFIER ON\r\n",
|
|
"GO\r\n",
|
|
"\r\n",
|
|
"CREATE TABLE [dbo].[aml_model](\r\n",
|
|
" [Id] [int] IDENTITY(1,1) NOT NULL PRIMARY KEY,\r\n",
|
|
" [Model] [varchar](max) NOT NULL, -- The model, which can be passed to AutoMLPredict for testing or prediction.\r\n",
|
|
" [RunId] [nvarchar](250) NULL, -- The RunId, which can be used to view the model in the Azure Portal.\r\n",
|
|
" [CreatedDate] [datetime] NULL,\r\n",
|
|
" [ExperimentName] [nvarchar](100) NULL, -- Azure ML Experiment Name\r\n",
|
|
" [WorkspaceName] [nvarchar](100) NULL, -- Azure ML Workspace Name\r\n",
|
|
"\t[LogFileText] [nvarchar](max) NULL\r\n",
|
|
") \r\n",
|
|
"GO\r\n",
|
|
"\r\n",
|
|
"ALTER TABLE [dbo].[aml_model] ADD DEFAULT (getutcdate()) FOR [CreatedDate]\r\n",
|
|
"GO\r\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- This stored procedure uses automated machine learning to train several models\r\n",
|
|
"-- and return the best model.\r\n",
|
|
"--\r\n",
|
|
"-- The result set has several columns:\r\n",
|
|
"-- best_run - ID of the best model found\r\n",
|
|
"-- experiment_name - training run name\r\n",
|
|
"-- fitted_model - best model found\r\n",
|
|
"-- log_file_text - console output\r\n",
|
|
"-- workspace - name of the Azure ML workspace where run history is stored\r\n",
|
|
"--\r\n",
|
|
"-- An example call for a classification problem is:\r\n",
|
|
"-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
|
|
"-- exec dbo.AutoMLTrain @input_query='\r\n",
|
|
"-- SELECT top 100000 \r\n",
|
|
"-- CAST([pickup_datetime] AS NVARCHAR(30)) AS pickup_datetime\r\n",
|
|
"-- ,CAST([dropoff_datetime] AS NVARCHAR(30)) AS dropoff_datetime\r\n",
|
|
"-- ,[passenger_count]\r\n",
|
|
"-- ,[trip_time_in_secs]\r\n",
|
|
"-- ,[trip_distance]\r\n",
|
|
"-- ,[payment_type]\r\n",
|
|
"-- ,[tip_class]\r\n",
|
|
"-- FROM [dbo].[nyctaxi_sample] order by [hack_license] ',\r\n",
|
|
"-- @label_column = 'tip_class',\r\n",
|
|
"-- @iterations=10\r\n",
|
|
"-- \r\n",
|
|
"-- An example call for forecasting is:\r\n",
|
|
"-- insert into dbo.aml_model(RunId, ExperimentName, Model, LogFileText, WorkspaceName)\r\n",
|
|
"-- exec dbo.AutoMLTrain @input_query='\r\n",
|
|
"-- select cast(timeStamp as nvarchar(30)) as timeStamp,\r\n",
|
|
"-- demand,\r\n",
|
|
"-- \t precip,\r\n",
|
|
"-- \t temp,\r\n",
|
|
"-- case when timeStamp < ''2017-01-01'' then 0 else 1 end as is_validate_column\r\n",
|
|
"-- from nyc_energy\r\n",
|
|
"-- where demand is not null and precip is not null and temp is not null\r\n",
|
|
"-- and timeStamp < ''2017-02-01''',\r\n",
|
|
"-- @label_column='demand',\r\n",
|
|
"-- @task='forecasting',\r\n",
|
|
"-- @iterations=10,\r\n",
|
|
"-- @iteration_timeout_minutes=5,\r\n",
|
|
"-- @time_column_name='timeStamp',\r\n",
|
|
"-- @is_validate_column='is_validate_column',\r\n",
|
|
"-- @experiment_name='automl-sql-forecast',\r\n",
|
|
"-- @primary_metric='normalized_root_mean_squared_error'\r\n",
|
|
"\r\n",
|
|
"SET ANSI_NULLS ON\r\n",
|
|
"GO\r\n",
|
|
"SET QUOTED_IDENTIFIER ON\r\n",
|
|
"GO\r\n",
|
|
"CREATE OR ALTER PROCEDURE [dbo].[AutoMLTrain]\r\n",
|
|
" (\r\n",
|
|
" @input_query NVARCHAR(MAX), -- The SQL Query that will return the data to train and validate the model.\r\n",
|
|
" @label_column NVARCHAR(255)='Label', -- The name of the column in the result of @input_query that is the label.\r\n",
|
|
" @primary_metric NVARCHAR(40)='AUC_weighted', -- The metric to optimize.\r\n",
|
|
" @iterations INT=100, -- The maximum number of pipelines to train.\r\n",
|
|
" @task NVARCHAR(40)='classification', -- The type of task. Can be classification, regression or forecasting.\r\n",
|
|
" @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.\r\n",
|
|
" @iteration_timeout_minutes INT = 15, -- The maximum time in minutes for training a single pipeline. \r\n",
|
|
" @experiment_timeout_minutes INT = 60, -- The maximum time in minutes for training all pipelines.\r\n",
|
|
" @n_cross_validations INT = 3, -- The number of cross validations.\r\n",
|
|
" @blacklist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that will not be used.\r\n",
|
|
" -- The list of possible models can be found at:\r\n",
|
|
" -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings\r\n",
|
|
" @whitelist_models NVARCHAR(MAX) = '', -- A comma separated list of algos that can be used.\r\n",
|
|
" -- The list of possible models can be found at:\r\n",
|
|
" -- https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#configure-your-experiment-settings\r\n",
|
|
" @experiment_exit_score FLOAT = 0, -- Stop the experiment if this score is acheived.\r\n",
|
|
" @sample_weight_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that gives a sample weight.\r\n",
|
|
" @is_validate_column NVARCHAR(255)='', -- The name of the column in the result of @input_query that indicates if the row is for training or validation.\r\n",
|
|
"\t -- In the values of the column, 0 means for training and 1 means for validation.\r\n",
|
|
" @time_column_name NVARCHAR(255)='', -- The name of the timestamp column for forecasting.\r\n",
|
|
"\t@connection_name NVARCHAR(255)='default' -- The AML connection to use.\r\n",
|
|
" ) AS\r\n",
|
|
"BEGIN\r\n",
|
|
"\r\n",
|
|
" DECLARE @tenantid NVARCHAR(255)\r\n",
|
|
" DECLARE @appid NVARCHAR(255)\r\n",
|
|
" DECLARE @password NVARCHAR(255)\r\n",
|
|
" DECLARE @config_file NVARCHAR(255)\r\n",
|
|
"\r\n",
|
|
"\tSELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile\r\n",
|
|
"\tFROM aml_connection\r\n",
|
|
"\tWHERE ConnectionName = @connection_name;\r\n",
|
|
"\r\n",
|
|
"\tEXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd\r\n",
|
|
"import logging \r\n",
|
|
"import azureml.core \r\n",
|
|
"import pandas as pd\r\n",
|
|
"import numpy as np\r\n",
|
|
"from azureml.core.experiment import Experiment \r\n",
|
|
"from azureml.train.automl import AutoMLConfig \r\n",
|
|
"from sklearn import datasets \r\n",
|
|
"import pickle\r\n",
|
|
"import codecs\r\n",
|
|
"from azureml.core.authentication import ServicePrincipalAuthentication \r\n",
|
|
"from azureml.core.workspace import Workspace \r\n",
|
|
"\r\n",
|
|
"if __name__.startswith(\"sqlindb\"):\r\n",
|
|
" auth = ServicePrincipalAuthentication(tenantid, appid, password) \r\n",
|
|
" \r\n",
|
|
" ws = Workspace.from_config(path=config_file, auth=auth) \r\n",
|
|
" \r\n",
|
|
" project_folder = \"./sample_projects/\" + experiment_name\r\n",
|
|
" \r\n",
|
|
" experiment = Experiment(ws, experiment_name) \r\n",
|
|
"\r\n",
|
|
" data_train = input_data\r\n",
|
|
" X_valid = None\r\n",
|
|
" y_valid = None\r\n",
|
|
" sample_weight_valid = None\r\n",
|
|
"\r\n",
|
|
" if is_validate_column != \"\" and is_validate_column is not None:\r\n",
|
|
" data_train = input_data[input_data[is_validate_column] <= 0]\r\n",
|
|
" data_valid = input_data[input_data[is_validate_column] > 0]\r\n",
|
|
" data_train.pop(is_validate_column)\r\n",
|
|
" data_valid.pop(is_validate_column)\r\n",
|
|
" y_valid = data_valid.pop(label_column).values\r\n",
|
|
" if sample_weight_column != \"\" and sample_weight_column is not None:\r\n",
|
|
" sample_weight_valid = data_valid.pop(sample_weight_column).values\r\n",
|
|
" X_valid = data_valid\r\n",
|
|
" n_cross_validations = None\r\n",
|
|
"\r\n",
|
|
" y_train = data_train.pop(label_column).values\r\n",
|
|
"\r\n",
|
|
" sample_weight = None\r\n",
|
|
" if sample_weight_column != \"\" and sample_weight_column is not None:\r\n",
|
|
" sample_weight = data_train.pop(sample_weight_column).values\r\n",
|
|
"\r\n",
|
|
" X_train = data_train\r\n",
|
|
"\r\n",
|
|
" if experiment_timeout_minutes == 0:\r\n",
|
|
" experiment_timeout_minutes = None\r\n",
|
|
"\r\n",
|
|
" if experiment_exit_score == 0:\r\n",
|
|
" experiment_exit_score = None\r\n",
|
|
"\r\n",
|
|
" if blacklist_models == \"\":\r\n",
|
|
" blacklist_models = None\r\n",
|
|
"\r\n",
|
|
" if blacklist_models is not None:\r\n",
|
|
" blacklist_models = blacklist_models.replace(\" \", \"\").split(\",\")\r\n",
|
|
"\r\n",
|
|
" if whitelist_models == \"\":\r\n",
|
|
" whitelist_models = None\r\n",
|
|
"\r\n",
|
|
" if whitelist_models is not None:\r\n",
|
|
" whitelist_models = whitelist_models.replace(\" \", \"\").split(\",\")\r\n",
|
|
"\r\n",
|
|
" automl_settings = {}\r\n",
|
|
" preprocess = True\r\n",
|
|
" if time_column_name != \"\" and time_column_name is not None:\r\n",
|
|
" automl_settings = { \"time_column_name\": time_column_name }\r\n",
|
|
" preprocess = False\r\n",
|
|
"\r\n",
|
|
" log_file_name = \"automl_errors.log\"\r\n",
|
|
"\t \r\n",
|
|
" automl_config = AutoMLConfig(task = task, \r\n",
|
|
" debug_log = log_file_name, \r\n",
|
|
" primary_metric = primary_metric, \r\n",
|
|
" iteration_timeout_minutes = iteration_timeout_minutes, \r\n",
|
|
" experiment_timeout_minutes = experiment_timeout_minutes,\r\n",
|
|
" iterations = iterations, \r\n",
|
|
" n_cross_validations = n_cross_validations, \r\n",
|
|
" preprocess = preprocess,\r\n",
|
|
" verbosity = logging.INFO, \r\n",
|
|
" enable_ensembling = False,\r\n",
|
|
" X = X_train, \r\n",
|
|
" y = y_train, \r\n",
|
|
" path = project_folder,\r\n",
|
|
" blacklist_models = blacklist_models,\r\n",
|
|
" whitelist_models = whitelist_models,\r\n",
|
|
" experiment_exit_score = experiment_exit_score,\r\n",
|
|
" sample_weight = sample_weight,\r\n",
|
|
" X_valid = X_valid,\r\n",
|
|
" y_valid = y_valid,\r\n",
|
|
" sample_weight_valid = sample_weight_valid,\r\n",
|
|
" **automl_settings) \r\n",
|
|
" \r\n",
|
|
" local_run = experiment.submit(automl_config, show_output = True) \r\n",
|
|
"\r\n",
|
|
" best_run, fitted_model = local_run.get_output()\r\n",
|
|
"\r\n",
|
|
" pickled_model = codecs.encode(pickle.dumps(fitted_model), \"base64\").decode()\r\n",
|
|
"\r\n",
|
|
" log_file_text = \"\"\r\n",
|
|
"\r\n",
|
|
" try:\r\n",
|
|
" with open(log_file_name, \"r\") as log_file:\r\n",
|
|
" log_file_text = log_file.read()\r\n",
|
|
" except:\r\n",
|
|
" log_file_text = \"Log file not found\"\r\n",
|
|
"\r\n",
|
|
" returned_model = pd.DataFrame({\"best_run\": [best_run.id], \"experiment_name\": [experiment_name], \"fitted_model\": [pickled_model], \"log_file_text\": [log_file_text], \"workspace\": [ws.name]}, dtype=np.dtype(np.str))\r\n",
|
|
"'\r\n",
|
|
"\t, @input_data_1 = @input_query\r\n",
|
|
"\t, @input_data_1_name = N'input_data'\r\n",
|
|
"\t, @output_data_1_name = N'returned_model'\r\n",
|
|
"\t, @params = N'@label_column NVARCHAR(255), \r\n",
|
|
"\t @primary_metric NVARCHAR(40),\r\n",
|
|
"\t\t\t\t @iterations INT, @task NVARCHAR(40),\r\n",
|
|
"\t\t\t\t @experiment_name NVARCHAR(32),\r\n",
|
|
"\t\t\t\t @iteration_timeout_minutes INT,\r\n",
|
|
"\t\t\t\t @experiment_timeout_minutes INT,\r\n",
|
|
"\t\t\t\t @n_cross_validations INT,\r\n",
|
|
"\t\t\t\t @blacklist_models NVARCHAR(MAX),\r\n",
|
|
"\t\t\t\t @whitelist_models NVARCHAR(MAX),\r\n",
|
|
"\t\t\t\t @experiment_exit_score FLOAT,\r\n",
|
|
"\t\t\t\t @sample_weight_column NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @is_validate_column NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @time_column_name NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @tenantid NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @appid NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @password NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @config_file NVARCHAR(255)'\r\n",
|
|
"\t, @label_column = @label_column\r\n",
|
|
"\t, @primary_metric = @primary_metric\r\n",
|
|
"\t, @iterations = @iterations\r\n",
|
|
"\t, @task = @task\r\n",
|
|
"\t, @experiment_name = @experiment_name\r\n",
|
|
"\t, @iteration_timeout_minutes = @iteration_timeout_minutes\r\n",
|
|
"\t, @experiment_timeout_minutes = @experiment_timeout_minutes\r\n",
|
|
"\t, @n_cross_validations = @n_cross_validations\r\n",
|
|
"\t, @blacklist_models = @blacklist_models\r\n",
|
|
"\t, @whitelist_models = @whitelist_models\r\n",
|
|
"\t, @experiment_exit_score = @experiment_exit_score\r\n",
|
|
"\t, @sample_weight_column = @sample_weight_column\r\n",
|
|
"\t, @is_validate_column = @is_validate_column\r\n",
|
|
"\t, @time_column_name = @time_column_name\r\n",
|
|
"\t, @tenantid = @tenantid\r\n",
|
|
"\t, @appid = @appid\r\n",
|
|
"\t, @password = @password\r\n",
|
|
"\t, @config_file = @config_file\r\n",
|
|
"WITH RESULT SETS ((best_run NVARCHAR(250), experiment_name NVARCHAR(100), fitted_model VARCHAR(MAX), log_file_text NVARCHAR(MAX), workspace NVARCHAR(100)))\r\n",
|
|
"END"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- This procedure returns a list of metrics for each iteration of a training run.\r\n",
|
|
"SET ANSI_NULLS ON\r\n",
|
|
"GO\r\n",
|
|
"SET QUOTED_IDENTIFIER ON\r\n",
|
|
"GO\r\n",
|
|
"CREATE OR ALTER PROCEDURE [dbo].[AutoMLGetMetrics]\r\n",
|
|
" (\r\n",
|
|
"\t@run_id NVARCHAR(250), -- The RunId\r\n",
|
|
" @experiment_name NVARCHAR(32)='automl-sql-test', -- This can be used to find the experiment in the Azure Portal.\r\n",
|
|
" @connection_name NVARCHAR(255)='default' -- The AML connection to use.\r\n",
|
|
" ) AS\r\n",
|
|
"BEGIN\r\n",
|
|
" DECLARE @tenantid NVARCHAR(255)\r\n",
|
|
" DECLARE @appid NVARCHAR(255)\r\n",
|
|
" DECLARE @password NVARCHAR(255)\r\n",
|
|
" DECLARE @config_file NVARCHAR(255)\r\n",
|
|
"\r\n",
|
|
"\tSELECT @tenantid=TenantId, @appid=AppId, @password=Password, @config_file=ConfigFile\r\n",
|
|
"\tFROM aml_connection\r\n",
|
|
"\tWHERE ConnectionName = @connection_name;\r\n",
|
|
"\r\n",
|
|
" EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd\r\n",
|
|
"import logging \r\n",
|
|
"import azureml.core \r\n",
|
|
"import numpy as np\r\n",
|
|
"from azureml.core.experiment import Experiment \r\n",
|
|
"from azureml.train.automl.run import AutoMLRun\r\n",
|
|
"from azureml.core.authentication import ServicePrincipalAuthentication \r\n",
|
|
"from azureml.core.workspace import Workspace \r\n",
|
|
"\r\n",
|
|
"auth = ServicePrincipalAuthentication(tenantid, appid, password) \r\n",
|
|
" \r\n",
|
|
"ws = Workspace.from_config(path=config_file, auth=auth) \r\n",
|
|
" \r\n",
|
|
"experiment = Experiment(ws, experiment_name) \r\n",
|
|
"\r\n",
|
|
"ml_run = AutoMLRun(experiment = experiment, run_id = run_id)\r\n",
|
|
"\r\n",
|
|
"children = list(ml_run.get_children())\r\n",
|
|
"iterationlist = []\r\n",
|
|
"metricnamelist = []\r\n",
|
|
"metricvaluelist = []\r\n",
|
|
"\r\n",
|
|
"for run in children:\r\n",
|
|
" properties = run.get_properties()\r\n",
|
|
" if \"iteration\" in properties:\r\n",
|
|
" iteration = int(properties[\"iteration\"])\r\n",
|
|
" for metric_name, metric_value in run.get_metrics().items():\r\n",
|
|
" if isinstance(metric_value, float):\r\n",
|
|
" iterationlist.append(iteration)\r\n",
|
|
" metricnamelist.append(metric_name)\r\n",
|
|
" metricvaluelist.append(metric_value)\r\n",
|
|
" \r\n",
|
|
"metrics = pd.DataFrame({\"iteration\": iterationlist, \"metric_name\": metricnamelist, \"metric_value\": metricvaluelist})\r\n",
|
|
"'\r\n",
|
|
" , @output_data_1_name = N'metrics'\r\n",
|
|
"\t, @params = N'@run_id NVARCHAR(250), \r\n",
|
|
"\t\t\t\t @experiment_name NVARCHAR(32),\r\n",
|
|
" \t\t\t\t @tenantid NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @appid NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @password NVARCHAR(255),\r\n",
|
|
"\t\t\t\t @config_file NVARCHAR(255)'\r\n",
|
|
" , @run_id = @run_id\r\n",
|
|
"\t, @experiment_name = @experiment_name\r\n",
|
|
"\t, @tenantid = @tenantid\r\n",
|
|
"\t, @appid = @appid\r\n",
|
|
"\t, @password = @password\r\n",
|
|
"\t, @config_file = @config_file\r\n",
|
|
"WITH RESULT SETS ((iteration INT, metric_name NVARCHAR(100), metric_value FLOAT))\r\n",
|
|
"END"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"-- This procedure predicts values based on a model returned by AutoMLTrain and a dataset.\r\n",
|
|
"-- It returns the dataset with a new column added, which is the predicted value.\r\n",
|
|
"SET ANSI_NULLS ON\r\n",
|
|
"GO\r\n",
|
|
"SET QUOTED_IDENTIFIER ON\r\n",
|
|
"GO\r\n",
|
|
"CREATE OR ALTER PROCEDURE [dbo].[AutoMLPredict]\r\n",
|
|
" (\r\n",
|
|
" @input_query NVARCHAR(MAX), -- A SQL query returning data to predict on.\r\n",
|
|
" @model NVARCHAR(MAX), -- A model returned from AutoMLTrain.\r\n",
|
|
" @label_column NVARCHAR(255)='' -- Optional name of the column from input_query, which should be ignored when predicting\r\n",
|
|
" ) AS \r\n",
|
|
"BEGIN \r\n",
|
|
" \r\n",
|
|
" EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd \r\n",
|
|
"import azureml.core \r\n",
|
|
"import numpy as np \r\n",
|
|
"from azureml.train.automl import AutoMLConfig \r\n",
|
|
"import pickle \r\n",
|
|
"import codecs \r\n",
|
|
" \r\n",
|
|
"model_obj = pickle.loads(codecs.decode(model.encode(), \"base64\")) \r\n",
|
|
" \r\n",
|
|
"test_data = input_data.copy() \r\n",
|
|
"\r\n",
|
|
"if label_column != \"\" and label_column is not None:\r\n",
|
|
" y_test = test_data.pop(label_column).values \r\n",
|
|
"X_test = test_data \r\n",
|
|
" \r\n",
|
|
"predicted = model_obj.predict(X_test) \r\n",
|
|
" \r\n",
|
|
"combined_output = input_data.assign(predicted=predicted)\r\n",
|
|
" \r\n",
|
|
"' \r\n",
|
|
" , @input_data_1 = @input_query \r\n",
|
|
" , @input_data_1_name = N'input_data' \r\n",
|
|
" , @output_data_1_name = N'combined_output' \r\n",
|
|
" , @params = N'@model NVARCHAR(MAX), @label_column NVARCHAR(255)' \r\n",
|
|
" , @model = @model \r\n",
|
|
"\t, @label_column = @label_column\r\n",
|
|
"END"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"authors": [
|
|
{
|
|
"name": "jeffshep"
|
|
}
|
|
],
|
|
"kernelspec": {
|
|
"display_name": "Python 3.6",
|
|
"language": "sql",
|
|
"name": "python36"
|
|
},
|
|
"language_info": {
|
|
"name": "sql",
|
|
"version": ""
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
} |