mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 01:27:06 -05:00
93 lines
3.8 KiB
Transact-SQL
93 lines
3.8 KiB
Transact-SQL
-- This procedure forecast values based on a forecasting model returned by AutoMLTrain.
|
|
-- It returns a dataset with the forecasted values.
|
|
SET ANSI_NULLS ON
|
|
GO
|
|
SET QUOTED_IDENTIFIER ON
|
|
GO
|
|
CREATE OR ALTER PROCEDURE [dbo].[AutoMLForecast]
|
|
(
|
|
@input_query NVARCHAR(MAX), -- A SQL query returning data to predict on.
|
|
@model NVARCHAR(MAX), -- A model returned from AutoMLTrain.
|
|
@time_column_name NVARCHAR(255)='', -- The name of the timestamp column for forecasting.
|
|
@label_column NVARCHAR(255)='', -- Optional name of the column from input_query, which should be ignored when predicting
|
|
@y_query_column NVARCHAR(255)='', -- Optional value column that can be used for predicting.
|
|
-- If specified, this can contain values for past times (after the model was trained)
|
|
-- and contain Nan for future times.
|
|
@forecast_column_name NVARCHAR(255) = 'predicted'
|
|
-- The name of the output column containing the forecast value.
|
|
) AS
|
|
BEGIN
|
|
|
|
EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd
|
|
import azureml.core
|
|
import numpy as np
|
|
from azureml.train.automl import AutoMLConfig
|
|
import pickle
|
|
import codecs
|
|
|
|
model_obj = pickle.loads(codecs.decode(model.encode(), "base64"))
|
|
|
|
test_data = input_data.copy()
|
|
|
|
if label_column != "" and label_column is not None:
|
|
y_test = test_data.pop(label_column).values
|
|
else:
|
|
y_test = None
|
|
|
|
if y_query_column != "" and y_query_column is not None:
|
|
y_query = test_data.pop(y_query_column).values
|
|
else:
|
|
y_query = np.repeat(np.nan, len(test_data))
|
|
|
|
X_test = test_data
|
|
|
|
if time_column_name != "" and time_column_name is not None:
|
|
X_test[time_column_name] = pd.to_datetime(X_test[time_column_name])
|
|
|
|
y_fcst, X_trans = model_obj.forecast(X_test, y_query)
|
|
|
|
def align_outputs(y_forecast, X_trans, X_test, y_test, forecast_column_name):
|
|
# Demonstrates how to get the output aligned to the inputs
|
|
# using pandas indexes. Helps understand what happened if
|
|
# the output shape differs from the input shape, or if
|
|
# the data got re-sorted by time and grain during forecasting.
|
|
|
|
# Typical causes of misalignment are:
|
|
# * we predicted some periods that were missing in actuals -> drop from eval
|
|
# * model was asked to predict past max_horizon -> increase max horizon
|
|
# * data at start of X_test was needed for lags -> provide previous periods
|
|
|
|
df_fcst = pd.DataFrame({forecast_column_name : y_forecast})
|
|
# y and X outputs are aligned by forecast() function contract
|
|
df_fcst.index = X_trans.index
|
|
|
|
# align original X_test to y_test
|
|
X_test_full = X_test.copy()
|
|
if y_test is not None:
|
|
X_test_full[label_column] = y_test
|
|
|
|
# X_test_full does not include origin, so reset for merge
|
|
df_fcst.reset_index(inplace=True)
|
|
X_test_full = X_test_full.reset_index().drop(columns=''index'')
|
|
together = df_fcst.merge(X_test_full, how=''right'')
|
|
|
|
# drop rows where prediction or actuals are nan
|
|
# happens because of missing actuals
|
|
# or at edges of time due to lags/rolling windows
|
|
clean = together[together[[label_column, forecast_column_name]].notnull().all(axis=1)]
|
|
return(clean)
|
|
|
|
combined_output = align_outputs(y_fcst, X_trans, X_test, y_test, forecast_column_name)
|
|
|
|
'
|
|
, @input_data_1 = @input_query
|
|
, @input_data_1_name = N'input_data'
|
|
, @output_data_1_name = N'combined_output'
|
|
, @params = N'@model NVARCHAR(MAX), @time_column_name NVARCHAR(255), @label_column NVARCHAR(255), @y_query_column NVARCHAR(255), @forecast_column_name NVARCHAR(255)'
|
|
, @model = @model
|
|
, @time_column_name = @time_column_name
|
|
, @label_column = @label_column
|
|
, @y_query_column = @y_query_column
|
|
, @forecast_column_name = @forecast_column_name
|
|
END
|