-- This procedure forecast values based on a forecasting model returned by AutoMLTrain.
-- It returns a dataset with the forecasted values.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE OR ALTER PROCEDURE [dbo].[AutoMLForecast]
 (
   @input_query NVARCHAR(MAX),          -- A SQL query returning data to predict on.
   @model NVARCHAR(MAX),                -- A model returned from AutoMLTrain.
   @time_column_name  NVARCHAR(255)='', -- The name of the timestamp column for forecasting.
   @label_column  NVARCHAR(255)='',     -- Optional name of the column from input_query, which should be ignored when predicting
   @y_query_column NVARCHAR(255)='',    -- Optional value column that can be used for predicting.
                                        -- If specified, this can contain values for past times (after the model was trained)
									    -- and contain Nan for future times.
   @forecast_column_name NVARCHAR(255) = 'predicted'
                                        -- The name of the output column containing the forecast value.
 ) AS 
BEGIN 
  
    EXEC sp_execute_external_script @language = N'Python', @script = N'import pandas as pd 
import azureml.core  
import numpy as np 
from azureml.train.automl import AutoMLConfig  
import pickle 
import codecs 
  
model_obj = pickle.loads(codecs.decode(model.encode(), "base64")) 
  
test_data = input_data.copy() 

if label_column != "" and label_column is not None:
    y_test = test_data.pop(label_column).values
else:
    y_test = None 

if y_query_column != "" and y_query_column is not None:
    y_query = test_data.pop(y_query_column).values
else:
    y_query = np.repeat(np.nan, len(test_data))

X_test = test_data 

if time_column_name != "" and time_column_name is not None:
    X_test[time_column_name] = pd.to_datetime(X_test[time_column_name])

y_fcst, X_trans = model_obj.forecast(X_test, y_query) 

def align_outputs(y_forecast, X_trans, X_test, y_test, forecast_column_name):
    # Demonstrates how to get the output aligned to the inputs
    # using pandas indexes. Helps understand what happened if
    # the output shape differs from the input shape, or if
    # the data got re-sorted by time and grain during forecasting.
    
    # Typical causes of misalignment are:
    # * we predicted some periods that were missing in actuals -> drop from eval
    # * model was asked to predict past max_horizon -> increase max horizon
    # * data at start of X_test was needed for lags -> provide previous periods

    df_fcst = pd.DataFrame({forecast_column_name : y_forecast})
    # y and X outputs are aligned by forecast() function contract
    df_fcst.index = X_trans.index
    
    # align original X_test to y_test    
    X_test_full = X_test.copy()
    if y_test is not None:
        X_test_full[label_column] = y_test

    # X_test_full does not include origin, so reset for merge
    df_fcst.reset_index(inplace=True)
    X_test_full = X_test_full.reset_index().drop(columns=''index'')
    together = df_fcst.merge(X_test_full, how=''right'')
    
    # drop rows where prediction or actuals are nan 
    # happens because of missing actuals 
    # or at edges of time due to lags/rolling windows
    clean = together[together[[label_column, forecast_column_name]].notnull().all(axis=1)]
    return(clean)

combined_output = align_outputs(y_fcst, X_trans, X_test, y_test, forecast_column_name)
  
' 
    , @input_data_1 = @input_query 
    , @input_data_1_name = N'input_data' 
    , @output_data_1_name = N'combined_output' 
    , @params = N'@model NVARCHAR(MAX), @time_column_name  NVARCHAR(255), @label_column NVARCHAR(255), @y_query_column NVARCHAR(255), @forecast_column_name NVARCHAR(255)' 
    , @model = @model 
	, @time_column_name = @time_column_name
	, @label_column = @label_column
	, @y_query_column = @y_query_column
	, @forecast_column_name = @forecast_column_name
END