# Summary
From raw data that is a mixture of categoricals and numeric, featurize the categoricals using one hot encoding. Use tabular explainer to get explain object and then get raw feature importances

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/explain-model/explain-tabular-data-raw-features/explain-sklearn-raw-features.png)

Explain a model with the AML explain-model package on raw features

1. Train a Logistic Regression model using Scikit-learn
2. Run 'explain_model' with full dataset in local mode, which doesn't contact any Azure services.
3. Run 'explain_model' with summarized dataset in local mode, which doesn't contact any Azure services.
4. Visualize the global and local explanations with the visualization dashboard.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from azureml.explain.model.tabular_explainer import TabularExplainer
import pandas as pd
import numpy as np

In [None]:
titanic_url = ('https://raw.githubusercontent.com/amueller/'
 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
# fill missing values
data = data.fillna(method="ffill")
data = data.fillna(method="bfill")

# 1. Run model explainer locally with full data

Similar to example [here](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py), use a subset of columns

In [None]:
from sklearn.model_selection import train_test_split

numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

y = data['survived'].values
X = data[categorical_features + numeric_features]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sklearn imports

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

We can explain raw features by either using a `sklearn.compose.ColumnTransformer` or a list of fitted transformer tuples. The cell below uses `sklearn.compose.ColumnTransformer`. In case you want to run the example with the list of fitted transformer tuples, comment the cell below and uncomment the cell that follows after. 

In [None]:
from sklearn.compose import ColumnTransformer

transformations = ColumnTransformer([
 ("age_fare", Pipeline(steps=[
 ('imputer', SimpleImputer(strategy='median')),
 ('scaler', StandardScaler())
 ]), ["age", "fare"]),
 ("embarked", Pipeline(steps=[
 ("imputer", SimpleImputer(strategy='constant', fill_value='missing')), 
 ("encoder", OneHotEncoder(sparse=False))]), ["embarked"]),
 ("sex_pclass", OneHotEncoder(sparse=False), ["sex", "pclass"]) 
])


# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
 ('classifier', LogisticRegression(solver='lbfgs'))])


In [None]:
'''
# Uncomment below if sklearn-pandas is not installed
#!pip install sklearn-pandas
from sklearn_pandas import DataFrameMapper

# Impute, standardize the numeric features and one-hot encode the categorical features. 

transformations = [
 (["age", "fare"], Pipeline(steps=[
 ('imputer', SimpleImputer(strategy='median')),
 ('scaler', StandardScaler())
 ])),
 (["embarked"], Pipeline(steps=[
 ("imputer", SimpleImputer(strategy='constant', fill_value='missing')), 
 ("encoder", OneHotEncoder(sparse=False))])),
 (["sex", "pclass"], OneHotEncoder(sparse=False)) 
]


# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)),
 ('classifier', LogisticRegression(solver='lbfgs'))])
'''

## Train a Logistic Regression model, which you want to explain

In [None]:
model = clf.fit(x_train, y_train)

## Explain predictions on your local machine

In [None]:
tabular_explainer = TabularExplainer(clf.steps[-1][1], initialization_examples=x_train, features=x_train.columns, transformations=transformations)

In [None]:
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations will take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(x_test)

In [None]:
sorted_global_importance_values = global_explanation.get_ranked_global_values()
sorted_global_importance_names = global_explanation.get_ranked_global_names()
dict(zip(sorted_global_importance_names, sorted_global_importance_values))

## Explain overall model predictions as a collection of local (instance-level) explanations

In [None]:
# explain the first member of the test set
local_explanation = tabular_explainer.explain_local(x_test[:1])

In [None]:
# get the prediction for the first member of the test set and explain why model made that prediction
prediction_value = clf.predict(x_test)[0]

sorted_local_importance_values = local_explanation.get_ranked_local_values()[prediction_value]
sorted_local_importance_names = local_explanation.get_ranked_local_names()[prediction_value]

# Sorted local SHAP values
print('ranked local importance values: {}'.format(sorted_local_importance_values))
# Corresponding feature names
print('ranked local importance names: {}'.format(sorted_local_importance_names))

# 2. Load visualization dashboard

In [None]:
# Note you will need to have extensions enabled prior to jupyter kernel starting
!jupyter nbextension install --py --sys-prefix azureml.contrib.explain.model.visualize
!jupyter nbextension enable --py --sys-prefix azureml.contrib.explain.model.visualize
# Or, in Jupyter Labs, uncomment below
# jupyter labextension install @jupyter-widgets/jupyterlab-manager
# jupyter labextension install microsoft-mli-widget

In [None]:
from azureml.contrib.explain.model.visualize import ExplanationDashboard

In [None]:
ExplanationDashboard(global_explanation, model, x_test)