EBM benchmark with skorecard¶
This benchmark was adjusted from this notebook
# To run benchmark script, you will need to install XGBoost
# (pip install XGBoost)
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
import warnings
warnings.filterwarnings("ignore")
def load_breast_data():
breast = load_breast_cancer()
feature_names = list(breast.feature_names)
X, y = pd.DataFrame(breast.data, columns=feature_names), breast.target
dataset = {
"problem": "classification",
"full": {
"X": X,
"y": y,
},
}
return dataset
def load_adult_data():
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None)
df.columns = [
"Age",
"WorkClass",
"fnlwgt",
"Education",
"EducationNum",
"MaritalStatus",
"Occupation",
"Relationship",
"Race",
"Gender",
"CapitalGain",
"CapitalLoss",
"HoursPerWeek",
"NativeCountry",
"Income",
]
train_cols = df.columns[0:-1]
label = df.columns[-1]
X_df = df[train_cols]
y_df = df[label]
dataset = {
"problem": "classification",
"full": {
"X": X_df,
"y": y_df,
},
}
return dataset
def load_heart_data():
# https://www.kaggle.com/ronitf/heart-disease-uci
df = pd.read_csv(r"heart.csv")
train_cols = df.columns[0:-1]
label = df.columns[-1]
X_df = df[train_cols]
y_df = df[label]
dataset = {
"problem": "classification",
"full": {
"X": X_df,
"y": y_df,
},
}
return dataset
def load_credit_data():
# https://www.kaggle.com/mlg-ulb/creditcardfraud
df = pd.read_csv(r"creditcard.csv")
train_cols = df.columns[0:-1]
label = df.columns[-1]
X_df = df[train_cols]
y_df = df[label]
dataset = {
"problem": "classification",
"full": {
"X": X_df,
"y": y_df,
},
}
return dataset
def load_telco_churn_data():
# https://www.kaggle.com/blastchar/telco-customer-churn
df = pd.read_csv(r"WA_Fn-UseC_-Telco-Customer-Churn.csv")
train_cols = df.columns[1:-1] # First column is an ID
label = df.columns[-1]
X_df = df[train_cols]
y_df = df[label] # 'Yes, No'
dataset = {
"problem": "classification",
"full": {
"X": X_df,
"y": y_df,
},
}
return dataset
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from interpret.glassbox import ExplainableBoostingClassifier
from skorecard import Skorecard
from optbinning import BinningProcess
from optbinning import Scorecard
def format_n(x):
return f"{x:.3f}"
def process_model(clf, name, X, y, n_splits=3):
# Evaluate model
ss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=1337)
scores = cross_validate(clf, X, y, scoring="roc_auc", cv=ss, n_jobs=-1, return_estimator=True)
record = dict()
record["model_name"] = name
record["fit_time_mean"] = format_n(np.mean(scores["fit_time"]))
record["fit_time_std"] = format_n(np.std(scores["fit_time"]))
record["test_score_mean"] = format_n(np.mean(scores["test_score"]))
record["test_score_std"] = format_n(np.std(scores["test_score"]))
return record
def benchmark_models(dataset_name, X, y, ct=None, n_splits=3, random_state=1337):
if ct is None:
is_cat = np.array([dt.kind == "O" for dt in X.dtypes])
cat_cols = X.columns.values[is_cat]
num_cols = X.columns.values[~is_cat]
cat_ohe_step = ("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))
cat_pipe = Pipeline([cat_ohe_step])
num_pipe = Pipeline([("identity", FunctionTransformer())])
transformers = [("cat", cat_pipe, cat_cols), ("num", num_pipe, num_cols)]
ct = ColumnTransformer(transformers=transformers)
cat_ord_step = ("ord_enc", OrdinalEncoder())
cat_pipe = Pipeline([cat_ord_step])
transformers = [("cat", cat_pipe, cat_cols), ("num", num_pipe, num_cols)]
ot = ColumnTransformer(transformers=transformers)
records = []
summary_record = {}
summary_record["dataset_name"] = dataset_name
print()
print("-" * 78)
print(dataset_name)
print("-" * 78)
print(summary_record)
print()
pipe = Pipeline(
[
("ct", ct),
("std", StandardScaler()),
("lr", LogisticRegression(random_state=random_state)),
]
)
record = process_model(pipe, "lr_ohe", X, y, n_splits=n_splits)
print(record)
record.update(summary_record)
records.append(record)
pipe = Pipeline(
[
("ot", ot),
("std", StandardScaler()),
("lr", LogisticRegression(max_iter=7000, random_state=random_state)),
]
)
record = process_model(pipe, "lr_ordinal", X, y, n_splits=n_splits)
print(record)
record.update(summary_record)
records.append(record)
# Skorecard
skorecard = Skorecard()
record = process_model(skorecard, "skorecard", X, y, n_splits=n_splits)
print(record)
record.update(summary_record)
records.append(record)
pipe = Pipeline(
[
("ct", ct),
# n_estimators updated from 10 to 100 due to sci-kit defaults changing in future versions
("rf-100", RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)),
]
)
record = process_model(pipe, "rf-100", X, y, n_splits=n_splits)
print(record)
record.update(summary_record)
records.append(record)
pipe = Pipeline(
[
("ct", ct),
("xgb", XGBClassifier(random_state=random_state, eval_metric="logloss")),
]
)
record = process_model(pipe, "xgb", X, y, n_splits=n_splits)
print(record)
record.update(summary_record)
records.append(record)
# No pipeline needed due to EBM handling string datatypes
ebm_inter = ExplainableBoostingClassifier(n_jobs=-1, random_state=random_state)
record = process_model(ebm_inter, "ebm", X, y, n_splits=n_splits)
print(record)
record.update(summary_record)
records.append(record)
return records
results = []
n_splits = 3
from skorecard.datasets import load_uci_credit_card
X, y = load_uci_credit_card(return_X_y=True)
result = benchmark_models("UCI-creditcard", X, y, n_splits=n_splits)
results.append(result)
dataset = load_breast_data()
result = benchmark_models("breast-cancer", dataset["full"]["X"], dataset["full"]["y"], n_splits=n_splits)
results.append(result)
dataset = load_adult_data()
result = benchmark_models("adult", dataset["full"]["X"], dataset["full"]["y"], n_splits=n_splits)
results.append(result)
# 0.888
dataset = load_telco_churn_data()
result = benchmark_models("telco_churn", dataset["full"]["X"], dataset["full"]["y"], n_splits=n_splits)
results.append(result)
dataset = load_heart_data()
result = benchmark_models("heart", dataset["full"]["X"], dataset["full"]["y"], n_splits=n_splits)
results.append(result)
records = [item for result in results for item in result]
record_df = pd.DataFrame.from_records(records)[["dataset_name", "model_name", "test_score_mean", "test_score_std"]]
record_df = record_df.sort_values(["dataset_name", "test_score_mean"], ascending=False)
print(
record_df[record_df["model_name"].isin(["lr_ohe", "lr_ordinal", "rf-100", "skorecard", "xgb"])]
.drop(["test_score_std"], axis=1)
.to_markdown(tablefmt="github", showindex=False)
)