Skip to content

Benchmarks

Here we will demonstrate some benchmarks against some alternatives.

Data

UCI Credit card dataset with 30k rows and 23 features.

import pandas as pd
from skorecard.datasets import load_credit_card
from sklearn.model_selection import train_test_split

data = load_credit_card(as_frame=True)
print(f"data shape: {data.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(["y"], axis=1), data[["y"]], test_size=0.25, random_state=42
)

data_train_opt, data_test_opt = train_test_split(data, test_size=0.25, random_state=42)
data shape: (30000, 24)

Experiment setup

from sklearn.metrics import roc_auc_score


def report_auc(clf, X_train, y_train, X_test, y_test):
    proba_train = clf.predict_proba(X_train)[:, 1]
    proba_test = clf.predict_proba(X_test)[:, 1]

    auc_train = round(roc_auc_score(y_train, proba_train), 4)
    auc_test = round(roc_auc_score(y_test, proba_test), 4)

    return auc_train, auc_test
from memo import memlist, time_taken

data = []


@memlist(data=data)
@time_taken()
def fit_eval_record(clf, name, opt=False):
    if opt:
        clf.fit(data_train_opt)
        proba_train = clf.predict_proba(data_train_opt)[:, 1]
        proba_test = clf.predict_proba(data_test_opt)[:, 1]

        auc_train = round(roc_auc_score(y_train, proba_train), 4)
        auc_test = round(roc_auc_score(y_test, proba_test), 4)

    else:
        clf.fit(X_train, y_train)
        auc_train, auc_test = report_auc(clf, X_train, y_train, X_test, y_test)

    return {"auc_train": auc_train, "auc_test": auc_test}

Baseline

from skorecard import Skorecard

scorecard = Skorecard()
fit_eval_record(scorecard, name="skorecard.Scorecard")
{'auc_train': 0.7727, 'auc_test': 0.766, 'time_taken': 16.73}
# from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import LogisticRegression
# from skorecard.preprocessing import WoeEncoder
# from skorecard.bucketers import DecisionTreeBucketer, OptimalBucketer
# from category_encoders.woe import WOEEncoder

# pipe = make_pipeline(
#     DecisionTreeBucketer(),
#     OptimalBucketer(),
#     #WoeEncoder(),
#     WOEEncoder(cols=X_train.columns),
#     LogisticRegression(solver="lbfgs", max_iter=400)
# )

# fit_eval_record(pipe, name="pipeline")

# # .7166 with skorecard woe in 3.7s
# # 0.758 with no WOE in 3.9s
# # 0.7661 with WOE on all cols.

Optbinning

See the excellent package Optbinning.

from optbinning import BinningProcess
from optbinning import Scorecard
from sklearn.linear_model import LogisticRegression
import pandas as pd

selection_criteria = {"iv": {"min": 0.02, "max": 1}, "quality_score": {"min": 0.01}}
binning_process = BinningProcess(variable_names=list(X_train.columns), selection_criteria=selection_criteria)

estimator = LogisticRegression(solver="lbfgs")

opt_scorecard = Scorecard(
    target="y",
    binning_process=binning_process,
    estimator=estimator,
    scaling_method="min_max",
    scaling_method_params={"min": 300, "max": 850},
)

opt_scorecard.fit(data_train_opt)
fit_eval_record(opt_scorecard, name="optbinning.Scorecard", opt=True)
{'auc_train': 0.7719, 'auc_test': 0.7628, 'time_taken': 1.88}

Basic LR

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=42, solver="lbfgs"))

fit_eval_record(pipe, name="sklearn.LogisticRegression")
/Users/iv58uq/miniconda3/envs/dancard_py37/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(*args, **kwargs)

{'auc_train': 0.724, 'auc_test': 0.7232, 'time_taken': 0.11}

LightGBM model

The LightGBM Classifier documentation can be found here

from lightgbm import LGBMClassifier

clf = LGBMClassifier(random_state=42, max_depth=10, learning_rate=0.01)

fit_eval_record(clf, name="LightGBM")
/Users/iv58uq/miniconda3/envs/dancard_py37/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(*args, **kwargs)

{'auc_train': 0.8038, 'auc_test': 0.7778, 'time_taken': 0.33}

Results

pd.DataFrame(data).sort_values("auc_test", ascending=False).drop("opt", axis=1)
name auc_train auc_test time_taken
3 LightGBM 0.8038 0.7778 0.33
0 skorecard.Scorecard 0.7727 0.7660 16.73
1 optbinning.Scorecard 0.7719 0.7628 1.88
2 sklearn.LogisticRegression 0.7240 0.7232 0.11