import pandas as pd
from skorecard.datasets import load_credit_card
from sklearn.model_selection import train_test_split
data = load_credit_card(as_frame=True)
print(f"data shape: {data.shape}")
X_train, X_test, y_train, y_test = train_test_split(
data.drop(["y"], axis=1), data[["y"]], test_size=0.25, random_state=42
)
data_train_opt, data_test_opt = train_test_split(data, test_size=0.25, random_state=42)
Experiment setup¶
from sklearn.metrics import roc_auc_score
def report_auc(clf, X_train, y_train, X_test, y_test):
proba_train = clf.predict_proba(X_train)[:, 1]
proba_test = clf.predict_proba(X_test)[:, 1]
auc_train = round(roc_auc_score(y_train, proba_train), 4)
auc_test = round(roc_auc_score(y_test, proba_test), 4)
return auc_train, auc_test
from memo import memlist, time_taken
data = []
@memlist(data=data)
@time_taken()
def fit_eval_record(clf, name, opt=False):
if opt:
clf.fit(data_train_opt)
proba_train = clf.predict_proba(data_train_opt)[:, 1]
proba_test = clf.predict_proba(data_test_opt)[:, 1]
auc_train = round(roc_auc_score(y_train, proba_train), 4)
auc_test = round(roc_auc_score(y_test, proba_test), 4)
else:
clf.fit(X_train, y_train)
auc_train, auc_test = report_auc(clf, X_train, y_train, X_test, y_test)
return {"auc_train": auc_train, "auc_test": auc_test}
Baseline¶
from skorecard import Skorecard
scorecard = Skorecard()
fit_eval_record(scorecard, name="skorecard.Scorecard")
# from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import LogisticRegression
# from skorecard.preprocessing import WoeEncoder
# from skorecard.bucketers import DecisionTreeBucketer, OptimalBucketer
# from category_encoders.woe import WOEEncoder
# pipe = make_pipeline(
# DecisionTreeBucketer(),
# OptimalBucketer(),
# #WoeEncoder(),
# WOEEncoder(cols=X_train.columns),
# LogisticRegression(solver="lbfgs", max_iter=400)
# )
# fit_eval_record(pipe, name="pipeline")
# # .7166 with skorecard woe in 3.7s
# # 0.758 with no WOE in 3.9s
# # 0.7661 with WOE on all cols.
Optbinning¶
See the excellent package Optbinning.
from optbinning import BinningProcess
from optbinning import Scorecard
from sklearn.linear_model import LogisticRegression
import pandas as pd
selection_criteria = {"iv": {"min": 0.02, "max": 1}, "quality_score": {"min": 0.01}}
binning_process = BinningProcess(variable_names=list(X_train.columns), selection_criteria=selection_criteria)
estimator = LogisticRegression(solver="lbfgs")
opt_scorecard = Scorecard(
target="y",
binning_process=binning_process,
estimator=estimator,
scaling_method="min_max",
scaling_method_params={"min": 300, "max": 850},
)
opt_scorecard.fit(data_train_opt)
fit_eval_record(opt_scorecard, name="optbinning.Scorecard", opt=True)
Basic LR¶
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=42, solver="lbfgs"))
fit_eval_record(pipe, name="sklearn.LogisticRegression")
from lightgbm import LGBMClassifier
clf = LGBMClassifier(random_state=42, max_depth=10, learning_rate=0.01)
fit_eval_record(clf, name="LightGBM")
Results¶
pd.DataFrame(data).sort_values("auc_test", ascending=False).drop("opt", axis=1)