import pandas as pd
from skorecard.datasets import load_credit_card
from sklearn.model_selection import train_test_split
data = load_credit_card(as_frame=True)
print(f"data shape: {data.shape}")
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['y'], axis=1),
data[['y']],
test_size=0.25,
random_state=42
)
data_train_opt, data_test_opt = train_test_split(
data,
test_size=0.25,
random_state=42
)
Experiment setup¶
from sklearn.metrics import roc_auc_score
def report_auc(clf, X_train, y_train, X_test, y_test):
proba_train = clf.predict_proba(X_train)[:,1]
proba_test = clf.predict_proba(X_test)[:,1]
auc_train = round(roc_auc_score(y_train, proba_train),4)
auc_test = round(roc_auc_score(y_test, proba_test),4)
return auc_train, auc_test
from memo import memlist, time_taken
data = []
@memlist(data=data)
@time_taken()
def fit_eval_record(clf, name, opt=False):
if opt:
clf.fit(data_train_opt)
proba_train = clf.predict_proba(data_train_opt)[:,1]
proba_test = clf.predict_proba(data_test_opt)[:,1]
auc_train = round(roc_auc_score(y_train, proba_train),4)
auc_test = round(roc_auc_score(y_test, proba_test),4)
else:
clf.fit(X_train, y_train)
auc_train, auc_test = report_auc(clf, X_train, y_train, X_test, y_test)
return {'auc_train': auc_train, 'auc_test': auc_test}
Baseline¶
from skorecard import Skorecard
scorecard = Skorecard()
fit_eval_record(scorecard, name="skorecard.Scorecard")
# from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import LogisticRegression
# from skorecard.preprocessing import WoeEncoder
# from skorecard.bucketers import DecisionTreeBucketer, OptimalBucketer
# from category_encoders.woe import WOEEncoder
# pipe = make_pipeline(
# DecisionTreeBucketer(),
# OptimalBucketer(),
# #WoeEncoder(),
# WOEEncoder(cols=X_train.columns),
# LogisticRegression(solver="lbfgs", max_iter=400)
# )
# fit_eval_record(pipe, name="pipeline")
# # .7166 with skorecard woe in 3.7s
# # 0.758 with no WOE in 3.9s
# # 0.7661 with WOE on all cols.
Optbinning¶
See the excellent package Optbinning.
from optbinning import BinningProcess
from optbinning import Scorecard
from sklearn.linear_model import LogisticRegression
import pandas as pd
selection_criteria = {
"iv": {"min": 0.02, "max": 1},
"quality_score": {"min": 0.01}
}
binning_process = BinningProcess(variable_names = list(X_train.columns), selection_criteria=selection_criteria)
estimator = LogisticRegression(solver="lbfgs")
opt_scorecard = Scorecard(
target="y",
binning_process=binning_process,
estimator=estimator, scaling_method="min_max",
scaling_method_params={"min": 300, "max": 850},
)
opt_scorecard.fit(data_train_opt)
fit_eval_record(opt_scorecard, name="optbinning.Scorecard", opt=True)
Basic LR¶
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(
StandardScaler(),
LogisticRegression(random_state=42, solver="lbfgs")
)
fit_eval_record(pipe, name="sklearn.LogisticRegression")
from lightgbm import LGBMClassifier
clf = LGBMClassifier(random_state=42, max_depth=10, learning_rate=0.01)
fit_eval_record(clf, name="LightGBM")
Results¶
pd.DataFrame(data).sort_values('auc_test', ascending=False).drop("opt", axis=1)
Last update: 2021-11-24