import pandas as pd
from skorecard.datasets import load_credit_card
from sklearn.model_selection import train_test_split
from skorecard import Skorecard
from skorecard.pipeline.bucketing_process import BucketingProcess
from sklearn.pipeline import make_pipeline
from skorecard.bucketers.bucketers import DecisionTreeBucketer, OptimalBucketer
from time import time
data = load_credit_card(as_frame=True)
# data = pd.read_csv('UCI_Credit_Card.csv')
# cols = ["EDUCATION", "MARRIAGE", "LIMIT_BAL", "BILL_AMT1", "default"]
# data = data[cols]
# data.rename(columns={'default': 'y'}, inplace=True)
print(f"data shape: {data.shape}")
X_train, X_test, y_train, y_test = train_test_split(
data.drop(["y"], axis=1), data[["y"]], test_size=0.25, random_state=42
)
data_train_opt, data_test_opt = train_test_split(data, test_size=0.25, random_state=42)
data_train_opt.head()
y_train = y_train.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()
Experiment setup¶
from sklearn.metrics import roc_auc_score
def report_auc(clf, X_train, y_train, X_test, y_test):
proba_train = clf.predict_proba(X_train)[:, 1]
proba_test = clf.predict_proba(X_test)[:, 1]
auc_train = round(roc_auc_score(y_train, proba_train), 4)
auc_test = round(roc_auc_score(y_test, proba_test), 4)
return auc_train, auc_test
from memo import memlist, time_taken
data = []
@memlist(data=data)
@time_taken()
def fit_eval_record(clf, name, opt=False):
if opt:
clf.fit(data_train_opt, data_train_opt["y"])
proba_train = clf.predict_proba(data_train_opt)[:, 1]
proba_test = clf.predict_proba(data_test_opt)[:, 1]
auc_train = round(roc_auc_score(y_train, proba_train), 4)
auc_test = round(roc_auc_score(y_test, proba_test), 4)
else:
clf.fit(X_train, y_train)
auc_train, auc_test = report_auc(clf, X_train, y_train, X_test, y_test)
return {"auc_train": auc_train, "auc_test": auc_test}
Skorecard is currently rather slow. A minor speed-up can be obtained by noting that both BucketingProcess and its pre-bucketers and bucketers compute identical bucket_tables and summaries: this is redundant when using a BucketingProcess. A boolean variable 'get_statistics' has been added to the bucketers to remove the calculation of these statistics. Below, a comparison is made to show the difference in speed this makes at the level of:¶
1) A single bucketer¶
2) A BucketingProcess¶
3) A full Scorecard pipeline¶
start_slow = time()
for i in range(10):
bucketer_slow = DecisionTreeBucketer(max_n_bins=100, min_bin_size=0.05, get_statistics=True)
X_train_b1 = bucketer_slow.fit_transform(X_train, y_train)
end_slow = time()
print("Time for a single bucket when summary is computed:", end_slow - start_slow)
start = time()
for i in range(10):
bucketer = DecisionTreeBucketer(max_n_bins=100, min_bin_size=0.05, get_statistics=False)
X_train_b2 = bucketer.fit_transform(X_train, y_train)
end = time()
print("Time for a single bucket when summary is not computed:", end - start)
start_slow = time()
for i in range(5):
clf_slow = BucketingProcess(
prebucketing_pipeline=make_pipeline(
DecisionTreeBucketer(max_n_bins=100, min_bin_size=0.05, get_statistics=True)
),
bucketing_pipeline=make_pipeline(OptimalBucketer(max_n_bins=10, min_bin_size=0.05, get_statistics=True)),
)
clf_slow.fit(X_train, y_train)
end_slow = time()
print("Time for a bucketing process when redundant summary is computed:", end_slow - start_slow)
start = time()
for i in range(5):
clf = BucketingProcess(
prebucketing_pipeline=make_pipeline(
DecisionTreeBucketer(max_n_bins=100, min_bin_size=0.05, get_statistics=False)
),
bucketing_pipeline=make_pipeline(OptimalBucketer(max_n_bins=10, min_bin_size=0.05, get_statistics=False)),
)
clf.fit(X_train, y_train)
end = time()
print("Time for a bucketing process when redundant summary is not computed:", end - start)
bucketing_process_slow = BucketingProcess(
prebucketing_pipeline=make_pipeline(DecisionTreeBucketer(max_n_bins=100, min_bin_size=0.05, get_statistics=True)),
bucketing_pipeline=make_pipeline(OptimalBucketer(max_n_bins=10, min_bin_size=0.05, get_statistics=True)),
)
scorecard_slow = Skorecard(bucketing=bucketing_process_slow)
d_slow = fit_eval_record(scorecard_slow, name="skorecard.Scorecard")
print("Time for a scorecard model when redundant summary is computed:", d_slow["time_taken"])
bucketing_process = BucketingProcess(
prebucketing_pipeline=make_pipeline(DecisionTreeBucketer(max_n_bins=100, min_bin_size=0.05, get_statistics=False)),
bucketing_pipeline=make_pipeline(OptimalBucketer(max_n_bins=10, min_bin_size=0.05, get_statistics=False)),
)
scorecard = Skorecard(bucketing=bucketing_process)
d = fit_eval_record(scorecard, name="skorecard.Scorecard")
print("Time for a scorecard model when redundant summary is not computed:", d["time_taken"])