Shap variance penalty¶
When ShapRFECV is computing feature importance and subsequently eliminating features, it computes the average of shap values to get an estimate of that feature's overall importance. In some situations, the variance of these shap values might be high - which might indicate a lack of agreement regarding that feature's importance. Catering to this situation, probatus allows you to penalize features that have a higher variance of shap values.
By setting shap_variance_penalty_factor
param within fit_compute()
method, the averaging of shap values is computed by:
<
See example below:
In [1]:
Copied!
%%capture
!pip install probatus
!pip install catboost
%%capture
!pip install probatus
!pip install catboost
In [2]:
Copied!
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
In [3]:
Copied!
X, y = make_classification(n_samples=500, n_informative=20, n_features=100)
model = CatBoostClassifier(n_estimators=100, verbose=0)
shap_elimination = ShapRFECV(model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1)
report_with_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=1.0)
report_without_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=0)
X, y = make_classification(n_samples=500, n_informative=20, n_features=100)
model = CatBoostClassifier(n_estimators=100, verbose=0)
shap_elimination = ShapRFECV(model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1)
report_with_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=1.0)
report_without_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=0)
In [4]:
Copied!
report_with_penalty
report_with_penalty
Out[4]:
num_features | features_set | eliminated_features | train_metric_mean | train_metric_std | val_metric_mean | val_metric_std | |
---|---|---|---|---|---|---|---|
1 | 100 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [48, 23, 25, 17, 41, 79, 70, 67, 96, 95, 52, 5... | 1.000000 | 0.000000 | 0.783734 | 0.036136 |
2 | 80 | [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14... | [29, 44, 31, 80, 34, 42, 60, 87, 77, 75, 64, 7... | 1.000000 | 0.000000 | 0.818636 | 0.027409 |
3 | 64 | [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1... | [63, 59, 83, 12, 38, 90, 93, 16, 49, 94, 8, 1] | 1.000000 | 0.000000 | 0.809475 | 0.040263 |
4 | 52 | [0, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 18, 19,... | [62, 14, 36, 18, 3, 4, 24, 74, 82, 89] | 1.000000 | 0.000000 | 0.825634 | 0.027513 |
5 | 42 | [0, 2, 5, 9, 10, 11, 13, 15, 19, 20, 21, 22, 3... | [2, 66, 68, 39, 71, 72, 22, 99] | 1.000000 | 0.000000 | 0.858765 | 0.031187 |
6 | 34 | [0, 5, 9, 10, 11, 13, 15, 19, 20, 21, 30, 35, ... | [19, 69, 53, 30, 37, 15] | 1.000000 | 0.000000 | 0.845318 | 0.034718 |
7 | 28 | [0, 5, 9, 10, 11, 13, 20, 21, 35, 40, 43, 45, ... | [21, 43, 98, 57, 0] | 1.000000 | 0.000000 | 0.847304 | 0.029020 |
8 | 23 | [5, 9, 10, 11, 13, 20, 35, 40, 45, 46, 47, 51,... | [45, 20, 84, 88] | 1.000000 | 0.000000 | 0.863716 | 0.027382 |
9 | 19 | [5, 9, 10, 11, 13, 35, 40, 46, 47, 51, 54, 58,... | [13, 76, 92] | 0.972956 | 0.005839 | 0.815100 | 0.035161 |
10 | 16 | [5, 9, 10, 11, 35, 40, 46, 47, 51, 54, 58, 65,... | [47, 51, 35] | 0.969608 | 0.003283 | 0.823234 | 0.055277 |
11 | 13 | [5, 9, 10, 11, 40, 46, 54, 58, 65, 73, 81, 85,... | [46, 58] | 0.962777 | 0.011050 | 0.800052 | 0.048493 |
12 | 11 | [5, 9, 10, 11, 40, 54, 65, 73, 81, 85, 91] | [9, 91] | 0.956023 | 0.008971 | 0.814270 | 0.051047 |
13 | 9 | [5, 10, 11, 40, 54, 65, 73, 81, 85] | [10] | 0.951823 | 0.009062 | 0.804158 | 0.079721 |
14 | 8 | [5, 11, 40, 54, 65, 73, 81, 85] | [5] | 0.930154 | 0.008260 | 0.770472 | 0.048131 |
15 | 7 | [11, 40, 54, 65, 73, 81, 85] | [81] | 0.906913 | 0.008914 | 0.762450 | 0.029873 |
16 | 6 | [11, 40, 54, 65, 73, 85] | [54] | 0.894709 | 0.009730 | 0.743937 | 0.029733 |
17 | 5 | [11, 40, 65, 73, 85] | [] | 0.875344 | 0.012642 | 0.725548 | 0.026652 |
In [5]:
Copied!
report_without_penalty
report_without_penalty
Out[5]:
num_features | features_set | eliminated_features | train_metric_mean | train_metric_std | val_metric_mean | val_metric_std | |
---|---|---|---|---|---|---|---|
1 | 100 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [48, 23, 25, 17, 41, 79, 70, 67, 96, 95, 52, 5... | 1.000000 | 0.000000 | 0.783734 | 0.036136 |
2 | 80 | [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14... | [29, 44, 31, 80, 34, 42, 60, 87, 77, 75, 64, 7... | 1.000000 | 0.000000 | 0.818636 | 0.027409 |
3 | 64 | [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1... | [63, 59, 83, 12, 38, 90, 93, 16, 49, 94, 8, 1] | 1.000000 | 0.000000 | 0.809475 | 0.040263 |
4 | 52 | [0, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 18, 19,... | [62, 14, 36, 18, 3, 4, 24, 74, 82, 89] | 1.000000 | 0.000000 | 0.825634 | 0.027513 |
5 | 42 | [0, 2, 5, 9, 10, 11, 13, 15, 19, 20, 21, 22, 3... | [2, 66, 68, 39, 71, 72, 22, 99] | 1.000000 | 0.000000 | 0.858765 | 0.031187 |
6 | 34 | [0, 5, 9, 10, 11, 13, 15, 19, 20, 21, 30, 35, ... | [19, 69, 53, 30, 37, 15] | 1.000000 | 0.000000 | 0.845318 | 0.034718 |
7 | 28 | [0, 5, 9, 10, 11, 13, 20, 21, 35, 40, 43, 45, ... | [21, 43, 98, 57, 0] | 1.000000 | 0.000000 | 0.847304 | 0.029020 |
8 | 23 | [5, 9, 10, 11, 13, 20, 35, 40, 45, 46, 47, 51,... | [45, 20, 84, 88] | 1.000000 | 0.000000 | 0.863716 | 0.027382 |
9 | 19 | [5, 9, 10, 11, 13, 35, 40, 46, 47, 51, 54, 58,... | [13, 76, 92] | 0.972956 | 0.005839 | 0.815100 | 0.035161 |
10 | 16 | [5, 9, 10, 11, 35, 40, 46, 47, 51, 54, 58, 65,... | [47, 51, 35] | 0.969608 | 0.003283 | 0.823234 | 0.055277 |
11 | 13 | [5, 9, 10, 11, 40, 46, 54, 58, 65, 73, 81, 85,... | [46, 58] | 0.962777 | 0.011050 | 0.800052 | 0.048493 |
12 | 11 | [5, 9, 10, 11, 40, 54, 65, 73, 81, 85, 91] | [9, 91] | 0.956023 | 0.008971 | 0.814270 | 0.051047 |
13 | 9 | [5, 10, 11, 40, 54, 65, 73, 81, 85] | [10] | 0.951823 | 0.009062 | 0.804158 | 0.079721 |
14 | 8 | [5, 11, 40, 54, 65, 73, 81, 85] | [5] | 0.930154 | 0.008260 | 0.770472 | 0.048131 |
15 | 7 | [11, 40, 54, 65, 73, 81, 85] | [81] | 0.906913 | 0.008914 | 0.762450 | 0.029873 |
16 | 6 | [11, 40, 54, 65, 73, 85] | [54] | 0.894709 | 0.009730 | 0.743937 | 0.029733 |
17 | 5 | [11, 40, 65, 73, 85] | [] | 0.875344 | 0.012642 | 0.725548 | 0.026652 |
1 | 100 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [80, 4, 64, 33, 14, 87, 48, 36, 56, 6, 18, 29,... | 1.000000 | 0.000000 | 0.783734 | 0.036136 |
2 | 80 | [0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 15, 1... | [77, 19, 2, 31, 57, 86, 26, 37, 59, 68, 72, 63... | 1.000000 | 0.000000 | 0.826010 | 0.033883 |
3 | 64 | [0, 1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, ... | [34, 15, 12, 83, 55, 8, 38, 75, 44, 53, 78, 69] | 1.000000 | 0.000000 | 0.840406 | 0.004867 |
4 | 52 | [0, 1, 3, 5, 7, 9, 10, 11, 13, 16, 17, 20, 21,... | [17, 89, 93, 3, 23, 62, 60, 1, 49, 96] | 1.000000 | 0.000000 | 0.834353 | 0.023681 |
5 | 42 | [0, 5, 7, 9, 10, 11, 13, 16, 20, 21, 22, 27, 2... | [43, 71, 0, 84, 7, 97, 98, 88] | 1.000000 | 0.000000 | 0.845624 | 0.017201 |
6 | 34 | [5, 9, 10, 11, 13, 16, 20, 21, 22, 27, 28, 32,... | [90, 21, 16, 70, 27, 95] | 1.000000 | 0.000000 | 0.863096 | 0.030650 |
7 | 28 | [5, 9, 10, 11, 13, 20, 22, 28, 32, 35, 40, 45,... | [28, 46, 61, 20, 94] | 1.000000 | 0.000000 | 0.856743 | 0.036105 |
8 | 23 | [5, 9, 10, 11, 13, 22, 32, 35, 40, 45, 47, 51,... | [82, 32, 74, 13] | 1.000000 | 0.000000 | 0.858418 | 0.031434 |
9 | 19 | [5, 9, 10, 11, 22, 35, 40, 45, 47, 51, 54, 58,... | [92, 58, 35] | 0.978937 | 0.004357 | 0.857832 | 0.040339 |
10 | 16 | [5, 9, 10, 11, 22, 40, 45, 47, 51, 54, 65, 73,... | [47, 76, 91] | 0.973436 | 0.004632 | 0.856597 | 0.044714 |
11 | 13 | [5, 9, 10, 11, 22, 40, 45, 51, 54, 65, 73, 81,... | [9, 45] | 0.966317 | 0.006283 | 0.836464 | 0.069308 |
12 | 11 | [5, 10, 11, 22, 40, 51, 54, 65, 73, 81, 85] | [11, 81] | 0.958512 | 0.007243 | 0.826117 | 0.055609 |
13 | 9 | [5, 10, 22, 40, 51, 54, 65, 73, 85] | [51] | 0.946004 | 0.011688 | 0.807256 | 0.066020 |
14 | 8 | [5, 10, 22, 40, 54, 65, 73, 85] | [22] | 0.932403 | 0.010371 | 0.796027 | 0.049649 |
15 | 7 | [5, 10, 40, 54, 65, 73, 85] | [5] | 0.918967 | 0.005895 | 0.797665 | 0.054531 |
16 | 6 | [10, 40, 54, 65, 73, 85] | [54] | 0.905771 | 0.006757 | 0.785010 | 0.050255 |
17 | 5 | [10, 40, 65, 73, 85] | [] | 0.876856 | 0.006382 | 0.725792 | 0.058888 |
Which approach is better?¶
Let's compare a few different configurations of RFECV.
In [6]:
Copied!
# Compare A: shap_variance_penalty_factor=0.5 & approximate=True
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=True
num_simulations = 5
results = []
def get_best_idx(shap_report):
shap_report["eval_metric"] = shap_report["val_metric_mean"]
best_iteration_idx = shap_report["eval_metric"].argmax()
return best_iteration_idx
for i in range(num_simulations):
# Params
n_samples = np.random.randint(100, 500)
n_features = 200
n_informative = np.random.randint(10, 200)
test_size = np.random.uniform(0.05, 0.5)
# Create data
X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
# Model
model = CatBoostClassifier(n_estimators=100, verbose=0)
# Best score from ShapRFECV WITHOUT penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_a = shap_elimination.fit_compute(
X_train, y_train, shap_variance_penalty_factor=0, approximate=True, check_additivity=False
)
best_idx_a = get_best_idx(report_a)
best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
std_a = report_a["val_metric_std"].iloc[best_idx_a]
num_features_a = report_a["num_features"].iloc[best_idx_a]
# Best score from ShapRFECV WITH penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_b = shap_elimination.fit_compute(
X_train, y_train, shap_variance_penalty_factor=0.5, approximate=True, check_additivity=False
)
best_idx_b = get_best_idx(report_b)
best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
std_b = report_b["val_metric_std"].iloc[best_idx_b]
num_features_b = report_b["num_features"].iloc[best_idx_b]
results.append(
[best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
)
results_df = pd.DataFrame(
results,
columns=[
"best_score_a",
"std_a",
"num_features_a",
"best_score_b",
"std_b",
"num_features_b",
"n_samples",
"n_features",
"n_informative",
],
)
# Show results
results_df
# Compare A: shap_variance_penalty_factor=0.5 & approximate=True
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=True
num_simulations = 5
results = []
def get_best_idx(shap_report):
shap_report["eval_metric"] = shap_report["val_metric_mean"]
best_iteration_idx = shap_report["eval_metric"].argmax()
return best_iteration_idx
for i in range(num_simulations):
# Params
n_samples = np.random.randint(100, 500)
n_features = 200
n_informative = np.random.randint(10, 200)
test_size = np.random.uniform(0.05, 0.5)
# Create data
X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
# Model
model = CatBoostClassifier(n_estimators=100, verbose=0)
# Best score from ShapRFECV WITHOUT penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_a = shap_elimination.fit_compute(
X_train, y_train, shap_variance_penalty_factor=0, approximate=True, check_additivity=False
)
best_idx_a = get_best_idx(report_a)
best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
std_a = report_a["val_metric_std"].iloc[best_idx_a]
num_features_a = report_a["num_features"].iloc[best_idx_a]
# Best score from ShapRFECV WITH penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_b = shap_elimination.fit_compute(
X_train, y_train, shap_variance_penalty_factor=0.5, approximate=True, check_additivity=False
)
best_idx_b = get_best_idx(report_b)
best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
std_b = report_b["val_metric_std"].iloc[best_idx_b]
num_features_b = report_b["num_features"].iloc[best_idx_b]
results.append(
[best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
)
results_df = pd.DataFrame(
results,
columns=[
"best_score_a",
"std_a",
"num_features_a",
"best_score_b",
"std_b",
"num_features_b",
"n_samples",
"n_features",
"n_informative",
],
)
# Show results
results_df
Out[6]:
best_score_a | std_a | num_features_a | best_score_b | std_b | num_features_b | n_samples | n_features | n_informative | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.892217 | 0.028543 | 8 | 0.894579 | 0.013364 | 20 | 188 | 200 | 13 |
1 | 0.797291 | 0.036200 | 24 | 0.801302 | 0.040278 | 36 | 404 | 200 | 141 |
2 | 0.741265 | 0.021340 | 29 | 0.701914 | 0.028287 | 54 | 499 | 200 | 180 |
3 | 0.786747 | 0.087275 | 8 | 0.787802 | 0.065435 | 24 | 179 | 200 | 183 |
4 | 0.834444 | 0.067287 | 7 | 0.770303 | 0.091079 | 5 | 176 | 200 | 198 |
In [7]:
Copied!
# Compare A: shap_variance_penalty_factor=0.5 & approximate=False
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=False
num_simulations = 5
results = []
for i in range(num_simulations):
# Params
n_samples = np.random.randint(100, 500)
n_features = 200
n_informative = np.random.randint(10, 200)
test_size = np.random.uniform(0.05, 0.5)
# Create data
X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
# Model
model = CatBoostClassifier(n_estimators=100, verbose=0)
# Best score from ShapRFECV WITHOUT penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_a = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0, approximate=False)
best_idx_a = get_best_idx(report_a)
best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
std_a = report_a["val_metric_std"].iloc[best_idx_a]
num_features_a = report_a["num_features"].iloc[best_idx_a]
# Best score from ShapRFECV WITH penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_b = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0.5, approximate=False)
best_idx_b = get_best_idx(report_b)
best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
std_b = report_b["val_metric_std"].iloc[best_idx_b]
num_features_b = report_b["num_features"].iloc[best_idx_b]
results.append(
[best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
)
results_df = pd.DataFrame(
results,
columns=[
"best_score_a",
"std_a",
"num_features_a",
"best_score_b",
"std_b",
"num_features_b",
"n_samples",
"n_features",
"n_informative",
],
)
# Show results
results_df
# Compare A: shap_variance_penalty_factor=0.5 & approximate=False
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=False
num_simulations = 5
results = []
for i in range(num_simulations):
# Params
n_samples = np.random.randint(100, 500)
n_features = 200
n_informative = np.random.randint(10, 200)
test_size = np.random.uniform(0.05, 0.5)
# Create data
X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
# Model
model = CatBoostClassifier(n_estimators=100, verbose=0)
# Best score from ShapRFECV WITHOUT penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_a = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0, approximate=False)
best_idx_a = get_best_idx(report_a)
best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
std_a = report_a["val_metric_std"].iloc[best_idx_a]
num_features_a = report_a["num_features"].iloc[best_idx_a]
# Best score from ShapRFECV WITH penalization
shap_elimination = ShapRFECV(
model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
)
report_b = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0.5, approximate=False)
best_idx_b = get_best_idx(report_b)
best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
std_b = report_b["val_metric_std"].iloc[best_idx_b]
num_features_b = report_b["num_features"].iloc[best_idx_b]
results.append(
[best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
)
results_df = pd.DataFrame(
results,
columns=[
"best_score_a",
"std_a",
"num_features_a",
"best_score_b",
"std_b",
"num_features_b",
"n_samples",
"n_features",
"n_informative",
],
)
# Show results
results_df
Out[7]:
best_score_a | std_a | num_features_a | best_score_b | std_b | num_features_b | n_samples | n_features | n_informative | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.742664 | 0.091943 | 11 | 0.773073 | 0.095611 | 13 | 250 | 200 | 43 |
1 | 0.829656 | 0.052365 | 20 | 0.798127 | 0.053808 | 29 | 327 | 200 | 24 |
2 | 0.724558 | 0.043146 | 83 | 0.746103 | 0.022388 | 20 | 394 | 200 | 179 |
3 | 0.822537 | 0.044845 | 36 | 0.825153 | 0.038366 | 29 | 479 | 200 | 60 |
4 | 0.729214 | 0.038897 | 83 | 0.731563 | 0.024997 | 54 | 485 | 200 | 176 |