Shap variance penalty¶

When ShapRFECV is computing feature importance and subsequently eliminating features, it computes the average of shap values to get an estimate of that feature's overall importance. In some situations, the variance of these shap values might be high - which might indicate a lack of agreement regarding that feature's importance. Catering to this situation, probatus allows you to penalize features that have a higher variance of shap values.

By setting shap_variance_penalty_factor param within fit_compute() method, the averaging of shap values is computed by: <>

See example below:

In [1]:

Copied!

%%capture
!pip install probatus
!pip install catboost
%%capture
!pip install probatus
!pip install catboost

In [2]:

Copied!





from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd

In [3]:

Copied!





X, y = make_classification(n_samples=500, n_informative=20, n_features=100)
model = CatBoostClassifier(n_estimators=100, verbose=0)
shap_elimination = ShapRFECV(model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1)
report_with_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=1.0)
report_without_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=0)
X, y = make_classification(n_samples=500, n_informative=20, n_features=100)
model = CatBoostClassifier(n_estimators=100, verbose=0)
shap_elimination = ShapRFECV(model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1)
report_with_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=1.0)
report_without_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=0)

In [4]:

Copied!

report_with_penalty
report_with_penalty

Out[4]:

	num_features	features_set	eliminated_features	train_metric_mean	train_metric_std	val_metric_mean	val_metric_std
1	100	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	[48, 23, 25, 17, 41, 79, 70, 67, 96, 95, 52, 5...	1.000000	0.000000	0.783734	0.036136
2	80	[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14...	[29, 44, 31, 80, 34, 42, 60, 87, 77, 75, 64, 7...	1.000000	0.000000	0.818636	0.027409
3	64	[0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1...	[63, 59, 83, 12, 38, 90, 93, 16, 49, 94, 8, 1]	1.000000	0.000000	0.809475	0.040263
4	52	[0, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 18, 19,...	[62, 14, 36, 18, 3, 4, 24, 74, 82, 89]	1.000000	0.000000	0.825634	0.027513
5	42	[0, 2, 5, 9, 10, 11, 13, 15, 19, 20, 21, 22, 3...	[2, 66, 68, 39, 71, 72, 22, 99]	1.000000	0.000000	0.858765	0.031187
6	34	[0, 5, 9, 10, 11, 13, 15, 19, 20, 21, 30, 35, ...	[19, 69, 53, 30, 37, 15]	1.000000	0.000000	0.845318	0.034718
7	28	[0, 5, 9, 10, 11, 13, 20, 21, 35, 40, 43, 45, ...	[21, 43, 98, 57, 0]	1.000000	0.000000	0.847304	0.029020
8	23	[5, 9, 10, 11, 13, 20, 35, 40, 45, 46, 47, 51,...	[45, 20, 84, 88]	1.000000	0.000000	0.863716	0.027382
9	19	[5, 9, 10, 11, 13, 35, 40, 46, 47, 51, 54, 58,...	[13, 76, 92]	0.972956	0.005839	0.815100	0.035161
10	16	[5, 9, 10, 11, 35, 40, 46, 47, 51, 54, 58, 65,...	[47, 51, 35]	0.969608	0.003283	0.823234	0.055277
11	13	[5, 9, 10, 11, 40, 46, 54, 58, 65, 73, 81, 85,...	[46, 58]	0.962777	0.011050	0.800052	0.048493
12	11	[5, 9, 10, 11, 40, 54, 65, 73, 81, 85, 91]	[9, 91]	0.956023	0.008971	0.814270	0.051047
13	9	[5, 10, 11, 40, 54, 65, 73, 81, 85]	[10]	0.951823	0.009062	0.804158	0.079721
14	8	[5, 11, 40, 54, 65, 73, 81, 85]	[5]	0.930154	0.008260	0.770472	0.048131
15	7	[11, 40, 54, 65, 73, 81, 85]	[81]	0.906913	0.008914	0.762450	0.029873
16	6	[11, 40, 54, 65, 73, 85]	[54]	0.894709	0.009730	0.743937	0.029733
17	5	[11, 40, 65, 73, 85]	[]	0.875344	0.012642	0.725548	0.026652

In [5]:

Copied!

report_without_penalty
report_without_penalty

Out[5]:

	num_features	features_set	eliminated_features	train_metric_mean	train_metric_std	val_metric_mean	val_metric_std
1	100	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	[48, 23, 25, 17, 41, 79, 70, 67, 96, 95, 52, 5...	1.000000	0.000000	0.783734	0.036136
2	80	[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14...	[29, 44, 31, 80, 34, 42, 60, 87, 77, 75, 64, 7...	1.000000	0.000000	0.818636	0.027409
3	64	[0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1...	[63, 59, 83, 12, 38, 90, 93, 16, 49, 94, 8, 1]	1.000000	0.000000	0.809475	0.040263
4	52	[0, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 18, 19,...	[62, 14, 36, 18, 3, 4, 24, 74, 82, 89]	1.000000	0.000000	0.825634	0.027513
5	42	[0, 2, 5, 9, 10, 11, 13, 15, 19, 20, 21, 22, 3...	[2, 66, 68, 39, 71, 72, 22, 99]	1.000000	0.000000	0.858765	0.031187
6	34	[0, 5, 9, 10, 11, 13, 15, 19, 20, 21, 30, 35, ...	[19, 69, 53, 30, 37, 15]	1.000000	0.000000	0.845318	0.034718
7	28	[0, 5, 9, 10, 11, 13, 20, 21, 35, 40, 43, 45, ...	[21, 43, 98, 57, 0]	1.000000	0.000000	0.847304	0.029020
8	23	[5, 9, 10, 11, 13, 20, 35, 40, 45, 46, 47, 51,...	[45, 20, 84, 88]	1.000000	0.000000	0.863716	0.027382
9	19	[5, 9, 10, 11, 13, 35, 40, 46, 47, 51, 54, 58,...	[13, 76, 92]	0.972956	0.005839	0.815100	0.035161
10	16	[5, 9, 10, 11, 35, 40, 46, 47, 51, 54, 58, 65,...	[47, 51, 35]	0.969608	0.003283	0.823234	0.055277
11	13	[5, 9, 10, 11, 40, 46, 54, 58, 65, 73, 81, 85,...	[46, 58]	0.962777	0.011050	0.800052	0.048493
12	11	[5, 9, 10, 11, 40, 54, 65, 73, 81, 85, 91]	[9, 91]	0.956023	0.008971	0.814270	0.051047
13	9	[5, 10, 11, 40, 54, 65, 73, 81, 85]	[10]	0.951823	0.009062	0.804158	0.079721
14	8	[5, 11, 40, 54, 65, 73, 81, 85]	[5]	0.930154	0.008260	0.770472	0.048131
15	7	[11, 40, 54, 65, 73, 81, 85]	[81]	0.906913	0.008914	0.762450	0.029873
16	6	[11, 40, 54, 65, 73, 85]	[54]	0.894709	0.009730	0.743937	0.029733
17	5	[11, 40, 65, 73, 85]	[]	0.875344	0.012642	0.725548	0.026652
1	100	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	[80, 4, 64, 33, 14, 87, 48, 36, 56, 6, 18, 29,...	1.000000	0.000000	0.783734	0.036136
2	80	[0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 15, 1...	[77, 19, 2, 31, 57, 86, 26, 37, 59, 68, 72, 63...	1.000000	0.000000	0.826010	0.033883
3	64	[0, 1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, ...	[34, 15, 12, 83, 55, 8, 38, 75, 44, 53, 78, 69]	1.000000	0.000000	0.840406	0.004867
4	52	[0, 1, 3, 5, 7, 9, 10, 11, 13, 16, 17, 20, 21,...	[17, 89, 93, 3, 23, 62, 60, 1, 49, 96]	1.000000	0.000000	0.834353	0.023681
5	42	[0, 5, 7, 9, 10, 11, 13, 16, 20, 21, 22, 27, 2...	[43, 71, 0, 84, 7, 97, 98, 88]	1.000000	0.000000	0.845624	0.017201
6	34	[5, 9, 10, 11, 13, 16, 20, 21, 22, 27, 28, 32,...	[90, 21, 16, 70, 27, 95]	1.000000	0.000000	0.863096	0.030650
7	28	[5, 9, 10, 11, 13, 20, 22, 28, 32, 35, 40, 45,...	[28, 46, 61, 20, 94]	1.000000	0.000000	0.856743	0.036105
8	23	[5, 9, 10, 11, 13, 22, 32, 35, 40, 45, 47, 51,...	[82, 32, 74, 13]	1.000000	0.000000	0.858418	0.031434
9	19	[5, 9, 10, 11, 22, 35, 40, 45, 47, 51, 54, 58,...	[92, 58, 35]	0.978937	0.004357	0.857832	0.040339
10	16	[5, 9, 10, 11, 22, 40, 45, 47, 51, 54, 65, 73,...	[47, 76, 91]	0.973436	0.004632	0.856597	0.044714
11	13	[5, 9, 10, 11, 22, 40, 45, 51, 54, 65, 73, 81,...	[9, 45]	0.966317	0.006283	0.836464	0.069308
12	11	[5, 10, 11, 22, 40, 51, 54, 65, 73, 81, 85]	[11, 81]	0.958512	0.007243	0.826117	0.055609
13	9	[5, 10, 22, 40, 51, 54, 65, 73, 85]	[51]	0.946004	0.011688	0.807256	0.066020
14	8	[5, 10, 22, 40, 54, 65, 73, 85]	[22]	0.932403	0.010371	0.796027	0.049649
15	7	[5, 10, 40, 54, 65, 73, 85]	[5]	0.918967	0.005895	0.797665	0.054531
16	6	[10, 40, 54, 65, 73, 85]	[54]	0.905771	0.006757	0.785010	0.050255
17	5	[10, 40, 65, 73, 85]	[]	0.876856	0.006382	0.725792	0.058888

Which approach is better?¶

Let's compare a few different configurations of RFECV.

In [6]:

Copied!





# Compare A: shap_variance_penalty_factor=0.5 & approximate=True
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=True
num_simulations = 5
results = []


def get_best_idx(shap_report):
    shap_report["eval_metric"] = shap_report["val_metric_mean"]
    best_iteration_idx = shap_report["eval_metric"].argmax()

    return best_iteration_idx


for i in range(num_simulations):
    # Params
    n_samples = np.random.randint(100, 500)
    n_features = 200
    n_informative = np.random.randint(10, 200)
    test_size = np.random.uniform(0.05, 0.5)

    # Create data
    X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Model
    model = CatBoostClassifier(n_estimators=100, verbose=0)

    # Best score from ShapRFECV WITHOUT penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_a = shap_elimination.fit_compute(
        X_train, y_train, shap_variance_penalty_factor=0, approximate=True, check_additivity=False
    )
    best_idx_a = get_best_idx(report_a)
    best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
    std_a = report_a["val_metric_std"].iloc[best_idx_a]
    num_features_a = report_a["num_features"].iloc[best_idx_a]

    # Best score from ShapRFECV WITH penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_b = shap_elimination.fit_compute(
        X_train, y_train, shap_variance_penalty_factor=0.5, approximate=True, check_additivity=False
    )
    best_idx_b = get_best_idx(report_b)
    best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
    std_b = report_b["val_metric_std"].iloc[best_idx_b]
    num_features_b = report_b["num_features"].iloc[best_idx_b]

    results.append(
        [best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
    )

    results_df = pd.DataFrame(
        results,
        columns=[
            "best_score_a",
            "std_a",
            "num_features_a",
            "best_score_b",
            "std_b",
            "num_features_b",
            "n_samples",
            "n_features",
            "n_informative",
        ],
    )

# Show results
results_df
# Compare A: shap_variance_penalty_factor=0.5 & approximate=True
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=True
num_simulations = 5
results = []


def get_best_idx(shap_report):
    shap_report["eval_metric"] = shap_report["val_metric_mean"]
    best_iteration_idx = shap_report["eval_metric"].argmax()

    return best_iteration_idx


for i in range(num_simulations):
    # Params
    n_samples = np.random.randint(100, 500)
    n_features = 200
    n_informative = np.random.randint(10, 200)
    test_size = np.random.uniform(0.05, 0.5)

    # Create data
    X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Model
    model = CatBoostClassifier(n_estimators=100, verbose=0)

    # Best score from ShapRFECV WITHOUT penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_a = shap_elimination.fit_compute(
        X_train, y_train, shap_variance_penalty_factor=0, approximate=True, check_additivity=False
    )
    best_idx_a = get_best_idx(report_a)
    best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
    std_a = report_a["val_metric_std"].iloc[best_idx_a]
    num_features_a = report_a["num_features"].iloc[best_idx_a]

    # Best score from ShapRFECV WITH penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_b = shap_elimination.fit_compute(
        X_train, y_train, shap_variance_penalty_factor=0.5, approximate=True, check_additivity=False
    )
    best_idx_b = get_best_idx(report_b)
    best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
    std_b = report_b["val_metric_std"].iloc[best_idx_b]
    num_features_b = report_b["num_features"].iloc[best_idx_b]

    results.append(
        [best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
    )

    results_df = pd.DataFrame(
        results,
        columns=[
            "best_score_a",
            "std_a",
            "num_features_a",
            "best_score_b",
            "std_b",
            "num_features_b",
            "n_samples",
            "n_features",
            "n_informative",
        ],
    )

# Show results
results_df

Out[6]:

	best_score_a	std_a	num_features_a	best_score_b	std_b	num_features_b	n_samples	n_features	n_informative
0	0.892217	0.028543	8	0.894579	0.013364	20	188	200	13
1	0.797291	0.036200	24	0.801302	0.040278	36	404	200	141
2	0.741265	0.021340	29	0.701914	0.028287	54	499	200	180
3	0.786747	0.087275	8	0.787802	0.065435	24	179	200	183
4	0.834444	0.067287	7	0.770303	0.091079	5	176	200	198

In [7]:

Copied!





# Compare A: shap_variance_penalty_factor=0.5 & approximate=False
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=False
num_simulations = 5
results = []

for i in range(num_simulations):
    # Params
    n_samples = np.random.randint(100, 500)
    n_features = 200
    n_informative = np.random.randint(10, 200)
    test_size = np.random.uniform(0.05, 0.5)

    # Create data
    X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Model
    model = CatBoostClassifier(n_estimators=100, verbose=0)

    # Best score from ShapRFECV WITHOUT penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_a = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0, approximate=False)
    best_idx_a = get_best_idx(report_a)
    best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
    std_a = report_a["val_metric_std"].iloc[best_idx_a]
    num_features_a = report_a["num_features"].iloc[best_idx_a]

    # Best score from ShapRFECV WITH penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_b = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0.5, approximate=False)
    best_idx_b = get_best_idx(report_b)
    best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
    std_b = report_b["val_metric_std"].iloc[best_idx_b]
    num_features_b = report_b["num_features"].iloc[best_idx_b]

    results.append(
        [best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
    )

    results_df = pd.DataFrame(
        results,
        columns=[
            "best_score_a",
            "std_a",
            "num_features_a",
            "best_score_b",
            "std_b",
            "num_features_b",
            "n_samples",
            "n_features",
            "n_informative",
        ],
    )

# Show results
results_df
# Compare A: shap_variance_penalty_factor=0.5 & approximate=False
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=False
num_simulations = 5
results = []

for i in range(num_simulations):
    # Params
    n_samples = np.random.randint(100, 500)
    n_features = 200
    n_informative = np.random.randint(10, 200)
    test_size = np.random.uniform(0.05, 0.5)

    # Create data
    X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Model
    model = CatBoostClassifier(n_estimators=100, verbose=0)

    # Best score from ShapRFECV WITHOUT penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_a = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0, approximate=False)
    best_idx_a = get_best_idx(report_a)
    best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
    std_a = report_a["val_metric_std"].iloc[best_idx_a]
    num_features_a = report_a["num_features"].iloc[best_idx_a]

    # Best score from ShapRFECV WITH penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_b = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0.5, approximate=False)
    best_idx_b = get_best_idx(report_b)
    best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
    std_b = report_b["val_metric_std"].iloc[best_idx_b]
    num_features_b = report_b["num_features"].iloc[best_idx_b]

    results.append(
        [best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
    )

    results_df = pd.DataFrame(
        results,
        columns=[
            "best_score_a",
            "std_a",
            "num_features_a",
            "best_score_b",
            "std_b",
            "num_features_b",
            "n_samples",
            "n_features",
            "n_informative",
        ],
    )

# Show results
results_df

Out[7]:

	best_score_a	std_a	num_features_a	best_score_b	std_b	num_features_b	n_samples	n_features	n_informative
0	0.742664	0.091943	11	0.773073	0.095611	13	250	200	43
1	0.829656	0.052365	20	0.798127	0.053808	29	327	200	24
2	0.724558	0.043146	83	0.746103	0.022388	20	394	200	179
3	0.822537	0.044845	36	0.825153	0.038366	29	479	200	60
4	0.729214	0.038897	83	0.731563	0.024997	54	485	200	176