Skip to content

LogisticRegression

Bases: lm.LogisticRegression

Extended Logistic Regression.

Extends sklearn.linear_model.LogisticRegression.

This class provides the following extra statistics, calculated on .fit() and accessible via .get_stats():

  • cov_matrix_: covariance matrix for the estimated parameters.
  • std_err_intercept_: estimated uncertainty for the intercept
  • std_err_coef_: estimated uncertainty for the coefficients
  • z_intercept_: estimated z-statistic for the intercept
  • z_coef_: estimated z-statistic for the coefficients
  • p_value_intercept_: estimated p-value for the intercept
  • p_value_coef_: estimated p-value for the coefficients

Example:

from skorecard.datasets import load_uci_credit_card
from skorecard.bucketers import EqualFrequencyBucketer
from skorecard.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

X, y = load_uci_credit_card(return_X_y=True)

pipeline = Pipeline([
    ('bucketer', EqualFrequencyBucketer(n_bins=10)),
    ('clf', LogisticRegression(calculate_stats=True))
])
pipeline.fit(X, y)
assert pipeline.named_steps['clf'].p_val_coef_[0][0] > 0

pipeline.named_steps['clf'].get_stats()

An example output of .get_stats():

Index Coef. Std.Err z Pz
const -0.537571 0.096108 -5.593394 2.226735e-08
EDUCATION 0.010091 0.044874 0.224876 8.220757e-01
Source code in skorecard/linear_model/linear_model.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
class LogisticRegression(lm.LogisticRegression):
    """Extended Logistic Regression.

    Extends [sklearn.linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

    This class provides the following extra statistics, calculated on `.fit()` and accessible via `.get_stats()`:

    - `cov_matrix_`: covariance matrix for the estimated parameters.
    - `std_err_intercept_`: estimated uncertainty for the intercept
    - `std_err_coef_`: estimated uncertainty for the coefficients
    - `z_intercept_`: estimated z-statistic for the intercept
    - `z_coef_`: estimated z-statistic for the coefficients
    - `p_value_intercept_`: estimated p-value for the intercept
    - `p_value_coef_`: estimated p-value for the coefficients

    Example:

    ```python
    from skorecard.datasets import load_uci_credit_card
    from skorecard.bucketers import EqualFrequencyBucketer
    from skorecard.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder

    X, y = load_uci_credit_card(return_X_y=True)

    pipeline = Pipeline([
        ('bucketer', EqualFrequencyBucketer(n_bins=10)),
        ('clf', LogisticRegression(calculate_stats=True))
    ])
    pipeline.fit(X, y)
    assert pipeline.named_steps['clf'].p_val_coef_[0][0] > 0

    pipeline.named_steps['clf'].get_stats()
    ```

    An example output of `.get_stats()`:

    Index     | Coef.     | Std.Err  |   z       | Pz
    --------- | ----------| ---------| ----------| ------------
    const     | -0.537571 | 0.096108 | -5.593394 | 2.226735e-08
    EDUCATION | 0.010091  | 0.044874 | 0.224876  | 8.220757e-01

    """  # noqa

    def __init__(
        self,
        penalty="l2",
        calculate_stats=False,
        dual=False,
        tol=0.0001,
        C=1.0,
        fit_intercept=True,
        intercept_scaling=1,
        class_weight=None,
        random_state=None,
        solver="lbfgs",
        max_iter=100,
        multi_class="auto",
        verbose=0,
        warm_start=False,
        n_jobs=None,
        l1_ratio=None,
    ):
        """
        Extends [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

        Args:
            calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .get_stats()
        """  # noqa
        super().__init__(
            penalty=penalty,
            dual=dual,
            tol=tol,
            C=C,
            fit_intercept=fit_intercept,
            intercept_scaling=intercept_scaling,
            class_weight=class_weight,
            random_state=random_state,
            solver=solver,
            max_iter=max_iter,
            multi_class=multi_class,
            verbose=verbose,
            warm_start=warm_start,
            n_jobs=n_jobs,
            l1_ratio=l1_ratio,
        )
        self.calculate_stats = calculate_stats

    def fit(self, X, y, sample_weight=None, calculate_stats=False, **kwargs):
        """
        Fit the model.

        Overwrites [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

        In addition to the standard fit by sklearn, this function will compute the covariance of the coefficients.

        Args:
            X (array-like, sparse matrix): Matrix of shape (n_samples, n_features)
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (array-like): of shape (n_samples,)
                Target vector relative to X.
            sample_weight (array-like): of shape (n_samples,) default=None
                Array of weights that are assigned to individual samples.
                If not provided, then each sample is given unit weight.
            calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .get_stats()

        Returns:
            self (LogisticRegression): Fitted estimator.
        """  # noqa

        if not self.calculate_stats and not calculate_stats:
            return super().fit(X, y, sample_weight=sample_weight, **kwargs)

        X = convert_sparse_matrix(X)
        if isinstance(X, pd.DataFrame):
            self.names_ = ["const"] + [f for f in X.columns]
        else:
            self.names_ = ["const"] + [f"x{i}" for i in range(X.shape[1])]

        lr = super().fit(X, y, sample_weight=sample_weight, **kwargs)

        predProbs = self.predict_proba(X)

        # Design matrix -- add column of 1's at the beginning of your X matrix
        if lr.fit_intercept:
            X_design = np.hstack([np.ones((X.shape[0], 1)), X])
        else:
            X_design = X

        p = np.product(predProbs, axis=1)
        self.cov_matrix_ = np.linalg.inv((X_design * p[..., np.newaxis]).T @ X_design)
        std_err = np.sqrt(np.diag(self.cov_matrix_)).reshape(1, -1)

        # In case fit_intercept is set to True, then in the std_error array
        # Index 0 corresponds to the intercept, from index 1 onwards it relates to the coefficients
        # If fit intercept is False, then all the values are related to the coefficients
        if lr.fit_intercept:
            self.std_err_intercept_ = std_err[:, 0]
            self.std_err_coef_ = std_err[:, 1:][0]

            self.z_intercept_ = self.intercept_ / self.std_err_intercept_

            # Get p-values under the gaussian assumption
            self.p_val_intercept_ = scipy.stats.norm.sf(abs(self.z_intercept_)) * 2

        else:
            self.std_err_intercept_ = np.array([np.nan])
            self.std_err_coef_ = std_err[0]

            self.z_intercept_ = np.array([np.nan])

            # Get p-values under the gaussian assumption
            self.p_val_intercept_ = np.array([np.nan])

        self.z_coef_ = self.coef_ / self.std_err_coef_
        self.p_val_coef_ = scipy.stats.norm.sf(abs(self.z_coef_)) * 2

        return self

    def get_stats(self) -> pd.DataFrame:
        """
        Puts the summary statistics of the fit() function into a pandas DataFrame.

        Returns:
            data (pandas DataFrame): The statistics dataframe, indexed by
                the column name
        """
        check_is_fitted(self)

        if not hasattr(self, "std_err_coef_"):
            msg = "Summary statistics were not calculated on .fit(). Options to fix:\n"
            msg += "\t- Re-fit using .fit(X, y, calculate_stats=True)\n"
            msg += "\t- Re-inititialize using LogisticRegression(calculate_stats=True)"
            raise AssertionError(msg)

        data = {
            "Coef.": (self.intercept_.tolist() + self.coef_.tolist()[0]),
            "Std.Err": (self.std_err_intercept_.tolist() + self.std_err_coef_.tolist()),
            "z": (self.z_intercept_.tolist() + self.z_coef_.tolist()[0]),
            "P>|z|": (self.p_val_intercept_.tolist() + self.p_val_coef_.tolist()[0]),
        }

        return pd.DataFrame(data, index=self.names_)

    def plot_weights(self):
        """
        Plots the relative importance of coefficients of the model.

        Example:

        ```from skorecard.datasets import load_uci_credit_card
        from skorecard.bucketers import EqualFrequencyBucketer
        from skorecard.linear_model import LogisticRegression
        from skorecard.reporting.plotting import weight_plot
        from sklearn.pipeline import Pipeline
        from sklearn.preprocessing import OneHotEncoder
        X, y = load_uci_credit_card(return_X_y=True)
        pipeline = Pipeline([
            ('bucketer', EqualFrequencyBucketer(n_bins=10)),
            ('clf', LogisticRegression(calculate_stats=True))
        ])
        pipeline.fit(X, y)
        assert pipeline.named_steps['clf'].p_val_coef_[0][0] > 0
        stats = pipeline.named_steps['clf'].get_stats()
        pipeline.named_steps['clf'].plot_weights()```
        """
        stats = self.get_stats()
        return weight_plot(stats)

__init__(penalty='l2', calculate_stats=False, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

Extends sklearn.linear_model.LogisticRegression.fit().

Parameters:

Name Type Description Default
calculate_stats bool

If true, calculate statistics like standard error during fit, accessible with .get_stats()

False
Source code in skorecard/linear_model/linear_model.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def __init__(
    self,
    penalty="l2",
    calculate_stats=False,
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    random_state=None,
    solver="lbfgs",
    max_iter=100,
    multi_class="auto",
    verbose=0,
    warm_start=False,
    n_jobs=None,
    l1_ratio=None,
):
    """
    Extends [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

    Args:
        calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .get_stats()
    """  # noqa
    super().__init__(
        penalty=penalty,
        dual=dual,
        tol=tol,
        C=C,
        fit_intercept=fit_intercept,
        intercept_scaling=intercept_scaling,
        class_weight=class_weight,
        random_state=random_state,
        solver=solver,
        max_iter=max_iter,
        multi_class=multi_class,
        verbose=verbose,
        warm_start=warm_start,
        n_jobs=n_jobs,
        l1_ratio=l1_ratio,
    )
    self.calculate_stats = calculate_stats

fit(X, y, sample_weight=None, calculate_stats=False, **kwargs)

Fit the model.

Overwrites sklearn.linear_model.LogisticRegression.fit().

In addition to the standard fit by sklearn, this function will compute the covariance of the coefficients.

Parameters:

Name Type Description Default
X array-like, sparse matrix

Matrix of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.

required
y array - like

of shape (n_samples,) Target vector relative to X.

required
sample_weight array - like

of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

None
calculate_stats bool

If true, calculate statistics like standard error during fit, accessible with .get_stats()

False

Returns:

Name Type Description
self LogisticRegression

Fitted estimator.

Source code in skorecard/linear_model/linear_model.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def fit(self, X, y, sample_weight=None, calculate_stats=False, **kwargs):
    """
    Fit the model.

    Overwrites [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

    In addition to the standard fit by sklearn, this function will compute the covariance of the coefficients.

    Args:
        X (array-like, sparse matrix): Matrix of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y (array-like): of shape (n_samples,)
            Target vector relative to X.
        sample_weight (array-like): of shape (n_samples,) default=None
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given unit weight.
        calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .get_stats()

    Returns:
        self (LogisticRegression): Fitted estimator.
    """  # noqa

    if not self.calculate_stats and not calculate_stats:
        return super().fit(X, y, sample_weight=sample_weight, **kwargs)

    X = convert_sparse_matrix(X)
    if isinstance(X, pd.DataFrame):
        self.names_ = ["const"] + [f for f in X.columns]
    else:
        self.names_ = ["const"] + [f"x{i}" for i in range(X.shape[1])]

    lr = super().fit(X, y, sample_weight=sample_weight, **kwargs)

    predProbs = self.predict_proba(X)

    # Design matrix -- add column of 1's at the beginning of your X matrix
    if lr.fit_intercept:
        X_design = np.hstack([np.ones((X.shape[0], 1)), X])
    else:
        X_design = X

    p = np.product(predProbs, axis=1)
    self.cov_matrix_ = np.linalg.inv((X_design * p[..., np.newaxis]).T @ X_design)
    std_err = np.sqrt(np.diag(self.cov_matrix_)).reshape(1, -1)

    # In case fit_intercept is set to True, then in the std_error array
    # Index 0 corresponds to the intercept, from index 1 onwards it relates to the coefficients
    # If fit intercept is False, then all the values are related to the coefficients
    if lr.fit_intercept:
        self.std_err_intercept_ = std_err[:, 0]
        self.std_err_coef_ = std_err[:, 1:][0]

        self.z_intercept_ = self.intercept_ / self.std_err_intercept_

        # Get p-values under the gaussian assumption
        self.p_val_intercept_ = scipy.stats.norm.sf(abs(self.z_intercept_)) * 2

    else:
        self.std_err_intercept_ = np.array([np.nan])
        self.std_err_coef_ = std_err[0]

        self.z_intercept_ = np.array([np.nan])

        # Get p-values under the gaussian assumption
        self.p_val_intercept_ = np.array([np.nan])

    self.z_coef_ = self.coef_ / self.std_err_coef_
    self.p_val_coef_ = scipy.stats.norm.sf(abs(self.z_coef_)) * 2

    return self

get_stats()

Puts the summary statistics of the fit() function into a pandas DataFrame.

Returns:

Name Type Description
data pandas DataFrame

The statistics dataframe, indexed by the column name

Source code in skorecard/linear_model/linear_model.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def get_stats(self) -> pd.DataFrame:
    """
    Puts the summary statistics of the fit() function into a pandas DataFrame.

    Returns:
        data (pandas DataFrame): The statistics dataframe, indexed by
            the column name
    """
    check_is_fitted(self)

    if not hasattr(self, "std_err_coef_"):
        msg = "Summary statistics were not calculated on .fit(). Options to fix:\n"
        msg += "\t- Re-fit using .fit(X, y, calculate_stats=True)\n"
        msg += "\t- Re-inititialize using LogisticRegression(calculate_stats=True)"
        raise AssertionError(msg)

    data = {
        "Coef.": (self.intercept_.tolist() + self.coef_.tolist()[0]),
        "Std.Err": (self.std_err_intercept_.tolist() + self.std_err_coef_.tolist()),
        "z": (self.z_intercept_.tolist() + self.z_coef_.tolist()[0]),
        "P>|z|": (self.p_val_intercept_.tolist() + self.p_val_coef_.tolist()[0]),
    }

    return pd.DataFrame(data, index=self.names_)

plot_weights()

Plots the relative importance of coefficients of the model.

Example:

from skorecard.datasets import load_uci_credit_card from skorecard.bucketers import EqualFrequencyBucketer from skorecard.linear_model import LogisticRegression from skorecard.reporting.plotting import weight_plot from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder X, y = load_uci_credit_card(return_X_y=True) pipeline = Pipeline([ ('bucketer', EqualFrequencyBucketer(n_bins=10)), ('clf', LogisticRegression(calculate_stats=True)) ]) pipeline.fit(X, y) assert pipeline.named_steps['clf'].p_val_coef_[0][0] > 0 stats = pipeline.named_steps['clf'].get_stats() pipeline.named_steps['clf'].plot_weights()

Source code in skorecard/linear_model/linear_model.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def plot_weights(self):
    """
    Plots the relative importance of coefficients of the model.

    Example:

    ```from skorecard.datasets import load_uci_credit_card
    from skorecard.bucketers import EqualFrequencyBucketer
    from skorecard.linear_model import LogisticRegression
    from skorecard.reporting.plotting import weight_plot
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder
    X, y = load_uci_credit_card(return_X_y=True)
    pipeline = Pipeline([
        ('bucketer', EqualFrequencyBucketer(n_bins=10)),
        ('clf', LogisticRegression(calculate_stats=True))
    ])
    pipeline.fit(X, y)
    assert pipeline.named_steps['clf'].p_val_coef_[0][0] > 0
    stats = pipeline.named_steps['clf'].get_stats()
    pipeline.named_steps['clf'].plot_weights()```
    """
    stats = self.get_stats()
    return weight_plot(stats)

Last update: 2023-08-08