Skip to content

ColumnSelector

Bases: BaseEstimator, TransformerMixin

Transformer that performs selection of variables from a pandas dataframe.

Useful in pipelines, where we require a step that selects features.

Example:

from skorecard import datasets
from skorecard.preprocessing import ColumnSelector

X, y = datasets.load_uci_credit_card(return_X_y=True)
cs = ColumnSelector(variables=['EDUCATION'])
assert cs.fit_transform(X, y).columns == ['EDUCATION']
Source code in skorecard/preprocessing/preprocessing.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer that performs selection of variables from a pandas dataframe.

    Useful in pipelines, where we require a step that selects features.

    Example:

    ```python
    from skorecard import datasets
    from skorecard.preprocessing import ColumnSelector

    X, y = datasets.load_uci_credit_card(return_X_y=True)
    cs = ColumnSelector(variables=['EDUCATION'])
    assert cs.fit_transform(X, y).columns == ['EDUCATION']
    ```
    """

    def __init__(self, variables: List = []):
        """Transformer constructor.

        Args:
            variables: list of columns to select. Default value is set to None - in this case, there is no selection of
                columns.
        """
        self.variables = variables

    def fit(self, X, y=None):
        """
        Fit the transformer.

        Here to be compliant with the sklearn API, does not fit anything.
        """
        # scikit-learn requires checking that X has same shape on transform
        # this is because scikit-learn is still positional based (no column names used)
        self.n_train_features_ = X.shape[1]

        return self

    def transform(self, X):
        """
        Selects the columns.

        Args:
            X (pd.DataFrame): Dataset
        """
        X = ensure_dataframe(X)
        if hasattr(self, "n_train_features_"):
            if X.shape[1] != self.n_train_features_:
                msg = f"Number of features in X ({X.shape[1]}) is different "
                msg += f"from the number of features in X during fit ({self.n_train_features_})"
                raise ValueError(msg)

        if len(self.variables) > 0:
            return X[self.variables]
        else:
            return X

    def _more_tags(self):
        """
        Estimator tags are annotations of estimators that allow programmatic inspection of their capabilities.

        See https://scikit-learn.org/stable/developers/develop.html#estimator-tags
        """  # noqa
        return {"requires_fit": False}

__init__(variables=[])

Transformer constructor.

Parameters:

Name Type Description Default
variables List

list of columns to select. Default value is set to None - in this case, there is no selection of columns.

[]
Source code in skorecard/preprocessing/preprocessing.py
26
27
28
29
30
31
32
33
def __init__(self, variables: List = []):
    """Transformer constructor.

    Args:
        variables: list of columns to select. Default value is set to None - in this case, there is no selection of
            columns.
    """
    self.variables = variables

fit(X, y=None)

Fit the transformer.

Here to be compliant with the sklearn API, does not fit anything.

Source code in skorecard/preprocessing/preprocessing.py
35
36
37
38
39
40
41
42
43
44
45
def fit(self, X, y=None):
    """
    Fit the transformer.

    Here to be compliant with the sklearn API, does not fit anything.
    """
    # scikit-learn requires checking that X has same shape on transform
    # this is because scikit-learn is still positional based (no column names used)
    self.n_train_features_ = X.shape[1]

    return self

transform(X)

Selects the columns.

Parameters:

Name Type Description Default
X DataFrame

Dataset

required
Source code in skorecard/preprocessing/preprocessing.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def transform(self, X):
    """
    Selects the columns.

    Args:
        X (pd.DataFrame): Dataset
    """
    X = ensure_dataframe(X)
    if hasattr(self, "n_train_features_"):
        if X.shape[1] != self.n_train_features_:
            msg = f"Number of features in X ({X.shape[1]}) is different "
            msg += f"from the number of features in X during fit ({self.n_train_features_})"
            raise ValueError(msg)

    if len(self.variables) > 0:
        return X[self.variables]
    else:
        return X