ColumnSelector

Bases: BaseEstimator, TransformerMixin

Transformer that performs selection of variables from a pandas dataframe.

Useful in pipelines, where we require a step that selects features.

Example:

from skorecard import datasets
from skorecard.preprocessing import ColumnSelector

X, y = datasets.load_uci_credit_card(return_X_y=True)
cs = ColumnSelector(variables=['EDUCATION'])
assert cs.fit_transform(X, y).columns == ['EDUCATION']

Source code in skorecard/preprocessing/preprocessing.py

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer that performs selection of variables from a pandas dataframe.

    Useful in pipelines, where we require a step that selects features.

    Example:

    ```python
    from skorecard import datasets
    from skorecard.preprocessing import ColumnSelector

    X, y = datasets.load_uci_credit_card(return_X_y=True)
    cs = ColumnSelector(variables=['EDUCATION'])
    assert cs.fit_transform(X, y).columns == ['EDUCATION']
    ```
    """

    def __init__(self, variables: List = []):
        """Transformer constructor.

        Args:
            variables: list of columns to select. Default value is set to None - in this case, there is no selection of
                columns.
        """
        self.variables = variables

    def fit(self, X, y=None):
        """
        Fit the transformer.

        Here to be compliant with the sklearn API, does not fit anything.
        """
        # scikit-learn requires checking that X has same shape on transform
        # this is because scikit-learn is still positional based (no column names used)
        self.n_train_features_ = X.shape[1]

        return self

    def transform(self, X):
        """
        Selects the columns.

        Args:
            X (pd.DataFrame): Dataset
        """
        X = ensure_dataframe(X)
        if hasattr(self, "n_train_features_"):
            if X.shape[1] != self.n_train_features_:
                msg = f"Number of features in X ({X.shape[1]}) is different "
                msg += f"from the number of features in X during fit ({self.n_train_features_})"
                raise ValueError(msg)

        if len(self.variables) > 0:
            return X[self.variables]
        else:
            return X

    def _more_tags(self):
        """
        Estimator tags are annotations of estimators that allow programmatic inspection of their capabilities.

        See https://scikit-learn.org/stable/developers/develop.html#estimator-tags
        """  # noqa
        return {"requires_fit": False}

`init(variables=[])` ¶

Transformer constructor.

Parameters:

Name	Type	Description	Default
`variables`	`List`	list of columns to select. Default value is set to None - in this case, there is no selection of columns.	`[]`

Source code in skorecard/preprocessing/preprocessing.py

def __init__(self, variables: List = []):
    """Transformer constructor.

    Args:
        variables: list of columns to select. Default value is set to None - in this case, there is no selection of
            columns.
    """
    self.variables = variables

`fit(X, y=None)` ¶

Fit the transformer.

Here to be compliant with the sklearn API, does not fit anything.

Source code in skorecard/preprocessing/preprocessing.py

def fit(self, X, y=None):
    """
    Fit the transformer.

    Here to be compliant with the sklearn API, does not fit anything.
    """
    # scikit-learn requires checking that X has same shape on transform
    # this is because scikit-learn is still positional based (no column names used)
    self.n_train_features_ = X.shape[1]

    return self

`transform(X)` ¶

Selects the columns.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Dataset	required

Source code in skorecard/preprocessing/preprocessing.py

def transform(self, X):
    """
    Selects the columns.

    Args:
        X (pd.DataFrame): Dataset
    """
    X = ensure_dataframe(X)
    if hasattr(self, "n_train_features_"):
        if X.shape[1] != self.n_train_features_:
            msg = f"Number of features in X ({X.shape[1]}) is different "
            msg += f"from the number of features in X during fit ({self.n_train_features_})"
            raise ValueError(msg)

    if len(self.variables) > 0:
        return X[self.variables]
    else:
        return X

ColumnSelector

__init__(variables=[]) ¶

fit(X, y=None) ¶

transform(X) ¶

`init(variables=[])` ¶

`fit(X, y=None)` ¶

`transform(X)` ¶