UserInputBucketer

Bases: BaseBucketer

The UserInputBucketer transformer creates buckets by implementing user-defined boundaries.

Support:

This is a special bucketer that is not fitted but rather relies on pre-defined user input. The most common use-case is loading bucket mapping information previously fitted by other bucketers.

Example:

from skorecard import datasets
from skorecard.bucketers import AgglomerativeClusteringBucketer, UserInputBucketer

X, y = datasets.load_uci_credit_card(return_X_y=True)

ac_bucketer = AgglomerativeClusteringBucketer(n_bins=3, variables=['LIMIT_BAL'])
ac_bucketer.fit(X)
mapping = ac_bucketer.features_bucket_mapping_

ui_bucketer = UserInputBucketer(mapping)
new_X = ui_bucketer.fit_transform(X)
assert len(new_X['LIMIT_BAL'].unique()) == 3

#Map some values to the special buckets
specials = {
    "LIMIT_BAL":{
        "=50000":[50000],
        "in [20001,30000]":[20000,30000],
        }
}

ac_bucketer = AgglomerativeClusteringBucketer(n_bins=3, variables=['LIMIT_BAL'], specials = specials)
ac_bucketer.fit(X)
mapping = ac_bucketer.features_bucket_mapping_

ui_bucketer = UserInputBucketer(mapping)
new_X = ui_bucketer.fit_transform(X)
assert len(new_X['LIMIT_BAL'].unique()) == 5

Source code in skorecard/bucketers/bucketers.py

class UserInputBucketer(BaseBucketer):
    """
    The `UserInputBucketer` transformer creates buckets by implementing user-defined boundaries.

    Support: ![badge](https://img.shields.io/badge/numerical-true-green) ![badge](https://img.shields.io/badge/categorical-true-green) ![badge](https://img.shields.io/badge/supervised-false-blue)

    This is a special bucketer that is not fitted but rather relies
    on pre-defined user input. The most common use-case is loading
    bucket mapping information previously fitted by other bucketers.

    Example:

    ```python
    from skorecard import datasets
    from skorecard.bucketers import AgglomerativeClusteringBucketer, UserInputBucketer

    X, y = datasets.load_uci_credit_card(return_X_y=True)

    ac_bucketer = AgglomerativeClusteringBucketer(n_bins=3, variables=['LIMIT_BAL'])
    ac_bucketer.fit(X)
    mapping = ac_bucketer.features_bucket_mapping_

    ui_bucketer = UserInputBucketer(mapping)
    new_X = ui_bucketer.fit_transform(X)
    assert len(new_X['LIMIT_BAL'].unique()) == 3

    #Map some values to the special buckets
    specials = {
        "LIMIT_BAL":{
            "=50000":[50000],
            "in [20001,30000]":[20000,30000],
            }
    }

    ac_bucketer = AgglomerativeClusteringBucketer(n_bins=3, variables=['LIMIT_BAL'], specials = specials)
    ac_bucketer.fit(X)
    mapping = ac_bucketer.features_bucket_mapping_

    ui_bucketer = UserInputBucketer(mapping)
    new_X = ui_bucketer.fit_transform(X)
    assert len(new_X['LIMIT_BAL'].unique()) == 5
    ```

    """  # noqa

    def __init__(
        self, features_bucket_mapping=None, variables: List = [], remainder="passthrough", get_statistics=True
    ) -> None:
        """
        Initialise the user-defined boundaries with a dictionary.

        Notes:
        - features_bucket_mapping is stored without the trailing underscore (_) because it is not fitted.

        Args:
            features_bucket_mapping (None, Dict, FeaturesBucketMapping, str or Path): Contains the feature name and boundaries
                defined for this feature.
                If a dict, it will be converted to an internal FeaturesBucketMapping object.
                If a string or path, which will attempt to load the file as a yaml and convert to FeaturesBucketMapping object.
            variables (list): The features to bucket. Uses all features in features_bucket_mapping if not defined.
            remainder (str): How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
                passthrough (Default): all columns that were not specified in "variables" will be passed through.
                drop: all remaining columns that were not specified in "variables" will be dropped.
        """  # noqa
        # Assigning the variable in the init to the attribute with the same name is a requirement of
        # sklearn.base.BaseEstimator. See the notes in
        # https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator
        self.features_bucket_mapping = features_bucket_mapping
        self.remainder = remainder
        self.get_statistics = get_statistics

        self.variables = variables

        if features_bucket_mapping is None:
            self.features_bucket_mapping_ = FeaturesBucketMapping()
        elif isinstance(features_bucket_mapping, str):
            buckets_yaml = yaml.safe_load(open(features_bucket_mapping))
            self.features_bucket_mapping_ = FeaturesBucketMapping(buckets_yaml)
        elif isinstance(features_bucket_mapping, dict):
            self.features_bucket_mapping_ = FeaturesBucketMapping(features_bucket_mapping)
        elif isinstance(features_bucket_mapping, FeaturesBucketMapping):
            self.features_bucket_mapping_ = features_bucket_mapping
        else:
            try:
                buckets_yaml = yaml.safe_load(features_bucket_mapping)
                self.features_bucket_mapping_ = FeaturesBucketMapping(buckets_yaml)
            except Exception:
                raise TypeError(
                    "'features_bucket_mapping' must be a None, dict, str, path, or FeaturesBucketMapping instance"
                )

    def fit(self, X, y=None):
        """Init the class."""
        X = ensure_dataframe(X)
        if y is not None:
            assert len(y) == X.shape[0], "y and X not same length"
            # Store the classes seen during fit
            self.classes_ = unique_labels(y)

        # scikit-learn requires checking that X has same shape on transform
        # this is because scikit-learn is still positional based (no column names used)
        self.n_train_features_ = X.shape[1]

        # bucket tables can only be computed on fit().
        # so a user will have to .fit() if she/he wants .plot_buckets() and .bucket_table()
        self.bucket_tables_ = {}

        # and if user did not specify any variables
        # use all the variables defined in the features_bucket_mapping
        if self.variables == []:
            self.variables_ = list(self.features_bucket_mapping_.maps.keys())

        for feature in self.variables_:
            # Calculate the bucket table
            self.bucket_tables_[feature] = build_bucket_table(
                X,
                y,
                column=feature,
                bucket_mapping=self.features_bucket_mapping_.get(feature),
            )

        self._generate_summary(X, y)

        return self

    def _more_tags(self):
        """
        Estimator tags are annotations of estimators that allow programmatic inspection of their capabilities.

        See https://scikit-learn.org/stable/developers/develop.html#estimator-tags
        """  # noqa
        return {"binary_only": True, "allow_nan": True, "requires_fit": False}

`init(features_bucket_mapping=None, variables=[], remainder='passthrough', get_statistics=True)` ¶

Initialise the user-defined boundaries with a dictionary.

Notes: - features_bucket_mapping is stored without the trailing underscore (_) because it is not fitted.

Parameters:

Name	Type	Description	Default
`features_bucket_mapping`	`(None, Dict, FeaturesBucketMapping, str or Path)`	Contains the feature name and boundaries defined for this feature. If a dict, it will be converted to an internal FeaturesBucketMapping object. If a string or path, which will attempt to load the file as a yaml and convert to FeaturesBucketMapping object.	`None`
`variables`	`list`	The features to bucket. Uses all features in features_bucket_mapping if not defined.	`[]`
`remainder`	`str`	How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"]. passthrough (Default): all columns that were not specified in "variables" will be passed through. drop: all remaining columns that were not specified in "variables" will be dropped.	`'passthrough'`

Source code in skorecard/bucketers/bucketers.py

def __init__(
    self, features_bucket_mapping=None, variables: List = [], remainder="passthrough", get_statistics=True
) -> None:
    """
    Initialise the user-defined boundaries with a dictionary.

    Notes:
    - features_bucket_mapping is stored without the trailing underscore (_) because it is not fitted.

    Args:
        features_bucket_mapping (None, Dict, FeaturesBucketMapping, str or Path): Contains the feature name and boundaries
            defined for this feature.
            If a dict, it will be converted to an internal FeaturesBucketMapping object.
            If a string or path, which will attempt to load the file as a yaml and convert to FeaturesBucketMapping object.
        variables (list): The features to bucket. Uses all features in features_bucket_mapping if not defined.
        remainder (str): How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
            passthrough (Default): all columns that were not specified in "variables" will be passed through.
            drop: all remaining columns that were not specified in "variables" will be dropped.
    """  # noqa
    # Assigning the variable in the init to the attribute with the same name is a requirement of
    # sklearn.base.BaseEstimator. See the notes in
    # https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator
    self.features_bucket_mapping = features_bucket_mapping
    self.remainder = remainder
    self.get_statistics = get_statistics

    self.variables = variables

    if features_bucket_mapping is None:
        self.features_bucket_mapping_ = FeaturesBucketMapping()
    elif isinstance(features_bucket_mapping, str):
        buckets_yaml = yaml.safe_load(open(features_bucket_mapping))
        self.features_bucket_mapping_ = FeaturesBucketMapping(buckets_yaml)
    elif isinstance(features_bucket_mapping, dict):
        self.features_bucket_mapping_ = FeaturesBucketMapping(features_bucket_mapping)
    elif isinstance(features_bucket_mapping, FeaturesBucketMapping):
        self.features_bucket_mapping_ = features_bucket_mapping
    else:
        try:
            buckets_yaml = yaml.safe_load(features_bucket_mapping)
            self.features_bucket_mapping_ = FeaturesBucketMapping(buckets_yaml)
        except Exception:
            raise TypeError(
                "'features_bucket_mapping' must be a None, dict, str, path, or FeaturesBucketMapping instance"
            )

`fit(X, y=None)` ¶

Init the class.

Source code in skorecard/bucketers/bucketers.py

def fit(self, X, y=None):
    """Init the class."""
    X = ensure_dataframe(X)
    if y is not None:
        assert len(y) == X.shape[0], "y and X not same length"
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

    # scikit-learn requires checking that X has same shape on transform
    # this is because scikit-learn is still positional based (no column names used)
    self.n_train_features_ = X.shape[1]

    # bucket tables can only be computed on fit().
    # so a user will have to .fit() if she/he wants .plot_buckets() and .bucket_table()
    self.bucket_tables_ = {}

    # and if user did not specify any variables
    # use all the variables defined in the features_bucket_mapping
    if self.variables == []:
        self.variables_ = list(self.features_bucket_mapping_.maps.keys())

    for feature in self.variables_:
        # Calculate the bucket table
        self.bucket_tables_[feature] = build_bucket_table(
            X,
            y,
            column=feature,
            bucket_mapping=self.features_bucket_mapping_.get(feature),
        )

    self._generate_summary(X, y)

    return self

UserInputBucketer

__init__(features_bucket_mapping=None, variables=[], remainder='passthrough', get_statistics=True) ¶

fit(X, y=None) ¶

`init(features_bucket_mapping=None, variables=[], remainder='passthrough', get_statistics=True)` ¶

`fit(X, y=None)` ¶