AgglomerativeClusteringBucketer

Bases: BaseBucketer

The AgglomerativeClusteringBucketer transformer creates buckets using sklearn.AgglomerativeClustering.

Support

Example:

from skorecard import datasets
from skorecard.bucketers import AgglomerativeClusteringBucketer

specials = {"LIMIT_BAL": {"=50000": [50000], "in [20001,30000]": [20000, 30000]}}

X, y = datasets.load_uci_credit_card(return_X_y=True)
bucketer = AgglomerativeClusteringBucketer(n_bins = 10, variables=['LIMIT_BAL'], specials=specials)
bucketer.fit_transform(X)
bucketer.fit_transform(X)['LIMIT_BAL'].value_counts()

Source code in skorecard/bucketers/bucketers.py

class AgglomerativeClusteringBucketer(BaseBucketer):
    """
    The `AgglomerativeClusteringBucketer` transformer creates buckets using [sklearn.AgglomerativeClustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html).

    Support ![badge](https://img.shields.io/badge/numerical-true-green) ![badge](https://img.shields.io/badge/categorical-false-red) ![badge](https://img.shields.io/badge/supervised-false-red)

    Example:

    ```python
    from skorecard import datasets
    from skorecard.bucketers import AgglomerativeClusteringBucketer

    specials = {"LIMIT_BAL": {"=50000": [50000], "in [20001,30000]": [20000, 30000]}}

    X, y = datasets.load_uci_credit_card(return_X_y=True)
    bucketer = AgglomerativeClusteringBucketer(n_bins = 10, variables=['LIMIT_BAL'], specials=specials)
    bucketer.fit_transform(X)
    bucketer.fit_transform(X)['LIMIT_BAL'].value_counts()
    ```
    """  # noqa

    def __init__(
        self,
        n_bins=5,
        variables=[],
        specials={},
        missing_treatment="separate",
        remainder="passthrough",
        get_statistics=True,
        **kwargs,
    ):
        """Init the class.

        Args:
            n_bins (int): Number of bins to create.
            variables (list): The features to bucket. Uses all features if not defined.
            specials: (dict) of special values that require their own binning.
                The dictionary has the following format:
                 {"<column name>" : {"name of special bucket" : <list with 1 or more values>}}
                For every feature that needs a special value, a dictionary must be passed as value.
                This dictionary contains a name of a bucket (key) and an array of unique values that should be put
                in that bucket.
                When special values are defined, they are not considered in the fitting procedure.
            missing_treatment: Defines how we treat the missing values present in the data.
                If a string, it must be one of the following options:
                    separate: Missing values get put in a separate 'Other' bucket: `-1`
                    most_risky: Missing values are put into the bucket containing the largest percentage of Class 1.
                    least_risky: Missing values are put into the bucket containing the largest percentage of Class 0.
                    most_frequent: Missing values are put into the most common bucket.
                    neutral: Missing values are put into the bucket with WoE closest to 0.
                    similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values.
                    passthrough: Leaves missing values untouched.
                If a dict, it must be of the following format:
                    {"<column name>": <bucket_number>}
                    This bucket number is where we will put the missing values.
            remainder: How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
                passthrough (Default): all columns that were not specified in "variables" will be passed through.
                drop: all remaining columns that were not specified in "variables" will be dropped.
            kwargs: Other parameters passed to AgglomerativeBucketer
        """  # noqa
        self.variables = variables
        self.n_bins = n_bins
        self.specials = specials
        self.missing_treatment = missing_treatment
        self.remainder = remainder
        self.get_statistics = get_statistics
        self.kwargs = kwargs

    @property
    def variables_type(self):
        """
        Signals variables type supported by this bucketer.
        """
        return "numerical"

    def _get_feature_splits(self, feature, X, y, X_unfiltered=None):
        """
        Finds the splits for a single feature.

        X and y have already been preprocessed, and have specials removed.

        Args:
            feature (str): Name of the feature.
            X (pd.Series): df with single column of feature to bucket
            y (np.ndarray): array with target
            X_unfiltered (pd.Series): df with single column of feature to bucket before any filtering was applied

        Returns:
            splits, right (tuple): The splits (dict or array), and whether right=True or False.
        """
        # Fit the estimator
        ab = AgglomerativeClustering(n_clusters=self.n_bins, **self.kwargs)
        ab.fit(X.values.reshape(-1, 1), y=None)

        # Find the boundaries
        df = pd.DataFrame({"x": X.values, "label": ab.labels_}).sort_values(by="x")
        cluster_minimum_values = df.groupby("label")["x"].min().sort_values().tolist()
        cluster_maximum_values = df.groupby("label")["x"].max().sort_values().tolist()
        # take the mean of the upper boundary of a cluster and the lower boundary of the next cluster
        boundaries = [
            # Assures numbers are float and not np.float - necessary for serialization
            float(np.mean([cluster_minimum_values[i + 1], cluster_maximum_values[i]]))
            for i in range(len(cluster_minimum_values) - 1)
        ]

        if isinstance(boundaries, np.ndarray):
            boundaries = boundaries.tolist()

        return (boundaries, True)

`variables_type` `property` ¶

Signals variables type supported by this bucketer.

`init(n_bins=5, variables=[], specials={}, missing_treatment='separate', remainder='passthrough', get_statistics=True, **kwargs)` ¶

Init the class.

Parameters:

Name	Type	Description	Default
`n_bins`	`int`	Number of bins to create.	`5`
`variables`	`list`	The features to bucket. Uses all features if not defined.	`[]`
`specials`		(dict) of special values that require their own binning. The dictionary has the following format: {"" : {"name of special bucket" : }} For every feature that needs a special value, a dictionary must be passed as value. This dictionary contains a name of a bucket (key) and an array of unique values that should be put in that bucket. When special values are defined, they are not considered in the fitting procedure.	`{}`
`missing_treatment`		Defines how we treat the missing values present in the data. If a string, it must be one of the following options: separate: Missing values get put in a separate 'Other' bucket: `-1` most_risky: Missing values are put into the bucket containing the largest percentage of Class 1. least_risky: Missing values are put into the bucket containing the largest percentage of Class 0. most_frequent: Missing values are put into the most common bucket. neutral: Missing values are put into the bucket with WoE closest to 0. similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values. passthrough: Leaves missing values untouched. If a dict, it must be of the following format: {"": } This bucket number is where we will put the missing values.	`'separate'`
`remainder`		How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"]. passthrough (Default): all columns that were not specified in "variables" will be passed through. drop: all remaining columns that were not specified in "variables" will be dropped.	`'passthrough'`
`kwargs`		Other parameters passed to AgglomerativeBucketer	`{}`

Source code in skorecard/bucketers/bucketers.py

def __init__(
    self,
    n_bins=5,
    variables=[],
    specials={},
    missing_treatment="separate",
    remainder="passthrough",
    get_statistics=True,
    **kwargs,
):
    """Init the class.

    Args:
        n_bins (int): Number of bins to create.
        variables (list): The features to bucket. Uses all features if not defined.
        specials: (dict) of special values that require their own binning.
            The dictionary has the following format:
             {"<column name>" : {"name of special bucket" : <list with 1 or more values>}}
            For every feature that needs a special value, a dictionary must be passed as value.
            This dictionary contains a name of a bucket (key) and an array of unique values that should be put
            in that bucket.
            When special values are defined, they are not considered in the fitting procedure.
        missing_treatment: Defines how we treat the missing values present in the data.
            If a string, it must be one of the following options:
                separate: Missing values get put in a separate 'Other' bucket: `-1`
                most_risky: Missing values are put into the bucket containing the largest percentage of Class 1.
                least_risky: Missing values are put into the bucket containing the largest percentage of Class 0.
                most_frequent: Missing values are put into the most common bucket.
                neutral: Missing values are put into the bucket with WoE closest to 0.
                similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values.
                passthrough: Leaves missing values untouched.
            If a dict, it must be of the following format:
                {"<column name>": <bucket_number>}
                This bucket number is where we will put the missing values.
        remainder: How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
            passthrough (Default): all columns that were not specified in "variables" will be passed through.
            drop: all remaining columns that were not specified in "variables" will be dropped.
        kwargs: Other parameters passed to AgglomerativeBucketer
    """  # noqa
    self.variables = variables
    self.n_bins = n_bins
    self.specials = specials
    self.missing_treatment = missing_treatment
    self.remainder = remainder
    self.get_statistics = get_statistics
    self.kwargs = kwargs

AgglomerativeClusteringBucketer

variables_type property ¶

__init__(n_bins=5, variables=[], specials={}, missing_treatment='separate', remainder='passthrough', get_statistics=True, **kwargs) ¶

`variables_type` `property` ¶

`init(n_bins=5, variables=[], specials={}, missing_treatment='separate', remainder='passthrough', get_statistics=True, **kwargs)` ¶