EqualFrequencyBucketer

Bases: BaseBucketer

The EqualFrequencyBucketer transformer creates buckets with equal number of elements.

Support:

Example:

from skorecard import datasets
from skorecard.bucketers import EqualFrequencyBucketer

X, y = datasets.load_uci_credit_card(return_X_y=True)
bucketer = EqualFrequencyBucketer(n_bins = 10, variables=['LIMIT_BAL'])
bucketer.fit_transform(X)
bucketer.fit_transform(X)['LIMIT_BAL'].value_counts()

Source code in skorecard/bucketers/bucketers.py

class EqualFrequencyBucketer(BaseBucketer):
    """
    The `EqualFrequencyBucketer` transformer creates buckets with equal number of elements.

    Support: ![badge](https://img.shields.io/badge/numerical-true-green) ![badge](https://img.shields.io/badge/categorical-false-red) ![badge](https://img.shields.io/badge/supervised-false-red)

    Example:

    ```python
    from skorecard import datasets
    from skorecard.bucketers import EqualFrequencyBucketer

    X, y = datasets.load_uci_credit_card(return_X_y=True)
    bucketer = EqualFrequencyBucketer(n_bins = 10, variables=['LIMIT_BAL'])
    bucketer.fit_transform(X)
    bucketer.fit_transform(X)['LIMIT_BAL'].value_counts()
    ```
    """  # noqa

    def __init__(
        self,
        n_bins=5,
        variables=[],
        specials={},
        missing_treatment="separate",
        remainder="passthrough",
        get_statistics=True,
    ):
        """Init the class.

        Args:
            n_bins (int): Number of bins to create.
            variables (list): The features to bucket. Uses all features if not defined.
            specials: (nested) dictionary of special values that require their own binning.
                The dictionary has the following format:
                 {"<column name>" : {"name of special bucket" : <list with 1 or more values>}}
                For every feature that needs a special value, a dictionary must be passed as value.
                This dictionary contains a name of a bucket (key) and an array of unique values that should be put
                in that bucket.
                When special values are defined, they are not considered in the fitting procedure.
            missing_treatment: Defines how we treat the missing values present in the data.
                If a string, it must be one of the following options:
                    separate: Missing values get put in a separate 'Other' bucket: `-1`
                    most_risky: Missing values are put into the bucket containing the largest percentage of Class 1.
                    least_risky: Missing values are put into the bucket containing the largest percentage of Class 0.
                    most_frequent: Missing values are put into the most common bucket.
                    neutral: Missing values are put into the bucket with WoE closest to 0.
                    similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values.
                    passthrough: Leaves missing values untouched.
                If a dict, it must be of the following format:
                    {"<column name>": <bucket_number>}
                    This bucket number is where we will put the missing values..
            remainder: How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
                passthrough (Default): all columns that were not specified in "variables" will be passed through.
                drop: all remaining columns that were not specified in "variables" will be dropped.
        """  # noqa
        self.variables = variables
        self.n_bins = n_bins
        self.specials = specials
        self.missing_treatment = missing_treatment
        self.remainder = remainder
        self.get_statistics = get_statistics

    @property
    def variables_type(self):
        """
        Signals variables type supported by this bucketer.
        """
        return "numerical"

    def _get_feature_splits(self, feature, X, y, X_unfiltered=None):
        """
        Finds the splits for a single feature.

        X and y have already been preprocessed, and have specials removed.

        Args:
            feature (str): Name of the feature.
            X (pd.Series): df with single column of feature to bucket
            y (np.ndarray): array with target
            X_unfiltered (pd.Series): df with single column of feature to bucket before any filtering was applied

        Returns:
            splits, right (tuple): The splits (dict or array), and whether right=True or False.
        """
        # Fit the estimator
        # Uses pd.qcut()
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html
        try:
            _, boundaries = pd.qcut(X, q=self.n_bins, retbins=True, duplicates="raise")
        except ValueError:
            # If there are too many duplicate values (assume a lot of filled missings)
            # this crashes - the exception drops them.
            # This means that it will return approximate quantile bins
            _, boundaries = pd.qcut(X, q=self.n_bins, retbins=True, duplicates="drop")
            warnings.warn(ApproximationWarning("Approximated quantiles - too many unique values"))

        # pd.qcut returns the min & max values of the fits
        # On transform, we use np.digitize, which means new data that is outside of this range
        # will be assigned to their own buckets.
        # To solve, we simply remove the min and max boundaries
        boundaries = boundaries[1:-1]

        if isinstance(boundaries, np.ndarray):
            boundaries = boundaries.tolist()

        # pd.qcut returns bins including right edge: (edge, edge]
        return (boundaries, True)

`variables_type` `property` ¶

Signals variables type supported by this bucketer.

`init(n_bins=5, variables=[], specials={}, missing_treatment='separate', remainder='passthrough', get_statistics=True)` ¶

Init the class.

Parameters:

Name	Type	Description	Default
`n_bins`	`int`	Number of bins to create.	`5`
`variables`	`list`	The features to bucket. Uses all features if not defined.	`[]`
`specials`		(nested) dictionary of special values that require their own binning. The dictionary has the following format: {"" : {"name of special bucket" : }} For every feature that needs a special value, a dictionary must be passed as value. This dictionary contains a name of a bucket (key) and an array of unique values that should be put in that bucket. When special values are defined, they are not considered in the fitting procedure.	`{}`
`missing_treatment`		Defines how we treat the missing values present in the data. If a string, it must be one of the following options: separate: Missing values get put in a separate 'Other' bucket: `-1` most_risky: Missing values are put into the bucket containing the largest percentage of Class 1. least_risky: Missing values are put into the bucket containing the largest percentage of Class 0. most_frequent: Missing values are put into the most common bucket. neutral: Missing values are put into the bucket with WoE closest to 0. similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values. passthrough: Leaves missing values untouched. If a dict, it must be of the following format: {"": } This bucket number is where we will put the missing values..	`'separate'`
`remainder`		How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"]. passthrough (Default): all columns that were not specified in "variables" will be passed through. drop: all remaining columns that were not specified in "variables" will be dropped.	`'passthrough'`

Source code in skorecard/bucketers/bucketers.py

def __init__(
    self,
    n_bins=5,
    variables=[],
    specials={},
    missing_treatment="separate",
    remainder="passthrough",
    get_statistics=True,
):
    """Init the class.

    Args:
        n_bins (int): Number of bins to create.
        variables (list): The features to bucket. Uses all features if not defined.
        specials: (nested) dictionary of special values that require their own binning.
            The dictionary has the following format:
             {"<column name>" : {"name of special bucket" : <list with 1 or more values>}}
            For every feature that needs a special value, a dictionary must be passed as value.
            This dictionary contains a name of a bucket (key) and an array of unique values that should be put
            in that bucket.
            When special values are defined, they are not considered in the fitting procedure.
        missing_treatment: Defines how we treat the missing values present in the data.
            If a string, it must be one of the following options:
                separate: Missing values get put in a separate 'Other' bucket: `-1`
                most_risky: Missing values are put into the bucket containing the largest percentage of Class 1.
                least_risky: Missing values are put into the bucket containing the largest percentage of Class 0.
                most_frequent: Missing values are put into the most common bucket.
                neutral: Missing values are put into the bucket with WoE closest to 0.
                similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values.
                passthrough: Leaves missing values untouched.
            If a dict, it must be of the following format:
                {"<column name>": <bucket_number>}
                This bucket number is where we will put the missing values..
        remainder: How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
            passthrough (Default): all columns that were not specified in "variables" will be passed through.
            drop: all remaining columns that were not specified in "variables" will be dropped.
    """  # noqa
    self.variables = variables
    self.n_bins = n_bins
    self.specials = specials
    self.missing_treatment = missing_treatment
    self.remainder = remainder
    self.get_statistics = get_statistics

EqualFrequencyBucketer

variables_type property ¶

__init__(n_bins=5, variables=[], specials={}, missing_treatment='separate', remainder='passthrough', get_statistics=True) ¶

`variables_type` `property` ¶

`init(n_bins=5, variables=[], specials={}, missing_treatment='separate', remainder='passthrough', get_statistics=True)` ¶