DecisionTreeBucketer

Bases: BaseBucketer

The DecisionTreeBucketer transformer creates buckets by training a decision tree.

Support:

It uses sklearn.tree.DecisionTreeClassifier to find the splits.

Example:

from skorecard import datasets
from skorecard.bucketers import DecisionTreeBucketer
X, y = datasets.load_uci_credit_card(return_X_y=True)

# make sure that those cases
specials = {
    "LIMIT_BAL":{
        "=50000":[50000],
        "in [20001,30000]":[20000,30000],
        }
}

dt_bucketer = DecisionTreeBucketer(variables=['LIMIT_BAL'], specials = specials)
dt_bucketer.fit(X, y)

dt_bucketer.fit_transform(X, y)['LIMIT_BAL'].value_counts()

Source code in skorecard/bucketers/bucketers.py

class DecisionTreeBucketer(BaseBucketer):
    """
    The `DecisionTreeBucketer` transformer creates buckets by training a decision tree.

    Support: ![badge](https://img.shields.io/badge/numerical-true-green) ![badge](https://img.shields.io/badge/categorical-false-red) ![badge](https://img.shields.io/badge/supervised-true-green)

    It uses [sklearn.tree.DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
    to find the splits.

    Example:

    ```python
    from skorecard import datasets
    from skorecard.bucketers import DecisionTreeBucketer
    X, y = datasets.load_uci_credit_card(return_X_y=True)

    # make sure that those cases
    specials = {
        "LIMIT_BAL":{
            "=50000":[50000],
            "in [20001,30000]":[20000,30000],
            }
    }

    dt_bucketer = DecisionTreeBucketer(variables=['LIMIT_BAL'], specials = specials)
    dt_bucketer.fit(X, y)

    dt_bucketer.fit_transform(X, y)['LIMIT_BAL'].value_counts()
    ```
    """  # noqa

    def __init__(
        self,
        variables=[],
        specials={},
        max_n_bins=100,
        missing_treatment="separate",
        min_bin_size=0.05,
        random_state=None,
        remainder="passthrough",
        get_statistics=True,
        dt_kwargs={},
    ) -> None:
        """Init the class.

        Args:
            variables (list): The features to bucket. Uses all features if not defined.
            specials (dict):  dictionary of special values that require their own binning.
                The dictionary has the following format:
                 {"<column name>" : {"name of special bucket" : <list with 1 or more values>}}
                For every feature that needs a special value, a dictionary must be passed as value.
                This dictionary contains a name of a bucket (key) and an array of unique values that should be put
                in that bucket.
                When special values are defined, they are not considered in the fitting procedure.
            min_bin_size (float): Minimum fraction of observations in a bucket. Passed directly to min_samples_leaf.
            max_n_bins (int): Maximum numbers of after the bucketing. Passed directly to max_leaf_nodes of the
                DecisionTreeClassifier.
                If specials are defined, max_leaf_nodes will be redefined to max_n_bins - (number of special bins).
                The DecisionTreeClassifier requires max_leaf_nodes>=2:
                therefore, max_n_bins  must always be >= (number of special bins + 2) if specials are defined,
                otherwise must be >=2.
            missing_treatment (str or dict): Defines how we treat the missing values present in the data.
                If a string, it must be one of the following options:
                    separate: Missing values get put in a separate 'Other' bucket: `-1`
                    most_risky: Missing values are put into the bucket containing the largest percentage of Class 1.
                    least_risky: Missing values are put into the bucket containing the largest percentage of Class 0.
                    most_frequent: Missing values are put into the most common bucket.
                    neutral: Missing values are put into the bucket with WoE closest to 0.
                    similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values.
                    passthrough: Leaves missing values untouched.
                If a dict, it must be of the following format:
                    {"<column name>": <bucket_number>}
                    This bucket number is where we will put the missing values.
            random_state (int): The random state, Passed directly to DecisionTreeClassifier
            remainder (str): How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
                passthrough (Default): all columns that were not specified in "variables" will be passed through.
                drop: all remaining columns that were not specified in "variables" will be dropped.
            dt_kwargs: Other parameters passed to DecisionTreeClassifier
        """  # noqa
        self.variables = variables
        self.specials = specials
        self.dt_kwargs = dt_kwargs
        self.max_n_bins = max_n_bins
        self.missing_treatment = missing_treatment
        self.min_bin_size = min_bin_size
        self.random_state = random_state
        self.remainder = remainder
        self.get_statistics = get_statistics
        self.dt_kwargs.update({"random_state": self.random_state})

        check_args(dt_kwargs, DecisionTreeClassifier)

    @property
    def variables_type(self):
        """
        Signals variables type supported by this bucketer.
        """
        return "numerical"

    def _get_feature_splits(self, feature, X, y, X_unfiltered=None):
        """
        Finds the splits for a single feature.

        X and y have already been preprocessed, and have specials removed.

        Args:
            feature (str): Name of the feature.
            X (pd.Series): df with single column of feature to bucket
            y (np.ndarray): array with target
            X_unfiltered (pd.Series): df with single column of feature to bucket before any filtering was applied

        Returns:
            splits, right (tuple): The splits (dict or array), and whether right=True or False.
        """
        # Make sure max_n_bins settings is correct
        n_special_bins = 0
        if feature in self.specials.keys():
            n_special_bins = len(self.specials[feature])
            if (self.max_n_bins - n_special_bins) <= 1:
                raise ValueError(
                    f"max_n_bins must be at least = the number of special bins + 2: set a value "
                    f"max_n_bins>= {n_special_bins+2} (currently max_n_bins={self.max_n_bins})"
                )

        # If the data contains only specials,
        # Then don't use any splits
        if X.shape[0] == 0:
            splits = []
        else:
            # If the specials are excluded, make sure that the bin size is rescaled.
            frac_left = X.shape[0] / X_unfiltered.shape[0]
            min_bin_size = self.min_bin_size / frac_left

            if min_bin_size > 0.5:
                min_bin_size = 0.5

            binner = DecisionTreeClassifier(
                max_leaf_nodes=(self.max_n_bins - n_special_bins),
                min_samples_leaf=min_bin_size,
                **self.dt_kwargs,
            )
            binner.fit(X.values.reshape(-1, 1), y)

            # Extract fitted boundaries
            splits = np.unique(binner.tree_.threshold[binner.tree_.feature != _tree.TREE_UNDEFINED])

        # Note for trees we use right=False
        return (splits, False)

`variables_type` `property` ¶

Signals variables type supported by this bucketer.

`init(variables=[], specials={}, max_n_bins=100, missing_treatment='separate', min_bin_size=0.05, random_state=None, remainder='passthrough', get_statistics=True, dt_kwargs={})` ¶

Init the class.

Parameters:

Name	Type	Description	Default
`variables`	`list`	The features to bucket. Uses all features if not defined.	`[]`
`specials`	`dict`	dictionary of special values that require their own binning. The dictionary has the following format: {"" : {"name of special bucket" : }} For every feature that needs a special value, a dictionary must be passed as value. This dictionary contains a name of a bucket (key) and an array of unique values that should be put in that bucket. When special values are defined, they are not considered in the fitting procedure.	`{}`
`min_bin_size`	`float`	Minimum fraction of observations in a bucket. Passed directly to min_samples_leaf.	`0.05`
`max_n_bins`	`int`	Maximum numbers of after the bucketing. Passed directly to max_leaf_nodes of the DecisionTreeClassifier. If specials are defined, max_leaf_nodes will be redefined to max_n_bins - (number of special bins). The DecisionTreeClassifier requires max_leaf_nodes>=2: therefore, max_n_bins must always be >= (number of special bins + 2) if specials are defined, otherwise must be >=2.	`100`
`missing_treatment`	`str or dict`	Defines how we treat the missing values present in the data. If a string, it must be one of the following options: separate: Missing values get put in a separate 'Other' bucket: `-1` most_risky: Missing values are put into the bucket containing the largest percentage of Class 1. least_risky: Missing values are put into the bucket containing the largest percentage of Class 0. most_frequent: Missing values are put into the most common bucket. neutral: Missing values are put into the bucket with WoE closest to 0. similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values. passthrough: Leaves missing values untouched. If a dict, it must be of the following format: {"": } This bucket number is where we will put the missing values.	`'separate'`
`random_state`	`int`	The random state, Passed directly to DecisionTreeClassifier	`None`
`remainder`	`str`	How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"]. passthrough (Default): all columns that were not specified in "variables" will be passed through. drop: all remaining columns that were not specified in "variables" will be dropped.	`'passthrough'`
`dt_kwargs`		Other parameters passed to DecisionTreeClassifier	`{}`

Source code in skorecard/bucketers/bucketers.py

def __init__(
    self,
    variables=[],
    specials={},
    max_n_bins=100,
    missing_treatment="separate",
    min_bin_size=0.05,
    random_state=None,
    remainder="passthrough",
    get_statistics=True,
    dt_kwargs={},
) -> None:
    """Init the class.

    Args:
        variables (list): The features to bucket. Uses all features if not defined.
        specials (dict):  dictionary of special values that require their own binning.
            The dictionary has the following format:
             {"<column name>" : {"name of special bucket" : <list with 1 or more values>}}
            For every feature that needs a special value, a dictionary must be passed as value.
            This dictionary contains a name of a bucket (key) and an array of unique values that should be put
            in that bucket.
            When special values are defined, they are not considered in the fitting procedure.
        min_bin_size (float): Minimum fraction of observations in a bucket. Passed directly to min_samples_leaf.
        max_n_bins (int): Maximum numbers of after the bucketing. Passed directly to max_leaf_nodes of the
            DecisionTreeClassifier.
            If specials are defined, max_leaf_nodes will be redefined to max_n_bins - (number of special bins).
            The DecisionTreeClassifier requires max_leaf_nodes>=2:
            therefore, max_n_bins  must always be >= (number of special bins + 2) if specials are defined,
            otherwise must be >=2.
        missing_treatment (str or dict): Defines how we treat the missing values present in the data.
            If a string, it must be one of the following options:
                separate: Missing values get put in a separate 'Other' bucket: `-1`
                most_risky: Missing values are put into the bucket containing the largest percentage of Class 1.
                least_risky: Missing values are put into the bucket containing the largest percentage of Class 0.
                most_frequent: Missing values are put into the most common bucket.
                neutral: Missing values are put into the bucket with WoE closest to 0.
                similar: Missing values are put into the bucket with WoE closest to the bucket with only missing values.
                passthrough: Leaves missing values untouched.
            If a dict, it must be of the following format:
                {"<column name>": <bucket_number>}
                This bucket number is where we will put the missing values.
        random_state (int): The random state, Passed directly to DecisionTreeClassifier
        remainder (str): How we want the non-specified columns to be transformed. It must be in ["passthrough", "drop"].
            passthrough (Default): all columns that were not specified in "variables" will be passed through.
            drop: all remaining columns that were not specified in "variables" will be dropped.
        dt_kwargs: Other parameters passed to DecisionTreeClassifier
    """  # noqa
    self.variables = variables
    self.specials = specials
    self.dt_kwargs = dt_kwargs
    self.max_n_bins = max_n_bins
    self.missing_treatment = missing_treatment
    self.min_bin_size = min_bin_size
    self.random_state = random_state
    self.remainder = remainder
    self.get_statistics = get_statistics
    self.dt_kwargs.update({"random_state": self.random_state})

    check_args(dt_kwargs, DecisionTreeClassifier)

DecisionTreeBucketer

variables_type property ¶

__init__(variables=[], specials={}, max_n_bins=100, missing_treatment='separate', min_bin_size=0.05, random_state=None, remainder='passthrough', get_statistics=True, dt_kwargs={}) ¶

`variables_type` `property` ¶

`init(variables=[], specials={}, max_n_bins=100, missing_treatment='separate', min_bin_size=0.05, random_state=None, remainder='passthrough', get_statistics=True, dt_kwargs={})` ¶