SkorecardPipeline

Bases: Pipeline, PlotBucketMethod, BucketTableMethod, SummaryMethod

A sklearn Pipeline with several attribute and methods added.

This Pipeline of bucketers behaves more like a bucketer and adds:

.summary(): See which columns are bucketed
.plot_bucket(): Plot buckets of a column
.bucket_table(): Table with buckets of a column
.save_to_yaml(): Save information necessary for bucketing to a YAML file
.features_bucket_mapping_: Access bucketing information
.fit_interactive(): Edit fitted buckets interactively in a dash app

from skorecard.pipeline.pipeline import SkorecardPipeline
from skorecard.bucketers import DecisionTreeBucketer, OrdinalCategoricalBucketer
from skorecard import datasets

pipe = SkorecardPipeline([
    ('decisiontreebucketer', DecisionTreeBucketer(variables = ["LIMIT_BAL", "BILL_AMT1"],max_n_bins=5)),
    ('ordinalcategoricalbucketer', OrdinalCategoricalBucketer(variables = ["EDUCATION", "MARRIAGE"], tol =0.05)),
])

df = datasets.load_uci_credit_card(as_frame=True)
features = ["LIMIT_BAL", "BILL_AMT1", "EDUCATION", "MARRIAGE"]
X = df[features]
y = df["default"].values

pipe.fit(X, y)
pipe.bucket_table('LIMIT_BAL')

Source code in skorecard/pipeline/pipeline.py

class SkorecardPipeline(Pipeline, PlotBucketMethod, BucketTableMethod, SummaryMethod):
    """
    A sklearn Pipeline with several attribute and methods added.

    This Pipeline of bucketers behaves more like a bucketer and adds:

    - `.summary()`: See which columns are bucketed
    - `.plot_bucket()`: Plot buckets of a column
    - `.bucket_table()`: Table with buckets of a column
    - `.save_to_yaml()`: Save information necessary for bucketing to a YAML file
    - `.features_bucket_mapping_`: Access bucketing information
    - `.fit_interactive()`: Edit fitted buckets interactively in a dash app

    ```python
    from skorecard.pipeline.pipeline import SkorecardPipeline
    from skorecard.bucketers import DecisionTreeBucketer, OrdinalCategoricalBucketer
    from skorecard import datasets

    pipe = SkorecardPipeline([
        ('decisiontreebucketer', DecisionTreeBucketer(variables = ["LIMIT_BAL", "BILL_AMT1"],max_n_bins=5)),
        ('ordinalcategoricalbucketer', OrdinalCategoricalBucketer(variables = ["EDUCATION", "MARRIAGE"], tol =0.05)),
    ])

    df = datasets.load_uci_credit_card(as_frame=True)
    features = ["LIMIT_BAL", "BILL_AMT1", "EDUCATION", "MARRIAGE"]
    X = df[features]
    y = df["default"].values

    pipe.fit(X, y)
    pipe.bucket_table('LIMIT_BAL')
    ```
    """

    def __init__(self, steps, *, memory=None, verbose=False):
        """
        Wraps sklearn Pipeline.
        """
        super().__init__(steps=steps, memory=memory, verbose=verbose)
        self._check_pipeline_all_bucketers(self)
        self._check_pipeline_duplicated_columns(self)

    @property
    def features_bucket_mapping_(self):
        """
        Retrieve features bucket mapping.
        """
        check_is_fitted(self.steps[-1][1])
        return get_features_bucket_mapping(Pipeline(self.steps))

    @property
    def bucket_tables_(self):
        """
        Retrieve bucket tables.

        Used by .bucket_table()
        """
        check_is_fitted(self.steps[-1][1])
        bucket_tables = dict()
        for step in self.steps:
            bucket_tables.update(step[1].bucket_tables_)
        return bucket_tables

    @property
    def summary_dict_(self) -> Dict:
        """
        Retrieve summary_dicts and combine.

        Used by .summary()
        """
        summary_dict = {}
        for step in self.steps:
            summary_dict.update(step[1].summary_dict_)
        return summary_dict

    def save_yml(self, fout):
        """
        Save the features bucket to a yaml file.

        Args:
            fout: file output
        """
        check_is_fitted(self.steps[-1][1])
        self.features_bucket_mapping_.save_yml(fout)

    @staticmethod
    def _check_pipeline_duplicated_columns(pipeline: Pipeline) -> None:
        """
        Check that the pipeline has no duplicated columns.

        This check only works on fitted pipelines!
        """
        assert isinstance(pipeline, Pipeline)

        bucketers_vars = []
        bucketers_on_all = []
        bucketers_with_vars = []

        for step in _get_all_steps(pipeline):
            if is_fitted(step):
                if hasattr(step, "variables_"):
                    if len(step.variables_) == 0:
                        bucketers_vars += ["**all**"]
                        bucketers_on_all += [step]
                    else:
                        bucketers_vars += step.variables_
                        bucketers_with_vars += [step]
            else:
                if hasattr(step, "variables"):
                    if len(step.variables) == 0:
                        bucketers_vars += ["**all**"]
                        bucketers_on_all += [step]
                    else:
                        bucketers_vars += step.variables
                        bucketers_with_vars += [step]

        if len(list(set(bucketers_vars))) > 1 and "**all**" in list(set(bucketers_vars)):
            msg = "A SkorecardPipeline should bucket each feature only once.\n"
            msg += f"These bucketers bucket all features: {bucketers_on_all}\n"
            msg += f"While these bucket specific ones: {bucketers_with_vars}\n"
            msg += "This means some features would have been bucketed sequentially."
            msg += "To solve this, either use a BucketingProcess, or remove the duplicates from one of the bucketers."
            msg += "Remember that if you don't specify 'variables', a bucketer will bucket all columns."
            raise BucketingPipelineError(msg)

        if len(set(bucketers_vars)) != len(bucketers_vars):
            values, counts = np.unique(bucketers_vars, return_counts=True)
            duplicates = list(set(values[counts > 1]))

            msg = "A SkorecardPipeline should bucket each feature only once. "
            msg += f"The features {duplicates} appear in multiple bucketers, "
            msg += "meaning they would have been bucketed sequentially."
            msg += "To solve this, either use a BucketingProcess, or remove the duplicates from one of the bucketers."
            msg += "Remember that if you don't specify 'variables', a bucketer will bucket all columns."
            raise BucketingPipelineError(msg)

    @staticmethod
    def _check_pipeline_all_bucketers(pipeline: Pipeline) -> None:
        """
        Ensure all specified bucketing steps are skorecard bucketers.

        Args:
            pipeline: scikit-learn pipeline.
        """
        assert isinstance(pipeline, Pipeline)

        for step in _get_all_steps(pipeline):
            if all(x not in str(type(step)) for x in ["bucketing_process", "skorecard.bucketers"]):
                msg = "All bucketing steps must be skorecard bucketers."
                msg += f"Remove {step} from the pipeline."
                raise NotBucketObjectError(msg)

    def fit_interactive(self, X, y=None, mode="external"):
        """
        Fit a bucketer and then interactively edit the fit using a dash app.

        Note we are using a [jupyterdash](https://medium.com/plotly/introducing-jupyterdash-811f1f57c02e) app,
        which supports 3 different modes:

        - 'external' (default): Start dash server and print URL
        - 'inline': Start dash app inside an Iframe in the jupyter notebook
        - 'jupyterlab': Start dash app as a new tab inside jupyterlab

        """
        # We need to make sure we only fit if not already fitted
        # This prevents a user losing manually defined boundaries
        # when re-running .fit_interactive()
        if not is_fitted(self):
            self.fit(X, y)

        self.app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
        add_basic_layout(self)
        add_bucketing_callbacks(self, X, y)
        self.app.run_server(mode=mode)

    def _update_column_fit(self, X, y, feature, special, splits, right, generate_summary=False):
        """
        Extract out part of the fit for a column.

        Useful when we want to interactively update the fit.
        """
        for step in self.steps:
            if feature in step[1].variables:
                step[1]._update_column_fit(
                    X=X,
                    y=y,
                    feature=feature,
                    special=special,
                    splits=splits,
                    right=right,
                    generate_summary=generate_summary,
                )

`bucket_tables_` `property` ¶

Retrieve bucket tables.

Used by .bucket_table()

`features_bucket_mapping_` `property` ¶

Retrieve features bucket mapping.

`summary_dict_: Dict` `property` ¶

Retrieve summary_dicts and combine.

Used by .summary()

`init(steps, *, memory=None, verbose=False)` ¶

Wraps sklearn Pipeline.

Source code in skorecard/pipeline/pipeline.py

def __init__(self, steps, *, memory=None, verbose=False):
    """
    Wraps sklearn Pipeline.
    """
    super().__init__(steps=steps, memory=memory, verbose=verbose)
    self._check_pipeline_all_bucketers(self)
    self._check_pipeline_duplicated_columns(self)

`fit_interactive(X, y=None, mode='external')` ¶

Fit a bucketer and then interactively edit the fit using a dash app.

Note we are using a jupyterdash app, which supports 3 different modes:

'external' (default): Start dash server and print URL
'inline': Start dash app inside an Iframe in the jupyter notebook
'jupyterlab': Start dash app as a new tab inside jupyterlab

Source code in skorecard/pipeline/pipeline.py

def fit_interactive(self, X, y=None, mode="external"):
    """
    Fit a bucketer and then interactively edit the fit using a dash app.

    Note we are using a [jupyterdash](https://medium.com/plotly/introducing-jupyterdash-811f1f57c02e) app,
    which supports 3 different modes:

    - 'external' (default): Start dash server and print URL
    - 'inline': Start dash app inside an Iframe in the jupyter notebook
    - 'jupyterlab': Start dash app as a new tab inside jupyterlab

    """
    # We need to make sure we only fit if not already fitted
    # This prevents a user losing manually defined boundaries
    # when re-running .fit_interactive()
    if not is_fitted(self):
        self.fit(X, y)

    self.app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
    add_basic_layout(self)
    add_bucketing_callbacks(self, X, y)
    self.app.run_server(mode=mode)

`save_yml(fout)` ¶

Save the features bucket to a yaml file.

Parameters:

Name	Type	Description	Default
`fout`		file output	required

Source code in skorecard/pipeline/pipeline.py

def save_yml(self, fout):
    """
    Save the features bucket to a yaml file.

    Args:
        fout: file output
    """
    check_is_fitted(self.steps[-1][1])
    self.features_bucket_mapping_.save_yml(fout)

SkorecardPipeline

bucket_tables_ property ¶

features_bucket_mapping_ property ¶

summary_dict_: Dict property ¶

__init__(steps, *, memory=None, verbose=False) ¶

fit_interactive(X, y=None, mode='external') ¶

save_yml(fout) ¶

`bucket_tables_` `property` ¶

`features_bucket_mapping_` `property` ¶

`summary_dict_: Dict` `property` ¶

`init(steps, *, memory=None, verbose=False)` ¶

`fit_interactive(X, y=None, mode='external')` ¶

`save_yml(fout)` ¶