Source code for paralytics.preprocessing.transformation

import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

from ..utils import is_numeric

__all__ = [
    'CategoricalBinarizer',
    'CategoricalGrouper',
    'ColumnProjector',
    'ColumnSelector',
    'TypeSelector'
]


[docs]class CategoricalBinarizer(BaseEstimator, TransformerMixin):
    """Finds categorical columns with binary-like response and converts them.

    Searches throughout the categorical columns in the DataFrame and finds
    those which contain categories corresponding to the passed boolean values
    only.

    Parameters
    ----------
    keywords_{true, false}: list, optional (default=None)
        List of categories' names corresponding to {True, False} logical
        values.

    Attributes
    ----------
    columns_binarylike_: list
        List of column names that should be mapped to boolean.

    """
    def __init__(self, keywords_true=None, keywords_false=None):
        self.keywords_true = keywords_true
        self.keywords_false = keywords_false

[docs]    def fit(self, X, y=None):
        """Fits selection of binary-like columns.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Data with n_samples as its number of samples and n_features as its
            number of features.

        y: ignore

        Returns
        -------
        self: object
            Returns the instance itself.

        """
        self.columns_binarylike_ = []

        if self.keywords_true is None:
            self.keywords_true = ['yes', 'YES', 'Yes']
        if self.keywords_false is None:
            self.keywords_false = ['no', 'NO', 'No']

        keywords_binarylike = set(self.keywords_true + self.keywords_false)

        for col in X.columns:
            try:
                binarylike_only = \
                    set(X[col].cat.categories) <= keywords_binarylike
            except AttributeError as e:
                continue
            if binarylike_only:
                self.columns_binarylike_.append(col)

        return self

[docs]    def transform(self, X):
        """Applies boolean convertion to binary-like category columns.

        X columns that match the condition of containing only binary-like
        string values are mapped to boolean values corresponding to the
        passed strings expected to be interpreted as binary response.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Data with n_samples as its number of samples and n_features as its
            number of features.

        Returns
        -------
        X_new: pd.DataFrame, shape = (n_samples, n_features)
            X data with substituted binary-like category columns with its
            corresponding binary values.
        """
        try:
            getattr(self, 'columns_binarylike_')
        except AttributeError:
            raise RuntimeError(
                'Could not find the attribute.\nFitting is necessary before '
                'you do the transformation!'
            )
        assert isinstance(X, pd.DataFrame), \
            'Input must be an instance of pandas.DataFrame()!'

        X_new = X.copy()

        dict_true = dict.fromkeys(self.keywords_true, True)
        dict_false = dict.fromkeys(self.keywords_false, False)

        translator_binarylike = {**dict_true, **dict_false}

        for col in self.columns_binarylike_:
            X_new[col] = X[col].map(translator_binarylike)

        return X_new


[docs]class CategoricalGrouper(BaseEstimator, TransformerMixin):
    """Groups sparse observations in a categorical columns into one category.

    Parameters
    ----------
    method: string {'freq'}, optional (default='freq')
        The sparse categories grouping method:

        - `freq`:

          Counts the frequency against each category. Retains categories
          whose cumulative share (with respect to descending sort) in the
          total dataset is equal or higher than the percentile threshold.

    percentile_thresh: float, optional (default=.05)
        Defines the percentile threshold for 'freq' method.

    new_cat: string or int, optional (default='Other')
        Specifies the category name that will be imputed to the chosen sparse
        observations.

    include_cols: list, optional (default=None)
        Specifies column names that should be treated like categorical
        features. If None then estimator is executed only on the automatically
        selected categorical columns.

    exclude_cols: list, optional (default=None)
        Specifies categorical column names that should not be treated like
        categorical features. If None then no column is excluded from
        transformation.

    Attributes
    ----------
    cat_cols_: list
        List of categorical columns in a given dataset.

    imp_cats_: dict
        Dictionary that keeps track of replaced category names with the new
        category for every feature in the dataset.

    """
    def __init__(self, method='freq', percentile_thresh=.05, new_cat='Other',
                 include_cols=None, exclude_cols=None):
        self.method = method
        self.percentile_thresh = percentile_thresh
        self.new_cat = new_cat
        self.include_cols = include_cols
        self.exclude_cols = exclude_cols

[docs]    def fit(self, X, y=None):
        """Fits grouping with X by using given method.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.

        y: ignore

        Returns
        -------
        self: object
            Returns the instance itself.

        """
        assert isinstance(X, pd.DataFrame), \
            'Input must be an instance of pandas.DataFrame()!'
        assert len(X) > 0, 'Input data can not be empty!'

        self.cat_cols_ = self._cat_cols_selection(
            X, self.include_cols, self.exclude_cols
        )

        self.imp_cats_ = {}
        if self.method == 'freq':
            for col in self.cat_cols_:
                tracker, i = 0, 0
                sorted_series = X[col].value_counts(normalize=True)
                while tracker < 1 - self.percentile_thresh:
                    tracker += sorted_series.iloc[i]
                    i += 1
                sparse_cats = sorted_series.index[i:].tolist()
                if len(sparse_cats) > 1:
                    self.imp_cats_[col] = sparse_cats
                else:
                    self.imp_cats_[col] = []

        return self

[docs]    def transform(self, X):
        """Apply grouping of sparse categories on X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Data with n_samples as its number of samples.

        Returns
        -------
        X_new: pd.DataFrame, shape = (n_samples_new, n_features)
            X data with substituted sparse categories to new_cat.

        """
        try:
            getattr(self, 'imp_cats_')
            getattr(self, 'cat_cols_')
        except AttributeError:
            raise RuntimeError('Could not find the attribute.\n'
                               'Fitting is necessary before you do '
                               'the transformation.')
        assert isinstance(X, pd.DataFrame), \
            'Input must be an instance of pandas.DataFrame()'

        X_new = X.copy()
        for col in self.cat_cols_:
            row_indices = X_new[col].isin(self.imp_cats_[col])
            if X_new[col].dtype.name == 'category':
                try:
                    X_new[col].cat.add_categories(self.new_cat, inplace=True)
                except ValueError as e:
                    raise ValueError(
                        'You need to specify different "new_cat" value, '
                        'because the current one is already included in the '
                        'category names.'
                    ).with_traceback(e.__traceback__)
                cat_removals = list(
                    set(self.imp_cats_[col]).intersection(
                        X_new[col].cat.categories
                    )
                )
                X_new[col].cat.remove_categories(
                    cat_removals,
                    inplace=True
                )
            X_new.loc[row_indices, col] = self.new_cat

        return X_new

    @staticmethod
    def _cat_cols_selection(X, include, exclude):
        """Returns categorical columns including the user's corrections."""
        cat_cols = X.select_dtypes('category').columns.tolist()

        if include is not None:
            assert isinstance(include, list), \
                'Columns to include must be given as an instance of a list!'
            cat_cols = [
                col for col in X.columns
                if col in cat_cols or col in include
            ]

        if exclude is not None:
            assert isinstance(exclude, list), \
                'Columns to exclude must be given as an instance of a list!'
            cat_cols = [col for col in cat_cols if col not in exclude]

        return cat_cols


[docs]class ColumnProjector(BaseEstimator, TransformerMixin):
    """Projects variable types onto basic dtypes.

    If not specified projects numeric features onto float, boolean onto bool
    and categorical onto 'category' dtypes.

    Parameters
    ----------
    manual_projection: dictionary, optional (default=None)
        Dictionary where keys are dtype names onto which specified columns
        will be projected and values are lists containing names of variables to
        be projected onto given dtype. Example usage:

        >>> manual_projection = {
        >>>    float: ['foo', 'bar'],
        >>>    'category': ['baz'],
        >>>    int: ['qux'],
        >>>    bool: ['quux']
        >>> }

    num_to_float: boolean, optional (default=True)
        Specifies whether numerical variables should be projected onto float
        (if True) or onto int (if False).

    Attributes
    ----------
    automatic_projection_: dict
        Dictionary where key is the dtype name onto which specified columns
        will be projected chosen automatically (when manual_projection is
        specified then this manual assignment is decisive).
    """
    def __init__(self, manual_projection=None, num_to_float=True):
        self.manual_projection = manual_projection
        self.num_to_float = num_to_float

[docs]    def fit(self, X, y=None):
        """Fits corresponding dtypes to X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.

        y: ignore

        Returns
        -------
        self: object
            Returns the instance itself.

        """
        self.automatic_projection_ = {'category': [], bool: []}

        if self.num_to_float:
            self.automatic_projection_[float] = []
        else:
            self.automatic_projection_[int] = []

        for col in X.columns:
            if self.num_to_float and is_numeric(X[col]):
                self.automatic_projection_[float].append(col)
            elif is_numeric(X[col]):
                self.automatic_projection_[int].append(col)
            elif set(X[col]) <= {0, 1}:
                self.automatic_projection_[bool].append(col)
            else:
                self.automatic_projection_['category'].append(col)

        return self

[docs]    def transform(self, X):
        """Apply variable projection on X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            New data with n_samples as its number of samples.

        Returns
        -------
        X_new: pd.DataFrame, shape = (n_samples, n_features)
            X data with projected values onto specified dtype.

        """
        try:
            getattr(self, 'automatic_projection_')
        except AttributeError:
            raise RuntimeError(
                'Could not find the attribute.\nFitting is necessary before '
                'you do the transformation!'
            )
        assert isinstance(X, pd.DataFrame), \
            'Input must be an instance of pandas.DataFrame()'

        X_new, columns_projected = self._project(X, self.manual_projection)
        X_new, _ = self._project(
            X_new, self.automatic_projection_, skip_columns=columns_projected
        )

        return X_new

    @staticmethod
    def _project(X, projection_dict, skip_columns=None):
        """Projects X in accordance with the guidelines provided."""
        X_new = X.copy()
        columns_projected = []

        if skip_columns is None:
            skip_columns = []

        if projection_dict is not None:
            assert isinstance(projection_dict, dict), \
                'projection_dict must be an instance of the dictionary!'
            for col_type, col_names in projection_dict.items():
                assert isinstance(col_names, list), (
                    'Values of projection_dict must be an instance '
                    'of the list!'
                )
                cols_to_project = [
                    col for col in col_names if col not in skip_columns
                ]
                if cols_to_project:
                    try:
                        X_new[cols_to_project] = (
                            X_new[cols_to_project].astype(col_type)
                        )
                    except KeyError:
                        cols_error = list(
                            set(cols_to_project) - set(X_new.columns)
                        )
                        raise KeyError("C'mon, those columns ain't in "
                                       "the DataFrame: %s" % cols_error)
                    columns_projected.extend(cols_to_project)

        return X_new, columns_projected


[docs]class ColumnSelector(BaseEstimator, TransformerMixin):
    """Limits the X to selected columns.

    Parameters
    ----------
    columns: list
        List of column names selected to be left.

    References
    ----------
    [1] J. Ramey, `Building Scikit-Learn Pipelines With Pandas DataFrame
    <https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/>`_,
    April 16, 2018

    """
    def __init__(self, columns):
        self.columns = columns

[docs]    def fit(self, X, y=None):
        """Fits columns selection to X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.

        y: Ignore

        Returns
        -------
        self: object
            Returns the instance itself.

        """
        return self

[docs]    def transform(self, X):
        """Apply columns selection to X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            New data with n_samples as its number of samples.

        Returns
        -------
        X_new: pd.DataFrame, shape = (n_samples, n_features)
            X data limited to selected columns only.

        """
        assert isinstance(X, pd.DataFrame), \
            'Input must be an instance of pandas.DataFrame!'

        try:
            X_new = X[self.columns]
            return X_new
        except KeyError as e:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError(
                "Selected columns not found in the DataFrame: %s" % cols_error
            ).with_traceback(e.__traceback__)


[docs]class TypeSelector(BaseEstimator, TransformerMixin):
    """Limits the X to selected types.

    Parameters
    ----------
    col_type: string or list-like
        Names of types to be selected.

    References
    ----------
    [1] J. Ramey, `Building Scikit-Learn Pipelines With Pandas DataFrame
    <https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/>`_,
    April 16, 2018

    """
    def __init__(self, col_type):
        self.col_type = col_type

[docs]    def fit(self, X, y=None):
        """Fits types selection to X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.

        y: ignore

        Returns
        -------
        self: object
            Returns the instance itself.

        """
        return self

[docs]    def transform(self, X):
        """Apply types selection to X.

        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            New data with n_samples as its number of samples.

        Returns
        -------
        X_new: pd.DataFrame, shape = (n_samples, n_features)
            X data limited to selected types only.

        """
        assert isinstance(X, pd.DataFrame), \
            'Input must be an instance of pandas.DataFrame()'
        X_new = X.select_dtypes(include=[self.col_type])

        return X_new