import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from ..utils import is_numeric
__all__ = [
    'CategoricalBinarizer',
    'CategoricalGrouper',
    'ColumnProjector',
    'ColumnSelector',
    'TypeSelector'
]
[docs]class CategoricalBinarizer(BaseEstimator, TransformerMixin):
    """Finds categorical columns with binary-like response and converts them.
    Searches throughout the categorical columns in the DataFrame and finds
    those which contain categories corresponding to the passed boolean values
    only.
    Parameters
    ----------
    keywords_{true, false}: list, optional (default=None)
        List of categories' names corresponding to {True, False} logical
        values.
    Attributes
    ----------
    columns_binarylike_: list
        List of column names that should be mapped to boolean.
    """
    def __init__(self, keywords_true=None, keywords_false=None):
        self.keywords_true = keywords_true
        self.keywords_false = keywords_false
[docs]    def fit(self, X, y=None):
        """Fits selection of binary-like columns.
        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Data with n_samples as its number of samples and n_features as its
            number of features.
        y: ignore
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        self.columns_binarylike_ = []
        if self.keywords_true is None:
            self.keywords_true = ['yes', 'YES', 'Yes']
        if self.keywords_false is None:
            self.keywords_false = ['no', 'NO', 'No']
        keywords_binarylike = set(self.keywords_true + self.keywords_false)
        for col in X.columns:
            try:
                binarylike_only = \
                    
set(X[col].cat.categories) <= keywords_binarylike
            except AttributeError as e:
                continue
            if binarylike_only:
                self.columns_binarylike_.append(col)
        return self 
 
[docs]class CategoricalGrouper(BaseEstimator, TransformerMixin):
    """Groups sparse observations in a categorical columns into one category.
    Parameters
    ----------
    method: string {'freq'}, optional (default='freq')
        The sparse categories grouping method:
        - `freq`:
          Counts the frequency against each category. Retains categories
          whose cumulative share (with respect to descending sort) in the
          total dataset is equal or higher than the percentile threshold.
    percentile_thresh: float, optional (default=.05)
        Defines the percentile threshold for 'freq' method.
    new_cat: string or int, optional (default='Other')
        Specifies the category name that will be imputed to the chosen sparse
        observations.
    include_cols: list, optional (default=None)
        Specifies column names that should be treated like categorical
        features. If None then estimator is executed only on the automatically
        selected categorical columns.
    exclude_cols: list, optional (default=None)
        Specifies categorical column names that should not be treated like
        categorical features. If None then no column is excluded from
        transformation.
    Attributes
    ----------
    cat_cols_: list
        List of categorical columns in a given dataset.
    imp_cats_: dict
        Dictionary that keeps track of replaced category names with the new
        category for every feature in the dataset.
    """
    def __init__(self, method='freq', percentile_thresh=.05, new_cat='Other',
                 include_cols=None, exclude_cols=None):
        self.method = method
        self.percentile_thresh = percentile_thresh
        self.new_cat = new_cat
        self.include_cols = include_cols
        self.exclude_cols = exclude_cols
[docs]    def fit(self, X, y=None):
        """Fits grouping with X by using given method.
        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.
        y: ignore
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        assert isinstance(X, pd.DataFrame), \
            
'Input must be an instance of pandas.DataFrame()!'
        assert len(X) > 0, 'Input data can not be empty!'
        self.cat_cols_ = self._cat_cols_selection(
            X, self.include_cols, self.exclude_cols
        )
        self.imp_cats_ = {}
        if self.method == 'freq':
            for col in self.cat_cols_:
                tracker, i = 0, 0
                sorted_series = X[col].value_counts(normalize=True)
                while tracker < 1 - self.percentile_thresh:
                    tracker += sorted_series.iloc[i]
                    i += 1
                sparse_cats = sorted_series.index[i:].tolist()
                if len(sparse_cats) > 1:
                    self.imp_cats_[col] = sparse_cats
                else:
                    self.imp_cats_[col] = []
        return self 
    @staticmethod
    def _cat_cols_selection(X, include, exclude):
        """Returns categorical columns including the user's corrections."""
        cat_cols = X.select_dtypes('category').columns.tolist()
        if include is not None:
            assert isinstance(include, list), \
                
'Columns to include must be given as an instance of a list!'
            cat_cols = [
                col for col in X.columns
                if col in cat_cols or col in include
            ]
        if exclude is not None:
            assert isinstance(exclude, list), \
                
'Columns to exclude must be given as an instance of a list!'
            cat_cols = [col for col in cat_cols if col not in exclude]
        return cat_cols 
[docs]class ColumnProjector(BaseEstimator, TransformerMixin):
    """Projects variable types onto basic dtypes.
    If not specified projects numeric features onto float, boolean onto bool
    and categorical onto 'category' dtypes.
    Parameters
    ----------
    manual_projection: dictionary, optional (default=None)
        Dictionary where keys are dtype names onto which specified columns
        will be projected and values are lists containing names of variables to
        be projected onto given dtype. Example usage:
        >>> manual_projection = {
        >>>    float: ['foo', 'bar'],
        >>>    'category': ['baz'],
        >>>    int: ['qux'],
        >>>    bool: ['quux']
        >>> }
    num_to_float: boolean, optional (default=True)
        Specifies whether numerical variables should be projected onto float
        (if True) or onto int (if False).
    Attributes
    ----------
    automatic_projection_: dict
        Dictionary where key is the dtype name onto which specified columns
        will be projected chosen automatically (when manual_projection is
        specified then this manual assignment is decisive).
    """
    def __init__(self, manual_projection=None, num_to_float=True):
        self.manual_projection = manual_projection
        self.num_to_float = num_to_float
[docs]    def fit(self, X, y=None):
        """Fits corresponding dtypes to X.
        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.
        y: ignore
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        self.automatic_projection_ = {'category': [], bool: []}
        if self.num_to_float:
            self.automatic_projection_[float] = []
        else:
            self.automatic_projection_[int] = []
        for col in X.columns:
            if self.num_to_float and is_numeric(X[col]):
                self.automatic_projection_[float].append(col)
            elif is_numeric(X[col]):
                self.automatic_projection_[int].append(col)
            elif set(X[col]) <= {0, 1}:
                self.automatic_projection_[bool].append(col)
            else:
                self.automatic_projection_['category'].append(col)
        return self 
    @staticmethod
    def _project(X, projection_dict, skip_columns=None):
        """Projects X in accordance with the guidelines provided."""
        X_new = X.copy()
        columns_projected = []
        if skip_columns is None:
            skip_columns = []
        if projection_dict is not None:
            assert isinstance(projection_dict, dict), \
                
'projection_dict must be an instance of the dictionary!'
            for col_type, col_names in projection_dict.items():
                assert isinstance(col_names, list), (
                    'Values of projection_dict must be an instance '
                    'of the list!'
                )
                cols_to_project = [
                    col for col in col_names if col not in skip_columns
                ]
                if cols_to_project:
                    try:
                        X_new[cols_to_project] = (
                            X_new[cols_to_project].astype(col_type)
                        )
                    except KeyError:
                        cols_error = list(
                            set(cols_to_project) - set(X_new.columns)
                        )
                        raise KeyError("C'mon, those columns ain't in "
                                       "the DataFrame: %s" % cols_error)
                    columns_projected.extend(cols_to_project)
        return X_new, columns_projected 
[docs]class ColumnSelector(BaseEstimator, TransformerMixin):
    """Limits the X to selected columns.
    Parameters
    ----------
    columns: list
        List of column names selected to be left.
    References
    ----------
    [1] J. Ramey, `Building Scikit-Learn Pipelines With Pandas DataFrame
    <https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/>`_,
    April 16, 2018
    """
    def __init__(self, columns):
        self.columns = columns
[docs]    def fit(self, X, y=None):
        """Fits columns selection to X.
        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.
        y: Ignore
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        return self 
 
[docs]class TypeSelector(BaseEstimator, TransformerMixin):
    """Limits the X to selected types.
    Parameters
    ----------
    col_type: string or list-like
        Names of types to be selected.
    References
    ----------
    [1] J. Ramey, `Building Scikit-Learn Pipelines With Pandas DataFrame
    <https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/>`_,
    April 16, 2018
    """
    def __init__(self, col_type):
        self.col_type = col_type
[docs]    def fit(self, X, y=None):
        """Fits types selection to X.
        Parameters
        ----------
        X: pd.DataFrame, shape = (n_samples, n_features)
            Training data of independent variable values.
        y: ignore
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        return self