import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from ..utils import is_numeric
__all__ = [
'CategoricalBinarizer',
'CategoricalGrouper',
'ColumnProjector',
'ColumnSelector',
'TypeSelector'
]
[docs]class CategoricalBinarizer(BaseEstimator, TransformerMixin):
"""Finds categorical columns with binary-like response and converts them.
Searches throughout the categorical columns in the DataFrame and finds
those which contain categories corresponding to the passed boolean values
only.
Parameters
----------
keywords_{true, false}: list, optional (default=None)
List of categories' names corresponding to {True, False} logical
values.
Attributes
----------
columns_binarylike_: list
List of column names that should be mapped to boolean.
"""
def __init__(self, keywords_true=None, keywords_false=None):
self.keywords_true = keywords_true
self.keywords_false = keywords_false
[docs] def fit(self, X, y=None):
"""Fits selection of binary-like columns.
Parameters
----------
X: pd.DataFrame, shape = (n_samples, n_features)
Data with n_samples as its number of samples and n_features as its
number of features.
y: ignore
Returns
-------
self: object
Returns the instance itself.
"""
self.columns_binarylike_ = []
if self.keywords_true is None:
self.keywords_true = ['yes', 'YES', 'Yes']
if self.keywords_false is None:
self.keywords_false = ['no', 'NO', 'No']
keywords_binarylike = set(self.keywords_true + self.keywords_false)
for col in X.columns:
try:
binarylike_only = \
set(X[col].cat.categories) <= keywords_binarylike
except AttributeError as e:
continue
if binarylike_only:
self.columns_binarylike_.append(col)
return self
[docs]class CategoricalGrouper(BaseEstimator, TransformerMixin):
"""Groups sparse observations in a categorical columns into one category.
Parameters
----------
method: string {'freq'}, optional (default='freq')
The sparse categories grouping method:
- `freq`:
Counts the frequency against each category. Retains categories
whose cumulative share (with respect to descending sort) in the
total dataset is equal or higher than the percentile threshold.
percentile_thresh: float, optional (default=.05)
Defines the percentile threshold for 'freq' method.
new_cat: string or int, optional (default='Other')
Specifies the category name that will be imputed to the chosen sparse
observations.
include_cols: list, optional (default=None)
Specifies column names that should be treated like categorical
features. If None then estimator is executed only on the automatically
selected categorical columns.
exclude_cols: list, optional (default=None)
Specifies categorical column names that should not be treated like
categorical features. If None then no column is excluded from
transformation.
Attributes
----------
cat_cols_: list
List of categorical columns in a given dataset.
imp_cats_: dict
Dictionary that keeps track of replaced category names with the new
category for every feature in the dataset.
"""
def __init__(self, method='freq', percentile_thresh=.05, new_cat='Other',
include_cols=None, exclude_cols=None):
self.method = method
self.percentile_thresh = percentile_thresh
self.new_cat = new_cat
self.include_cols = include_cols
self.exclude_cols = exclude_cols
[docs] def fit(self, X, y=None):
"""Fits grouping with X by using given method.
Parameters
----------
X: pd.DataFrame, shape = (n_samples, n_features)
Training data of independent variable values.
y: ignore
Returns
-------
self: object
Returns the instance itself.
"""
assert isinstance(X, pd.DataFrame), \
'Input must be an instance of pandas.DataFrame()!'
assert len(X) > 0, 'Input data can not be empty!'
self.cat_cols_ = self._cat_cols_selection(
X, self.include_cols, self.exclude_cols
)
self.imp_cats_ = {}
if self.method == 'freq':
for col in self.cat_cols_:
tracker, i = 0, 0
sorted_series = X[col].value_counts(normalize=True)
while tracker < 1 - self.percentile_thresh:
tracker += sorted_series.iloc[i]
i += 1
sparse_cats = sorted_series.index[i:].tolist()
if len(sparse_cats) > 1:
self.imp_cats_[col] = sparse_cats
else:
self.imp_cats_[col] = []
return self
@staticmethod
def _cat_cols_selection(X, include, exclude):
"""Returns categorical columns including the user's corrections."""
cat_cols = X.select_dtypes('category').columns.tolist()
if include is not None:
assert isinstance(include, list), \
'Columns to include must be given as an instance of a list!'
cat_cols = [
col for col in X.columns
if col in cat_cols or col in include
]
if exclude is not None:
assert isinstance(exclude, list), \
'Columns to exclude must be given as an instance of a list!'
cat_cols = [col for col in cat_cols if col not in exclude]
return cat_cols
[docs]class ColumnProjector(BaseEstimator, TransformerMixin):
"""Projects variable types onto basic dtypes.
If not specified projects numeric features onto float, boolean onto bool
and categorical onto 'category' dtypes.
Parameters
----------
manual_projection: dictionary, optional (default=None)
Dictionary where keys are dtype names onto which specified columns
will be projected and values are lists containing names of variables to
be projected onto given dtype. Example usage:
>>> manual_projection = {
>>> float: ['foo', 'bar'],
>>> 'category': ['baz'],
>>> int: ['qux'],
>>> bool: ['quux']
>>> }
num_to_float: boolean, optional (default=True)
Specifies whether numerical variables should be projected onto float
(if True) or onto int (if False).
Attributes
----------
automatic_projection_: dict
Dictionary where key is the dtype name onto which specified columns
will be projected chosen automatically (when manual_projection is
specified then this manual assignment is decisive).
"""
def __init__(self, manual_projection=None, num_to_float=True):
self.manual_projection = manual_projection
self.num_to_float = num_to_float
[docs] def fit(self, X, y=None):
"""Fits corresponding dtypes to X.
Parameters
----------
X: pd.DataFrame, shape = (n_samples, n_features)
Training data of independent variable values.
y: ignore
Returns
-------
self: object
Returns the instance itself.
"""
self.automatic_projection_ = {'category': [], bool: []}
if self.num_to_float:
self.automatic_projection_[float] = []
else:
self.automatic_projection_[int] = []
for col in X.columns:
if self.num_to_float and is_numeric(X[col]):
self.automatic_projection_[float].append(col)
elif is_numeric(X[col]):
self.automatic_projection_[int].append(col)
elif set(X[col]) <= {0, 1}:
self.automatic_projection_[bool].append(col)
else:
self.automatic_projection_['category'].append(col)
return self
@staticmethod
def _project(X, projection_dict, skip_columns=None):
"""Projects X in accordance with the guidelines provided."""
X_new = X.copy()
columns_projected = []
if skip_columns is None:
skip_columns = []
if projection_dict is not None:
assert isinstance(projection_dict, dict), \
'projection_dict must be an instance of the dictionary!'
for col_type, col_names in projection_dict.items():
assert isinstance(col_names, list), (
'Values of projection_dict must be an instance '
'of the list!'
)
cols_to_project = [
col for col in col_names if col not in skip_columns
]
if cols_to_project:
try:
X_new[cols_to_project] = (
X_new[cols_to_project].astype(col_type)
)
except KeyError:
cols_error = list(
set(cols_to_project) - set(X_new.columns)
)
raise KeyError("C'mon, those columns ain't in "
"the DataFrame: %s" % cols_error)
columns_projected.extend(cols_to_project)
return X_new, columns_projected
[docs]class ColumnSelector(BaseEstimator, TransformerMixin):
"""Limits the X to selected columns.
Parameters
----------
columns: list
List of column names selected to be left.
References
----------
[1] J. Ramey, `Building Scikit-Learn Pipelines With Pandas DataFrame
<https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/>`_,
April 16, 2018
"""
def __init__(self, columns):
self.columns = columns
[docs] def fit(self, X, y=None):
"""Fits columns selection to X.
Parameters
----------
X: pd.DataFrame, shape = (n_samples, n_features)
Training data of independent variable values.
y: Ignore
Returns
-------
self: object
Returns the instance itself.
"""
return self
[docs]class TypeSelector(BaseEstimator, TransformerMixin):
"""Limits the X to selected types.
Parameters
----------
col_type: string or list-like
Names of types to be selected.
References
----------
[1] J. Ramey, `Building Scikit-Learn Pipelines With Pandas DataFrame
<https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/>`_,
April 16, 2018
"""
def __init__(self, col_type):
self.col_type = col_type
[docs] def fit(self, X, y=None):
"""Fits types selection to X.
Parameters
----------
X: pd.DataFrame, shape = (n_samples, n_features)
Training data of independent variable values.
y: ignore
Returns
-------
self: object
Returns the instance itself.
"""
return self