Source code for paralytics.discretization

import numpy as np
import pandas as pd
import pandas.core.algorithms as algos
import scipy.stats as stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier

from .exceptions import UniqueValuesError
from .utils.validation import is_numeric


__all__ = [
    'Discretizer'
]


[docs]class Discretizer(BaseEstimator, TransformerMixin): """Discretizes variables in a given data set. Reduces continuous variables to a finite number of intervals with use of declared methods. Parameters ---------- method: string: {'sapling', 'spearman'}, optional (default='sapling') The discretization method: - `sapling`: Submethod based on the DecisionTreeClassifier. - `spearman`: Submethod based on the Spearman's rang correlation. Divides the values into subsequent quartiles as long as it doesn't get full monotonicity. If this doesn't happen, it divides values with use of quantiles into the declared minimum number of buckets. Using this method with parameter formula set to 'median' may throw RuntimeWarning for fixed vector values in one of the input vectors becuase there is no point in tracking mutual change of two vectors when one vector doesn't change. formula: string: {'mean', 'median'}, optional (default='mean') Chooses the formula that representatives will be chosen to check the Spearman's rank correlation: - `mean`: Takes the mean in every group as a representative value. - `median`: Takes the median in every group as a representative value. max_bins: int, optional (default=20) Maximum number of bins that will be created. min_bins: int, optional (default=3) Minimum number of bins that will be created. max_tree_depth: int, optional (default=None) Specifies maximum tree depth. min_samples_leaf: float, optional (default=.05) Specifies the minimum part of the entire population that must be included in the leaf. random_state: int, optional (default=None) Random state for sklearn algorithms. Attributes ---------- bins_: dictionary, length = n_features Dictionary of upper limits of successive intervals excluding the maximum value which length equals the number of features in the data passed. Examples -------- >>> import numpy as np >>> import pandas as pd >>> import paralytics as prl >>> # Fix the seed for reproducibility. >>> SEED = 42 >>> np.random.seed(SEED) >>> # Create available categories for non-numeric variable. >>> sexes = ['female', 'male', 'child'] >>> # Generate example DataFrame. >>> X = pd.DataFrame({ >>> 'NormalVariable': np.random.normal(loc=0, scale=10, size=100), >>> 'UniformVariable': np.random.uniform(low=0, high=100, size=100), >>> 'IntVariable': np.random.randint(low=0, high=100, size=100), >>> 'Sex': np.random.choice(sexes, 100, p=[.5, .3, .2]) >>> }) >>> # Generate response variable. >>> y = np.random.randint(low=0, high=2, size=100) >>> # Do discretization. >>> discretizer = prl.Discretizer(max_bins=5, random_state=SEED) >>> X_discretized = discretizer.fit_transform(X, y) >>> print(X_discretized.head()) NormalVariable UniformVariable IntVariable Sex 0 (-3.886, inf] (33.151, inf] (63.5, inf] child 1 (-3.886, inf] (-inf, 24.071] (-inf, 28.0] female 2 (-3.886, inf] (-inf, 24.071] (28.0, 63.5] female 3 (-3.886, inf] (33.151, inf] (63.5, inf] male 4 (-3.886, inf] (33.151, inf] (-inf, 28.0] male """ def __init__(self, method='sapling', formula='mean', max_bins=20, min_bins=3, max_tree_depth=None, min_samples_leaf=.05, random_state=None): self.method = method self.formula = formula self.max_bins = max_bins self.min_bins = min_bins self.max_tree_depth = max_tree_depth self.min_samples_leaf = min_samples_leaf self.random_state = random_state
[docs] def fit(self, X, y=None): """Fit the binning with X by extracting upper limits of right-closed intervals. Parameters ---------- X: pd.DataFrame, shape = (n_samples, n_features) Training data of independent variable values. y: array-like, shape = (n_samples, ) Vector of target variable values corresponding to X data. Returns ------- self: object Returns the instance itself. """ assert isinstance(X, pd.DataFrame), \ 'Input must be an instance of pandas.DataFrame()' call_method = getattr(self, self.method) self.bins_ = {} for col in X.columns.values: # Checking whether columns are non-binary numeric (excluding nans) if is_numeric(X[col]): try: self.bins_[col] = call_method( X[col], np.asarray(y).ravel() ).astype(float) except UniqueValuesError as e: e.args += (f'The problem occured for the column: {col}.',) print(' '.join(e.args)) self.bins_[col] = np.unique(X[col]).astype(float) return self
[docs] def transform(self, X): """Apply discretization on X. X is projected on the bins previously extracted from a training set. Parameters ---------- X: pd.DataFrame, shape = (n_samples, n_features) New data with n_samples as its number of samples. Returns ------- X_new: pd.DataFrame, shape = (n_samples_new, n_features) X data with substituted values to their respective labels being string type. """ try: getattr(self, 'bins_') except AttributeError: raise RuntimeError('Could not find the attribute.\n' 'Fitting is necessary before you do ' 'the transformation.') assert isinstance(X, pd.DataFrame), \ 'Input must be an instance of pd.DataFrame()' X_new = X.copy() for col, cutoffs in self.bins_.items(): cut_points = cutoffs[1:-1] try: cut_points = cut_points.tolist() except AttributeError: cut_points = list(cut_points) if not cut_points: cut_points = cutoffs X_new[col] = self.finger( X[col], cut_points=np.array(cut_points) ).astype(str) return X_new
[docs] def sapling(self, X, y): """Creates finitely many intervals for a continuous vector using DecisionTreeClassifier optimizing splits with respect to Gini impurity criterium. Parameters ---------- X: array-like, shape = (n_samples, ) Vector passed as an one-dimensional array-like object where n_samples in the number of samples. y: array-like, shape = (n_samples, ) Vector of corresponding to X values passed as an one-dimensional array-like object where n_samples is the number of samples. Returns ------- bins: array, shape = (n_bins, ) Vector of successive cut-off points being upper limits of the corresponding intervals as well as containing a minimum value. """ y = np.asarray(y) X = np.asarray(X) y = y[~np.isnan(X)] X = X[~np.isnan(X)] if len(np.unique(X)) < self.min_bins: raise UniqueValuesError( 'Not enough unique values in the array. ' 'Minimum {} unique values required.'.format(self.min_bins) ) clf = DecisionTreeClassifier( max_depth=self.max_tree_depth, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_bins, random_state=self.random_state ) X = X.reshape(-1, 1) clf.fit(X, y) min_val = min(X) max_val = max(X) # Excluding leaves. bins = clf.tree_.threshold[clf.tree_.feature != -2] np.append(bins, [min_val, max_val]) bins = np.unique(bins) cols_out_idx = [] for idx, val in enumerate(bins): if not (min_val <= val <= max_val): cols_out_idx.append(idx) bins = np.delete(bins, cols_out_idx) return bins
[docs] def spearman(self, X, y): """Creates finitely many intervals for a continuous vector with use of Spearman's rang correlation (supervised). Parameters ---------- X: array-like, shape = (n_samples, ) Vector passed as an one-dimensional array-like object where n_samples is the number of samples. y: array-like, shape = (n_samples, ) Vector of corresponding to X values passed as an one-dimensional array-like object where n_samples is the number of samples. Returns ------- bins: array, shape = (n_bins, ) Vector of successive cut-off points being upper limits of the corresponding intervals as well as containing a minimum value. """ y = np.asarray(y) X = np.asarray(X) y = y[~np.isnan(X)] X = X[~np.isnan(X)] if len(np.unique(X)) < self.min_bins: raise UniqueValuesError( 'Not enough unique values in the array. ' 'Minimum {} unique values required.'.format(self.min_bins) ) r = 0 n = self.max_bins + 1 while np.abs(r) < 1: bins = algos.quantile(np.unique(X), np.linspace(0, 1, n)) df = pd.DataFrame({ 'X': X, 'y': y, 'Bucket': pd.cut(X, bins=bins, include_lowest=True) }) df_gr = df.groupby(by='Bucket', as_index=True) if not (df_gr.agg('count').X == 0).any(): r, _ = stats.spearmanr( getattr(df_gr, self.formula)().X, getattr(df_gr, self.formula)().y ) if n == self.min_bins + 1: break n -= 1 return bins
[docs] @staticmethod def finger(X, y=None, cut_points=None, n_quantiles=4, labels=None, min_val=None, max_val=None): """Manually bins continuous variable into the declared intervals. If the cut-off points are not declared the split is made using quantiles. Parameters ---------- X: array-like, shape = (n_samples, ) Vector passed as an one-dimensional array-like object where n_samples in the number of samples. y: Ignore cut_points: array-like, optional (default=None) Increasing monotonic sequence generating right-closed intervals. Values not allocated to any of the categories will be assigned to the empty set. For example given: cut_points=[1, 5, 9] will generate intervals: [X.min(), 1], (1, 5], (5, 9], (9, X.max()]. If you want to specify lower and upper limitations, set parameters: "min_val", "max_val" to a specific value. n_quantiles: int, optional (default=4) When cut_points are not declared it sets the number of quantiles to which the variable will be splitted. For example setting n_quantiles = 4 will return quartiles of X values between min_val and max_val. labels: string: {'auto'} or list, optional (default=None) Specifies returned bucket names, needs to be the same length as the number of created buckets: - `auto`: Assigns default values to group names by numbering them. min_val: float, optional (default=None) Determines lower limit value. If not specified takes -np.inf. max_val: float, optional (default=None) Determines upper limit value. If not specified takes np.inf. Returns ------- X_new: array, shape = (n_samples, ) Input data with its original values ​​being substituted with their respective labels. """ X = np.asarray(X) x = X[~np.isnan(X)] if min_val is None: min_val = -np.inf if max_val is None: max_val = np.inf # Default break_points in case of no declaration of cut_points if cut_points is None: x = x[(x >= min_val) & (x <= max_val)] break_points = algos.quantile( np.unique(x), np.linspace(0, 1, n_quantiles + 1) ) else: break_points = np.insert( cut_points.astype(float), [0, len(cut_points)], [min_val, max_val] ) break_points = np.unique(break_points) if labels == 'auto': labels = range(len(break_points) - 1) X_new = pd.cut( X, bins=break_points, labels=labels, include_lowest=True ) return X_new