Source code for paralytics.utils.validation

"""Utilities for input validation."""


import numpy as np
import pandas as pd

from pandas.api.types import is_categorical_dtype, is_numeric_dtype


__all__ = [
    "check_uniq",
    "check_column_existence",
    "check_is_dataframe",
    "is_numeric",
    "find_sparsity",
    "check_continuity"
]


[docs]def check_uniq(X): """Checks whether all input data values are unique. Parameters ---------- X: array-like, shape = (n_samples, ) Vector to check whether it cointains unique values. Returns ------- boolean: Whether all input data values are unique. """ s = set() return not any(x in s or s.add(x) for x in X)
[docs]def check_column_existence(X, columns): """Checks whether all listed columns are in a given DataFrame. Parameters ---------- X: pandas.DataFrame Data with columns to be checked for occurrence. columns: single label or list-like Columns' labels to check. Returns ------- None Raises ------ ValueError If one of the elements of `cols` is not found in the `X` columns. """ if isinstance(columns, str): columns = [columns] exist = all(col in X.columns for col in columns) if not exist: cols_error = list(set(columns) - set(X.columns)) raise ValueError( "Columns not found in the DataFrame: {}" .format(", ".join(cols_error)) )
[docs]def check_is_dataframe(X): """Checks whether object is a pandas.DataFrame. Parameters ---------- X: object Object suspected of being a pandas.DataFrame. Returns ------- None Raises ------ TypeError If object is not a pandas.DataFrame. """ if not isinstance(X, pd.DataFrame): raise TypeError("Input must be an instance of pandas.DataFrame.")
[docs]def is_numeric(X, project=True): """Checks whether given vector contains numeric-only values excluding boolean vectors. Parameters ---------- X: array-like, shape = (n_samples, ) Vector where n_samples is the number of samples. project: bool, optional (default=True) If True tries to project on a numeric type unless categorical dtype is passed. Returns ------- bool """ if project and not is_categorical_dtype(X): try: X = np.array(X).astype(np.number) except ValueError: return False return is_numeric_dtype(X) and not set(X) <= {0, 1}
[docs]def find_sparsity(X, thresh=.01): """Finds columns with highly sparse categories. For categorical and binary features finds columns where categories with relative frequencies under the threshold are present. For numerical features (excluding binary variables) returns columns where NaNs or 0 are dominating in the given dataset. Parameters ---------- X: pandas.DataFrame Data to be checked for sparsity. thresh: float, optional (default=.01) Fraction of one of the categories under which the sparseness will be reported. Returns ------- sparse_{num, bin, cat}: list List of {numerical, binary, categorical} X column names where high sparsity was detected. """ assert isinstance(X, pd.DataFrame), \ 'Input must be an instance of pandas.DataFrame()' assert len(X) > 0, 'Input data can not be empty!' sparse_num, sparse_bin, sparse_cat = [[] for _ in range(3)] for col in X.columns: tab_counter = X[col].value_counts(normalize=True, dropna=False) if is_numeric(X[col]): most_freq = tab_counter.index[0] if most_freq != most_freq or most_freq == 0: sparse_num.append(col) else: min_frac = tab_counter.iloc[-1] if min_frac < thresh: if set(X[col]) <= {0, 1}: sparse_bin.append(col) else: sparse_cat.append(col) return sparse_num, sparse_bin, sparse_cat
[docs]def check_continuity(X, thresh=.5): """Checks whether input variable is continuous. Parameters ---------- X: array-like, shape = (n_samples, ) Vector to check for continuity. thresh: float, optional (default=.5) Fraction of non-unique values under which lack of continuity will be reported. Returns ------- boolean: Whether variable is continuous. """ return is_numeric(X) and len(np.unique(X)) / len(X) >= 1 - thresh