"""Utilities for input validation."""
import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype, is_numeric_dtype
__all__ = [
"check_uniq",
"check_column_existence",
"check_is_dataframe",
"is_numeric",
"find_sparsity",
"check_continuity"
]
[docs]def check_uniq(X):
"""Checks whether all input data values are unique.
Parameters
----------
X: array-like, shape = (n_samples, )
Vector to check whether it cointains unique values.
Returns
-------
boolean: Whether all input data values are unique.
"""
s = set()
return not any(x in s or s.add(x) for x in X)
[docs]def check_column_existence(X, columns):
"""Checks whether all listed columns are in a given DataFrame.
Parameters
----------
X: pandas.DataFrame
Data with columns to be checked for occurrence.
columns: single label or list-like
Columns' labels to check.
Returns
-------
None
Raises
------
ValueError
If one of the elements of `cols` is not found in the `X` columns.
"""
if isinstance(columns, str):
columns = [columns]
exist = all(col in X.columns for col in columns)
if not exist:
cols_error = list(set(columns) - set(X.columns))
raise ValueError(
"Columns not found in the DataFrame: {}"
.format(", ".join(cols_error))
)
[docs]def check_is_dataframe(X):
"""Checks whether object is a pandas.DataFrame.
Parameters
----------
X: object
Object suspected of being a pandas.DataFrame.
Returns
-------
None
Raises
------
TypeError
If object is not a pandas.DataFrame.
"""
if not isinstance(X, pd.DataFrame):
raise TypeError("Input must be an instance of pandas.DataFrame.")
[docs]def is_numeric(X, project=True):
"""Checks whether given vector contains numeric-only values excluding
boolean vectors.
Parameters
----------
X: array-like, shape = (n_samples, )
Vector where n_samples is the number of samples.
project: bool, optional (default=True)
If True tries to project on a numeric type unless categorical dtype is
passed.
Returns
-------
bool
"""
if project and not is_categorical_dtype(X):
try:
X = np.array(X).astype(np.number)
except ValueError:
return False
return is_numeric_dtype(X) and not set(X) <= {0, 1}
[docs]def find_sparsity(X, thresh=.01):
"""Finds columns with highly sparse categories.
For categorical and binary features finds columns where categories with
relative frequencies under the threshold are present.
For numerical features (excluding binary variables) returns columns
where NaNs or 0 are dominating in the given dataset.
Parameters
----------
X: pandas.DataFrame
Data to be checked for sparsity.
thresh: float, optional (default=.01)
Fraction of one of the categories under which the sparseness will be
reported.
Returns
-------
sparse_{num, bin, cat}: list
List of {numerical, binary, categorical} X column names where high
sparsity was detected.
"""
assert isinstance(X, pd.DataFrame), \
'Input must be an instance of pandas.DataFrame()'
assert len(X) > 0, 'Input data can not be empty!'
sparse_num, sparse_bin, sparse_cat = [[] for _ in range(3)]
for col in X.columns:
tab_counter = X[col].value_counts(normalize=True, dropna=False)
if is_numeric(X[col]):
most_freq = tab_counter.index[0]
if most_freq != most_freq or most_freq == 0:
sparse_num.append(col)
else:
min_frac = tab_counter.iloc[-1]
if min_frac < thresh:
if set(X[col]) <= {0, 1}:
sparse_bin.append(col)
else:
sparse_cat.append(col)
return sparse_num, sparse_bin, sparse_cat
[docs]def check_continuity(X, thresh=.5):
"""Checks whether input variable is continuous.
Parameters
----------
X: array-like, shape = (n_samples, )
Vector to check for continuity.
thresh: float, optional (default=.5)
Fraction of non-unique values under which lack of continuity will be
reported.
Returns
-------
boolean: Whether variable is continuous.
"""
return is_numeric(X) and len(np.unique(X)) / len(X) >= 1 - thresh