Source code for paralytics.feature_union
import numpy as np
import pandas as pd
from joblib import delayed, Parallel
from scipy import sparse
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
__all__ = [
'PandasFeatureUnion'
]
[docs]class PandasFeatureUnion(FeatureUnion):
"""Concatenates results of multiple pandas.DataFrame transformers.
Using FeatureUnion capabilities from scikit-learn applies multiple
transformers always returning pandas.DataFrame object.
References
----------
[1] marrrcin, `pandas-feature-union
<https://github.com/marrrcin/pandas-feature-union>`_, 2018
"""
[docs] def fit_transform(self, X, y=None, **fit_params):
"""Fits and transforms data based on transformers inside pipeline.
Parameters
----------
X: DataFrame, shape = (n_samples, n_features)
Data with n_samples as its number of samples and n_features as its
number of features.
Returns
-------
X_new: DataFrame, shape = (k_samples, k_features)
X data with substituted binary-like category columns with its
corresponding binary values.
Notes
-----
The transformer has to return pandas.DataFrame object.
"""
self._validate_transformers()
result = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_transform_one)(
transformer=trans,
X=X,
y=y,
weight=weight,
**fit_params
)
for name, trans, weight in self._iter())
if not result:
# If all transformers are None return array of zeros
return np.zeros((X.shape[0], 0))
X_new, transformers = zip(*result)
self._update_transformer_list(transformers)
if any(sparse.issparse(f) for f in X_new):
X_new = sparse.hstack(X_new).tocsr()
else:
X_new = self.merge_dataframes_by_column(X_new)
return X_new
[docs] def merge_dataframes_by_column(self, X):
"""Concatenates dataframes which resulted from different operations.
Parameters
----------
X: DataFrame, shape = (n_samples, n_features)
Data with n_samples as its number of samples and n_features as its
number of features.
Returns
-------
X_new: DataFrame, shape = (n_samples, n_features)
X data with substituted binary-like category columns with its
corresponding binary values.
"""
X_new = pd.concat(X, axis="columns", copy=False)
return X_new
[docs] def transform(self, X):
"""Applies conversions which are found in transformer_list.
Parameters
----------
X: DataFrame, shape = (n_samples, n_features)
Data with n_samples as its number of samples and n_features as its
number of features.
Returns
-------
X_new: DataFrame, shape = (n_samples, n_features)
X data with substituted binary-like category columns with its
corresponding binary values.
Notes
-----
Returns pandas.DataFrame object.
"""
X_new = Parallel(n_jobs=self.n_jobs)(
delayed(_transform_one)(
transformer=trans,
X=X,
y=None,
weight=weight)
for name, trans, weight in self._iter())
if not X_new:
# If all transformers are None return array of zeros
return np.zeros((X.shape[0], 0))
if any(sparse.issparse(f) for f in X_new):
X_new = sparse.hstack(X_new).tocsr()
else:
X_new = self.merge_dataframes_by_column(X_new)
return X_new