"""
Perform various dimensionality reduction algorithms on data
"""
import sys
from sklearn import decomposition
import pandas as pd
[docs]class DataFrameReducerBase(object):
"""Just like scikit-learn's reducers, but with prettied up DataFrames."""
def __init__(self, df, n_components=None, **kwargs):
"""Initialize and fit a dataframe to a decomposition algorithm
Parameters
----------
df : pandas.DataFrame
A (samples, features) dataframe of data to fit to the reduction
algorithm
n_components : int
Number of components to calculate. If None, use as many
components as there are samples
kwargs : keyword arguments
Any other arguments to the reduction algorithm
"""
# This magically initializes the reducer like PCA or NMF
if df.shape[1] <= 3:
raise ValueError(
"Too few features (n={}) to reduce".format(df.shape[1]))
super(DataFrameReducerBase, self).__init__(n_components=n_components,
**kwargs)
self.reduced_space = self.fit_transform(df)
@staticmethod
def _check_dataframe(X):
"""Check that the input is a pandas dataframe
Parameters
----------
X : input
Input to check if this is a pandas dataframe.
Raises
------
ValueError
If the input is not a pandas Dataframe
"""
try:
assert isinstance(X, pd.DataFrame)
except AssertionError:
sys.stdout.write("Try again as a pandas DataFrame")
raise ValueError('Input X was not a pandas DataFrame, '
'was of type {} instead'.format(str(type(X))))
@staticmethod
[docs] def relabel_pcs(x):
"""Given a list of integers, change the name to be a 1-based
principal component representation"""
return "pc_" + str(int(x) + 1)
[docs] def fit(self, X):
"""Perform a scikit-learn fit and relabel dimensions to be
informative names
Parameters
----------
X : pandas.DataFrame
A (n_samples, n_features) Dataframe of data to reduce
Returns
-------
self : DataFrameReducerBase
A instance of the data, now with components_,
explained_variance_, and explained_variance_ratio_ attributes
"""
self._check_dataframe(X)
self.X = X
super(DataFrameReducerBase, self).fit(X)
self.components_ = pd.DataFrame(self.components_,
columns=self.X.columns).rename_axis(
self.relabel_pcs, 0)
try:
self.explained_variance_ = pd.Series(
self.explained_variance_).rename_axis(self.relabel_pcs, 0)
self.explained_variance_ratio_ = pd.Series(
self.explained_variance_ratio_).rename_axis(self.relabel_pcs,
0)
except AttributeError:
pass
return self
[docs]class DataFramePCA(DataFrameReducerBase, decomposition.PCA):
"""Perform Principal Components Analaysis on a DataFrame"""
pass
[docs]class DataFrameNMF(DataFrameReducerBase, decomposition.NMF):
"""Perform Non-Negative Matrix Factorization on a DataFrame
"""
def __init__(self, df, n_components=None, **kwargs):
kwargs.setdefault('init', 'nndsvd')
super(decomposition.NMF, self).__init__(n_components,
**kwargs)
self.reduced_space = self.fit_transform(df)
[docs] def fit(self, X):
"""Override scikit-learn's fit() for our purposes
Duplicated fit code for DataFrameNMF because sklearn's NMF cheats for
efficiency and calls fit_transform. Method resolution order ("MRO")
resolves the closest (in this package)
_fit_transform first and so there's a recursion error:
def fit(self, X, y=None, **kwargs):
self._fit_transform(X, **kwargs)
return self
"""
self._check_dataframe(X)
self.X = X
# notice this is fit_transform, not fit
reduced_space = super(decomposition.NMF, self).fit_transform(X)
self.components_ = pd.DataFrame(self.components_,
columns=self.X.columns).rename_axis(
self.relabel_pcs, 0)
return reduced_space
[docs]class DataFrameICA(DataFrameReducerBase, decomposition.FastICA):
"""Perform Independent Comopnent Analysis on a DataFrame
"""
pass
[docs]class DataFrameTSNE(DataFrameReducerBase):
"""Perform t-Distributed Stochastic Neighbor Embedding on a DataFrame
Read more: http://homepage.tudelft.nl/19j49/t-SNE.html
"""