Source code for flotilla.compute.decomposition

"""
Perform various dimensionality reduction algorithms on data
"""

import sys

from sklearn import decomposition
import pandas as pd


[docs]class DataFrameReducerBase(object): """Just like scikit-learn's reducers, but with prettied up DataFrames.""" def __init__(self, df, n_components=None, **kwargs): """Initialize and fit a dataframe to a decomposition algorithm Parameters ---------- df : pandas.DataFrame A (samples, features) dataframe of data to fit to the reduction algorithm n_components : int Number of components to calculate. If None, use as many components as there are samples kwargs : keyword arguments Any other arguments to the reduction algorithm """ # This magically initializes the reducer like PCA or NMF if df.shape[1] <= 3: raise ValueError( "Too few features (n={}) to reduce".format(df.shape[1])) super(DataFrameReducerBase, self).__init__(n_components=n_components, **kwargs) self.reduced_space = self.fit_transform(df) @staticmethod def _check_dataframe(X): """Check that the input is a pandas dataframe Parameters ---------- X : input Input to check if this is a pandas dataframe. Raises ------ ValueError If the input is not a pandas Dataframe """ try: assert isinstance(X, pd.DataFrame) except AssertionError: sys.stdout.write("Try again as a pandas DataFrame") raise ValueError('Input X was not a pandas DataFrame, ' 'was of type {} instead'.format(str(type(X)))) @staticmethod
[docs] def relabel_pcs(x): """Given a list of integers, change the name to be a 1-based principal component representation""" return "pc_" + str(int(x) + 1)
[docs] def fit(self, X): """Perform a scikit-learn fit and relabel dimensions to be informative names Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) Dataframe of data to reduce Returns ------- self : DataFrameReducerBase A instance of the data, now with components_, explained_variance_, and explained_variance_ratio_ attributes """ self._check_dataframe(X) self.X = X super(DataFrameReducerBase, self).fit(X) self.components_ = pd.DataFrame(self.components_, columns=self.X.columns).rename_axis( self.relabel_pcs, 0) try: self.explained_variance_ = pd.Series( self.explained_variance_).rename_axis(self.relabel_pcs, 0) self.explained_variance_ratio_ = pd.Series( self.explained_variance_ratio_).rename_axis(self.relabel_pcs, 0) except AttributeError: pass return self
[docs] def transform(self, X): """Transform a matrix into the compoment space Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) sized DataFrame to transform into the current compoment space Returns ------- component_space : pandas.DataFrame A (n_samples, self.n_components) sized DataFrame transformed into component space """ component_space = super(DataFrameReducerBase, self).transform(X) self._check_dataframe(X) component_space = pd.DataFrame(component_space, index=X.index).rename_axis( self.relabel_pcs, 1) return component_space
[docs] def fit_transform(self, X): """Perform both a fit and a transform on the input data Fit the data to the reduction algorithm, and transform the data to the reduced space. Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) dataframe to both fit and transform Returns ------- self : DataFrameReducerBase A fit and transformed instance of the object Raises ------ ValueError If the input is not a pandas DataFrame, will not perform the fit and transform """ self._check_dataframe(X) self.fit(X) return self.transform(X)
[docs]class DataFramePCA(DataFrameReducerBase, decomposition.PCA): """Perform Principal Components Analaysis on a DataFrame""" pass
[docs]class DataFrameNMF(DataFrameReducerBase, decomposition.NMF): """Perform Non-Negative Matrix Factorization on a DataFrame """ def __init__(self, df, n_components=None, **kwargs): kwargs.setdefault('init', 'nndsvd') super(decomposition.NMF, self).__init__(n_components, **kwargs) self.reduced_space = self.fit_transform(df)
[docs] def fit(self, X): """Override scikit-learn's fit() for our purposes Duplicated fit code for DataFrameNMF because sklearn's NMF cheats for efficiency and calls fit_transform. Method resolution order ("MRO") resolves the closest (in this package) _fit_transform first and so there's a recursion error: def fit(self, X, y=None, **kwargs): self._fit_transform(X, **kwargs) return self """ self._check_dataframe(X) self.X = X # notice this is fit_transform, not fit reduced_space = super(decomposition.NMF, self).fit_transform(X) self.components_ = pd.DataFrame(self.components_, columns=self.X.columns).rename_axis( self.relabel_pcs, 0) return reduced_space
[docs]class DataFrameICA(DataFrameReducerBase, decomposition.FastICA): """Perform Independent Comopnent Analysis on a DataFrame """ pass
[docs]class DataFrameTSNE(DataFrameReducerBase): """Perform t-Distributed Stochastic Neighbor Embedding on a DataFrame Read more: http://homepage.tudelft.nl/19j49/t-SNE.html """
[docs] def fit_transform(self, X): """Perform both a fit and a transform on the input data Fit the data to the reduction algorithm, and transform the data to the reduced space. Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) dataframe to both fit and transform Returns ------- self : DataFrameReducerBase A fit and transformed instance of the object Raises ------ ValueError If the input is not a pandas DataFrame, will not perform the fit and transform """ from tsne import bh_sne self._check_dataframe(X) return pd.DataFrame(bh_sne(X), index=X.index)
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.