Source code for flotilla.visualize.decomposition

from __future__ import division

from collections import defaultdict
from itertools import cycle
import math

from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from .color import deep
from .generic import violinplot


[docs]class DecompositionViz(object):
    """
    Plots the reduced space from a decomposed dataset. Does not perform any
    reductions of its own
    """

    def __init__(self, reduced_space, components_,
                 explained_variance_ratio_,
                 feature_renamer=None, groupby=None,
                 singles=None, pooled=None, outliers=None,
                 featurewise=False,
                 order=None, violinplot_kws=None,
                 data_type='expression', label_to_color=None,
                 label_to_marker=None,
                 scale_by_variance=True, x_pc='pc_1',
                 y_pc='pc_2', n_vectors=20, distance='L2',
                 n_top_pc_features=50, max_char_width=30):
        """Plot the results of a decomposition visualization

        Parameters
        ----------
        reduced_space : pandas.DataFrame
            A (n_samples, n_dimensions) DataFrame of the post-dimensionality
            reduction data
        components_ : pandas.DataFrame
            A (n_features, n_dimensions) DataFrame of how much each feature
            contributes to the components (trailing underscore to be
            consistent with scikit-learn)
        explained_variance_ratio_ : pandas.Series
            A (n_dimensions,) Series of how much variance each component
            explains. (trailing underscore to be consistent with scikit-learn)
        feature_renamer : function, optional
            A function which takes the name of the feature and renames it,
            e.g. from an ENSEMBL ID to a HUGO known gene symbol. If not
            provided, the original name is used.
        groupby : mapping function | dict, optional
            A mapping of the samples to a label, e.g. sample IDs to
            phenotype, for the violinplots. If None, all samples are treated
            the same and are colored the same.
        singles : pandas.DataFrame, optional
            For violinplots only. If provided and 'plot_violins' is True,
            will plot the raw (not reduced) measurement values as violin plots.
        pooled : pandas.DataFrame, optional
            For violinplots only. If provided, pooled samples are plotted as
            black dots within their label.
        outliers : pandas.DataFrame, optional
            For violinplots only. If provided, outlier samples are plotted as
            a grey shadow within their label.
        featurewise : bool, optional
            If True, then the "samples" are features, e.g. genes instead of
            samples, and the "features" are the samples, e.g. the cells
            instead of the gene ids. Essentially, the transpose of the
            original matrix. If True, then violins aren't plotted. (default
            False)
        order : list-like, optional
            The order of the labels for the violinplots, e.g. if the data is
            from a differentiation timecourse, then this would be the labels
            of the phenotypes, in the differentiation order.
        violinplot_kws : dict, optional
            Any additional parameters to violinplot
        data_type : 'expression' | 'splicing', optional
            For violinplots only. The kind of data that was originally used
            for the reduction. (default 'expression')
        label_to_color : dict, optional
            A mapping of the label, e.g. the phenotype, to the desired
            plotting color (default None, auto-assigned with the groupby)
        label_to_marker : dict, optional
            A mapping of the label, e.g. the phenotype, to the desired
            plotting symbol (default None, auto-assigned with the groupby)
        scale_by_variance : bool, optional
            If True, scale the x- and y-axes by their explained_variance_ratio_
            (default True)
        {x,y}_pc : str, optional
            Principal component to plot on the x- and y-axis. (default "pc_1"
            and "pc_2")
        n_vectors : int, optional
            Number of vectors to plot of the principal components. (default 20)
        distance : 'L1' | 'L2', optional
            The distance metric to use to plot the vector lengths. L1 is
            "Cityblock", i.e. the sum of the x and y coordinates, and L2 is
            the traditional Euclidean distance. (default "L1")
        n_top_pc_features : int, optional
            THe number of top features from the principal components to plot.
            (default 50)
        max_char_width : int, optional
            Maximum character width of a feature name. Useful for crazy long
            feature IDs like MISO IDs
        """
        self.reduced_space = reduced_space
        self.components_ = components_
        self.explained_variance_ratio_ = explained_variance_ratio_

        self.singles = singles
        self.pooled = pooled
        self.outliers = outliers

        self.groupby = groupby
        self.order = order
        self.violinplot_kws = violinplot_kws if violinplot_kws is not None \
            else {}
        self.data_type = data_type
        self.label_to_color = label_to_color
        self.label_to_marker = label_to_marker
        self.n_vectors = n_vectors
        self.x_pc = x_pc
        self.y_pc = y_pc
        self.pcs = (self.x_pc, self.y_pc)
        self.distance = distance
        self.n_top_pc_features = n_top_pc_features
        self.featurewise = featurewise

        def renamer(x):
            return x

        self.feature_renamer = renamer if \
            feature_renamer is None else feature_renamer
        self.max_char_width = max_char_width

        if self.groupby is None:
            self.groupby = dict.fromkeys(self.reduced_space.index, 'all')
        self.grouped = self.reduced_space.groupby(self.groupby, axis=0)

        if self.label_to_color is None:
            self.label_to_color = dict(zip(
                self.grouped.groups.keys(),
                sns.color_palette('husl', n_colors=len(self.grouped))))

        if self.label_to_marker is None:
            markers = cycle(['o', '^', 's', 'v', '*', 'D', 'h'])

            def marker_factory():
                return markers.next()

            self.label_to_marker = defaultdict(marker_factory)

        if order is not None:
            self.color_ordered = [self.label_to_color[x] for x in self.order]
        else:
            self.color_ordered = [self.label_to_color[x] for x in
                                  self.grouped.groups]

        self.loadings = self.components_.ix[[self.x_pc, self.y_pc]]

        # Get the explained variance
        if self.explained_variance_ratio_ is not None:
            self.vars = self.explained_variance_ratio_[[self.x_pc, self.y_pc]]
        else:
            self.vars = pd.Series([1., 1.], index=[self.x_pc, self.y_pc])

        if scale_by_variance:
            self.loadings = self.loadings.multiply(self.vars, axis=0)

        # sort features by magnitude/contribution to transformation
        reduced_space = self.reduced_space[[self.x_pc, self.y_pc]]
        farthest_sample = reduced_space.apply(np.linalg.norm, axis=0).max()
        whole_space = self.loadings.apply(np.linalg.norm).max()
        scale = .25 * farthest_sample / whole_space
        self.loadings *= scale

        ord = 2 if self.distance == 'L2' else 1
        self.magnitudes = self.loadings.apply(np.linalg.norm, ord=ord)
        self.magnitudes.sort(ascending=False)

        self.top_features = set([])
        self.pc_loadings_labels = {}
        self.pc_loadings = {}
        for pc in self.pcs:
            x = self.components_.ix[pc].copy()
            x.sort(ascending=True)
            half_features = int(self.n_top_pc_features / 2)
            if len(x) > self.n_top_pc_features:
                a = x[:half_features]
                b = x[-half_features:]
                labels = np.r_[a.index, b.index]
                self.pc_loadings[pc] = np.r_[a, b]
            else:
                labels = x.index
                self.pc_loadings[pc] = x

            self.pc_loadings_labels[pc] = labels
            self.top_features.update(labels)

[docs]    def plot(self, ax=None, title='', plot_violins=False,
             show_point_labels=False,
             show_vectors=True,
             show_vector_labels=True,
             markersize=10, legend=True, bokeh=False, metadata=None,
             plot_loadings='heatmap', n_components=5):
        """Plot reduced space

        Figures can be saved with:

        dv.plot()
        dv.fig_reduced.savefig('decomposition.pdf')

        Parameters
        ----------
        ax : matplotlib.axes.Axes object, optional
            An axes object to plot the reduced space and the components onto
        title : str, optional
            Title to add to the plot
        plot_violins : bool, optional
            If True, also make a matplotlib.figure.Figure object of the top
            features
        show_point_labels : bool, optional
            If True, show the labels of the points plotted onto the canvas. If
            this was not featurewise (default), then the points are the
            samples. If this was featurewise, then the points are the features
        show_vectors : bool, optional
            If True, plot the vectors with the highest magnitude in PC1 and PC2
            space
        show_vector_labels : bool, optional
            If True, label the vectors with their features (if not
            featurewise), else their samples
        markersize : int, optional
            Size of the plotting marker of the samples on the plot
        legend : bool, optional
            If True, plot a legend showing which celltype corresponds to which
            color
        bokeh : bool, optional
            If True, attempt to plot this using interactive BokehJS plots
            which allow for hovering tooltips
        metadata : pandas.DataFrame, optional
            If provided, all columns of this metadata will be shown when
            hovering over the samples in the bokeh version of the plot
        plot_loadings : 'heatmap' | 'scatter'
            Whether to plot the loadings of features as a heatmap or a
            scatterplot
        n_components : int, optional
            Number of components to plot on the heatmap

        Returns
        -------
        self : DecompositionViz

        """

        if bokeh:
            self._plot_bokeh(metadata, title)
        else:
            nrows = 10
            ncols = 10

            if ax is None:
                self.fig_reduced, ax = plt.subplots(1, 1, figsize=(20, 10))
                self.gs = GridSpec(nrows, ncols)

            else:
                self.gs = GridSpecFromSubplotSpec(nrows, ncols,
                                                  ax.get_subplotspec())
                self.fig_reduced = plt.gcf()

            self.ax_components = plt.subplot(self.gs[:, :5])

            self.plot_samples(show_point_labels=show_point_labels,
                              title=title, show_vectors=show_vectors,
                              show_vector_labels=show_vector_labels,
                              markersize=markersize, legend=legend,
                              ax=self.ax_components)
            if plot_loadings == 'heatmap':
                self.ax_explained_variance = plt.subplot(
                    self.gs[0:2, 6:ncols-1])
                self.ax_empty = plt.subplot(self.gs[0:2, ncols-1])
                self.ax_pcs_heatmap = plt.subplot(self.gs[2:nrows, 6:ncols-1])
                self.ax_pcs_colorbar = plt.subplot(self.gs[2:nrows, ncols-1])

                # Zero out the one axes in the upper right corner
                self.ax_empty.axis('off')  # self.fig_reduced = plt.gcf()

                self.plot_loadings_heatmap(n_components=n_components)

            else:
                self.ax_loading1 = plt.subplot(self.gs[:, 6:8])
                self.ax_loading2 = plt.subplot(self.gs[:, 8:ncols])
                self.plot_loadings(pc=self.x_pc, ax=self.ax_loading1)
                self.plot_loadings(pc=self.y_pc, ax=self.ax_loading2)
                sns.despine(ax=self.ax_loading1)
                sns.despine(ax=self.ax_loading2)
            sns.despine(ax=self.ax_components)
            self.fig_reduced.tight_layout()

            singles_none = self.singles is None
            if plot_violins and not self.featurewise and not singles_none:
                self.plot_violins()
            return self

    def _plot_bokeh(self, metadata=None, title=''):
        metadata = metadata if metadata is not None else pd.DataFrame(
            index=self.reduced_space.index)
        # Clean alias
        import bokeh.plotting as bk
        from bokeh.plotting import ColumnDataSource, figure, show
        from bokeh.models import HoverTool

        # Plots can be displayed inline in an IPython Notebook
        bk.output_notebook(force=True)

        # Create a set of tools to use
        TOOLS = "pan,wheel_zoom,box_zoom,reset,hover"

        x = self.reduced_space[self.x_pc]
        y = self.reduced_space[self.y_pc]

        radii = np.ones(x.shape)

        colors = self.reduced_space.index.map(
            lambda x: self.label_to_color[self.groupby[x]])
        sample_ids = self.reduced_space.index

        data = dict(
            (col, metadata[col][self.reduced_space.index]) for
            col in metadata)
        tooltips = [('sample_id', '@sample_ids')]
        tooltips.extend([(k, '@{}'.format(k)) for k in data.keys()])

        data.update(dict(
            x=x,
            y=y,
            colors=colors,
            sample_ids=sample_ids
        ))

        source = ColumnDataSource(
            data=data
        )

        p = figure(title=title, tools=TOOLS)

        p.circle(x, y, radius=radii, source=source,
                 fill_color=colors, fill_alpha=0.6, line_color=None)
        hover = p.select(dict(type=HoverTool))
        hover.tooltips = tooltips

        show(p)

[docs]    def shorten(self, x):
        if len(x) > self.max_char_width:
            return '{}...'.format(x[:self.max_char_width])
        else:
            return x

[docs]    def plot_samples(self, show_point_labels=True,
                     title='PCA', show_vectors=True,
                     show_vector_labels=True, markersize=10,
                     three_d=False, legend=True, ax=None):
        """Plot PCA scatterplot

        Parameters
        ----------
        groupby : groupby
            How to group the samples by color/label
        label_to_color : dict
            Group labels to a matplotlib color E.g. if you've already chosen
            specific colors to indicate a particular group. Otherwise will
            auto-assign colors
        label_to_marker : dict
            Group labels to matplotlib marker
        title : str
            title of the plot
        show_vectors : bool
            Whether or not to draw the vectors indicating the supporting
            principal components
        show_vector_labels : bool
            whether or not to draw the names of the vectors
        show_point_labels : bool
            Whether or not to label the scatter points
        markersize : int
            size of the scatter markers on the plot
        text_group : list of str
            Group names that you want labeled with text
        three_d : bool
            if you want hte plot in 3d (need to set up the axes beforehand)

        Returns
        -------
        For each vector in data:
        x, y, marker, distance
        """
        if ax is None:
            ax = plt.gca()

        # Plot the samples
        for name, df in self.grouped:
            color = self.label_to_color[name]
            marker = self.label_to_marker[name]
            x = df[self.x_pc]
            y = df[self.y_pc]
            ax.plot(x, y, color=color, marker=marker, linestyle='None',
                    label=name, markersize=markersize, alpha=0.75,
                    markeredgewidth=.1)
            try:
                if not self.pooled.empty:
                    pooled_ids = x.index.intersection(self.pooled.index)
                    pooled_x, pooled_y = x[pooled_ids], y[pooled_ids]
                    ax.plot(pooled_x, pooled_y, 'o', color=color,
                            marker=marker,
                            markeredgecolor='k', markeredgewidth=2,
                            label='{} pooled'.format(name),
                            markersize=markersize, alpha=0.75)
            except AttributeError:
                pass
            try:
                if not self.outliers.empty:
                    outlier_ids = x.index.intersection(self.outliers.index)
                    outlier_x, outlier_y = x[outlier_ids], y[outlier_ids]
                    ax.plot(outlier_x, outlier_y, 'o', color=color,
                            marker=marker,
                            markeredgecolor='lightgrey', markeredgewidth=5,
                            label='{} outlier'.format(name),
                            markersize=markersize, alpha=0.75)
            except AttributeError:
                pass
            if show_point_labels:
                for args in zip(x, y, df.index):
                    ax.text(*args)

        # Plot vectors, if asked
        if show_vectors:
            for vector_label in self.magnitudes[:self.n_vectors].index:
                x, y = self.loadings[vector_label]
                ax.plot([0, x], [0, y], color='k', linewidth=1)
                if show_vector_labels:
                    x_offset = math.copysign(5, x)
                    y_offset = math.copysign(5, y)
                    horizontalalignment = 'left' if x > 0 else 'right'
                    if self.feature_renamer is not None:
                        renamed = self.feature_renamer(vector_label)
                    else:
                        renamed = vector_label
                    ax.annotate(renamed, (x, y),
                                textcoords='offset points',
                                xytext=(x_offset, y_offset),
                                horizontalalignment=horizontalalignment)

        # Label x and y axes
        ax.set_xlabel(
            'Principal Component {} (Explains {:.2f}% Of Variance)'.format(
                str(self.x_pc), 100 * self.vars[self.x_pc]))
        ax.set_ylabel(
            'Principal Component {} (Explains {:.2f}% Of Variance)'.format(
                str(self.y_pc), 100 * self.vars[self.y_pc]))
        ax.set_title(title)

        if legend:
            ax.legend(loc="best")
        sns.despine()

[docs]    def plot_loadings(self, pc='pc_1', n_features=50, ax=None):
        loadings = self.pc_loadings[pc]
        labels = self.pc_loadings_labels[pc]

        if ax is None:
            ax = plt.gca()

        # import pdb; pdb.set_trace()
        x = loadings
        y = np.arange(loadings.shape[0])

        ax.scatter(x, y, color=deep[0])
        ax.set_ylim(-.5, y.max() + .5)

        ax.set_yticks(np.arange(max(loadings.shape[0], n_features)))
        ax.set_title("Component " + pc)

        x_offset = max(loadings) * .05
        ax.set_xlim(left=loadings.min() - x_offset,
                    right=loadings.max() + x_offset)

        if self.feature_renamer is not None:
            labels = map(self.feature_renamer, labels)
        else:
            labels = labels

        labels = map(self.shorten, labels)
        # ax.set_yticklabels(map(shorten, labels))
        ax.set_yticklabels(labels)
        for lab in ax.get_xticklabels():
            lab.set_rotation(90)
        sns.despine(ax=ax)

[docs]    def plot_loadings_heatmap(self, n_features=50, n_components=5):
        """Plot the loadings of each feature in the top principal components

        Creates a heatmap of the top features contributing to the first few
        principal components, sorted by the features' contribution to PC1.

        Parameters
        ----------
        n_features : int, optional
            Total number of features to plot. Half of these will be the top
            features contributing to the positive side of PC1, the other will
            be the top features contributing to the negative side of PC2
        n_components : int, optional
            Number of components to plot
        """
        half = n_features/2
        components = self.components_.T
        components = components.sort(columns='pc_1', ascending=False)
        components_subset = pd.concat(
            [components.iloc[:half, :n_components],
             components.iloc[-half:, :n_components]])
        components_subset = components_subset.rename(
            index=self.feature_renamer)

        # Plot explained variance
        ymax = self.explained_variance_ratio_[:n_components]
        ymin = np.zeros(ymax.shape)
        x = np.arange(ymax.shape[0])
        self.ax_explained_variance.plot(ymax, 'o-', color='#262626')
        self.ax_explained_variance.fill_between(x, ymin, ymax,
                                                color='lightgrey')
        self.ax_explained_variance.set_ylabel('Explained\nvariance\nratio')
        # ax_explained_variance.set_xlabel('Principal component')
        self.ax_explained_variance.set_xticks(x)
        self.ax_explained_variance.set_xlim(-0.5, x.max() + 0.5)
        self.ax_explained_variance.set_xticklabels(components_subset.columns)
        sns.despine(ax=self.ax_explained_variance)

        sns.heatmap(components_subset, ax=self.ax_pcs_heatmap,
                    cbar_ax=self.ax_pcs_colorbar,
                    cbar_kws=dict(label='Contribution to Principal Component'))
        for ytl in self.ax_pcs_heatmap.get_yticklabels():
            ytl.set(size=11)
        self.ax_pcs_colorbar.yaxis.set_ticks_position('right')

[docs]    def plot_explained_variance(self, title="PCA explained variance"):
        """If the reducer is a form of PCA, then plot the explained variance
        ratio by the components.
        """
        # Plot the explained variance ratio
        assert self.explained_variance_ratio_ is not None
        import matplotlib.pyplot as plt
        import seaborn as sns

        fig, ax = plt.subplots()
        ax.plot(self.explained_variance_ratio_, 'o-')

        xticks = np.arange(len(self.explained_variance_ratio_))
        ax.set_xticks(xticks)
        ax.set_xticklabels(xticks + 1)
        ax.set_xlabel('Principal component')
        ax.set_ylabel('Fraction explained variance')
        ax.set_title(title)
        sns.despine()

[docs]    def plot_violins(self):
        """Make violinplots of each feature

        Must be called after plot_samples because it depends on the existence
        of the "self.magnitudes" attribute.
        """
        single_violin_width = 0.5
        ax_width = max(4, single_violin_width*self.grouped.size().shape[0])
        ncols = int(np.ceil(16/ax_width))
        nrows = 1
        vector_labels = list(set(self.magnitudes[:self.n_vectors].index.union(
            pd.Index(self.top_features))))
        while ncols * nrows < len(vector_labels):
            nrows += 1
        self.fig_violins, axes = plt.subplots(nrows=nrows, ncols=ncols,
                                              figsize=(ax_width * ncols,
                                                       4 * nrows))

        if self.feature_renamer is not None:
            renamed_vectors = map(self.feature_renamer, vector_labels)
        else:
            renamed_vectors = vector_labels
        labels = [(y, x) for (y, x) in sorted(zip(renamed_vectors,
                                                  vector_labels))]

        for (renamed, feature_id), ax in zip(labels, axes.flat):
            singles = self.singles[feature_id] if self.singles is not None \
                else None
            pooled = self.pooled[feature_id] if self.pooled is not None else \
                None
            outliers = self.outliers[feature_id] if self.outliers is not None \
                else None

            if isinstance(feature_id, tuple):
                feature_id = feature_id[0]
            if len(feature_id) > 25:
                feature_id = feature_id[:25] + '...'
            if renamed != feature_id:
                title = '{}\n{}'.format(feature_id, renamed)
            else:
                title = feature_id
            singles.name = renamed
            try:
                pooled.name = renamed
            except AttributeError:
                pass
            try:
                outliers.name = renamed
            except AttributeError:
                pass
            # import pdb; pdb.set_trace()
            violinplot(singles, pooled_data=pooled, outliers=outliers,
                       groupby=self.groupby, color_ordered=self.color_ordered,
                       order=self.order, title=title,
                       ax=ax, data_type=self.data_type,
                       **self.violinplot_kws)

            if self.data_type == 'splicing':
                ax.set_ylabel('$\Psi$')
                ax.set_ylim(0, 1)
            elif self.data_type == 'expression':
                ax.set_ylabel('Expression')
            else:
                ax.set_ylabel('')

        # Clear any unused axes
        for ax in axes.flat:
            # Check if the plotting space is empty
            if len(ax.collections) == 0 or len(ax.lines) == 0:
                ax.axis('off')
        self.fig_violins.tight_layout()