Source code for flotilla.data_model.study

"""
Data models for "studies" studies include attributes about the data and are
heavier in terms of data load
"""
import inspect
import itertools
import json
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import semantic_version
import seaborn as sns

from .metadata import MetaData, PHENOTYPE_COL, POOLED_COL, OUTLIER_COL
from .expression import ExpressionData, SpikeInData
from .gene_ontology import GeneOntologyData
from .quality_control import MappingStatsData, MIN_READS
from .splicing import SplicingData, FRACTION_DIFF_THRESH
from .supplemental import SupplementalData
from ..compute.predict import PredictorConfigManager
from ..datapackage import datapackage_url_to_dict, \
    check_if_already_downloaded, make_study_datapackage
from ..visualize.color import blue
from ..visualize.ipython_interact import Interactive
from ..datapackage import FLOTILLA_DOWNLOAD_DIR
from ..util import load_csv, load_json, load_tsv, load_gzip_pickle_df, \
    load_pickle_df, timestamp, cached_property


SPECIES_DATA_PACKAGE_BASE_URL = 'https://s3-us-west-2.amazonaws.com/' \
                                'flotilla-projects'
DATAPACKAGE_RESOURCE_COMMON_KWS = ('url', 'path', 'format', 'compression',
                                   'name')


def _is_absolute_path(location):
    return location.startswith('http') or location.startswith('/')


[docs]class Study(object):
    """A biological study, with associated metadata, expression, and splicing
    data.
    """
    default_feature_set_ids = []

    # Data types with enough data that we'd probably reduce them, and even
    # then we might want to take subsets. E.g. most variant genes for
    # expression. But we don't expect to do this for spikein or mapping_stats
    # data
    _subsetable_data_types = ['expression', 'splicing']

    initializers = {'metadata_data': MetaData,
                    'expression_data': ExpressionData,
                    'splicing_data': SplicingData,
                    'mapping_stats_data': MappingStatsData,
                    'spikein_data': SpikeInData}

    readers = {'tsv': load_tsv,
               'csv': load_csv,
               'json': load_json,
               'pickle_df': load_pickle_df,
               'gzip_pickle_df': load_gzip_pickle_df}

    _default_reducer_kwargs = {'whiten': False,
                               'show_point_labels': False,
                               'show_vectors': False}
    _common_id = 'common_id'
    _sample_id = 'sample_id'
    _event_name = 'event_name'

    _default_plot_kwargs = {'marker': 'o', 'color': blue}

    def __init__(self, sample_metadata, version='0.1.0',
                 metadata_pooled_col=POOLED_COL,
                 metadata_minimum_samples=0,
                 metadata_phenotype_col=PHENOTYPE_COL,
                 metadata_phenotype_order=None,
                 metadata_phenotype_to_color=None,
                 metadata_phenotype_to_marker=None,
                 metadata_outlier_col=OUTLIER_COL,
                 metadata_ignore_subset_cols=None,
                 metadata_batch_cols='batch',
                 metadata_batch_min_samples=3,
                 expression_data=None,
                 expression_feature_data=None,
                 expression_feature_rename_col=None,
                 expression_feature_ignore_subset_cols=None,
                 expression_log_base=None,
                 expression_thresh=-np.inf,
                 expression_plus_one=False,
                 expression_correct_batch_effects=False,
                 splicing_data=None,
                 splicing_feature_data=None,
                 splicing_feature_rename_col=None,
                 splicing_feature_ignore_subset_cols=None,
                 splicing_feature_expression_id_col=None,
                 mapping_stats_data=None,
                 mapping_stats_number_mapped_col=None,
                 mapping_stats_min_reads=MIN_READS,
                 spikein_data=None,
                 spikein_feature_data=None,
                 drop_outliers=True, species=None,
                 gene_ontology_data=None,
                 predictor_config_manager=None,
                 license=None, title=None, sources=None,
                 default_sample_subset="all_samples",
                 default_feature_subset="variant",
                 supplemental_data=None):
        """Construct a biological study

        This class only accepts data, no filenames. All data must already
        have been read in and exist as Python objects.

        Parameters
        ----------
        sample_metadata : pandas.DataFrame
            The only required parameter. Samples as the index, with features as
            columns. Required column: "phenotype". If there is a boolean
            column "pooled", this will be used to separate pooled from single
            cells. Similarly, the column "outliers" will also be used to
            separate outlier cells from the rest.
        version : str
            A string describing the semantic version of the data. Must be in:
            major.minor.patch format, as the "patch" number will be increased
            if you change something in the study and then study.save() it.
            (default "0.1.0")
        expression_data : pandas.DataFrame
            Samples x feature dataframe of gene expression measurements,
            e.g. from an RNA-Seq or a microarray experiment. Assumed to be
            log-transformed, i.e. you took the log of it. (default None)
        expression_feature_data : pandas.DatFrame
            Features x annotations dataframe describing other parameters
            of the gene expression features, e.g. mapping Ensembl IDs to gene
            symbols or gene biotypes. (default None)
        expression_feature_rename_col : str
            A column name in the expression_feature_data dataframe that you'd
            like to rename the expression features to, in the plots. For
            example, if your gene IDs are Ensembl IDs, but you want to plot
            UCSC IDs, make sure the column you want, e.g. "ucsc_id" is in your
            dataframe and specify that. (default "gene_name")
        expression_log_base : float
            If you want to log-transform your expression data (and it's not
            already log-transformed), use this number as the base of the
            transform. E.g. expression_log_base=10 will take the log10 of
            your data. (default None)
        thresh : float
            Minimum (non log-transformed) expression value. (default -inf)
        expression_plus_one : bool
            Whether or not to add 1 to the expression data. (default False)
        splicing_data : pandas.DataFrame
            Samples x feature dataframe of percent spliced in scores, e.g. as
            measured by the program MISO. Assumed that these values only fall
            between 0 and 1.
        splicing_feature_data : pandas.DataFrame
            features x other_features dataframe describing other parameters
            of the splicing features, e.g. mapping MISO IDs to Ensembl IDs or
            gene symbols or transcript types
        splicing_feature_rename_col : str
            A column name in the splicing_feature_data dataframe that you'd
            like to rename the splicing features to, in the plots. For
            example, if your splicing IDs are MISO IDs, but you want to plot
            Ensembl IDs, make sure the column you want, e.g. "ensembl_id" is
            in your dataframe and specify that. Default "gene_name".
        splicing_feature_expression_id_col : str
            A column name in the splicing_feature_data dataframe that
            corresponds to the row names of the expression data
        mapping_stats_data : pandas.DataFrame
            Samples x feature dataframe of mapping stats measurements.
            Currently, this
        mapping_stats_number_mapped_col : str
            A column name in the mapping_stats_data which specifies the
            number of (uniquely or not) mapped reads. Default "Uniquely
            mapped reads number"
        spikein_data : pandas.DataFrame
            samples x features DataFrame of spike-in expression values
        spikein_feature_data : pandas.DataFrame
            Features x other_features dataframe, e.g. of the molecular
            concentration of particular spikein transcripts
        drop_outliers : bool
            Whether or not to drop samples indicated as outliers in the
            sample_metadata from the other data, i.e. with a column
            named 'outlier' in sample_metadata, then remove those
            samples from expression_data for further analysis
        species : str
            Name of the species and genome version, e.g. 'hg19' or 'mm10'.
        gene_ontology_data : pandas.DataFrame
            Gene ids x ontology categories dataframe used for GO analysis.
        metadata_pooled_col : str
            Column in metadata_data which specifies as a boolean
            whether or not this sample was pooled.
        supplemental_data : dict
            str: dataframe mapping of the attribute name, and the pandas
            dataframe

        Note
        ----
        This function explicitly specifies ALL the instance variables (except
        those that are marked by the @property decorator), because,
        as described [1], "If you write initialization functions separate from
        __init__ then experienced developers will certainly see your code as a
        kid's playground."

        [1] http://stackoverflow.com/q/12513185/1628971
        """
        sys.stdout.write("{}\tInitializing Study\n".format(timestamp()))
        sys.stdout.write("{}\tInitializing Predictor configuration manager "
                         "for Study\n".format(timestamp()))
        self.predictor_config_manager = predictor_config_manager \
            if predictor_config_manager is not None \
            else PredictorConfigManager()
        # self.predictor_config_manager = None

        self.species = species
        if gene_ontology_data is not None:
            self.gene_ontology = GeneOntologyData(gene_ontology_data)

        self.license = license
        self.title = title
        self.sources = sources
        self.version = version

        sys.stdout.write('{}\tLoading metadata\n'.format(timestamp()))
        self.metadata = MetaData(
            sample_metadata, metadata_phenotype_order,
            metadata_phenotype_to_color,
            metadata_phenotype_to_marker, pooled_col=metadata_pooled_col,
            ignore_subset_cols=metadata_ignore_subset_cols,
            outlier_col=metadata_outlier_col,
            phenotype_col=metadata_phenotype_col,
            predictor_config_manager=self.predictor_config_manager,
            minimum_sample_subset=metadata_minimum_samples)

        self.default_feature_subset = default_feature_subset
        self.default_sample_subset = default_sample_subset

        if self.metadata.outlier_col in self.metadata.data and drop_outliers:
            outliers = self.metadata.data.index[
                self.metadata.data[self.metadata.outlier_col].astype(bool)]
        else:
            outliers = None
            self.metadata.data[self.metadata.outlier_col] = False

        # Get pooled samples
        pooled = None
        if self.metadata.pooled_col is not None:
            if self.metadata.pooled_col in self.metadata.data:
                try:
                    pooled = self.metadata.data.index[
                        self.metadata.data[
                            self.metadata.pooled_col].astype(bool)]
                except KeyError:
                    pooled = None
        self.pooled = pooled

        if mapping_stats_data is not None:
            self.mapping_stats = MappingStatsData(
                mapping_stats_data,
                number_mapped_col=mapping_stats_number_mapped_col,
                predictor_config_manager=self.predictor_config_manager,
                min_reads=mapping_stats_min_reads)
            self.technical_outliers = self.mapping_stats.too_few_mapped
            if len(self.technical_outliers) > 0:
                outliers_ids = ', '.join(self.technical_outliers)
                sys.stderr.write(
                    'Samples had too few mapped reads (<{:.1e} reads)'
                    ':\n\t{}\n'.format(mapping_stats_min_reads, outliers_ids))
        else:
            self.technical_outliers = None
            self.mapping_stats = None
        feature_data_none = expression_feature_data is None or \
            splicing_feature_data is None

        if self.species is not None and feature_data_none:
            sys.stdout.write('{}\tLoading species metadata from '
                             '~/flotilla_packages\n'.format(timestamp()))
            species_kws = self.load_species_data(self.species, self.readers)
            expression_feature_data = species_kws.pop(
                'expression_feature_data',
                None)
            expression_feature_rename_col = species_kws.pop(
                'expression_feature_rename_col', None)
            splicing_feature_data = species_kws.pop('splicing_feature_data',
                                                    None)
            splicing_feature_rename_col = species_kws.pop(
                'splicing_feature_rename_col', None)

            if expression_feature_data is None:
                expression_feature_data = species_kws.pop(
                    'expression_feature_data',
                    None)

            if expression_feature_rename_col is None:
                expression_feature_rename_col = species_kws.pop(
                    'expression_feature_rename_col', None)

            if splicing_feature_data is None:
                splicing_feature_data = species_kws.pop(
                    'splicing_feature_data',
                    None)

            if splicing_feature_rename_col is None:
                splicing_feature_rename_col = species_kws.pop(
                    'splicing_feature_rename_col', None)

        if expression_data is not None:
            sys.stdout.write(
                "{}\tLoading expression data\n".format(timestamp()))
            feature_ignore_subset_cols = expression_feature_ignore_subset_cols
            self.expression = ExpressionData(
                expression_data,
                feature_data=expression_feature_data,
                thresh=expression_thresh,
                feature_rename_col=expression_feature_rename_col,
                outliers=outliers, plus_one=expression_plus_one,
                log_base=expression_log_base, pooled=pooled,
                predictor_config_manager=self.predictor_config_manager,
                technical_outliers=self.technical_outliers,
                minimum_samples=metadata_minimum_samples,
                feature_ignore_subset_cols=feature_ignore_subset_cols)
            self.default_feature_set_ids.extend(self.expression.feature_subsets
                                                .keys())
        else:
            self.expression = None
        if splicing_data is not None:
            sys.stdout.write("{}\tLoading splicing data\n".format(
                timestamp()))
            self.splicing = SplicingData(
                splicing_data, feature_data=splicing_feature_data,
                feature_rename_col=splicing_feature_rename_col,
                outliers=outliers, pooled=pooled,
                predictor_config_manager=self.predictor_config_manager,
                technical_outliers=self.technical_outliers,
                minimum_samples=metadata_minimum_samples,
                feature_ignore_subset_cols=splicing_feature_ignore_subset_cols,
                feature_expression_id_col=splicing_feature_expression_id_col)
        else:
            self.splicing = None

        if spikein_data is not None:
            self.spikein = SpikeInData(
                spikein_data, feature_data=spikein_feature_data,
                technical_outliers=self.technical_outliers,
                predictor_config_manager=self.predictor_config_manager)
        else:
            self.spikein = None
        self.supplemental = SupplementalData(supplemental_data)
        sys.stdout.write("{}\tSuccessfully initialized a Study "
                         "object!\n".format(timestamp()))

    def __setattr__(self, key, value):
        """Check if the attribute already exists and warns on overwrite.
        """
        if hasattr(self, key):
            warnings.warn('Over-writing attribute {}'.format(key))
        super(Study, self).__setattr__(key, value)

    @property
[docs]    def phenotype_col(self):
        return self.metadata.phenotype_col

    @property
[docs]    def phenotype_order(self):
        return self.metadata.phenotype_order

    @property
[docs]    def phenotype_to_color(self):
        return self.metadata.phenotype_to_color

    @property
[docs]    def phenotype_to_marker(self):
        return self.metadata.phenotype_to_marker

    @property
[docs]    def sample_id_to_phenotype(self):
        return self.metadata.sample_id_to_phenotype

    @property
[docs]    def sample_id_to_color(self):
        return self.metadata.sample_id_to_color

    @property
[docs]    def phenotype_transitions(self):
        return self.metadata.phenotype_transitions

    @property
[docs]    def phenotype_color_ordered(self):
        return self.metadata.phenotype_color_order

    @property
[docs]    def default_sample_subsets(self):
        # move default_sample_subset to the front of the list, sort the rest
        sorted_sample_subsets = list(sorted(list(set(
            self.metadata.sample_subsets.keys()).difference(
            set(self.default_sample_subset)))))
        sorted_sample_subsets.insert(0, self.default_sample_subset)
        return sorted_sample_subsets

    @property
[docs]    def default_feature_subsets(self):
        feature_subsets = {}
        for name in self._subsetable_data_types:
            try:
                data_type = getattr(self, name)
                feature_subsets[name] = data_type.feature_subsets
            except AttributeError:
                continue
        return feature_subsets

    @classmethod
[docs]    def from_datapackage_url(
            cls, datapackage_url,
            load_species_data=True,
            species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
        """Create a study from a url of a datapackage.json file

        Parameters
        ----------
        datapackage_url : str
            HTTP url of a datapackage.json file, following the specification
            described here: http://dataprotocols.org/data-packages/ and
            requiring the following data resources: metadata,
            expression, splicing
        species_data_pacakge_base_url : str
            Base URL to fetch species-specific _ and splicing event
            metadata frnm.
            Default 'https://s3-us-west-2.amazonaws.com/flotilla-projects/'

        Returns
        -------
        study : Study
            A "study" object containing the data described in the
            datapackage_url file

        Raises
        ------
        AttributeError
            If the datapackage.json file does not contain the required
            resources of metadata, expression, and splicing.
        """
        datapackage = datapackage_url_to_dict(datapackage_url)
        datapackage_dir = '{}/{}'.format(FLOTILLA_DOWNLOAD_DIR,
                                         datapackage['name'])
        return cls.from_datapackage(
            datapackage, load_species_data=load_species_data,
            datapackage_dir=datapackage_dir,
            species_datapackage_base_url=species_datapackage_base_url)

    @classmethod
[docs]    def from_datapackage_file(
            cls, datapackage_filename,
            load_species_data=True,
            species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
        with open(datapackage_filename) as f:
            sys.stdout.write('{}\tReading datapackage from {}\n'.format(
                timestamp(), datapackage_filename))
            datapackage = json.load(f)
        datapackage_dir = os.path.dirname(datapackage_filename)
        return cls.from_datapackage(
            datapackage, datapackage_dir=datapackage_dir,
            load_species_data=load_species_data,
            species_datapackage_base_url=species_datapackage_base_url)

    @staticmethod
    def _filename_from_resource(resource, datapackage_dir,
                                datapackage_name):
        if 'url' in resource:
            resource_url = resource['url']
            if not _is_absolute_path(resource_url):
                resource_url = '{}/{}'.format(datapackage_dir,
                                              resource_url)
            filename = check_if_already_downloaded(resource_url,
                                                   datapackage_name)
            return filename
        elif 'path' in resource:
            if resource['path'].startswith('http'):
                filename = check_if_already_downloaded(resource['path'],
                                                       datapackage_name)
            else:
                filename = resource['path']
                if not _is_absolute_path(filename):
                    filename = '{}/{}'.format(datapackage_dir,
                                              filename)

                # Test if the file exists, if not, then add the datapackage
                # file
                if not os.path.exists(filename):
                    filename = os.path.join(datapackage_dir, filename)
            return filename
        else:
            return None

    @classmethod
[docs]    def from_datapackage(
            cls, datapackage, datapackage_dir='./',
            load_species_data=True,
            species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
        """Create a study object from a datapackage dictionary

        Parameters
        ----------
        datapackage : dict


        Returns
        -------
        study : flotilla.Study
            Study object
        """
        sys.stdout.write('{}\tParsing datapackage to create a Study '
                         'object\n'.format(timestamp()))
        dfs = {}
        kwargs = {}
        supplemental_data = {}
        datapackage_name = datapackage['name']

        for resource in datapackage['resources']:
            filename = cls._filename_from_resource(resource, datapackage_dir,
                                                   datapackage_name)
            if filename is None:
                # This is supplemental data
                for supplemental in resource[u'resources']:
                    filename = cls._filename_from_resource(supplemental,
                                                           datapackage_dir,
                                                           datapackage_name)
                    name = supplemental['name']

                    reader = cls.readers[supplemental['format']]
                    compression = None if 'compression' not in \
                                          supplemental else \
                        supplemental['compression']
                    header = supplemental.pop('header', 0)
                    index_col = supplemental.pop('index_col', 0)
                    df = reader(filename, compression=compression,
                                header=header, index_col=index_col)
                    supplemental_data[name] = df
            else:

                name = resource['name']

                reader = cls.readers[resource['format']]
                compression = None if 'compression' not in resource else \
                    resource['compression']
                header = resource.pop('header', 0)
                index_col = resource.pop('index_col', 0)

                dfs[name] = reader(filename, compression=compression,
                                   header=header, index_col=index_col)

                for key in set(resource.keys()).difference(
                        DATAPACKAGE_RESOURCE_COMMON_KWS):
                    kwargs['{}_{}'.format(name, key)] = resource[key]

        species_kws = {}
        species = None if 'species' not in datapackage else datapackage[
            'species']
        if load_species_data and species is not None:
            species_kws = cls.load_species_data(species, cls.readers,
                                                species_datapackage_base_url)

        try:
            sample_metadata = dfs.pop('metadata')
        except KeyError:
            raise AttributeError('The datapackage.json file is required to '
                                 'have the "metadata" resource')
        dfs = dict(('{}_data'.format(k), v) for k, v in dfs.iteritems())

        nones = [k for k, v in kwargs.iteritems() if v is None]
        for key in nones:
            kwargs.pop(key)
        kwargs.update(species_kws)
        kwargs.update(dfs)

        license = None if 'license' not in datapackage else datapackage[
            'license']
        title = None if 'title' not in datapackage else datapackage[
            'title']
        sources = None if 'sources' not in datapackage else datapackage[
            'sources']
        version = None if 'datapackage_version' not in datapackage else \
            datapackage['datapackage_version']
        if not semantic_version.validate(version):
            raise ValueError(
                '{} is not a valid version string. Please use semantic '
                'versioning, with major.minor.patch, e.g. 0.1.2 is a valid '
                'version string'.format(version))
        study = Study(sample_metadata=sample_metadata, species=species,
                      license=license, title=title, sources=sources,
                      version=version, supplemental_data=supplemental_data,
                      **kwargs)
        return study

    @staticmethod
[docs]    def load_species_data(
            species, readers,
            species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
        dfs = {}

        try:
            species_data_url = '{}/{}/datapackage.json'.format(
                species_datapackage_base_url, species)
            species_datapackage = datapackage_url_to_dict(
                species_data_url)

            for resource in species_datapackage['resources']:
                if 'url' in resource:
                    resource_url = resource['url']
                    filename = check_if_already_downloaded(resource_url,
                                                           species)
                else:
                    filename = resource['path']

                reader = readers[resource['format']]

                compression = None if 'compression' not in resource else \
                    resource['compression']
                name = resource['name']
                dfs[name] = reader(filename, index_col=0,
                                   compression=compression)
                other_keys = set(resource.keys()).difference(
                    DATAPACKAGE_RESOURCE_COMMON_KWS)
                name_no_data = name.rstrip('_data')
                for key in other_keys:
                    new_key = '{}_{}'.format(name_no_data, key)
                    dfs[new_key] = resource[key]
        except (IOError, ValueError) as e:
            sys.stderr.write('Error loading species {} data:'
                             ' {}'.format(species, e))
        return dfs

[docs]    def detect_outliers(self, data_type='expression',
                        sample_subset=None, feature_subset=None,
                        featurewise=False,
                        reducer=None,
                        standardize=None,
                        reducer_kwargs=None,
                        bins=None,
                        outlier_detection_method=None,
                        outlier_detection_method_kwargs=None):

        if sample_subset is None:
            sample_subset = self.default_sample_subset

        sample_ids = self.sample_subset_to_sample_ids(sample_subset)

        if feature_subset is None:
            feature_subset = self.default_feature_subset

        feature_ids = self.feature_subset_to_feature_ids(data_type,
                                                         feature_subset,
                                                         rename=False)
        if data_type == "expression":
            datamodel = self.expression
        elif data_type == "splicing":
            datamodel = self.splicing
        else:
            raise TypeError('{} not a supported data type'.format(data_type))

        reducer, outlier_detector = datamodel.detect_outliers(
            sample_ids=sample_ids, feature_ids=feature_ids,
            featurewise=featurewise, reducer=reducer, standardize=standardize,
            reducer_kwargs=reducer_kwargs, bins=bins,
            outlier_detection_method=outlier_detection_method,
            outlier_detection_method_kwargs=outlier_detection_method_kwargs)

        outlier_detector.predict(reducer.reduced_space)
        outlier_detector.title = "_".join(
            ['outlier', data_type, sample_subset, feature_subset])
        print "setting outlier type:\"{}\" in metadata".format(
            outlier_detector.title)
        if outlier_detector.title not in self.metadata.data:
            self.metadata.data[outlier_detector.title] = False

        self.metadata.data[outlier_detector.title].update(
            outlier_detector.outliers)
        return reducer, outlier_detector

[docs]    def drop_outliers(self):
        """Assign samples marked as "outlier" in metadata, to other datas"""
        outliers = self.metadata.data['outlier'][
            self.metadata.data['outlier']].index
        self.expression.outlier_samples = outliers
        self.splicing.outlier_samples = outliers

    @staticmethod
[docs]    def maybe_make_directory(filename):
        # Make the directory if it's not already there
        try:
            directory = os.path.abspath(os.path.dirname(filename))
            os.makedirs(os.path.abspath(directory))
        except OSError:
            pass

[docs]    def feature_subset_to_feature_ids(self, data_type, feature_subset=None,
                                      rename=False):
        """Given a name of a feature subset, get the associated feature ids

        Parameters
        ----------
        data_type : str
            A string describing the data type, e.g. "expression"
        feature_subset : str
            A string describing the subset of data type (must be already
            calculated)

        Returns
        -------
        feature_ids : list of strings
            List of features ids from the specified datatype
        """
        if 'expression'.startswith(data_type):
            return self.expression.feature_subset_to_feature_ids(
                feature_subset, rename)
        elif 'splicing'.startswith(data_type):
            return self.splicing.feature_subset_to_feature_ids(
                feature_subset, rename)

[docs]    def sample_subset_to_sample_ids(self, phenotype_subset=None):

        """Convert a string naming a subset of phenotypes in the data into
        sample ids

        Parameters
        ----------
        phenotype_subset : str
            A valid string describing a boolean phenotype described in the
            metadata data

        Returns
        -------
        sample_ids : list of strings
            List of sample ids in the data
        """

        # IF this is a list of IDs

        try:
            return self.metadata.sample_subsets[phenotype_subset]
        except (KeyError, TypeError):
            pass

        try:
            ind = self.metadata.sample_id_to_phenotype == phenotype_subset
            if ind.sum() > 0:
                return self.metadata.sample_id_to_phenotype.index[ind]

            if phenotype_subset is None or 'all_samples'.startswith(
                    phenotype_subset):
                sample_ind = np.ones(self.metadata.data.shape[0],
                                     dtype=bool)
            elif phenotype_subset.startswith("~"):
                sample_ind = ~pd.Series(
                    self.metadata.data[phenotype_subset.lstrip("~")],
                    dtype='bool')

            else:
                sample_ind = pd.Series(
                    self.metadata.data[phenotype_subset], dtype='bool')
            sample_ids = self.metadata.data.index[sample_ind]
            return sample_ids
        except (AttributeError, ValueError):
            return phenotype_subset

[docs]    def plot_pca(self, data_type='expression', x_pc=1, y_pc=2,
                 sample_subset=None, feature_subset=None,
                 title='', featurewise=False, plot_violins=False,
                 show_point_labels=False, reduce_kwargs=None,
                 color_samples_by=None, bokeh=False,
                 most_variant_features=False, std_multiplier=2,
                 scale_by_variance=True,
                 **kwargs):
        """Performs DataFramePCA on both expression and splicing study_data

        Parameters
        ----------
        data_type : str
            One of the names of the data types, e.g. "expression" or
            "splicing" (default "expression")
        x_pc : int, optional
            Which principal component to plot on the x-axis (default 1)
        y_pc : int, optional
            Which principal component to plot on the y-axis (default 2)
        sample_subset : str or None
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used. (default None)
        feature_subset : str or None
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used. (default None)
        title : str, optional
            Title of the reduced space plot (default '')
        featurewise : bool, optional
            If True, the features are reduced on the samples, and the plotted
            points are features, not samples. (default False)
        plot_violins : bool
            Whether or not to make the violinplots of the top features. This
            can take a long time, so to save time you can turn it off if you
            just want a quick look at the PCA. (default False)
        show_point_labels : bool, optional
            Whether or not to show the labels of the points. If this is
            samplewise (default), then this labels the samples. If this is
            featurewise, then this labels the features. (default False)
        reduce_kwargs : dict, optional
            Keyword arguments to the reducer (default None)
        color_samples_by : str, optional
            Instead of coloring the samples by their phenotype, color them by
            this column in the metadata. (default None)
        bokeh : bool, optional
            If True, plot a javascripty/interactive bokeh plot instead of a
            static printable figure (default False)
        most_variant_features : bool, optional
            If True, then only take the most variant of the provided features.
            The most variant are determined by taking the features whose
            variance is ``std_multiplier``standard deviations away from the
            mean feature variance (default False)
        std_multiplier : float, optional
            If ``most_variant_features`` is True, then use this as a cutoff
            for the minimum variance of a feature to be included (default 2)
        scale_by_variance : bool, optional
            If True, then scale the x- and y-axes by the explained variance
            ratio of the principal component dimensions. Only valid for PCA
            and its variations, not for NMF or tSNE. (default True)
        kwargs : other keyword arguments
            All other keyword arguments are passed to
            :py:meth:`DecomopsitionViz.plot`
        """

        sample_subset = self.default_sample_subset \
            if sample_subset is None else sample_subset
        feature_subset = self.default_feature_subset \
            if feature_subset is None else feature_subset

        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        feature_ids = self.feature_subset_to_feature_ids(data_type,
                                                         feature_subset,
                                                         rename=False)

        label_to_color = None
        label_to_marker = None
        groupby = None
        order = None
        color_samples_by_phenotype = color_samples_by \
            == self.metadata.phenotype_col
        if not featurewise:
            if color_samples_by is None or color_samples_by_phenotype:
                label_to_color = self.phenotype_to_color
                label_to_marker = self.phenotype_to_marker
                groupby = self.sample_id_to_phenotype
                order = self.phenotype_order
            else:
                groupby = self.metadata.data[color_samples_by]

        if "expression".startswith(data_type):
            reducer = self.expression.plot_pca(
                x_pc=x_pc, y_pc=y_pc, sample_ids=sample_ids,
                feature_ids=feature_ids,
                label_to_color=label_to_color,
                label_to_marker=label_to_marker, groupby=groupby,
                order=order, std_multiplier=std_multiplier,
                featurewise=featurewise, show_point_labels=show_point_labels,
                title=title, reduce_kwargs=reduce_kwargs,
                plot_violins=plot_violins, metadata=self.metadata.data,
                bokeh=bokeh, most_variant_features=most_variant_features,
                scale_by_variance=scale_by_variance,
                **kwargs)

        elif "splicing".startswith(data_type):
            reducer = self.splicing.plot_pca(
                x_pc=x_pc, y_pc=y_pc, sample_ids=sample_ids,
                feature_ids=feature_ids,
                label_to_color=label_to_color,
                label_to_marker=label_to_marker, groupby=groupby,
                order=order, std_multiplier=std_multiplier,
                featurewise=featurewise, show_point_labels=show_point_labels,
                title=title, reduce_kwargs=reduce_kwargs,
                plot_violins=plot_violins, metadata=self.metadata.data,
                bokeh=bokeh, most_variant_features=most_variant_features,
                scale_by_variance=scale_by_variance,
                **kwargs)
        else:
            raise ValueError('The data type {} does not exist in this study'
                             .format(data_type))
        return reducer

[docs]    def plot_graph(self, data_type='expression', sample_subset=None,
                   feature_subset=None, featurewise=False,
                   **kwargs):
        """Plot the graph (network) of these data

        Parameters
        ----------
        data_type : str
            One of the names of the data types, e.g. "expression" or "splicing"
        sample_subset : str or None
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used.
        feature_subset : str or None
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used.
        """
        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        feature_ids = self.feature_subset_to_feature_ids(data_type,
                                                         feature_subset,
                                                         rename=False)

        if not featurewise:
            label_to_color = self.phenotype_to_color
            label_to_marker = self.phenotype_to_marker
            groupby = self.sample_id_to_phenotype
        else:
            label_to_color = None
            label_to_marker = None
            groupby = None

        if data_type == "expression":
            return self.expression.networks.draw_graph(
                sample_ids=sample_ids, feature_ids=feature_ids,
                sample_id_to_color=self.sample_id_to_color,
                label_to_color=label_to_color,
                label_to_marker=label_to_marker, groupby=groupby,
                featurewise=featurewise,
                **kwargs)
        elif data_type == "splicing":
            return self.splicing.networks.draw_graph(
                sample_ids=sample_ids, feature_ids=feature_ids,
                sample_id_to_color=self.sample_id_to_color,
                label_to_color=label_to_color,
                label_to_marker=label_to_marker, groupby=groupby,
                featurewise=featurewise,
                **kwargs)

[docs]    def plot_classifier(self, trait, sample_subset=None,
                        feature_subset='all_genes',
                        data_type='expression', title='',
                        show_point_labels=False,
                        **kwargs):
        """Plot a predictor for the specified data type and trait(s)

        Parameters
        ----------
        data_type : str
            One of the names of the data types, e.g. "expression" or "splicing"
        trait : str
            Column name in the metadata data that you would like
            to classify on

        Returns
        -------


        """
        try:
            trait_data = self.metadata.data[trait]
        except KeyError:
            trait_ids = self.sample_subset_to_sample_ids(trait)
            trait_data = self.metadata.data.index.isin(trait_ids)

        if isinstance(trait_data.dtype, bool):
            all_true = np.all(trait_data)
            all_false = np.all(~trait_data)
            too_few_categories = False
        else:
            all_false = False
            all_true = False
            too_few_categories = len(set(trait_data)) <= 1
        nothing_to_classify = all_true or all_false or too_few_categories

        if nothing_to_classify:
            raise ValueError("All samples are True (or all samples are "
                             "False) or all are the same, cannot classify"
                             "when all samples are the same")
        trait_data = pd.Series(trait_data, name=trait,
                               index=self.metadata.data.index)

        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        feature_ids = self.feature_subset_to_feature_ids(data_type,
                                                         feature_subset,
                                                         rename=False)
        feature_subset = 'none' if feature_subset is None else feature_subset
        sample_subset = 'none' if sample_subset is None else sample_subset
        data_name = '_'.join([sample_subset, feature_subset])

        label_to_color = self.phenotype_to_color
        label_to_marker = self.phenotype_to_marker
        groupby = self.sample_id_to_phenotype

        order = self.phenotype_order
        color = self.phenotype_color_ordered

        if data_type == "expression":
            return self.expression.plot_classifier(
                data_name=data_name, trait=trait_data,
                sample_ids=sample_ids, feature_ids=feature_ids,
                label_to_color=label_to_color,
                label_to_marker=label_to_marker, groupby=groupby,
                show_point_labels=show_point_labels, title=title,
                order=order, color=color,
                **kwargs)
        elif data_type == "splicing":
            return self.splicing.plot_classifier(
                data_name=data_name, trait=trait_data,
                sample_ids=sample_ids, feature_ids=feature_ids,
                label_to_color=label_to_color,
                label_to_marker=label_to_marker, groupby=groupby,
                show_point_labels=show_point_labels, title=title,
                order=order, color=color,
                **kwargs)

[docs]    def modality_assignments(self, sample_subset=None, feature_subset=None,
                             expression_thresh=-np.inf, min_samples=10):
        """Get modality assignments of splicing data

        Parameters
        ----------
        sample_subset : str or None, optional
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used.
        feature_subset : str or None, optional
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used.
        expression_thresh : float, optional
            Minimum expression value, of the original input. E.g. if the
            original input is already log-transformed, then this threshold is
            on the log values.

        Returns
        -------
        modalities : pandas.DataFrame
            A (n_phenotypes, n_events) shaped DataFrame of the assigned
            modality
        """
        min_expression = self.expression.data.min().min()
        if expression_thresh > -np.inf and expression_thresh > min_expression:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
            feature_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            feature_ids = self.feature_subset_to_feature_ids(
                'splicing', feature_subset, rename=False)
            data = None

        return self.splicing.modality_assignments(
            sample_ids, feature_ids, data=data,
            groupby=self.sample_id_to_phenotype, min_samples=min_samples)

[docs]    def modality_counts(self, sample_subset=None, feature_subset=None,
                        expression_thresh=-np.inf, min_samples=10):
        """Get number of splicing events in modality categories

        Parameters
        ----------
        sample_subset : str or None, optional
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used.
        feature_subset : str or None, optional
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used.
        expression_thresh : float, optional
            Minimum expression value, of the original input. E.g. if the
            original input is already log-transformed, then this threshold is
            on the log values.

        Returns
        -------
        modalities : pandas.DataFrame
            A (n_phenotypes, n_modalities) shaped DataFrame of the number of
            events assigned to each modality
        """
        min_expression = self.expression.data.min().min()
        if expression_thresh > -np.inf and expression_thresh > min_expression:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
            feature_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            feature_ids = self.feature_subset_to_feature_ids(
                'splicing', feature_subset, rename=False)
            data = None

        return self.splicing.modality_assignments(
            sample_ids, feature_ids, data=data,
            groupby=self.sample_id_to_phenotype, min_samples=min_samples)

[docs]    def plot_modalities_bars(self, sample_subset=None, feature_subset=None,
                             expression_thresh=-np.inf, percentages=True):
        """Make grouped barplots of the number of modalities per phenotype

        Parameters
        ----------
        sample_subset : str or None
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used.
        feature_subset : str or None
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used.
        expression_thresh : float
            If greater than -inf, then filter on splicing events in genes
            with expression at least this value
        percentages : bool
            If True, plot percentages instead of counts
        """
        if expression_thresh > -np.inf:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
            feature_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            feature_ids = self.feature_subset_to_feature_ids(
                'splicing', feature_subset, rename=False)
            data = None

        self.splicing.plot_modalities_bars(sample_ids, feature_ids, data,
                                           self.sample_id_to_phenotype,
                                           self.phenotype_to_color,
                                           percentages=percentages)

[docs]    def plot_modalities_reduced(self, sample_subset=None, feature_subset=None,
                                expression_thresh=-np.inf):
        """Plot splicing events with modality assignments in NMF space

        This will plot a separate NMF space for each celltype in the data, as
        well as one for all samples.

        Parameters
        ----------
        sample_subset : str or None
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used.
        feature_subset : str or None
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used.
        expression_thresh : float
            If greater than -inf, then filter on splicing events in genes
            with expression at least this value
        """
        if expression_thresh > -np.inf:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
            feature_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            feature_ids = self.feature_subset_to_feature_ids(
                'splicing', feature_subset, rename=False)
            data = None

        grouped = self.sample_id_to_phenotype.groupby(
            self.sample_id_to_phenotype)

        # Account for bar plot and plot of the reduced space of ALL samples
        n = grouped.ngroups + 1
        groups = ['all']
        fig, axes = plt.subplots(ncols=n, figsize=(n * 4, 4))
        all_ax = axes[0]
        self.splicing.plot_modalities_reduced(sample_ids, feature_ids,
                                              data=data,
                                              ax=all_ax, title='all samples')
        axes = axes[1:]
        for i, ((celltype, series), ax) in enumerate(zip(grouped, axes)):
            groups.append(celltype)
            # sys.stdout.write('\n---- {} ----\n'.format(celltype))
            samples = series.index.intersection(sample_ids)
            # legend = i == 0
            self.splicing.plot_modalities_reduced(samples, feature_ids,
                                                  data=data,
                                                  ax=ax, title=celltype)

[docs]    def celltype_sizes(self, data_type='splicing'):
        if data_type == 'expression':
            self.expression.data.groupby(self.sample_id_to_phenotype,
                                         axis=0).size()
        if data_type == 'splicing':
            self.splicing.data.groupby(self.sample_id_to_phenotype,
                                       axis=0).size()

    @property
[docs]    def celltype_event_counts(self):
        """Number of cells that detected each event, per celltype
        """
        return self.splicing.data.groupby(
            self.sample_id_to_phenotype, axis=0).apply(
            lambda x: x.groupby(level=0, axis=0).transform(
                lambda x: x.count()).sum()).replace(0, np.nan)

[docs]    def unique_celltype_event_counts(self, n=1):
        celltype_event_counts = self.celltype_event_counts
        return celltype_event_counts[celltype_event_counts <= n]

[docs]    def percent_unique_celltype_events(self, n=1):
        n_unique = self.unique_celltype_event_counts(n).sum(axis=1)
        n_total = self.celltype_event_counts.sum(axis=1).astype(float)
        return n_unique / n_total * 100

    # @property
    # def celltype_modalities(self):
    # """Return modality assignments of each celltype
    #     """
    #     return self.splicing.data.groupby(
    #         self.sample_id_to_phenotype, axis=0).apply(
    #         lambda x: self.splicing.modalities(x.index))

[docs]    def plot_modalities_lavalamps(self, sample_subset=None,
                                  feature_subset=None,
                                  expression_thresh=-np.inf):
        """Plot each modality in each celltype on a separate axes

        Parameters
        ----------
        sample_subset : str or None
            Which subset of the samples to use, based on some phenotype
            column in the experiment design data. If None, all samples are
            used.
        feature_subset : str or None
            Which subset of the features to used, based on some feature type
            in the expression data (e.g. "variant"). If None, all features
            are used.
        expression_thresh : float
            If greater than -inf, then filter on splicing events in genes
            with expression at least this value
        """
        if expression_thresh > -np.inf:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
            feature_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            feature_ids = self.feature_subset_to_feature_ids(
                'splicing', feature_subset, rename=False)
            data = None

        self.splicing.plot_modalities_lavalamps(
            groupby=self.sample_id_to_phenotype,
            phenotype_to_color=self.phenotype_to_color,
            sample_ids=sample_ids, data=data, feature_ids=feature_ids)

[docs]    def plot_event_modality_estimation(self, event_id, sample_subset=None,
                                       expression_thresh=-np.inf):
        if expression_thresh > -np.inf:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            data = None

        self.splicing.plot_event_modality_estimation(
            event_id, groupby=self.sample_id_to_phenotype,
            sample_ids=sample_ids, data=data)

[docs]    def plot_event(self, feature_id, sample_subset=None, nmf_space=False):
        """Plot the violinplot and NMF transitions of a splicing event
        """
        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        self.splicing.plot_feature(
            feature_id, sample_ids,
            phenotype_groupby=self.sample_id_to_phenotype,
            phenotype_order=self.phenotype_order,
            color=self.phenotype_color_ordered,
            phenotype_to_color=self.phenotype_to_color,
            phenotype_to_marker=self.phenotype_to_marker, nmf_space=nmf_space)

[docs]    def plot_gene(self, feature_id, sample_subset=None, nmf_space=False):
        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        self.expression.plot_feature(
            feature_id, sample_ids,
            phenotype_groupby=self.sample_id_to_phenotype,
            phenotype_order=self.phenotype_order,
            color=self.phenotype_color_ordered,
            phenotype_to_color=self.phenotype_to_color,
            phenotype_to_marker=self.phenotype_to_marker, nmf_space=nmf_space)

[docs]    def plot_lavalamp_pooled_inconsistent(
            self, sample_subset=None, feature_subset=None,
            fraction_diff_thresh=FRACTION_DIFF_THRESH,
            expression_thresh=-np.inf):
        # grouped_ids = self.splicing.data.groupby(self.sample_id_to_color,
        # axis=0)
        celltype_groups = self.metadata.data.groupby(
            self.sample_id_to_phenotype, axis=0)

        if sample_subset is not None:
            # Only plotting one sample_subset
            celltype_samples = set(celltype_groups.groups[sample_subset])
        else:
            # Plotting all the celltypes
            celltype_samples = self.sample_subset_to_sample_ids(sample_subset)

        feature_ids = self.feature_subset_to_feature_ids(
            'splicing', feature_subset=feature_subset)

        celltype_and_sample_ids = celltype_groups.groups.iteritems()
        for i, (phenotype, sample_ids) in enumerate(celltype_and_sample_ids):
            # import pdb; pdb.set_trace()

            # Assumes all samples of a sample_subset have the same color...
            # probably wrong
            color = self.phenotype_to_color[phenotype]
            sample_ids = celltype_samples.intersection(sample_ids)
            if len(sample_ids) == 0:
                continue
            data = self.filter_splicing_on_expression(expression_thresh)
            data = data.ix[sample_ids, :]
            self.splicing.plot_lavalamp_pooled_inconsistent(
                data, feature_ids, fraction_diff_thresh, color=color)

[docs]    def percent_pooled_inconsistent(
            self, sample_subset=None, feature_subset=None,
            fraction_diff_thresh=FRACTION_DIFF_THRESH,
            expression_thresh=-np.inf):
        celltype_groups = self.metadata.data.groupby(
            self.sample_id_to_phenotype, axis=0)

        if sample_subset is not None:
            # Only plotting one sample_subset
            celltype_samples = set(celltype_groups.groups[sample_subset])
        else:
            # Plotting all the celltypes
            celltype_samples = self.sample_subset_to_sample_ids(sample_subset)

        feature_ids = self.feature_subset_to_feature_ids(
            'splicing', feature_subset=feature_subset)

        celltype_and_sample_ids = celltype_groups.groups.iteritems()
        index = pd.MultiIndex.from_product([celltype_groups.groups.keys(),
                                            ['n_events', 'percent']])
        percents = pd.Series(index=index)
        for i, (phenotype, sample_ids) in enumerate(celltype_and_sample_ids):
            # import pdb; pdb.set_trace()

            # Assumes all samples of a sample_subset have the same color...
            # probably wrong
            sample_ids = celltype_samples.intersection(sample_ids)
            if len(sample_ids) == 0:
                continue
            data = self.filter_splicing_on_expression(expression_thresh)
            data = data.ix[sample_ids, :]
            if not data.empty:
                singles, pooled, not_measured_in_pooled, pooled_inconsistent \
                    = self.splicing.pooled_inconsistent(data, feature_ids,
                                                        fraction_diff_thresh)
                percent = self.splicing._divide_inconsistent_and_pooled(
                    pooled, pooled_inconsistent)
            else:
                percent = np.nan
            percents[phenotype, 'percent'] = percent
            percents[phenotype, 'n_events'] = data.shape[1]
        return percents

[docs]    def expression_vs_inconsistent_splicing(self, bins=None):
        """Percentage of events inconsistent with pooled at expression threshs

        Parameters
        ----------
        bins : list-like
            List of expression cutoffs

        Returns
        -------
        expression_vs_inconsistent : pd.DataFrame
            A (len(bins), n_phenotypes) dataframe of the percentage of events
            in single cells that are inconsistent with pooled
        """

        if bins is None:
            emin = int(np.floor(self.expression.data_original.min().min()))
            emax = int(np.ceil(self.expression.data_original.max().max()))
            bins = np.arange(emin, emax)

        expression_vs_inconsistent = pd.Series(bins).apply(
            lambda x: self.percent_pooled_inconsistent(expression_thresh=x))
        return expression_vs_inconsistent

[docs]    def plot_expression_vs_inconsistent_splicing(self, bins=None):

        expression_vs_inconsistent = self.expression_vs_inconsistent_splicing(
            bins=bins)

        fig, axes = plt.subplots(nrows=2, figsize=(6, 6))

        # Plot the percent inconsistent
        ax = axes[0]
        for phenotype in self.phenotype_order:
            s = expression_vs_inconsistent[(phenotype, 'percent')]
            color = self.phenotype_to_color[phenotype]
            ax.plot(s, 'o-', color=color)
        ax.set_xlabel('Expression threshold')
        ax.set_ylabel('Percent events inconsistent with pooled')
        ymin, ymax = ax.get_ylim()
        ax.set_ylim(0, ymax)

        # Plot number of events at each cutoff
        ax = axes[1]
        for phenotype in self.phenotype_order:
            s = expression_vs_inconsistent[(phenotype, 'n_events')]
            color = self.phenotype_to_color[phenotype]
            ax.plot(s, 'o-', color=color)
        ax.set_xlabel('Expression threshold')
        ax.set_ylabel('Number of events')
        ymin, ymax = ax.get_ylim()
        ax.set_ylim(0, ymax)
        ax.legend()

        sns.despine()

[docs]    def plot_clustermap(self, sample_subset=None, feature_subset=None,
                        data_type='expression', metric='euclidean',
                        method='average', figsize=None,
                        scale_fig_by_data=True, **kwargs):
        """Visualize hierarchical relationships within samples and features

        Parameters
        ----------


        Returns
        -------


        Raises
        ------
        """

        if feature_subset is None:
            feature_subset = self.default_feature_subset

        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        feature_ids = self.feature_subset_to_feature_ids(data_type,
                                                         feature_subset,
                                                         rename=False)

        if data_type == "expression":
            return self.expression.plot_clustermap(
                sample_ids=sample_ids, feature_ids=feature_ids, method=method,
                metric=metric, sample_id_to_color=self.sample_id_to_color,
                figsize=figsize, scale_fig_by_data=scale_fig_by_data,
                **kwargs)
        elif data_type == "splicing":
            return self.splicing.plot_clustermap(
                sample_ids=sample_ids, feature_ids=feature_ids, method=method,
                metric=metric, sample_id_to_color=self.sample_id_to_color,
                figsize=figsize, scale_fig_by_data=scale_fig_by_data,
                **kwargs)

[docs]    def plot_correlations(self, sample_subset=None, feature_subset=None,
                          data_type='expression', metric='euclidean',
                          method='average', figsize=None, featurewise=False,
                          scale_fig_by_data=True, **kwargs):
        """Visualize clustered correlations of samples across features

        Parameters
        ----------


        Returns
        -------


        Raises
        ------

        """

        if feature_subset is None:
            feature_subset = self.default_feature_subset

        sample_ids = self.sample_subset_to_sample_ids(sample_subset)
        feature_ids = self.feature_subset_to_feature_ids(data_type,
                                                         feature_subset,
                                                         rename=False)

        if figsize is not None and scale_fig_by_data:
            raise ValueError('If "scale_fig_by_data" is true, then cannot '
                             'also specify "figsize"')

        if data_type == "expression":
            return self.expression.plot_correlations(
                sample_ids=sample_ids, feature_ids=feature_ids,
                sample_id_to_color=self.sample_id_to_color,
                figsize=figsize, scale_fig_by_data=scale_fig_by_data,
                metric=metric, method=method, featurewise=featurewise,
                **kwargs)
        elif data_type == "splicing":
            return self.splicing.plot_correlations(
                sample_ids=sample_ids, feature_ids=feature_ids, method=method,
                metric=metric, sample_id_to_color=self.sample_id_to_color,
                figsize=figsize, scale_fig_by_data=scale_fig_by_data,
                featurewise=featurewise, **kwargs)

[docs]    def plot_lavalamps(self, sample_subset=None, feature_subset=None,
                       expression_thresh=-np.inf):
        if expression_thresh > -np.inf:
            data = self.filter_splicing_on_expression(
                expression_thresh=expression_thresh,
                sample_subset=sample_subset)
            sample_ids = None
            feature_ids = None
        else:
            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            feature_ids = self.feature_subset_to_feature_ids(
                'splicing', feature_subset, rename=False)
            data = None

        self.splicing.plot_lavalamp(sample_ids=sample_ids,
                                    feature_ids=feature_ids, data=data,
                                    groupby=self.sample_id_to_phenotype,
                                    phenotype_to_color=self.phenotype_to_color,
                                    order=self.phenotype_order)

[docs]    def plot_big_nmf_space_transitions(self, data_type='expression', n=5):
        if data_type == 'expression':
            self.expression.plot_big_nmf_space_transitions(
                self.sample_id_to_phenotype, self.phenotype_transitions,
                self.phenotype_order, self.phenotype_color_ordered,
                self.phenotype_to_color, self.phenotype_to_marker, n=n)
        if data_type == 'splicing':
            self.splicing.plot_big_nmf_space_transitions(
                self.sample_id_to_phenotype, self.phenotype_transitions,
                self.phenotype_order, self.phenotype_color_ordered,
                self.phenotype_to_color, self.phenotype_to_marker, n=n)

[docs]    def plot_two_samples(self, sample1, sample2, data_type='expression',
                         **kwargs):
        """Plot a scatterplot of two samples' data

        Parameters
        ----------
        sample1 : str
            Name of the sample to plot on the x-axis
        sample2 : str
            Name of the sample to plot on the y-axis
        data_type : "expression" | "splicing"
            Type of data to plot. Default "expression"
        Any other keyword arguments valid for seaborn.jointplot

        Returns
        -------
        jointgrid : seaborn.axisgrid.JointGrid
            Returns a JointGrid instance

        See Also
        -------
        seaborn.jointplot

        """
        if data_type == 'expression':
            return self.expression.plot_two_samples(sample1, sample2, **kwargs)
        elif data_type == 'splicing':
            return self.splicing.plot_two_samples(sample1, sample2, **kwargs)

[docs]    def plot_two_features(self, feature1, feature2, data_type='expression',
                          **kwargs):
        """Make a scatterplot of two features' data

        Parameters
        ----------
        feature1 : str
            Name of the feature to plot on the x-axis. If you have a
            feature_data dataframe for this data type, will attempt to map
            the common name, e.g. "RBFOX2" back to the crazy name,
            e.g. "ENSG00000100320"
        feature2 : str
            Name of the feature to plot on the y-axis. If you have a
            feature_data dataframe for this data type, will attempt to map
            the common name, e.g. "RBFOX2" back to the crazy name,
            e.g. "ENSG00000100320"

        Returns
        -------


        Raises
        ------
        """
        if data_type == 'expression':
            self.expression.plot_two_features(
                feature1, feature2, groupby=self.sample_id_to_phenotype,
                label_to_color=self.phenotype_to_color, **kwargs)
        if data_type == 'splicing':
            self.splicing.plot_two_features(
                feature1, feature2, groupby=self.sample_id_to_phenotype,
                label_to_color=self.phenotype_to_color, **kwargs)

[docs]    def nmf_space_positions(self, data_type='splicing'):
        if data_type == 'splicing':
            return self.splicing.nmf_space_positions(
                self.sample_id_to_phenotype)

[docs]    def nmf_space_transitions(self, phenotype_transitions='all',
                              data_type='splicing', n=0.5):
        """The change in NMF space of splicing events across phenotypes

        Parameters
        ----------
        phenotype_transitions : list of length-2 tuples of str
            List of ('phenotype1', 'phenotype2') transitions whose change in
            distribution you are interested in
        data_type : 'splicing' | 'expression'
            Which data type to calculate this on. (default='splicing')
        n : int
            Minimum number of samples per phenotype, per event

        Returns
        -------
        big_transitions : pandas.DataFrame
            A (n_events, n_transitions) dataframe of the NMF distances between
            splicing events
        """
        if phenotype_transitions == 'all':
            phenotype_transitions = self.phenotype_transitions
        if data_type == 'splicing':
            return self.splicing.nmf_space_transitions(
                self.sample_id_to_phenotype, phenotype_transitions, n=n)

[docs]    def big_nmf_space_transitions(self, phenotype_transitions='all',
                                  data_type='splicing', n=0.5):
        """Splicing events whose change in NMF space is large

        By large, we mean that difference is 2 standard deviations away from
        the mean

        Parameters
        ----------
        phenotype_transitions : list of length-2 tuples of str
            List of ('phenotype1', 'phenotype2') transitions whose change in
            distribution you are interested in
        data_type : 'splicing' | 'expression'
            Which data type to calculate this on. (default='splicing')
        n : int
            Minimum number of samples per phenotype, per event

        Returns
        -------
        big_transitions : pandas.DataFrame
            A (n_events, n_transitions) dataframe of the NMF distances between
            splicing events
        """
        if phenotype_transitions == 'all':
            phenotype_transitions = self.phenotype_transitions
        if data_type == 'splicing':
            return self.splicing.big_nmf_space_transitions(
                self.sample_id_to_phenotype, phenotype_transitions, n=n)

[docs]    def save(self, study_name, flotilla_dir=FLOTILLA_DOWNLOAD_DIR):

        metadata = self.metadata.data_original

        metadata_kws = {'pooled_col': self.metadata.pooled_col,
                        'phenotype_col': self.metadata.phenotype_col,
                        'phenotype_order': self.metadata.phenotype_order,
                        'phenotype_to_color':
                            self.metadata.phenotype_to_color,
                        'phenotype_to_marker':
                            self.metadata.phenotype_to_marker,
                        'minimum_samples': self.metadata.minimum_sample_subset,
                        'outlier_col': self.metadata.outlier_col}

        try:
            expression = self.expression.data_original
            expression_kws = {
                'log_base': self.expression.log_base,
                'thresh': self.expression.thresh_original,
                'plus_one': self.expression.plus_one}
        except AttributeError:
            expression = None
            expression_kws = None

        try:
            expression_feature_data = self.expression.feature_data
            expression_feature_kws = {
                'rename_col': self.expression.feature_rename_col,
                'ignore_subset_cols':
                    self.expression.feature_ignore_subset_cols}
        except AttributeError:
            expression_feature_data = None
            expression_feature_kws = None

        try:
            splicing = self.splicing.data_original
            splicing_kws = {}
        except AttributeError:
            splicing = None
            splicing_kws = None

        try:
            splicing_feature_data = self.splicing.feature_data
            splicing_feature_kws = \
                {'rename_col': self.splicing.feature_rename_col,
                 'ignore_subset_cols':
                     self.splicing.feature_ignore_subset_cols,
                 'expression_id_col': self.splicing.feature_expression_id_col}
        except AttributeError:
            splicing_feature_data = None
            splicing_feature_kws = None

        try:
            spikein = self.spikein.data_original
        except AttributeError:
            spikein = None

        try:
            gene_ontology = self.gene_ontology.data
        except AttributeError:
            gene_ontology = None

        try:
            mapping_stats = self.mapping_stats.data_original
            mapping_stats_kws = {
                'number_mapped_col': self.mapping_stats.number_mapped_col,
                'min_reads': self.mapping_stats.min_reads}
        except AttributeError:
            mapping_stats = None
            mapping_stats_kws = None

        supplemental_attributes = inspect.getmembers(self.supplemental,
                                                     lambda a: not
                                                     (inspect.isroutine(a)))
        supplemental_attributes = [a for a in supplemental_attributes
                                   if not(a[0].startswith('__') and
                                          a[0].endswith('__'))]
        supplemental_kws = {}
        for supplemental_name, df in supplemental_attributes:
            supplemental_kws[supplemental_name] = df

        # Increase the version number
        version = semantic_version.Version(self.version)
        version.patch = version.patch + 1
        version = str(version)

        return make_study_datapackage(
            study_name, metadata, expression, splicing,
            spikein, mapping_stats, metadata_kws=metadata_kws,
            expression_kws=expression_kws, splicing_kws=splicing_kws,
            mapping_stats_kws=mapping_stats_kws,
            expression_feature_kws=expression_feature_kws,
            expression_feature_data=expression_feature_data,
            splicing_feature_data=splicing_feature_data,
            splicing_feature_kws=splicing_feature_kws, species=self.species,
            license=self.license, title=self.title, sources=self.sources,
            version=version, flotilla_dir=flotilla_dir,
            gene_ontology=gene_ontology, supplemental_kws=supplemental_kws)

    @staticmethod
    def _maybe_get_axis_name(df, axis=0, alt_name=None):
        if alt_name is None:
            alt_name = 'columns' if axis == 1 else 'index'
        axis = df.columns if axis == 1 else df.index
        if isinstance(axis, pd.MultiIndex):
            name = axis.names
        else:
            name = axis.name
        name = alt_name if name is None else name
        return name

    @cached_property()
    def tidy_splicing_with_expression(self):
        """A tall 'tidy' dataframe of samples with expression and splicing

        :return:
        :rtype:
        """
        # Establish common strings

        splicing_common_id = self.splicing.feature_data[
            self.splicing.feature_expression_id_col]

        # Tidify splicing
        splicing = self.splicing.data
        splicing_index_name = self._maybe_get_axis_name(splicing, axis=0)
        splicing_columns_name = self._maybe_get_axis_name(splicing, axis=1)

        splicing_tidy = pd.melt(splicing.reset_index(),
                                id_vars=splicing_index_name,
                                value_name='psi',
                                var_name=splicing_columns_name)

        s = splicing_common_id.dropna()

        event_name_to_ensembl_ids = list(itertools.chain(
            *[zip([k] * len(v.split(',')), v.split(',')) for k, v in
              s.iteritems()]))
        index, data = zip(*event_name_to_ensembl_ids)
        event_name_to_ensembl_ids = pd.Series(data, index=index,
                                              name=self._common_id)

        rename_columns = {}
        if splicing_index_name == 'index':
            rename_columns[splicing_index_name] = self._sample_id
        if splicing_columns_name == 'columns':
            rename_columns[splicing_columns_name] = self._event_name
            splicing_columns_name = self._event_name
        splicing_tidy = splicing_tidy.rename(columns=rename_columns)

        splicing_tidy = splicing_tidy.set_index(splicing_columns_name)
        splicing_tidy = splicing_tidy.ix[event_name_to_ensembl_ids.index]
        splicing_tidy = splicing_tidy.join(event_name_to_ensembl_ids)

        splicing_tidy = splicing_tidy.dropna().reset_index()
        splicing_tidy = splicing_tidy.rename(
            columns={'index': self._event_name})

        # Tidify expression
        expression = self.expression.data_original
        expression_index_name = self._maybe_get_axis_name(expression, axis=0)

        expression_tidy = pd.melt(expression.reset_index(),
                                  id_vars=expression_index_name,
                                  value_name='expression',
                                  var_name=self._common_id)
        # This will only do anything if there is a column named "index" so
        # no need to check anything
        expression_tidy = expression_tidy.rename(
            columns={'index': self._sample_id})
        expression_tidy = expression_tidy.dropna()

        splicing_tidy_with_expression = splicing_tidy.merge(
            expression_tidy, left_on=[self._sample_id, self._common_id],
            right_on=[self._sample_id, self._common_id])
        return splicing_tidy_with_expression

[docs]    def filter_splicing_on_expression(self, expression_thresh,
                                      sample_subset=None):
        """Filter splicing events on expression values

        Parameters
        ----------
        expression_thresh : float
            Minimum expression value, of the original input. E.g. if the
            original input is already log-transformed, then this threshold is
            on the log values.

        Returns
        -------
        psi : pandas.DataFrame
            A (n_samples, n_features)

        """
        min_expression = self.expression.data_original.min().min()
        if expression_thresh > -np.inf \
                and expression_thresh > min_expression:
            columns = self._maybe_get_axis_name(self.splicing.data, axis=1,
                                                alt_name=self._event_name)
            index = self._maybe_get_axis_name(self.splicing.data, axis=0,
                                              alt_name=self._sample_id)

            sample_ids = self.sample_subset_to_sample_ids(sample_subset)
            splicing_with_expression = \
                self.tidy_splicing_with_expression.ix[
                    self.tidy_splicing_with_expression.sample_id.isin(
                        sample_ids)]
            ind = splicing_with_expression.expression >= expression_thresh
            splicing_high_expression = splicing_with_expression.ix[ind]
            splicing_high_expression = \
                splicing_high_expression.reset_index().dropna()

            if isinstance(columns, list) or isinstance(index, list):
                filtered_psi = splicing_high_expression.pivot_table(
                    columns=columns, index=index, values='psi')
            else:
                filtered_psi = splicing_high_expression.pivot(
                    columns=columns, index=index, values='psi')
            return filtered_psi
        else:
            return self.splicing.data

[docs]    def go_enrichment(self, feature_ids, background=None, domain=None,
                      p_value_cutoff=1000000, min_feature_size=3,
                      min_background_size=5):
        """Calculate gene ontology enrichment of provided features

        Parameters
        ----------
        feature_ids : list-like
            Features to calculate gene ontology enrichment on
        background : list-like, optional
            Features to use as the background
        domain : str or list, optional
            Only calculate GO enrichment for a particular GO category or
            subset of categories. Valid domains:
            'biological_process', 'molecular_function', 'cellular_component'
        p_value_cutoff : float, optional
            Maximum accepted Bonferroni-corrected p-value
        min_feature_size : int, optional
            Minimum number of features of interest overlapping in a GO Term,
            to calculate enrichment
        min_background_size : int, optional
            Minimum number of features in the background overlapping a GO Term
        Returns
        -------
        enrichment : pandas.DataFrame
            A (go_categories, columns) dataframe showing the GO
            enrichment categories that were enriched in the features
        """
        if background is None:
            warnings.warn('No background provided, defaulting to all '
                          'expressed genes')
            background = self.expression.data.columns
        return self.gene_ontology.enrichment(
            feature_ids, background=background,
            cross_reference=self.expression.feature_renamer_series,
            domain=domain, p_value_cutoff=p_value_cutoff,
            min_feature_size=min_feature_size,
            min_background_size=min_background_size)

# Add interactive visualizations
Study.interactive_classifier = Interactive.interactive_classifier
Study.interactive_graph = Interactive.interactive_graph
Study.interactive_pca = Interactive.interactive_pca
# Study.interactive_localZ = Interactive.interactive_localZ
Study.interactive_lavalamp_pooled_inconsistent = \
    Interactive.interactive_lavalamp_pooled_inconsistent
Study.interactive_choose_outliers = Interactive.interactive_choose_outliers
Study.interactive_reset_outliers = Interactive.interactive_reset_outliers
Study.interactive_clustermap = Interactive.interactive_clustermap
Study.interactive_correlations = Interactive.interactive_correlations