"""
Data models for "studies" studies include attributes about the data and are
heavier in terms of data load
"""
import inspect
import itertools
import json
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import semantic_version
import seaborn as sns
from .metadata import MetaData, PHENOTYPE_COL, POOLED_COL, OUTLIER_COL
from .expression import ExpressionData, SpikeInData
from .gene_ontology import GeneOntologyData
from .quality_control import MappingStatsData, MIN_READS
from .splicing import SplicingData, FRACTION_DIFF_THRESH
from .supplemental import SupplementalData
from ..compute.predict import PredictorConfigManager
from ..datapackage import datapackage_url_to_dict, \
check_if_already_downloaded, make_study_datapackage
from ..visualize.color import blue
from ..visualize.ipython_interact import Interactive
from ..datapackage import FLOTILLA_DOWNLOAD_DIR
from ..util import load_csv, load_json, load_tsv, load_gzip_pickle_df, \
load_pickle_df, timestamp, cached_property
SPECIES_DATA_PACKAGE_BASE_URL = 'https://s3-us-west-2.amazonaws.com/' \
'flotilla-projects'
DATAPACKAGE_RESOURCE_COMMON_KWS = ('url', 'path', 'format', 'compression',
'name')
def _is_absolute_path(location):
return location.startswith('http') or location.startswith('/')
[docs]class Study(object):
"""A biological study, with associated metadata, expression, and splicing
data.
"""
default_feature_set_ids = []
# Data types with enough data that we'd probably reduce them, and even
# then we might want to take subsets. E.g. most variant genes for
# expression. But we don't expect to do this for spikein or mapping_stats
# data
_subsetable_data_types = ['expression', 'splicing']
initializers = {'metadata_data': MetaData,
'expression_data': ExpressionData,
'splicing_data': SplicingData,
'mapping_stats_data': MappingStatsData,
'spikein_data': SpikeInData}
readers = {'tsv': load_tsv,
'csv': load_csv,
'json': load_json,
'pickle_df': load_pickle_df,
'gzip_pickle_df': load_gzip_pickle_df}
_default_reducer_kwargs = {'whiten': False,
'show_point_labels': False,
'show_vectors': False}
_common_id = 'common_id'
_sample_id = 'sample_id'
_event_name = 'event_name'
_default_plot_kwargs = {'marker': 'o', 'color': blue}
def __init__(self, sample_metadata, version='0.1.0',
metadata_pooled_col=POOLED_COL,
metadata_minimum_samples=0,
metadata_phenotype_col=PHENOTYPE_COL,
metadata_phenotype_order=None,
metadata_phenotype_to_color=None,
metadata_phenotype_to_marker=None,
metadata_outlier_col=OUTLIER_COL,
metadata_ignore_subset_cols=None,
metadata_batch_cols='batch',
metadata_batch_min_samples=3,
expression_data=None,
expression_feature_data=None,
expression_feature_rename_col=None,
expression_feature_ignore_subset_cols=None,
expression_log_base=None,
expression_thresh=-np.inf,
expression_plus_one=False,
expression_correct_batch_effects=False,
splicing_data=None,
splicing_feature_data=None,
splicing_feature_rename_col=None,
splicing_feature_ignore_subset_cols=None,
splicing_feature_expression_id_col=None,
mapping_stats_data=None,
mapping_stats_number_mapped_col=None,
mapping_stats_min_reads=MIN_READS,
spikein_data=None,
spikein_feature_data=None,
drop_outliers=True, species=None,
gene_ontology_data=None,
predictor_config_manager=None,
license=None, title=None, sources=None,
default_sample_subset="all_samples",
default_feature_subset="variant",
supplemental_data=None):
"""Construct a biological study
This class only accepts data, no filenames. All data must already
have been read in and exist as Python objects.
Parameters
----------
sample_metadata : pandas.DataFrame
The only required parameter. Samples as the index, with features as
columns. Required column: "phenotype". If there is a boolean
column "pooled", this will be used to separate pooled from single
cells. Similarly, the column "outliers" will also be used to
separate outlier cells from the rest.
version : str
A string describing the semantic version of the data. Must be in:
major.minor.patch format, as the "patch" number will be increased
if you change something in the study and then study.save() it.
(default "0.1.0")
expression_data : pandas.DataFrame
Samples x feature dataframe of gene expression measurements,
e.g. from an RNA-Seq or a microarray experiment. Assumed to be
log-transformed, i.e. you took the log of it. (default None)
expression_feature_data : pandas.DatFrame
Features x annotations dataframe describing other parameters
of the gene expression features, e.g. mapping Ensembl IDs to gene
symbols or gene biotypes. (default None)
expression_feature_rename_col : str
A column name in the expression_feature_data dataframe that you'd
like to rename the expression features to, in the plots. For
example, if your gene IDs are Ensembl IDs, but you want to plot
UCSC IDs, make sure the column you want, e.g. "ucsc_id" is in your
dataframe and specify that. (default "gene_name")
expression_log_base : float
If you want to log-transform your expression data (and it's not
already log-transformed), use this number as the base of the
transform. E.g. expression_log_base=10 will take the log10 of
your data. (default None)
thresh : float
Minimum (non log-transformed) expression value. (default -inf)
expression_plus_one : bool
Whether or not to add 1 to the expression data. (default False)
splicing_data : pandas.DataFrame
Samples x feature dataframe of percent spliced in scores, e.g. as
measured by the program MISO. Assumed that these values only fall
between 0 and 1.
splicing_feature_data : pandas.DataFrame
features x other_features dataframe describing other parameters
of the splicing features, e.g. mapping MISO IDs to Ensembl IDs or
gene symbols or transcript types
splicing_feature_rename_col : str
A column name in the splicing_feature_data dataframe that you'd
like to rename the splicing features to, in the plots. For
example, if your splicing IDs are MISO IDs, but you want to plot
Ensembl IDs, make sure the column you want, e.g. "ensembl_id" is
in your dataframe and specify that. Default "gene_name".
splicing_feature_expression_id_col : str
A column name in the splicing_feature_data dataframe that
corresponds to the row names of the expression data
mapping_stats_data : pandas.DataFrame
Samples x feature dataframe of mapping stats measurements.
Currently, this
mapping_stats_number_mapped_col : str
A column name in the mapping_stats_data which specifies the
number of (uniquely or not) mapped reads. Default "Uniquely
mapped reads number"
spikein_data : pandas.DataFrame
samples x features DataFrame of spike-in expression values
spikein_feature_data : pandas.DataFrame
Features x other_features dataframe, e.g. of the molecular
concentration of particular spikein transcripts
drop_outliers : bool
Whether or not to drop samples indicated as outliers in the
sample_metadata from the other data, i.e. with a column
named 'outlier' in sample_metadata, then remove those
samples from expression_data for further analysis
species : str
Name of the species and genome version, e.g. 'hg19' or 'mm10'.
gene_ontology_data : pandas.DataFrame
Gene ids x ontology categories dataframe used for GO analysis.
metadata_pooled_col : str
Column in metadata_data which specifies as a boolean
whether or not this sample was pooled.
supplemental_data : dict
str: dataframe mapping of the attribute name, and the pandas
dataframe
Note
----
This function explicitly specifies ALL the instance variables (except
those that are marked by the @property decorator), because,
as described [1], "If you write initialization functions separate from
__init__ then experienced developers will certainly see your code as a
kid's playground."
[1] http://stackoverflow.com/q/12513185/1628971
"""
sys.stdout.write("{}\tInitializing Study\n".format(timestamp()))
sys.stdout.write("{}\tInitializing Predictor configuration manager "
"for Study\n".format(timestamp()))
self.predictor_config_manager = predictor_config_manager \
if predictor_config_manager is not None \
else PredictorConfigManager()
# self.predictor_config_manager = None
self.species = species
if gene_ontology_data is not None:
self.gene_ontology = GeneOntologyData(gene_ontology_data)
self.license = license
self.title = title
self.sources = sources
self.version = version
sys.stdout.write('{}\tLoading metadata\n'.format(timestamp()))
self.metadata = MetaData(
sample_metadata, metadata_phenotype_order,
metadata_phenotype_to_color,
metadata_phenotype_to_marker, pooled_col=metadata_pooled_col,
ignore_subset_cols=metadata_ignore_subset_cols,
outlier_col=metadata_outlier_col,
phenotype_col=metadata_phenotype_col,
predictor_config_manager=self.predictor_config_manager,
minimum_sample_subset=metadata_minimum_samples)
self.default_feature_subset = default_feature_subset
self.default_sample_subset = default_sample_subset
if self.metadata.outlier_col in self.metadata.data and drop_outliers:
outliers = self.metadata.data.index[
self.metadata.data[self.metadata.outlier_col].astype(bool)]
else:
outliers = None
self.metadata.data[self.metadata.outlier_col] = False
# Get pooled samples
pooled = None
if self.metadata.pooled_col is not None:
if self.metadata.pooled_col in self.metadata.data:
try:
pooled = self.metadata.data.index[
self.metadata.data[
self.metadata.pooled_col].astype(bool)]
except KeyError:
pooled = None
self.pooled = pooled
if mapping_stats_data is not None:
self.mapping_stats = MappingStatsData(
mapping_stats_data,
number_mapped_col=mapping_stats_number_mapped_col,
predictor_config_manager=self.predictor_config_manager,
min_reads=mapping_stats_min_reads)
self.technical_outliers = self.mapping_stats.too_few_mapped
if len(self.technical_outliers) > 0:
outliers_ids = ', '.join(self.technical_outliers)
sys.stderr.write(
'Samples had too few mapped reads (<{:.1e} reads)'
':\n\t{}\n'.format(mapping_stats_min_reads, outliers_ids))
else:
self.technical_outliers = None
self.mapping_stats = None
feature_data_none = expression_feature_data is None or \
splicing_feature_data is None
if self.species is not None and feature_data_none:
sys.stdout.write('{}\tLoading species metadata from '
'~/flotilla_packages\n'.format(timestamp()))
species_kws = self.load_species_data(self.species, self.readers)
expression_feature_data = species_kws.pop(
'expression_feature_data',
None)
expression_feature_rename_col = species_kws.pop(
'expression_feature_rename_col', None)
splicing_feature_data = species_kws.pop('splicing_feature_data',
None)
splicing_feature_rename_col = species_kws.pop(
'splicing_feature_rename_col', None)
if expression_feature_data is None:
expression_feature_data = species_kws.pop(
'expression_feature_data',
None)
if expression_feature_rename_col is None:
expression_feature_rename_col = species_kws.pop(
'expression_feature_rename_col', None)
if splicing_feature_data is None:
splicing_feature_data = species_kws.pop(
'splicing_feature_data',
None)
if splicing_feature_rename_col is None:
splicing_feature_rename_col = species_kws.pop(
'splicing_feature_rename_col', None)
if expression_data is not None:
sys.stdout.write(
"{}\tLoading expression data\n".format(timestamp()))
feature_ignore_subset_cols = expression_feature_ignore_subset_cols
self.expression = ExpressionData(
expression_data,
feature_data=expression_feature_data,
thresh=expression_thresh,
feature_rename_col=expression_feature_rename_col,
outliers=outliers, plus_one=expression_plus_one,
log_base=expression_log_base, pooled=pooled,
predictor_config_manager=self.predictor_config_manager,
technical_outliers=self.technical_outliers,
minimum_samples=metadata_minimum_samples,
feature_ignore_subset_cols=feature_ignore_subset_cols)
self.default_feature_set_ids.extend(self.expression.feature_subsets
.keys())
else:
self.expression = None
if splicing_data is not None:
sys.stdout.write("{}\tLoading splicing data\n".format(
timestamp()))
self.splicing = SplicingData(
splicing_data, feature_data=splicing_feature_data,
feature_rename_col=splicing_feature_rename_col,
outliers=outliers, pooled=pooled,
predictor_config_manager=self.predictor_config_manager,
technical_outliers=self.technical_outliers,
minimum_samples=metadata_minimum_samples,
feature_ignore_subset_cols=splicing_feature_ignore_subset_cols,
feature_expression_id_col=splicing_feature_expression_id_col)
else:
self.splicing = None
if spikein_data is not None:
self.spikein = SpikeInData(
spikein_data, feature_data=spikein_feature_data,
technical_outliers=self.technical_outliers,
predictor_config_manager=self.predictor_config_manager)
else:
self.spikein = None
self.supplemental = SupplementalData(supplemental_data)
sys.stdout.write("{}\tSuccessfully initialized a Study "
"object!\n".format(timestamp()))
def __setattr__(self, key, value):
"""Check if the attribute already exists and warns on overwrite.
"""
if hasattr(self, key):
warnings.warn('Over-writing attribute {}'.format(key))
super(Study, self).__setattr__(key, value)
@property
[docs] def phenotype_col(self):
return self.metadata.phenotype_col
@property
[docs] def phenotype_order(self):
return self.metadata.phenotype_order
@property
[docs] def phenotype_to_color(self):
return self.metadata.phenotype_to_color
@property
[docs] def phenotype_to_marker(self):
return self.metadata.phenotype_to_marker
@property
[docs] def sample_id_to_phenotype(self):
return self.metadata.sample_id_to_phenotype
@property
[docs] def sample_id_to_color(self):
return self.metadata.sample_id_to_color
@property
[docs] def phenotype_transitions(self):
return self.metadata.phenotype_transitions
@property
[docs] def phenotype_color_ordered(self):
return self.metadata.phenotype_color_order
@property
[docs] def default_sample_subsets(self):
# move default_sample_subset to the front of the list, sort the rest
sorted_sample_subsets = list(sorted(list(set(
self.metadata.sample_subsets.keys()).difference(
set(self.default_sample_subset)))))
sorted_sample_subsets.insert(0, self.default_sample_subset)
return sorted_sample_subsets
@property
[docs] def default_feature_subsets(self):
feature_subsets = {}
for name in self._subsetable_data_types:
try:
data_type = getattr(self, name)
feature_subsets[name] = data_type.feature_subsets
except AttributeError:
continue
return feature_subsets
@classmethod
[docs] def from_datapackage_url(
cls, datapackage_url,
load_species_data=True,
species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
"""Create a study from a url of a datapackage.json file
Parameters
----------
datapackage_url : str
HTTP url of a datapackage.json file, following the specification
described here: http://dataprotocols.org/data-packages/ and
requiring the following data resources: metadata,
expression, splicing
species_data_pacakge_base_url : str
Base URL to fetch species-specific _ and splicing event
metadata frnm.
Default 'https://s3-us-west-2.amazonaws.com/flotilla-projects/'
Returns
-------
study : Study
A "study" object containing the data described in the
datapackage_url file
Raises
------
AttributeError
If the datapackage.json file does not contain the required
resources of metadata, expression, and splicing.
"""
datapackage = datapackage_url_to_dict(datapackage_url)
datapackage_dir = '{}/{}'.format(FLOTILLA_DOWNLOAD_DIR,
datapackage['name'])
return cls.from_datapackage(
datapackage, load_species_data=load_species_data,
datapackage_dir=datapackage_dir,
species_datapackage_base_url=species_datapackage_base_url)
@classmethod
[docs] def from_datapackage_file(
cls, datapackage_filename,
load_species_data=True,
species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
with open(datapackage_filename) as f:
sys.stdout.write('{}\tReading datapackage from {}\n'.format(
timestamp(), datapackage_filename))
datapackage = json.load(f)
datapackage_dir = os.path.dirname(datapackage_filename)
return cls.from_datapackage(
datapackage, datapackage_dir=datapackage_dir,
load_species_data=load_species_data,
species_datapackage_base_url=species_datapackage_base_url)
@staticmethod
def _filename_from_resource(resource, datapackage_dir,
datapackage_name):
if 'url' in resource:
resource_url = resource['url']
if not _is_absolute_path(resource_url):
resource_url = '{}/{}'.format(datapackage_dir,
resource_url)
filename = check_if_already_downloaded(resource_url,
datapackage_name)
return filename
elif 'path' in resource:
if resource['path'].startswith('http'):
filename = check_if_already_downloaded(resource['path'],
datapackage_name)
else:
filename = resource['path']
if not _is_absolute_path(filename):
filename = '{}/{}'.format(datapackage_dir,
filename)
# Test if the file exists, if not, then add the datapackage
# file
if not os.path.exists(filename):
filename = os.path.join(datapackage_dir, filename)
return filename
else:
return None
@classmethod
[docs] def from_datapackage(
cls, datapackage, datapackage_dir='./',
load_species_data=True,
species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
"""Create a study object from a datapackage dictionary
Parameters
----------
datapackage : dict
Returns
-------
study : flotilla.Study
Study object
"""
sys.stdout.write('{}\tParsing datapackage to create a Study '
'object\n'.format(timestamp()))
dfs = {}
kwargs = {}
supplemental_data = {}
datapackage_name = datapackage['name']
for resource in datapackage['resources']:
filename = cls._filename_from_resource(resource, datapackage_dir,
datapackage_name)
if filename is None:
# This is supplemental data
for supplemental in resource[u'resources']:
filename = cls._filename_from_resource(supplemental,
datapackage_dir,
datapackage_name)
name = supplemental['name']
reader = cls.readers[supplemental['format']]
compression = None if 'compression' not in \
supplemental else \
supplemental['compression']
header = supplemental.pop('header', 0)
index_col = supplemental.pop('index_col', 0)
df = reader(filename, compression=compression,
header=header, index_col=index_col)
supplemental_data[name] = df
else:
name = resource['name']
reader = cls.readers[resource['format']]
compression = None if 'compression' not in resource else \
resource['compression']
header = resource.pop('header', 0)
index_col = resource.pop('index_col', 0)
dfs[name] = reader(filename, compression=compression,
header=header, index_col=index_col)
for key in set(resource.keys()).difference(
DATAPACKAGE_RESOURCE_COMMON_KWS):
kwargs['{}_{}'.format(name, key)] = resource[key]
species_kws = {}
species = None if 'species' not in datapackage else datapackage[
'species']
if load_species_data and species is not None:
species_kws = cls.load_species_data(species, cls.readers,
species_datapackage_base_url)
try:
sample_metadata = dfs.pop('metadata')
except KeyError:
raise AttributeError('The datapackage.json file is required to '
'have the "metadata" resource')
dfs = dict(('{}_data'.format(k), v) for k, v in dfs.iteritems())
nones = [k for k, v in kwargs.iteritems() if v is None]
for key in nones:
kwargs.pop(key)
kwargs.update(species_kws)
kwargs.update(dfs)
license = None if 'license' not in datapackage else datapackage[
'license']
title = None if 'title' not in datapackage else datapackage[
'title']
sources = None if 'sources' not in datapackage else datapackage[
'sources']
version = None if 'datapackage_version' not in datapackage else \
datapackage['datapackage_version']
if not semantic_version.validate(version):
raise ValueError(
'{} is not a valid version string. Please use semantic '
'versioning, with major.minor.patch, e.g. 0.1.2 is a valid '
'version string'.format(version))
study = Study(sample_metadata=sample_metadata, species=species,
license=license, title=title, sources=sources,
version=version, supplemental_data=supplemental_data,
**kwargs)
return study
@staticmethod
[docs] def load_species_data(
species, readers,
species_datapackage_base_url=SPECIES_DATA_PACKAGE_BASE_URL):
dfs = {}
try:
species_data_url = '{}/{}/datapackage.json'.format(
species_datapackage_base_url, species)
species_datapackage = datapackage_url_to_dict(
species_data_url)
for resource in species_datapackage['resources']:
if 'url' in resource:
resource_url = resource['url']
filename = check_if_already_downloaded(resource_url,
species)
else:
filename = resource['path']
reader = readers[resource['format']]
compression = None if 'compression' not in resource else \
resource['compression']
name = resource['name']
dfs[name] = reader(filename, index_col=0,
compression=compression)
other_keys = set(resource.keys()).difference(
DATAPACKAGE_RESOURCE_COMMON_KWS)
name_no_data = name.rstrip('_data')
for key in other_keys:
new_key = '{}_{}'.format(name_no_data, key)
dfs[new_key] = resource[key]
except (IOError, ValueError) as e:
sys.stderr.write('Error loading species {} data:'
' {}'.format(species, e))
return dfs
[docs] def detect_outliers(self, data_type='expression',
sample_subset=None, feature_subset=None,
featurewise=False,
reducer=None,
standardize=None,
reducer_kwargs=None,
bins=None,
outlier_detection_method=None,
outlier_detection_method_kwargs=None):
if sample_subset is None:
sample_subset = self.default_sample_subset
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
if feature_subset is None:
feature_subset = self.default_feature_subset
feature_ids = self.feature_subset_to_feature_ids(data_type,
feature_subset,
rename=False)
if data_type == "expression":
datamodel = self.expression
elif data_type == "splicing":
datamodel = self.splicing
else:
raise TypeError('{} not a supported data type'.format(data_type))
reducer, outlier_detector = datamodel.detect_outliers(
sample_ids=sample_ids, feature_ids=feature_ids,
featurewise=featurewise, reducer=reducer, standardize=standardize,
reducer_kwargs=reducer_kwargs, bins=bins,
outlier_detection_method=outlier_detection_method,
outlier_detection_method_kwargs=outlier_detection_method_kwargs)
outlier_detector.predict(reducer.reduced_space)
outlier_detector.title = "_".join(
['outlier', data_type, sample_subset, feature_subset])
print "setting outlier type:\"{}\" in metadata".format(
outlier_detector.title)
if outlier_detector.title not in self.metadata.data:
self.metadata.data[outlier_detector.title] = False
self.metadata.data[outlier_detector.title].update(
outlier_detector.outliers)
return reducer, outlier_detector
[docs] def drop_outliers(self):
"""Assign samples marked as "outlier" in metadata, to other datas"""
outliers = self.metadata.data['outlier'][
self.metadata.data['outlier']].index
self.expression.outlier_samples = outliers
self.splicing.outlier_samples = outliers
@staticmethod
[docs] def maybe_make_directory(filename):
# Make the directory if it's not already there
try:
directory = os.path.abspath(os.path.dirname(filename))
os.makedirs(os.path.abspath(directory))
except OSError:
pass
[docs] def feature_subset_to_feature_ids(self, data_type, feature_subset=None,
rename=False):
"""Given a name of a feature subset, get the associated feature ids
Parameters
----------
data_type : str
A string describing the data type, e.g. "expression"
feature_subset : str
A string describing the subset of data type (must be already
calculated)
Returns
-------
feature_ids : list of strings
List of features ids from the specified datatype
"""
if 'expression'.startswith(data_type):
return self.expression.feature_subset_to_feature_ids(
feature_subset, rename)
elif 'splicing'.startswith(data_type):
return self.splicing.feature_subset_to_feature_ids(
feature_subset, rename)
[docs] def sample_subset_to_sample_ids(self, phenotype_subset=None):
"""Convert a string naming a subset of phenotypes in the data into
sample ids
Parameters
----------
phenotype_subset : str
A valid string describing a boolean phenotype described in the
metadata data
Returns
-------
sample_ids : list of strings
List of sample ids in the data
"""
# IF this is a list of IDs
try:
return self.metadata.sample_subsets[phenotype_subset]
except (KeyError, TypeError):
pass
try:
ind = self.metadata.sample_id_to_phenotype == phenotype_subset
if ind.sum() > 0:
return self.metadata.sample_id_to_phenotype.index[ind]
if phenotype_subset is None or 'all_samples'.startswith(
phenotype_subset):
sample_ind = np.ones(self.metadata.data.shape[0],
dtype=bool)
elif phenotype_subset.startswith("~"):
sample_ind = ~pd.Series(
self.metadata.data[phenotype_subset.lstrip("~")],
dtype='bool')
else:
sample_ind = pd.Series(
self.metadata.data[phenotype_subset], dtype='bool')
sample_ids = self.metadata.data.index[sample_ind]
return sample_ids
except (AttributeError, ValueError):
return phenotype_subset
[docs] def plot_pca(self, data_type='expression', x_pc=1, y_pc=2,
sample_subset=None, feature_subset=None,
title='', featurewise=False, plot_violins=False,
show_point_labels=False, reduce_kwargs=None,
color_samples_by=None, bokeh=False,
most_variant_features=False, std_multiplier=2,
scale_by_variance=True,
**kwargs):
"""Performs DataFramePCA on both expression and splicing study_data
Parameters
----------
data_type : str
One of the names of the data types, e.g. "expression" or
"splicing" (default "expression")
x_pc : int, optional
Which principal component to plot on the x-axis (default 1)
y_pc : int, optional
Which principal component to plot on the y-axis (default 2)
sample_subset : str or None
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used. (default None)
feature_subset : str or None
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used. (default None)
title : str, optional
Title of the reduced space plot (default '')
featurewise : bool, optional
If True, the features are reduced on the samples, and the plotted
points are features, not samples. (default False)
plot_violins : bool
Whether or not to make the violinplots of the top features. This
can take a long time, so to save time you can turn it off if you
just want a quick look at the PCA. (default False)
show_point_labels : bool, optional
Whether or not to show the labels of the points. If this is
samplewise (default), then this labels the samples. If this is
featurewise, then this labels the features. (default False)
reduce_kwargs : dict, optional
Keyword arguments to the reducer (default None)
color_samples_by : str, optional
Instead of coloring the samples by their phenotype, color them by
this column in the metadata. (default None)
bokeh : bool, optional
If True, plot a javascripty/interactive bokeh plot instead of a
static printable figure (default False)
most_variant_features : bool, optional
If True, then only take the most variant of the provided features.
The most variant are determined by taking the features whose
variance is ``std_multiplier``standard deviations away from the
mean feature variance (default False)
std_multiplier : float, optional
If ``most_variant_features`` is True, then use this as a cutoff
for the minimum variance of a feature to be included (default 2)
scale_by_variance : bool, optional
If True, then scale the x- and y-axes by the explained variance
ratio of the principal component dimensions. Only valid for PCA
and its variations, not for NMF or tSNE. (default True)
kwargs : other keyword arguments
All other keyword arguments are passed to
:py:meth:`DecomopsitionViz.plot`
"""
sample_subset = self.default_sample_subset \
if sample_subset is None else sample_subset
feature_subset = self.default_feature_subset \
if feature_subset is None else feature_subset
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(data_type,
feature_subset,
rename=False)
label_to_color = None
label_to_marker = None
groupby = None
order = None
color_samples_by_phenotype = color_samples_by \
== self.metadata.phenotype_col
if not featurewise:
if color_samples_by is None or color_samples_by_phenotype:
label_to_color = self.phenotype_to_color
label_to_marker = self.phenotype_to_marker
groupby = self.sample_id_to_phenotype
order = self.phenotype_order
else:
groupby = self.metadata.data[color_samples_by]
if "expression".startswith(data_type):
reducer = self.expression.plot_pca(
x_pc=x_pc, y_pc=y_pc, sample_ids=sample_ids,
feature_ids=feature_ids,
label_to_color=label_to_color,
label_to_marker=label_to_marker, groupby=groupby,
order=order, std_multiplier=std_multiplier,
featurewise=featurewise, show_point_labels=show_point_labels,
title=title, reduce_kwargs=reduce_kwargs,
plot_violins=plot_violins, metadata=self.metadata.data,
bokeh=bokeh, most_variant_features=most_variant_features,
scale_by_variance=scale_by_variance,
**kwargs)
elif "splicing".startswith(data_type):
reducer = self.splicing.plot_pca(
x_pc=x_pc, y_pc=y_pc, sample_ids=sample_ids,
feature_ids=feature_ids,
label_to_color=label_to_color,
label_to_marker=label_to_marker, groupby=groupby,
order=order, std_multiplier=std_multiplier,
featurewise=featurewise, show_point_labels=show_point_labels,
title=title, reduce_kwargs=reduce_kwargs,
plot_violins=plot_violins, metadata=self.metadata.data,
bokeh=bokeh, most_variant_features=most_variant_features,
scale_by_variance=scale_by_variance,
**kwargs)
else:
raise ValueError('The data type {} does not exist in this study'
.format(data_type))
return reducer
[docs] def plot_graph(self, data_type='expression', sample_subset=None,
feature_subset=None, featurewise=False,
**kwargs):
"""Plot the graph (network) of these data
Parameters
----------
data_type : str
One of the names of the data types, e.g. "expression" or "splicing"
sample_subset : str or None
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used.
feature_subset : str or None
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used.
"""
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(data_type,
feature_subset,
rename=False)
if not featurewise:
label_to_color = self.phenotype_to_color
label_to_marker = self.phenotype_to_marker
groupby = self.sample_id_to_phenotype
else:
label_to_color = None
label_to_marker = None
groupby = None
if data_type == "expression":
return self.expression.networks.draw_graph(
sample_ids=sample_ids, feature_ids=feature_ids,
sample_id_to_color=self.sample_id_to_color,
label_to_color=label_to_color,
label_to_marker=label_to_marker, groupby=groupby,
featurewise=featurewise,
**kwargs)
elif data_type == "splicing":
return self.splicing.networks.draw_graph(
sample_ids=sample_ids, feature_ids=feature_ids,
sample_id_to_color=self.sample_id_to_color,
label_to_color=label_to_color,
label_to_marker=label_to_marker, groupby=groupby,
featurewise=featurewise,
**kwargs)
[docs] def plot_classifier(self, trait, sample_subset=None,
feature_subset='all_genes',
data_type='expression', title='',
show_point_labels=False,
**kwargs):
"""Plot a predictor for the specified data type and trait(s)
Parameters
----------
data_type : str
One of the names of the data types, e.g. "expression" or "splicing"
trait : str
Column name in the metadata data that you would like
to classify on
Returns
-------
"""
try:
trait_data = self.metadata.data[trait]
except KeyError:
trait_ids = self.sample_subset_to_sample_ids(trait)
trait_data = self.metadata.data.index.isin(trait_ids)
if isinstance(trait_data.dtype, bool):
all_true = np.all(trait_data)
all_false = np.all(~trait_data)
too_few_categories = False
else:
all_false = False
all_true = False
too_few_categories = len(set(trait_data)) <= 1
nothing_to_classify = all_true or all_false or too_few_categories
if nothing_to_classify:
raise ValueError("All samples are True (or all samples are "
"False) or all are the same, cannot classify"
"when all samples are the same")
trait_data = pd.Series(trait_data, name=trait,
index=self.metadata.data.index)
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(data_type,
feature_subset,
rename=False)
feature_subset = 'none' if feature_subset is None else feature_subset
sample_subset = 'none' if sample_subset is None else sample_subset
data_name = '_'.join([sample_subset, feature_subset])
label_to_color = self.phenotype_to_color
label_to_marker = self.phenotype_to_marker
groupby = self.sample_id_to_phenotype
order = self.phenotype_order
color = self.phenotype_color_ordered
if data_type == "expression":
return self.expression.plot_classifier(
data_name=data_name, trait=trait_data,
sample_ids=sample_ids, feature_ids=feature_ids,
label_to_color=label_to_color,
label_to_marker=label_to_marker, groupby=groupby,
show_point_labels=show_point_labels, title=title,
order=order, color=color,
**kwargs)
elif data_type == "splicing":
return self.splicing.plot_classifier(
data_name=data_name, trait=trait_data,
sample_ids=sample_ids, feature_ids=feature_ids,
label_to_color=label_to_color,
label_to_marker=label_to_marker, groupby=groupby,
show_point_labels=show_point_labels, title=title,
order=order, color=color,
**kwargs)
[docs] def modality_assignments(self, sample_subset=None, feature_subset=None,
expression_thresh=-np.inf, min_samples=10):
"""Get modality assignments of splicing data
Parameters
----------
sample_subset : str or None, optional
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used.
feature_subset : str or None, optional
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used.
expression_thresh : float, optional
Minimum expression value, of the original input. E.g. if the
original input is already log-transformed, then this threshold is
on the log values.
Returns
-------
modalities : pandas.DataFrame
A (n_phenotypes, n_events) shaped DataFrame of the assigned
modality
"""
min_expression = self.expression.data.min().min()
if expression_thresh > -np.inf and expression_thresh > min_expression:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
feature_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset, rename=False)
data = None
return self.splicing.modality_assignments(
sample_ids, feature_ids, data=data,
groupby=self.sample_id_to_phenotype, min_samples=min_samples)
[docs] def modality_counts(self, sample_subset=None, feature_subset=None,
expression_thresh=-np.inf, min_samples=10):
"""Get number of splicing events in modality categories
Parameters
----------
sample_subset : str or None, optional
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used.
feature_subset : str or None, optional
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used.
expression_thresh : float, optional
Minimum expression value, of the original input. E.g. if the
original input is already log-transformed, then this threshold is
on the log values.
Returns
-------
modalities : pandas.DataFrame
A (n_phenotypes, n_modalities) shaped DataFrame of the number of
events assigned to each modality
"""
min_expression = self.expression.data.min().min()
if expression_thresh > -np.inf and expression_thresh > min_expression:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
feature_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset, rename=False)
data = None
return self.splicing.modality_assignments(
sample_ids, feature_ids, data=data,
groupby=self.sample_id_to_phenotype, min_samples=min_samples)
[docs] def plot_modalities_bars(self, sample_subset=None, feature_subset=None,
expression_thresh=-np.inf, percentages=True):
"""Make grouped barplots of the number of modalities per phenotype
Parameters
----------
sample_subset : str or None
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used.
feature_subset : str or None
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used.
expression_thresh : float
If greater than -inf, then filter on splicing events in genes
with expression at least this value
percentages : bool
If True, plot percentages instead of counts
"""
if expression_thresh > -np.inf:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
feature_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset, rename=False)
data = None
self.splicing.plot_modalities_bars(sample_ids, feature_ids, data,
self.sample_id_to_phenotype,
self.phenotype_to_color,
percentages=percentages)
[docs] def plot_modalities_reduced(self, sample_subset=None, feature_subset=None,
expression_thresh=-np.inf):
"""Plot splicing events with modality assignments in NMF space
This will plot a separate NMF space for each celltype in the data, as
well as one for all samples.
Parameters
----------
sample_subset : str or None
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used.
feature_subset : str or None
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used.
expression_thresh : float
If greater than -inf, then filter on splicing events in genes
with expression at least this value
"""
if expression_thresh > -np.inf:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
feature_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset, rename=False)
data = None
grouped = self.sample_id_to_phenotype.groupby(
self.sample_id_to_phenotype)
# Account for bar plot and plot of the reduced space of ALL samples
n = grouped.ngroups + 1
groups = ['all']
fig, axes = plt.subplots(ncols=n, figsize=(n * 4, 4))
all_ax = axes[0]
self.splicing.plot_modalities_reduced(sample_ids, feature_ids,
data=data,
ax=all_ax, title='all samples')
axes = axes[1:]
for i, ((celltype, series), ax) in enumerate(zip(grouped, axes)):
groups.append(celltype)
# sys.stdout.write('\n---- {} ----\n'.format(celltype))
samples = series.index.intersection(sample_ids)
# legend = i == 0
self.splicing.plot_modalities_reduced(samples, feature_ids,
data=data,
ax=ax, title=celltype)
[docs] def celltype_sizes(self, data_type='splicing'):
if data_type == 'expression':
self.expression.data.groupby(self.sample_id_to_phenotype,
axis=0).size()
if data_type == 'splicing':
self.splicing.data.groupby(self.sample_id_to_phenotype,
axis=0).size()
@property
[docs] def celltype_event_counts(self):
"""Number of cells that detected each event, per celltype
"""
return self.splicing.data.groupby(
self.sample_id_to_phenotype, axis=0).apply(
lambda x: x.groupby(level=0, axis=0).transform(
lambda x: x.count()).sum()).replace(0, np.nan)
[docs] def unique_celltype_event_counts(self, n=1):
celltype_event_counts = self.celltype_event_counts
return celltype_event_counts[celltype_event_counts <= n]
[docs] def percent_unique_celltype_events(self, n=1):
n_unique = self.unique_celltype_event_counts(n).sum(axis=1)
n_total = self.celltype_event_counts.sum(axis=1).astype(float)
return n_unique / n_total * 100
# @property
# def celltype_modalities(self):
# """Return modality assignments of each celltype
# """
# return self.splicing.data.groupby(
# self.sample_id_to_phenotype, axis=0).apply(
# lambda x: self.splicing.modalities(x.index))
[docs] def plot_modalities_lavalamps(self, sample_subset=None,
feature_subset=None,
expression_thresh=-np.inf):
"""Plot each modality in each celltype on a separate axes
Parameters
----------
sample_subset : str or None
Which subset of the samples to use, based on some phenotype
column in the experiment design data. If None, all samples are
used.
feature_subset : str or None
Which subset of the features to used, based on some feature type
in the expression data (e.g. "variant"). If None, all features
are used.
expression_thresh : float
If greater than -inf, then filter on splicing events in genes
with expression at least this value
"""
if expression_thresh > -np.inf:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
feature_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset, rename=False)
data = None
self.splicing.plot_modalities_lavalamps(
groupby=self.sample_id_to_phenotype,
phenotype_to_color=self.phenotype_to_color,
sample_ids=sample_ids, data=data, feature_ids=feature_ids)
[docs] def plot_event_modality_estimation(self, event_id, sample_subset=None,
expression_thresh=-np.inf):
if expression_thresh > -np.inf:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
data = None
self.splicing.plot_event_modality_estimation(
event_id, groupby=self.sample_id_to_phenotype,
sample_ids=sample_ids, data=data)
[docs] def plot_event(self, feature_id, sample_subset=None, nmf_space=False):
"""Plot the violinplot and NMF transitions of a splicing event
"""
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
self.splicing.plot_feature(
feature_id, sample_ids,
phenotype_groupby=self.sample_id_to_phenotype,
phenotype_order=self.phenotype_order,
color=self.phenotype_color_ordered,
phenotype_to_color=self.phenotype_to_color,
phenotype_to_marker=self.phenotype_to_marker, nmf_space=nmf_space)
[docs] def plot_gene(self, feature_id, sample_subset=None, nmf_space=False):
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
self.expression.plot_feature(
feature_id, sample_ids,
phenotype_groupby=self.sample_id_to_phenotype,
phenotype_order=self.phenotype_order,
color=self.phenotype_color_ordered,
phenotype_to_color=self.phenotype_to_color,
phenotype_to_marker=self.phenotype_to_marker, nmf_space=nmf_space)
[docs] def plot_lavalamp_pooled_inconsistent(
self, sample_subset=None, feature_subset=None,
fraction_diff_thresh=FRACTION_DIFF_THRESH,
expression_thresh=-np.inf):
# grouped_ids = self.splicing.data.groupby(self.sample_id_to_color,
# axis=0)
celltype_groups = self.metadata.data.groupby(
self.sample_id_to_phenotype, axis=0)
if sample_subset is not None:
# Only plotting one sample_subset
celltype_samples = set(celltype_groups.groups[sample_subset])
else:
# Plotting all the celltypes
celltype_samples = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset=feature_subset)
celltype_and_sample_ids = celltype_groups.groups.iteritems()
for i, (phenotype, sample_ids) in enumerate(celltype_and_sample_ids):
# import pdb; pdb.set_trace()
# Assumes all samples of a sample_subset have the same color...
# probably wrong
color = self.phenotype_to_color[phenotype]
sample_ids = celltype_samples.intersection(sample_ids)
if len(sample_ids) == 0:
continue
data = self.filter_splicing_on_expression(expression_thresh)
data = data.ix[sample_ids, :]
self.splicing.plot_lavalamp_pooled_inconsistent(
data, feature_ids, fraction_diff_thresh, color=color)
[docs] def percent_pooled_inconsistent(
self, sample_subset=None, feature_subset=None,
fraction_diff_thresh=FRACTION_DIFF_THRESH,
expression_thresh=-np.inf):
celltype_groups = self.metadata.data.groupby(
self.sample_id_to_phenotype, axis=0)
if sample_subset is not None:
# Only plotting one sample_subset
celltype_samples = set(celltype_groups.groups[sample_subset])
else:
# Plotting all the celltypes
celltype_samples = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset=feature_subset)
celltype_and_sample_ids = celltype_groups.groups.iteritems()
index = pd.MultiIndex.from_product([celltype_groups.groups.keys(),
['n_events', 'percent']])
percents = pd.Series(index=index)
for i, (phenotype, sample_ids) in enumerate(celltype_and_sample_ids):
# import pdb; pdb.set_trace()
# Assumes all samples of a sample_subset have the same color...
# probably wrong
sample_ids = celltype_samples.intersection(sample_ids)
if len(sample_ids) == 0:
continue
data = self.filter_splicing_on_expression(expression_thresh)
data = data.ix[sample_ids, :]
if not data.empty:
singles, pooled, not_measured_in_pooled, pooled_inconsistent \
= self.splicing.pooled_inconsistent(data, feature_ids,
fraction_diff_thresh)
percent = self.splicing._divide_inconsistent_and_pooled(
pooled, pooled_inconsistent)
else:
percent = np.nan
percents[phenotype, 'percent'] = percent
percents[phenotype, 'n_events'] = data.shape[1]
return percents
[docs] def expression_vs_inconsistent_splicing(self, bins=None):
"""Percentage of events inconsistent with pooled at expression threshs
Parameters
----------
bins : list-like
List of expression cutoffs
Returns
-------
expression_vs_inconsistent : pd.DataFrame
A (len(bins), n_phenotypes) dataframe of the percentage of events
in single cells that are inconsistent with pooled
"""
if bins is None:
emin = int(np.floor(self.expression.data_original.min().min()))
emax = int(np.ceil(self.expression.data_original.max().max()))
bins = np.arange(emin, emax)
expression_vs_inconsistent = pd.Series(bins).apply(
lambda x: self.percent_pooled_inconsistent(expression_thresh=x))
return expression_vs_inconsistent
[docs] def plot_expression_vs_inconsistent_splicing(self, bins=None):
expression_vs_inconsistent = self.expression_vs_inconsistent_splicing(
bins=bins)
fig, axes = plt.subplots(nrows=2, figsize=(6, 6))
# Plot the percent inconsistent
ax = axes[0]
for phenotype in self.phenotype_order:
s = expression_vs_inconsistent[(phenotype, 'percent')]
color = self.phenotype_to_color[phenotype]
ax.plot(s, 'o-', color=color)
ax.set_xlabel('Expression threshold')
ax.set_ylabel('Percent events inconsistent with pooled')
ymin, ymax = ax.get_ylim()
ax.set_ylim(0, ymax)
# Plot number of events at each cutoff
ax = axes[1]
for phenotype in self.phenotype_order:
s = expression_vs_inconsistent[(phenotype, 'n_events')]
color = self.phenotype_to_color[phenotype]
ax.plot(s, 'o-', color=color)
ax.set_xlabel('Expression threshold')
ax.set_ylabel('Number of events')
ymin, ymax = ax.get_ylim()
ax.set_ylim(0, ymax)
ax.legend()
sns.despine()
[docs] def plot_clustermap(self, sample_subset=None, feature_subset=None,
data_type='expression', metric='euclidean',
method='average', figsize=None,
scale_fig_by_data=True, **kwargs):
"""Visualize hierarchical relationships within samples and features
Parameters
----------
Returns
-------
Raises
------
"""
if feature_subset is None:
feature_subset = self.default_feature_subset
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(data_type,
feature_subset,
rename=False)
if data_type == "expression":
return self.expression.plot_clustermap(
sample_ids=sample_ids, feature_ids=feature_ids, method=method,
metric=metric, sample_id_to_color=self.sample_id_to_color,
figsize=figsize, scale_fig_by_data=scale_fig_by_data,
**kwargs)
elif data_type == "splicing":
return self.splicing.plot_clustermap(
sample_ids=sample_ids, feature_ids=feature_ids, method=method,
metric=metric, sample_id_to_color=self.sample_id_to_color,
figsize=figsize, scale_fig_by_data=scale_fig_by_data,
**kwargs)
[docs] def plot_correlations(self, sample_subset=None, feature_subset=None,
data_type='expression', metric='euclidean',
method='average', figsize=None, featurewise=False,
scale_fig_by_data=True, **kwargs):
"""Visualize clustered correlations of samples across features
Parameters
----------
Returns
-------
Raises
------
"""
if feature_subset is None:
feature_subset = self.default_feature_subset
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(data_type,
feature_subset,
rename=False)
if figsize is not None and scale_fig_by_data:
raise ValueError('If "scale_fig_by_data" is true, then cannot '
'also specify "figsize"')
if data_type == "expression":
return self.expression.plot_correlations(
sample_ids=sample_ids, feature_ids=feature_ids,
sample_id_to_color=self.sample_id_to_color,
figsize=figsize, scale_fig_by_data=scale_fig_by_data,
metric=metric, method=method, featurewise=featurewise,
**kwargs)
elif data_type == "splicing":
return self.splicing.plot_correlations(
sample_ids=sample_ids, feature_ids=feature_ids, method=method,
metric=metric, sample_id_to_color=self.sample_id_to_color,
figsize=figsize, scale_fig_by_data=scale_fig_by_data,
featurewise=featurewise, **kwargs)
[docs] def plot_lavalamps(self, sample_subset=None, feature_subset=None,
expression_thresh=-np.inf):
if expression_thresh > -np.inf:
data = self.filter_splicing_on_expression(
expression_thresh=expression_thresh,
sample_subset=sample_subset)
sample_ids = None
feature_ids = None
else:
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
feature_ids = self.feature_subset_to_feature_ids(
'splicing', feature_subset, rename=False)
data = None
self.splicing.plot_lavalamp(sample_ids=sample_ids,
feature_ids=feature_ids, data=data,
groupby=self.sample_id_to_phenotype,
phenotype_to_color=self.phenotype_to_color,
order=self.phenotype_order)
[docs] def plot_big_nmf_space_transitions(self, data_type='expression', n=5):
if data_type == 'expression':
self.expression.plot_big_nmf_space_transitions(
self.sample_id_to_phenotype, self.phenotype_transitions,
self.phenotype_order, self.phenotype_color_ordered,
self.phenotype_to_color, self.phenotype_to_marker, n=n)
if data_type == 'splicing':
self.splicing.plot_big_nmf_space_transitions(
self.sample_id_to_phenotype, self.phenotype_transitions,
self.phenotype_order, self.phenotype_color_ordered,
self.phenotype_to_color, self.phenotype_to_marker, n=n)
[docs] def plot_two_samples(self, sample1, sample2, data_type='expression',
**kwargs):
"""Plot a scatterplot of two samples' data
Parameters
----------
sample1 : str
Name of the sample to plot on the x-axis
sample2 : str
Name of the sample to plot on the y-axis
data_type : "expression" | "splicing"
Type of data to plot. Default "expression"
Any other keyword arguments valid for seaborn.jointplot
Returns
-------
jointgrid : seaborn.axisgrid.JointGrid
Returns a JointGrid instance
See Also
-------
seaborn.jointplot
"""
if data_type == 'expression':
return self.expression.plot_two_samples(sample1, sample2, **kwargs)
elif data_type == 'splicing':
return self.splicing.plot_two_samples(sample1, sample2, **kwargs)
[docs] def plot_two_features(self, feature1, feature2, data_type='expression',
**kwargs):
"""Make a scatterplot of two features' data
Parameters
----------
feature1 : str
Name of the feature to plot on the x-axis. If you have a
feature_data dataframe for this data type, will attempt to map
the common name, e.g. "RBFOX2" back to the crazy name,
e.g. "ENSG00000100320"
feature2 : str
Name of the feature to plot on the y-axis. If you have a
feature_data dataframe for this data type, will attempt to map
the common name, e.g. "RBFOX2" back to the crazy name,
e.g. "ENSG00000100320"
Returns
-------
Raises
------
"""
if data_type == 'expression':
self.expression.plot_two_features(
feature1, feature2, groupby=self.sample_id_to_phenotype,
label_to_color=self.phenotype_to_color, **kwargs)
if data_type == 'splicing':
self.splicing.plot_two_features(
feature1, feature2, groupby=self.sample_id_to_phenotype,
label_to_color=self.phenotype_to_color, **kwargs)
[docs] def nmf_space_positions(self, data_type='splicing'):
if data_type == 'splicing':
return self.splicing.nmf_space_positions(
self.sample_id_to_phenotype)
[docs] def nmf_space_transitions(self, phenotype_transitions='all',
data_type='splicing', n=0.5):
"""The change in NMF space of splicing events across phenotypes
Parameters
----------
phenotype_transitions : list of length-2 tuples of str
List of ('phenotype1', 'phenotype2') transitions whose change in
distribution you are interested in
data_type : 'splicing' | 'expression'
Which data type to calculate this on. (default='splicing')
n : int
Minimum number of samples per phenotype, per event
Returns
-------
big_transitions : pandas.DataFrame
A (n_events, n_transitions) dataframe of the NMF distances between
splicing events
"""
if phenotype_transitions == 'all':
phenotype_transitions = self.phenotype_transitions
if data_type == 'splicing':
return self.splicing.nmf_space_transitions(
self.sample_id_to_phenotype, phenotype_transitions, n=n)
[docs] def big_nmf_space_transitions(self, phenotype_transitions='all',
data_type='splicing', n=0.5):
"""Splicing events whose change in NMF space is large
By large, we mean that difference is 2 standard deviations away from
the mean
Parameters
----------
phenotype_transitions : list of length-2 tuples of str
List of ('phenotype1', 'phenotype2') transitions whose change in
distribution you are interested in
data_type : 'splicing' | 'expression'
Which data type to calculate this on. (default='splicing')
n : int
Minimum number of samples per phenotype, per event
Returns
-------
big_transitions : pandas.DataFrame
A (n_events, n_transitions) dataframe of the NMF distances between
splicing events
"""
if phenotype_transitions == 'all':
phenotype_transitions = self.phenotype_transitions
if data_type == 'splicing':
return self.splicing.big_nmf_space_transitions(
self.sample_id_to_phenotype, phenotype_transitions, n=n)
[docs] def save(self, study_name, flotilla_dir=FLOTILLA_DOWNLOAD_DIR):
metadata = self.metadata.data_original
metadata_kws = {'pooled_col': self.metadata.pooled_col,
'phenotype_col': self.metadata.phenotype_col,
'phenotype_order': self.metadata.phenotype_order,
'phenotype_to_color':
self.metadata.phenotype_to_color,
'phenotype_to_marker':
self.metadata.phenotype_to_marker,
'minimum_samples': self.metadata.minimum_sample_subset,
'outlier_col': self.metadata.outlier_col}
try:
expression = self.expression.data_original
expression_kws = {
'log_base': self.expression.log_base,
'thresh': self.expression.thresh_original,
'plus_one': self.expression.plus_one}
except AttributeError:
expression = None
expression_kws = None
try:
expression_feature_data = self.expression.feature_data
expression_feature_kws = {
'rename_col': self.expression.feature_rename_col,
'ignore_subset_cols':
self.expression.feature_ignore_subset_cols}
except AttributeError:
expression_feature_data = None
expression_feature_kws = None
try:
splicing = self.splicing.data_original
splicing_kws = {}
except AttributeError:
splicing = None
splicing_kws = None
try:
splicing_feature_data = self.splicing.feature_data
splicing_feature_kws = \
{'rename_col': self.splicing.feature_rename_col,
'ignore_subset_cols':
self.splicing.feature_ignore_subset_cols,
'expression_id_col': self.splicing.feature_expression_id_col}
except AttributeError:
splicing_feature_data = None
splicing_feature_kws = None
try:
spikein = self.spikein.data_original
except AttributeError:
spikein = None
try:
gene_ontology = self.gene_ontology.data
except AttributeError:
gene_ontology = None
try:
mapping_stats = self.mapping_stats.data_original
mapping_stats_kws = {
'number_mapped_col': self.mapping_stats.number_mapped_col,
'min_reads': self.mapping_stats.min_reads}
except AttributeError:
mapping_stats = None
mapping_stats_kws = None
supplemental_attributes = inspect.getmembers(self.supplemental,
lambda a: not
(inspect.isroutine(a)))
supplemental_attributes = [a for a in supplemental_attributes
if not(a[0].startswith('__') and
a[0].endswith('__'))]
supplemental_kws = {}
for supplemental_name, df in supplemental_attributes:
supplemental_kws[supplemental_name] = df
# Increase the version number
version = semantic_version.Version(self.version)
version.patch = version.patch + 1
version = str(version)
return make_study_datapackage(
study_name, metadata, expression, splicing,
spikein, mapping_stats, metadata_kws=metadata_kws,
expression_kws=expression_kws, splicing_kws=splicing_kws,
mapping_stats_kws=mapping_stats_kws,
expression_feature_kws=expression_feature_kws,
expression_feature_data=expression_feature_data,
splicing_feature_data=splicing_feature_data,
splicing_feature_kws=splicing_feature_kws, species=self.species,
license=self.license, title=self.title, sources=self.sources,
version=version, flotilla_dir=flotilla_dir,
gene_ontology=gene_ontology, supplemental_kws=supplemental_kws)
@staticmethod
def _maybe_get_axis_name(df, axis=0, alt_name=None):
if alt_name is None:
alt_name = 'columns' if axis == 1 else 'index'
axis = df.columns if axis == 1 else df.index
if isinstance(axis, pd.MultiIndex):
name = axis.names
else:
name = axis.name
name = alt_name if name is None else name
return name
@cached_property()
def tidy_splicing_with_expression(self):
"""A tall 'tidy' dataframe of samples with expression and splicing
:return:
:rtype:
"""
# Establish common strings
splicing_common_id = self.splicing.feature_data[
self.splicing.feature_expression_id_col]
# Tidify splicing
splicing = self.splicing.data
splicing_index_name = self._maybe_get_axis_name(splicing, axis=0)
splicing_columns_name = self._maybe_get_axis_name(splicing, axis=1)
splicing_tidy = pd.melt(splicing.reset_index(),
id_vars=splicing_index_name,
value_name='psi',
var_name=splicing_columns_name)
s = splicing_common_id.dropna()
event_name_to_ensembl_ids = list(itertools.chain(
*[zip([k] * len(v.split(',')), v.split(',')) for k, v in
s.iteritems()]))
index, data = zip(*event_name_to_ensembl_ids)
event_name_to_ensembl_ids = pd.Series(data, index=index,
name=self._common_id)
rename_columns = {}
if splicing_index_name == 'index':
rename_columns[splicing_index_name] = self._sample_id
if splicing_columns_name == 'columns':
rename_columns[splicing_columns_name] = self._event_name
splicing_columns_name = self._event_name
splicing_tidy = splicing_tidy.rename(columns=rename_columns)
splicing_tidy = splicing_tidy.set_index(splicing_columns_name)
splicing_tidy = splicing_tidy.ix[event_name_to_ensembl_ids.index]
splicing_tidy = splicing_tidy.join(event_name_to_ensembl_ids)
splicing_tidy = splicing_tidy.dropna().reset_index()
splicing_tidy = splicing_tidy.rename(
columns={'index': self._event_name})
# Tidify expression
expression = self.expression.data_original
expression_index_name = self._maybe_get_axis_name(expression, axis=0)
expression_tidy = pd.melt(expression.reset_index(),
id_vars=expression_index_name,
value_name='expression',
var_name=self._common_id)
# This will only do anything if there is a column named "index" so
# no need to check anything
expression_tidy = expression_tidy.rename(
columns={'index': self._sample_id})
expression_tidy = expression_tidy.dropna()
splicing_tidy_with_expression = splicing_tidy.merge(
expression_tidy, left_on=[self._sample_id, self._common_id],
right_on=[self._sample_id, self._common_id])
return splicing_tidy_with_expression
[docs] def filter_splicing_on_expression(self, expression_thresh,
sample_subset=None):
"""Filter splicing events on expression values
Parameters
----------
expression_thresh : float
Minimum expression value, of the original input. E.g. if the
original input is already log-transformed, then this threshold is
on the log values.
Returns
-------
psi : pandas.DataFrame
A (n_samples, n_features)
"""
min_expression = self.expression.data_original.min().min()
if expression_thresh > -np.inf \
and expression_thresh > min_expression:
columns = self._maybe_get_axis_name(self.splicing.data, axis=1,
alt_name=self._event_name)
index = self._maybe_get_axis_name(self.splicing.data, axis=0,
alt_name=self._sample_id)
sample_ids = self.sample_subset_to_sample_ids(sample_subset)
splicing_with_expression = \
self.tidy_splicing_with_expression.ix[
self.tidy_splicing_with_expression.sample_id.isin(
sample_ids)]
ind = splicing_with_expression.expression >= expression_thresh
splicing_high_expression = splicing_with_expression.ix[ind]
splicing_high_expression = \
splicing_high_expression.reset_index().dropna()
if isinstance(columns, list) or isinstance(index, list):
filtered_psi = splicing_high_expression.pivot_table(
columns=columns, index=index, values='psi')
else:
filtered_psi = splicing_high_expression.pivot(
columns=columns, index=index, values='psi')
return filtered_psi
else:
return self.splicing.data
[docs] def go_enrichment(self, feature_ids, background=None, domain=None,
p_value_cutoff=1000000, min_feature_size=3,
min_background_size=5):
"""Calculate gene ontology enrichment of provided features
Parameters
----------
feature_ids : list-like
Features to calculate gene ontology enrichment on
background : list-like, optional
Features to use as the background
domain : str or list, optional
Only calculate GO enrichment for a particular GO category or
subset of categories. Valid domains:
'biological_process', 'molecular_function', 'cellular_component'
p_value_cutoff : float, optional
Maximum accepted Bonferroni-corrected p-value
min_feature_size : int, optional
Minimum number of features of interest overlapping in a GO Term,
to calculate enrichment
min_background_size : int, optional
Minimum number of features in the background overlapping a GO Term
Returns
-------
enrichment : pandas.DataFrame
A (go_categories, columns) dataframe showing the GO
enrichment categories that were enriched in the features
"""
if background is None:
warnings.warn('No background provided, defaulting to all '
'expressed genes')
background = self.expression.data.columns
return self.gene_ontology.enrichment(
feature_ids, background=background,
cross_reference=self.expression.feature_renamer_series,
domain=domain, p_value_cutoff=p_value_cutoff,
min_feature_size=min_feature_size,
min_background_size=min_background_size)
# Add interactive visualizations
Study.interactive_classifier = Interactive.interactive_classifier
Study.interactive_graph = Interactive.interactive_graph
Study.interactive_pca = Interactive.interactive_pca
# Study.interactive_localZ = Interactive.interactive_localZ
Study.interactive_lavalamp_pooled_inconsistent = \
Interactive.interactive_lavalamp_pooled_inconsistent
Study.interactive_choose_outliers = Interactive.interactive_choose_outliers
Study.interactive_reset_outliers = Interactive.interactive_reset_outliers
Study.interactive_clustermap = Interactive.interactive_clustermap
Study.interactive_correlations = Interactive.interactive_correlations