Source code for flotilla.data_model.metadata

from collections import defaultdict
import sys
import warnings
from itertools import cycle

import matplotlib as mpl
import pandas as pd
import seaborn as sns

from .base import BaseData, subsets_from_metadata
from ..visualize.color import str_to_color


POOLED_COL = 'pooled'
PHENOTYPE_COL = 'phenotype'
MINIMUM_SAMPLE_SUBSET = 10
OUTLIER_COL = 'outlier'


[docs]class MetaData(BaseData): def __init__(self, data, phenotype_order=None, phenotype_to_color=None, phenotype_to_marker=None, phenotype_col=PHENOTYPE_COL, pooled_col=POOLED_COL, outlier_col=OUTLIER_COL, predictor_config_manager=None, ignore_subset_cols=None, minimum_sample_subset=MINIMUM_SAMPLE_SUBSET): super(MetaData, self).__init__( data, outliers=None, predictor_config_manager=predictor_config_manager) self.data_original = self.data self.phenotype_col = phenotype_col if phenotype_col is not None else \ self._default_phenotype_col self.phenotype_order = phenotype_order self.phenotype_to_color = phenotype_to_color self.pooled_col = pooled_col self.minimum_sample_subset = minimum_sample_subset self.outlier_col = outlier_col self.ignore_subset_cols = ignore_subset_cols phenotypes_not_in_order = set(self.unique_phenotypes).difference( set(self.phenotype_order)) if len(phenotypes_not_in_order) > 0: self.phenotype_order.extend(phenotypes_not_in_order) if self.phenotype_col not in self.data: sys.stderr.write('The required column name "{}" does not exist in ' 'the sample metadata. All samples will be ' 'treated as the same phenotype. You may also ' 'specify "phenotype_col" in the metadata section ' 'of the ' 'datapackage.\n'.format(self.phenotype_col)) self.data[self.phenotype_col] = 'phenotype' self.phenotype_order = None self.phenotype_to_color = None # Convert color strings to non-default matplotlib colors if self.phenotype_to_color is not None: # colors = iter(self._colors) for phenotype, color in self.phenotype_to_color.iteritems(): try: color = str_to_color[color] except KeyError: pass self._phenotype_to_color[phenotype] = color self.phenotype_to_marker = phenotype_to_marker markers = cycle(['o', '^', 's', 'v', '*', 'D', ]) if self.phenotype_to_marker is not None: for phenotype in self.unique_phenotypes: try: marker = self.phenotype_to_marker[phenotype] except KeyError: marker = markers.next() sys.stderr.write( '{} does not have marker style, ' 'falling back on "{}"'.format(phenotype, marker)) if marker not in mpl.markers.MarkerStyle.filled_markers: correct_marker = markers.next() sys.stderr.write( '{} is not a valid matplotlib marker style, ' 'falling back on "{}"'.format(marker, correct_marker)) marker = correct_marker self.phenotype_to_marker[phenotype] = marker @property
[docs] def sample_id_to_phenotype(self): return self.data[self.phenotype_col]
@property
[docs] def unique_phenotypes(self): return self.sample_id_to_phenotype.unique()
@property
[docs] def n_phenotypes(self): return len(self.unique_phenotypes)
@property def _default_phenotype_order(self): return list(sorted(self.unique_phenotypes)) @property def phenotype_order(self): if len(set(self._phenotype_order) & set(self.unique_phenotypes)) > 0: return [v for v in self._phenotype_order if v in self.unique_phenotypes] else: return self._default_phenotype_order @phenotype_order.setter
[docs] def phenotype_order(self, value): if value is not None: self._phenotype_order = value else: self._phenotype_order = self._default_phenotype_order
@property
[docs] def phenotype_transitions(self): return zip(self.phenotype_order[:-1], self.phenotype_order[1:])
@property def _colors(self): return map(mpl.colors.rgb2hex, sns.color_palette('husl', n_colors=self.n_phenotypes)) @property def _default_phenotype_to_color(self): colors = iter(self._colors) def color_factory(): return colors.next() return defaultdict(color_factory) @property def phenotype_to_color(self): _default_phenotype_to_color = self._default_phenotype_to_color all_phenotypes = self._phenotype_to_color.keys() all_phenotypes.extend(self.phenotype_order) return dict((k, self._phenotype_to_color[k]) if k in self._phenotype_to_color else (k, _default_phenotype_to_color[k]) for k in all_phenotypes) @phenotype_to_color.setter
[docs] def phenotype_to_color(self, value): if value is not None: self._phenotype_to_color = value else: sys.stderr.write('No phenotype to color mapping was provided, ' 'falling back on reasonable defaults.\n') self._phenotype_to_color = self._default_phenotype_to_color
@property def phenotype_to_marker(self): markers = cycle(['o', '^', 's', 'v', '*', 'D', ]) def marker_factory(): return markers.next() _default_phenotype_to_marker = defaultdict(marker_factory) all_phenotypes = self._phenotype_to_marker.keys() all_phenotypes.extend(self.phenotype_order) return dict((k, self._phenotype_to_marker[k]) if k in self._phenotype_to_marker else (k, _default_phenotype_to_marker[k]) for k in all_phenotypes) @phenotype_to_marker.setter
[docs] def phenotype_to_marker(self, value): if value is not None: self._phenotype_to_marker = value else: sys.stderr.write('No phenotype to marker (matplotlib plotting ' 'symbol) was provided, falling back on reasonable' ' defaults.\n') markers = cycle(['o', '^', 's', 'v', '*', 'D', ]) def marker_factory(): return markers.next() self._phenotype_to_marker = defaultdict(marker_factory)
@property
[docs] def phenotype_color_order(self): return [self.phenotype_to_color[p] for p in self.phenotype_order]
@property
[docs] def sample_id_to_color(self): return pd.Series( dict((sample_id, self.phenotype_to_color[p]) for sample_id, p in self.sample_id_to_phenotype.iteritems()))
@property
[docs] def sample_subsets(self): return subsets_from_metadata(self.data, self.minimum_sample_subset, 'samples', ignore=self.ignore_subset_cols)
@property
[docs] def phenotype_series(self): warnings.warn('MetaData.phenotype_series will be deprecated in 0.3.0') return self.data[self.phenotype_col]
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.