Source code for flotilla.data_model.gene_ontology

from collections import defaultdict, Iterable
import sys
import warnings

import pandas as pd
from scipy.stats import hypergeom

from ..util import timestamp


[docs]class GeneOntologyData(object):

    domains = frozenset(['biological_process', 'molecular_function',
                         'cellular_component'])

    def __init__(self, data):
        """Object to calculate enrichment of Gene Ontology terms

        Acceptable Gene Ontology tables can be downloaded from ENSEMBL's
        BioMart tool: http://www.ensembl.org/biomart

        1. Choose "Ensembl Genes ##" (## = version number, for me it's 78)
        2. Click "Attributes"
        3. Expand "EXTERNAL"
        4. Check the boxes for 'GO Term Accession', 'Ensembl Gene ID',
           'GO Term Name', and 'GO domain'

        Parameters
        ----------
        data : pandas.DataFrame
            A dataframe with at least the following columns:
            'GO Term Accession', 'Ensembl Gene ID', 'GO Term Name', 'GO domain'
        """

        self.data = data.dropna()

        # Need "data_original" to be consistent with other datatypes
        self.data_original = self.data
        sys.stdout.write('{}\tBuilding Gene Ontology '
                         'database...\n'.format(timestamp()))
        self.ontology = defaultdict(dict)
        for go, df in data.groupby('GO Term Accession'):
            self.ontology[go]['genes'] = set(df['Ensembl Gene ID'])
            self.ontology[go]['name'] = df['GO Term Name'].values[0]
            self.ontology[go]['domain'] = df['GO domain'].values[0]
            self.ontology[go]['n_genes'] = len(self.ontology[go]['genes'])
        sys.stdout.write('{}\t\tDone.'.format(timestamp()))

        self.all_genes = self.data['Ensembl Gene ID'].unique()

[docs]    def enrichment(self, features_of_interest, background=None,
                   p_value_cutoff=1000000, cross_reference=None,
                   min_feature_size=3, min_background_size=5,
                   domain=None):
        """Bonferroni-corrected hypergeometric p-values of GO enrichment

        Calculates hypergeometric enrichment of the features of interest, in
        each GO category.

        Parameters
        ----------
        features_of_interest : list-like
            List of features. Must match the identifiers in the ontology
            database exactly, i.e. if your ontology database is ENSEMBL ids,
            then you can only provide those and not common names like "RBFOX2"
        background : list-like, optional
            Background genes to use. It is best to use a relevant background
            such as all expressed genes. If None, defaults to all genes.
        p_value_cutoff : float, optional
            Maximum accepted Bonferroni-corrected p-value
        cross_reference : dict-like, optional
            A mapping of gene ids to gene symbols, e.g. a pandas Series of
            ENSEMBL genes e.g. ENSG00000139675 to gene symbols e.g HNRNPA1L2
        min_feature_size : int, optional
            Minimum number of features of interest overlapping in a GO Term,
            to calculate enrichment
        min_background_size : int, optional
            Minimum number of features in the background overlapping a GO Term
        domain : str or list, optional
            Only calculate GO enrichment for a particular GO category or
            subset of categories. Valid domains:
            'biological_process', 'molecular_function', 'cellular_component'

        Returns
        -------
        enrichment_df : pandas.DataFrame
            A (n_go_categories, columns) DataFrame of the enrichment scores

        Raises
        ------
        ValueError
            If features of interest and background do not overlap, or invalid
            GO domains are given
        """
        cross_reference = {} if cross_reference is None else cross_reference
        background = self.all_genes if background is None else background
        if len(set(background) & set(features_of_interest)) == 0:
            raise ValueError('Features of interest and background do not '
                             'overlap! Not calculating GO enrichment')
        if len(set(features_of_interest) & set(self.all_genes)) == 0:
            raise ValueError('Features of interest do not overlap with GO term'
                             'gene ids. Not calculating GO enrichment.')
        domains = self.domains
        valid_domains = ",".join("'{}'".format(x) for x in self.domains)

        if isinstance(domain, str):
            if domain not in self.domains:
                raise ValueError(
                    "'{}' is not a valid GO domain. "
                    "Only {} are acceptable".format(domain, valid_domains))
            domains = frozenset([domain])
        elif isinstance(domain, Iterable):
            if len(set(domain) & self.domains) == 0:
                raise ValueError(
                    "'{}' are not a valid GO domains. "
                    "Only {} are acceptable".format(
                        ",".join("'{}'".format(x) for x in domain),
                        valid_domains))
            domains = frozenset(domain)

        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in self.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # Survival function is more accurate on small p-values
            p_value = hypergeom.sf(len(features_in_go), n_all_genes,
                                   len(background_in_go),
                                   n_features_of_interest)
            p_value = 0 if p_value < 0 else p_value
            symbols = [cross_reference[f] if f in cross_reference else f for f
                       in features_in_go]
            enrichment['p_value'][go_term] = p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][
                go_term] = ','.join(features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        enrichment_df = pd.DataFrame(enrichment)

        if enrichment_df.empty:
            warnings.warn('No GO categories enriched in provided features')
            return

        # Bonferonni correction
        enrichment_df['bonferonni_corrected_p_value'] = \
            enrichment_df.p_value * enrichment_df.shape[0]
        ind = enrichment_df['bonferonni_corrected_p_value'] < p_value_cutoff
        enrichment_df = enrichment_df.ix[ind]
        enrichment_df = enrichment_df.sort(columns=['p_value'])

        return enrichment_df