Source code for outrigger.index.events

import itertools
import logging

import graphlite
import joblib
import numpy as np
import pandas as pd
from graphlite import V

from ..common import STRAND, ISOFORM_ORDER, ISOFORM_COMPONENTS, \
    EVENT_ID, INCOMPATIBLE_JUNCTIONS, SPLICE_ABBREVS, \
    SPLICE_TYPE_ALL_EXONS, SPLICE_TYPE_ALL_JUNCTIONS, CHROM, UPSTREAM, \
    DOWNSTREAM, DIRECTIONS
from outrigger.region import Region
from ..util import progress, done


[docs]def stringify_location(chrom, start, stop, strand, region=None): """Convert genome location to a string, optionally prefixing with region""" if region is not None: return '{0}:{1}:{2}-{3}:{4}'.format(region, chrom, start, stop, strand) else: return '{0}:{1}-{2}:{3}'.format(chrom, start, stop, strand)
[docs]def opposite(direction): return UPSTREAM if direction == DOWNSTREAM else DOWNSTREAM
[docs]class SpliceGraph(object): def __init__(self, junction_exon_triples, junction_col='junction', exon_col='exon', splice_types=SPLICE_ABBREVS): self.junction_exon_triples = junction_exon_triples self.junction_col = junction_col self.exon_col = exon_col self.log = logging.getLogger('SpliceGraph') self.splice_types = splice_types self.event_finders = tuple((splice_type, finder) for splice_type, finder in self._all_event_finders if splice_type in self.splice_types) self._make_graph(junction_exon_triples) def _make_graph(self, junction_exon_triples): self.graph = graphlite.connect(":memory:", graphs=DIRECTIONS) self.exons = tuple(junction_exon_triples[self.exon_col].unique()) self.n_exons = len(self.exons) self.junctions = tuple( junction_exon_triples[self.junction_col].unique()) # Exons are always first to make iteration easy self.items = tuple(np.concatenate([self.exons, self.junctions])) self.item_to_region = pd.Series(map(Region, self.items), index=self.items) with self.graph.transaction() as tr: for i, row in self.junction_exon_triples.iterrows(): junction = row[self.junction_col] exon = row[self.exon_col] junction_i = self.items.index(junction) exon_i = self.items.index(exon) self.log.debug('\n{} is {} of {}\n'.format( exon, row.direction, junction)) self.log.debug('{} is {} of {}\n'.format( junction, opposite(row.direction), exon)) tr.store(getattr(V(exon_i), row.direction)(junction_i)) tr.store(getattr(V(junction_i), opposite(row.direction))(exon_i)) # To speed up queries self.graph.db.execute("ANALYZE upstream") self.graph.db.execute("ANALYZE downstream")
[docs] def exons_one_junction_downstream(self, exon_i): """Get the exon(s) that are immediately downstream of this one Get exons that are downstream from this one, separated by one junction Parameters ---------- exon_i : int Integer identifier of the exon whose downstream exons you want. This is the exon's index location in self.exon_cols Returns ------- downstream_exons : graphlite.Query Integer identfiers of exons which are one junction downstream of the provided one """ return self.graph.find( V().downstream(exon_i)).traverse(V().upstream)
[docs] def exons_one_junction_upstream(self, exon_query): """Get the exon(s) that are immediately upstream of this one Get exons that are upstream from this one, separated by one junction Parameters ---------- exon_query : graphlite.Query Integer identifier of the exon whose upstream exons you want. This is the exon's index location in self.exons Returns ------- upstream_exons : graphlite.Query Integer identfiers of exon_cols which are one junction upstream of the provided one """ return exon_query.traverse(V().downstream).traverse( V().downstream)
[docs] def exons_two_junctions_downstream(self, exon_i): """Get the exon(s) that are two junction hops downstream Go one exon downstream, then one more exon. This is all the 2nd level exon_cols Parameters ---------- exon_i : int Integer identifier of the exon whose downstream exon_cols you want. This is the exon's index location in self.exon_cols Returns ------- downstream_exons : graphlite.Query Integer identfiers of exon_cols which are separated from the original exon by a junction, exon, and another junction """ return self.graph.find(V().downstream(exon_i)).traverse( V().upstream).traverse(V().upstream).traverse(V().upstream)
[docs] def junctions_between_exons(self, exon_a, exon_b): """Get the junctions between exonA and exonB""" return self.graph.find( V(exon_a).upstream) \ .intersection(V(exon_b).downstream)
def _skipped_exon(self, exon1_i, exon1_name): """Checks if this exon could be exon1 of an SE event""" events = {} exon23s = list(self.exons_one_junction_downstream(exon1_i)) exon23s = self.item_to_region[[self.items[i] for i in exon23s]] for exon_a, exon_b in itertools.combinations(exon23s, 2): if not exon_a.overlaps(exon_b): exon2 = min((exon_a, exon_b), key=lambda x: x._start) exon3 = max((exon_a, exon_b), key=lambda x: x._start) exon2_i = self.exons.index(exon2.name) exon3_i = self.exons.index(exon3.name) exon23_junction = list(self.graph.find( V(exon2_i).upstream).intersection( V().upstream(exon3_i))) if len(exon23_junction) > 0: # Isoform 1 - corresponds to Psi=0. Exclusion of exon2 exon13_junction = self.junctions_between_exons( exon1_i, exon3_i) # Isoform 2 - corresponds to Psi=1. Inclusion of exon2 exon12_junction = self.junctions_between_exons( exon1_i, exon2_i) junctions_i = list(itertools.chain( *[exon13_junction, exon12_junction, exon23_junction])) junctions = [self.items[i] for i in junctions_i] exons = exon1_name, exon2.name, exon3.name events[exons] = junctions return events def _mutually_exclusive_exon(self, exon1_i, exon1_name): """Checks if this exon could be exon1 of a MXE event""" events = {} exon23s_from1 = self.exons_one_junction_downstream(exon1_i) exon4s = self.exons_two_junctions_downstream(exon1_i) exon23s_from4 = self.exons_one_junction_upstream(exon4s) exon23s = set(exon23s_from4) & set(exon23s_from1) exon23s = [self.items[i] for i in exon23s] exon23s = self.item_to_region[exon23s] for exon_a, exon_b in itertools.combinations(exon23s, 2): if not exon_a.overlaps(exon_b): exon2 = min((exon_a, exon_b), key=lambda x: x._start) exon3 = max((exon_a, exon_b), key=lambda x: x._start) exon2_i = self.items.index(exon2.name) exon3_i = self.items.index(exon3.name) exon4_from2 = set( self.exons_one_junction_downstream(exon2_i)) exon4_from3 = set( self.exons_one_junction_downstream(exon3_i)) exon4_is = exon4_from2 & exon4_from3 try: for exon4_i in exon4_is: exon4_name = self.items[exon4_i] # print(exon1_name, exon2.name, exon3.name, exon4_name) # Isoform 1 - corresponds to Psi=0. Inclusion of # exon3 exon13_junction = self.junctions_between_exons( exon1_i, exon3_i) exon34_junction = self.junctions_between_exons( exon3_i, exon4_i) # Isoform 2 - corresponds to Psi=1. Inclusion of # exon2 exon12_junction = self.junctions_between_exons( exon1_i, exon2_i) exon24_junction = self.junctions_between_exons( exon2_i, exon4_i) exon_tuple = exon1_name, exon2.name, exon3.name, \ exon4_name # print exon12_junction.next() junctions_i = itertools.chain(*[exon13_junction, exon34_junction, exon12_junction, exon24_junction]) junctions = [self.items[i] for i in junctions_i] events[exon_tuple] = junctions except KeyError: pass return events
[docs] def single_exon_alternative_events(self, exon_i, exon_name): events = {} for event_type, event_finder in self.event_finders: events[event_type] = event_finder(exon_i, exon_name) return events
@property def _all_event_finders(self): finders = (('se', self._skipped_exon), ('mxe', self._mutually_exclusive_exon)) return finders
[docs] def alternative_events(self): events = {event_type: {} for event_type in SPLICE_ABBREVS} for exon_i, exon_name in enumerate(self.exons): new_events = self.single_exon_alternative_events( exon_i, exon_name) for key, value in new_events.items(): events[key].update(value) return events
[docs]class EventMaker(object): def __init__(self, junction_exon_triples, db=None, junction_col='junction', exon_col='exon'): """Combine splice junctions into splicing events Parameters ---------- junction_exon_triples : pandas.DataFrame of "exon, direction, junction", e.g.: exon1, upstream, junction12 db : gffutils.FeatureDB Gffutils Database of gene, transcript, and exon features. The exons must be accessible by the id provided on the `exon_col` columns. If not provided, certain splice types which require information about the transcript (AFE, ALE) cannot be annotated. """ self.log = logging.getLogger('EventMaker') self.junction_exon_triples = junction_exon_triples self.junction_exon_triples[CHROM] = self.junction_exon_triples[ junction_col].str.split(':').str.get(1) self.db = db self.junction_col = junction_col self.exon_col = exon_col @property def exon_progress_interval(self): return int(np.ceil(self.n_exons / 100.)) def _maybe_print_exon_progress(self, i): if (i + 1) % self.exon_progress_interval == 0: progress('\t{0}/{1} exons tested ({2:.1f}%)'.format( i + 1, self.n_exons, 100 * (i + 1) / float(self.n_exons)))
[docs] def event_dict_to_df(self, events, exon_names, junction_names): columns = list(exon_names) + list(junction_names) \ + ['exons', 'junctions'] data = pd.DataFrame(index=np.arange(len(events)), columns=columns) for i, (exons, junctions) in enumerate(events.items()): exon_ids = '@'.join(exons) junction_ids = '@'.join(junctions) data.loc[i, exon_names] = list(exons) data.loc[i, junction_names] = list(junctions) data.loc[i, 'exons'] = exon_ids data.loc[i, 'junctions'] = junction_ids data.loc[i, STRAND] = exons[0][-1] return data
[docs] def add_event_id_col(self, events, splice_type): isoform_components = ISOFORM_COMPONENTS[splice_type] events[EVENT_ID] = events.apply( lambda x: '|'.join( '{}={}'.format(isoform, '@'.join( x[list(isoform_components[isoform])])) for isoform in ISOFORM_ORDER), axis=1) events = events.set_index(EVENT_ID) return events
@staticmethod def _get_junction14(row): """Make incompatible junction between exons1 and 4 for MXE Parameters ---------- row : pandas.Series A row containing "junction12" and "junction34" names in the index, which contain outrigger.Region values """ junction12 = row['junction12'] junction34 = row['junction34'] if junction12.strand == "+": return Region(region='junction', chrom=junction12.chrom, start=junction12.start, stop=junction34.stop, strand=junction12.strand) if junction12.strand == "-": return Region(region='junction', chrom=junction12.chrom, start=junction34.start, stop=junction12.stop, strand=junction12.strand) @staticmethod def _get_junction23(row): """Make incompatible junction between exons2 and 3 for MXE Parameters ---------- row : pandas.Series A row containing "junction13" and "junction24" names in the index, which contain outrigger.Region values """ junction13 = row['junction13'] junction24 = row['junction24'] if junction13.strand == "+": return Region(region='junction', chrom=junction13.chrom, start=junction24.start, stop=junction13.stop, strand=junction13.strand) if junction13.strand == "-": return Region(region='junction', chrom=junction13.chrom, start=junction13.start, stop=junction24.stop, strand=junction13.strand)
[docs] def add_incompatible_junctions(self, events, splice_type): """Add junctions that are incompatible with splice type definition""" if splice_type == 'se': events[INCOMPATIBLE_JUNCTIONS] = np.nan elif splice_type == 'mxe': junction12s = events['junction12'].map(Region) junction13s = events['junction13'].map(Region) junction24s = events['junction24'].map(Region) junction34s = events['junction34'].map(Region) junction12_34 = pd.concat([junction12s, junction34s], axis=1) junction13_24 = pd.concat([junction13s, junction24s], axis=1) junction14 = junction12_34.apply(self._get_junction14, axis=1) junction23 = junction13_24.apply(self._get_junction23, axis=1) junction14 = junction14.apply(lambda x: x.name) junction23 = junction23.apply(lambda x: x.name) incompatible_junctions = junction14 + '|' + junction23 events[INCOMPATIBLE_JUNCTIONS] = incompatible_junctions return events
[docs] def find_events(self, splice_types=SPLICE_ABBREVS, n_jobs=-1): """For each exon, test if it is part of an alternative event""" events = {abbrev: {} for abbrev in splice_types} new_events = joblib.Parallel(n_jobs)( joblib.delayed(make_splice_graph_find_events)( df, self.junction_col, self.exon_col, splice_types=splice_types) for chrom, df in self.junction_exon_triples.groupby(CHROM)) for chrom_events in new_events: for key, value in chrom_events.items(): events[key].update(value) progress("Combining all events into large dataframes") events_dfs = dict.fromkeys(events.keys()) for event_type, event_subset in events.items(): exon_numbers = SPLICE_TYPE_ALL_EXONS[event_type] junction_numbers = SPLICE_TYPE_ALL_JUNCTIONS[event_type] events_df = self.event_dict_to_df(event_subset, exon_names=exon_numbers, junction_names=junction_numbers) if not events_df.empty: events_df = self.add_event_id_col(events_df, event_type) events_df = self.add_incompatible_junctions(events_df, event_type) events_dfs[event_type] = events_df done() return events_dfs
[docs]def make_splice_graph_find_events(df, junction_col, exon_col, splice_types=SPLICE_ABBREVS): splice_graph = SpliceGraph(df, junction_col, exon_col, splice_types=splice_types) return splice_graph.alternative_events()