Source code for outrigger.index.adjacencies

"""
Find exons adjacent to junctions
"""
import warnings

import joblib

from ..common import JUNCTION_ID, EXON_START, EXON_STOP, CHROM, STRAND, \
    ORDER_BY, UPSTREAM, DOWNSTREAM, NOVEL_EXON, \
    OUTRIGGER_DE_NOVO, MAX_DE_NOVO_EXON_LENGTH
from ..io.gtf import transform, maybe_analyze, location_to_feature
from ..region import Region
from ..util import done, progress

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import pandas as pd


def _unify_strand(strand1, strand2):
    """If strands are equal, return the strand, otherwise return None"""
    if strand1 != strand2:
        strand = '.'
    else:
        strand = strand1
    return strand


def _exons_from_neighboring_junctions(junction, neighbors, side='left'):
    """Returns (chrom, start, stop, strand) of neighboring exons from junctions

    Parameters
    ----------
    junction : outrigger.Region
        A single junction whose neighboring exons you want
    neighbors : pandas.DataFrame
        All junctions which are within

    Returns
    -------
    exons : pandas.Series
        A column containing (chrom, start, stop, strand) for each detected exon
    """
    if neighbors.empty:
        return pd.Series()
    if side == 'left':
        return neighbors.apply(lambda x: (
            x.chrom, x.stop + 1, junction.start - 1,
            _unify_strand(x.strand, junction.strand)), axis=1)
    elif side == 'right':
        return neighbors.apply(lambda x: (
            x.chrom, junction.stop + 1, x.start - 1,
            _unify_strand(x.strand, junction.strand)), axis=1)


def _neighboring_exons(junction, df, side='left',
                       max_de_novo_exon_length=MAX_DE_NOVO_EXON_LENGTH):
    """Get either the left or right neighbors of a particular junction

    Used to find novel exons between junctions. Not part of the
    ExonJunctionAdjacencies object so it can be parallelized with joblib.
    Internal function

    Parameters
    ----------
    junction : outrigger.Region
        A Region object with .start and .stop
    df : pandas.DataFrame
        A data table with "start" and "stop" columns of all junctions
    side : 'left' | 'right'
        Specifies whether you want the left or right side neighbors of a
        junction (not strand-specific)

    Returns
    -------
    neighboring_exon_locations : pandas.Series
        A column containing (chrom, start, stop, strand) for each detected exon
    """
    if side == 'left':
        rows = junction.start - df.stop
    elif side == 'right':
        rows = df.start - junction.stop
    rows = rows[rows > 0]
    rows = rows[rows <= max_de_novo_exon_length]
    neighboring_junctions = df.loc[rows.index]

    neighboring_exon_locations = _exons_from_neighboring_junctions(
        junction, neighboring_junctions, side=side)

    return neighboring_exon_locations


[docs]def is_there_an_exon_here(self, junction1, junction2): """Check if there could be an exon between these two junctions Parameters ---------- junction{1,2} : outrigger.Region Outrigger.Region objects Returns ------- start, stop : (int, int) or (False, False) Start and stop of the new exon if it exists, else False, False """ if junction1.overlaps(junction2): return False, False # These are junction start/stops, not exon start/stops # Move one nt upstream of starts for exon stops, # and one nt downstream of stops for exon starts. option1 = abs(junction1.stop - junction2.start) \ < self.max_de_novo_exon_length option2 = abs(junction2.stop - junction1.start) \ < self.max_de_novo_exon_length if option1: return junction1.stop + 1, junction2.start - 1 elif option2: return junction2.stop + 1, junction1.start - 1 return False, False
[docs]class ExonJunctionAdjacencies(object): """Annotate junctions with neighboring exons (upstream or downstream)""" exon_types = 'exon', NOVEL_EXON def __init__(self, metadata, db, junction_id=JUNCTION_ID, exon_start=EXON_START, exon_stop=EXON_STOP, chrom=CHROM, strand=STRAND, max_de_novo_exon_length=MAX_DE_NOVO_EXON_LENGTH, n_jobs=-1): """Initialize class to get upstream/downstream exons of junctions Parameters ---------- metadata : pandas.DataFrame A table of splice junctions with the columns indicated by the variables `junction_id`, `exon_start`, `exon_stop`, `chrom`, `strand` db : gffutils.FeatureDB Gffutils Database of gene, transcript, and exon features. junction_id, exon_start, exon_stop, chrom, strand : str Columns in `metadata` """ columns = junction_id, exon_start, exon_stop, chrom, strand for column in columns: if column not in metadata: raise ValueError('The required column {} is not in the splice ' 'junction dataframe'.format(column)) self.metadata = metadata.set_index(junction_id) self.metadata = self.metadata.sort_index() self.junction_id = junction_id self.exon_start = exon_start self.exon_stop = exon_stop self.chrom = chrom self.strand = strand self.db = db progress('\tLooking up which exons are already defined ...') self.existing_exons = set( i['id'] for i in self.db.execute( 'select id from features where featuretype = "exon"')) done(n_tabs=3) self.max_de_novo_exon_length = max_de_novo_exon_length self.n_jobs = n_jobs
[docs] def detect_exons_from_junctions(self): """Find exons based on gaps in junctions""" junctions = pd.Series(self.metadata.index.map(Region), name='region', index=self.metadata.index) junctions = junctions.to_frame() junctions['chrom'] = junctions['region'].map(lambda x: x.chrom) junctions['start'] = junctions['region'].map(lambda x: x.start) junctions['stop'] = junctions['region'].map(lambda x: x.stop) junctions['strand'] = junctions['region'].map(lambda x: x.strand) for chrom, df in junctions.groupby('chrom'): # Only get left-adjacent novel exons since there has to be a # junction on both sides, and since we iterate over ALL junctions, # if we get all left and right exons for all junctions, we're # double-counting exons progress('\tFinding all exons on chromosome {chrom} ' '...'.format(chrom=chrom)) exon_locations = pd.concat(joblib.Parallel(n_jobs=self.n_jobs)( joblib.delayed(_neighboring_exons)(junction, df, 'left') for junction in df.region)) done(n_tabs=3) progress('\t\tFiltering for only novel exons on chromosome ' '{chrom} ...'.format(chrom=chrom)) novel_exons = set(x for x in exon_locations if 'exon:{}:{}-{}:{}'.format(*x) not in self.existing_exons) done(n_tabs=4) progress('\t\tCreating gffutils.Feature objects for each novel ' 'exon, plus potentially its overlapping gene') exon_features = [location_to_feature(self.db, *x, source=OUTRIGGER_DE_NOVO, featuretype=NOVEL_EXON) for x in novel_exons] done(n_tabs=4) progress('\t\tUpdating gffutils database with {n} novel exons on ' 'chromosome {chrom} ...'.format(chrom=chrom, n=len(novel_exons))) try: self.db.update(exon_features, make_backup=False, id_spec={NOVEL_EXON: 'location_id'}, transform=transform) except ValueError: progress('\tNo novel exons found on chromosome ' '{chrom}'.format(chrom=chrom)) done(n_tabs=4) # For up to 1000x faster queries, re-Analyze the database now that it # has been updated maybe_analyze(self.db)
[docs] def write_de_novo_exons(self, filename='novel_exons.gtf'): """Write all de novo exons to a gtf""" with open(filename, 'w') as f: novel_exons = self.db.features_of_type(NOVEL_EXON, order_by=ORDER_BY) for noveL_exon in novel_exons: f.write(str(noveL_exon) + '\n')
@staticmethod def _single_junction_exon_triple(direction_ind, direction, exon_id): """Create exon, direction, junction triples for an exon + its junctions Parameters ---------- direction_ind : pandas.Series A boolean series of the indices of the junctions matching with the provided exon. The index of the series must be the junction ID direction : str The direction of the exon relative to the junctions, either "upstream" or "downstream" exon_id : str Unique identifier of the exon Returns ------- triples : pandas.DataFrame A (n, 3) sized dataframe of an exon and its adjacent junctions """ length = direction_ind.sum() exons = [exon_id] * length directions = [direction] * length junctions = direction_ind[direction_ind].index return pd.DataFrame(list(zip(exons, directions, junctions)), columns=['exon', 'direction', 'junction']) @staticmethod def _to_stranded_transcript_adjacency(adjacent_in_genome, strand): """If negative strand, swap the upstream/downstream adjacency Parameters ---------- adjacent_in_genome : dict dict of two keys, "upstream" and "downstream", mapping to a boolean series indicating whether the junction is upstream or downstream of a particular exon strand : "-" | "+" Positive or negative strand """ if strand == '+': return {UPSTREAM: adjacent_in_genome[UPSTREAM], DOWNSTREAM: adjacent_in_genome[DOWNSTREAM]} elif strand == '-': return {UPSTREAM: adjacent_in_genome[DOWNSTREAM], DOWNSTREAM: adjacent_in_genome[UPSTREAM]} else: # If strand is unknown, put both upstream and downstream for each # side adjacent = pd.concat(adjacent_in_genome.values()) return {UPSTREAM: adjacent, DOWNSTREAM: adjacent} def _junctions_genome_adjacent_to_exon(self, exon): """Get indices of junctions next to an exon, in genome coordinates""" chrom_ind = self.metadata[self.chrom] == exon.chrom strand_ind = self.metadata[self.strand] == exon.strand common_ind = chrom_ind & strand_ind upstream_in_genome = \ common_ind & (self.metadata[self.exon_stop] == exon.stop) downstream_in_genome = \ common_ind & (self.metadata[self.exon_start] == exon.start) return {UPSTREAM: upstream_in_genome, DOWNSTREAM: downstream_in_genome}
[docs] def junctions_adjacent_to_this_exon(self, exon): """Get junctions adjacent to this exon Parameters ---------- exon : gffutils.Feature An item in a gffutils database """ dfs = [] adjacent_in_genome = self._junctions_genome_adjacent_to_exon(exon) adjacent_in_transcriptome = self._to_stranded_transcript_adjacency( adjacent_in_genome, exon.strand) exon_id = exon.id for direction, ind in adjacent_in_transcriptome.items(): if ind.any(): df = self._single_junction_exon_triple(ind, direction, exon_id) dfs.append(df) if len(dfs) > 0: return pd.concat(dfs, ignore_index=True) else: return pd.DataFrame()
[docs] def upstream_downstream_exons(self): """Get upstream and downstream exons of each junction The "upstream" and "downstream" is relative to the **junction**, e.g. exonA upstream junctionX exonB downstream junctionX should be read as "exonA is upstream of juction X" and "exonB is downstream of junctionX" Use junctions defined in ``sj_metadata`` and exons in ``db`` to create triples of (exon, direction, junction), which are read like (subject, object, verb) e.g. ('exon1', 'upstream', 'junction12'), for creation of a graph database. Parameters ---------- sj_metadata : pandas.DataFrame A splice junction metadata dataframe with the junction id as the index, with columns defined by variables ``exon_start`` and ``exon_stop``. db : gffutils.FeatureDB A database of gene annotations created by gffutils. Must have features of type "exon" exon_start : str, optional Name of the column in sj_metadata corresponding to the start of the exon exon_stop : str, optional Name of the column in sj_metadata corresponding to the end of the exon Returns ------- junction_exon_triples : pandas.DataFrame A three-column dataframe describing the relationship of where an exon is relative to junctions """ n_exons = sum(1 for _ in self.db.features_of_type(self.exon_types)) dfs = [] progress('Starting annotation of all junctions with known ' 'neighboring exons ...') for i, exon in enumerate(self.db.features_of_type(self.exon_types)): if (i + 1) % 10000 == 0: progress('\t{}/{} exons completed'.format(i + 1, n_exons)) df = self.junctions_adjacent_to_this_exon(exon) dfs.append(df) junction_exon_triples = pd.concat(dfs, ignore_index=True) done() return junction_exon_triples