"""
Find exons adjacent to junctions
"""
import warnings
import joblib
from ..common import JUNCTION_ID, EXON_START, EXON_STOP, CHROM, STRAND, \
ORDER_BY, UPSTREAM, DOWNSTREAM, NOVEL_EXON, \
OUTRIGGER_DE_NOVO, MAX_DE_NOVO_EXON_LENGTH
from ..io.gtf import transform, maybe_analyze, location_to_feature
from ..region import Region
from ..util import done, progress
with warnings.catch_warnings():
warnings.simplefilter("ignore")
import pandas as pd
def _unify_strand(strand1, strand2):
"""If strands are equal, return the strand, otherwise return None"""
if strand1 != strand2:
strand = '.'
else:
strand = strand1
return strand
def _exons_from_neighboring_junctions(junction, neighbors, side='left'):
"""Returns (chrom, start, stop, strand) of neighboring exons from junctions
Parameters
----------
junction : outrigger.Region
A single junction whose neighboring exons you want
neighbors : pandas.DataFrame
All junctions which are within
Returns
-------
exons : pandas.Series
A column containing (chrom, start, stop, strand) for each detected exon
"""
if neighbors.empty:
return pd.Series()
if side == 'left':
return neighbors.apply(lambda x: (
x.chrom, x.stop + 1, junction.start - 1,
_unify_strand(x.strand, junction.strand)), axis=1)
elif side == 'right':
return neighbors.apply(lambda x: (
x.chrom, junction.stop + 1, x.start - 1,
_unify_strand(x.strand, junction.strand)), axis=1)
def _neighboring_exons(junction, df, side='left',
max_de_novo_exon_length=MAX_DE_NOVO_EXON_LENGTH):
"""Get either the left or right neighbors of a particular junction
Used to find novel exons between junctions. Not part of the
ExonJunctionAdjacencies object so it can be parallelized with joblib.
Internal function
Parameters
----------
junction : outrigger.Region
A Region object with .start and .stop
df : pandas.DataFrame
A data table with "start" and "stop" columns of all junctions
side : 'left' | 'right'
Specifies whether you want the left or right side neighbors of a
junction (not strand-specific)
Returns
-------
neighboring_exon_locations : pandas.Series
A column containing (chrom, start, stop, strand) for each detected exon
"""
if side == 'left':
rows = junction.start - df.stop
elif side == 'right':
rows = df.start - junction.stop
rows = rows[rows > 0]
rows = rows[rows <= max_de_novo_exon_length]
neighboring_junctions = df.loc[rows.index]
neighboring_exon_locations = _exons_from_neighboring_junctions(
junction, neighboring_junctions, side=side)
return neighboring_exon_locations
[docs]def is_there_an_exon_here(self, junction1, junction2):
"""Check if there could be an exon between these two junctions
Parameters
----------
junction{1,2} : outrigger.Region
Outrigger.Region objects
Returns
-------
start, stop : (int, int) or (False, False)
Start and stop of the new exon if it exists, else False, False
"""
if junction1.overlaps(junction2):
return False, False
# These are junction start/stops, not exon start/stops
# Move one nt upstream of starts for exon stops,
# and one nt downstream of stops for exon starts.
option1 = abs(junction1.stop - junction2.start) \
< self.max_de_novo_exon_length
option2 = abs(junction2.stop - junction1.start) \
< self.max_de_novo_exon_length
if option1:
return junction1.stop + 1, junction2.start - 1
elif option2:
return junction2.stop + 1, junction1.start - 1
return False, False
[docs]class ExonJunctionAdjacencies(object):
"""Annotate junctions with neighboring exons (upstream or downstream)"""
exon_types = 'exon', NOVEL_EXON
def __init__(self, metadata, db, junction_id=JUNCTION_ID,
exon_start=EXON_START, exon_stop=EXON_STOP,
chrom=CHROM, strand=STRAND,
max_de_novo_exon_length=MAX_DE_NOVO_EXON_LENGTH,
n_jobs=-1):
"""Initialize class to get upstream/downstream exons of junctions
Parameters
----------
metadata : pandas.DataFrame
A table of splice junctions with the columns indicated by the
variables `junction_id`, `exon_start`, `exon_stop`, `chrom`,
`strand`
db : gffutils.FeatureDB
Gffutils Database of gene, transcript, and exon features.
junction_id, exon_start, exon_stop, chrom, strand : str
Columns in `metadata`
"""
columns = junction_id, exon_start, exon_stop, chrom, strand
for column in columns:
if column not in metadata:
raise ValueError('The required column {} is not in the splice '
'junction dataframe'.format(column))
self.metadata = metadata.set_index(junction_id)
self.metadata = self.metadata.sort_index()
self.junction_id = junction_id
self.exon_start = exon_start
self.exon_stop = exon_stop
self.chrom = chrom
self.strand = strand
self.db = db
progress('\tLooking up which exons are already defined ...')
self.existing_exons = set(
i['id'] for i in self.db.execute(
'select id from features where featuretype = "exon"'))
done(n_tabs=3)
self.max_de_novo_exon_length = max_de_novo_exon_length
self.n_jobs = n_jobs
[docs] def detect_exons_from_junctions(self):
"""Find exons based on gaps in junctions"""
junctions = pd.Series(self.metadata.index.map(Region),
name='region', index=self.metadata.index)
junctions = junctions.to_frame()
junctions['chrom'] = junctions['region'].map(lambda x: x.chrom)
junctions['start'] = junctions['region'].map(lambda x: x.start)
junctions['stop'] = junctions['region'].map(lambda x: x.stop)
junctions['strand'] = junctions['region'].map(lambda x: x.strand)
for chrom, df in junctions.groupby('chrom'):
# Only get left-adjacent novel exons since there has to be a
# junction on both sides, and since we iterate over ALL junctions,
# if we get all left and right exons for all junctions, we're
# double-counting exons
progress('\tFinding all exons on chromosome {chrom} '
'...'.format(chrom=chrom))
exon_locations = pd.concat(joblib.Parallel(n_jobs=self.n_jobs)(
joblib.delayed(_neighboring_exons)(junction, df, 'left')
for junction in df.region))
done(n_tabs=3)
progress('\t\tFiltering for only novel exons on chromosome '
'{chrom} ...'.format(chrom=chrom))
novel_exons = set(x for x in exon_locations if
'exon:{}:{}-{}:{}'.format(*x)
not in self.existing_exons)
done(n_tabs=4)
progress('\t\tCreating gffutils.Feature objects for each novel '
'exon, plus potentially its overlapping gene')
exon_features = [location_to_feature(self.db, *x,
source=OUTRIGGER_DE_NOVO,
featuretype=NOVEL_EXON)
for x in novel_exons]
done(n_tabs=4)
progress('\t\tUpdating gffutils database with {n} novel exons on '
'chromosome {chrom} ...'.format(chrom=chrom,
n=len(novel_exons)))
try:
self.db.update(exon_features,
make_backup=False,
id_spec={NOVEL_EXON: 'location_id'},
transform=transform)
except ValueError:
progress('\tNo novel exons found on chromosome '
'{chrom}'.format(chrom=chrom))
done(n_tabs=4)
# For up to 1000x faster queries, re-Analyze the database now that it
# has been updated
maybe_analyze(self.db)
[docs] def write_de_novo_exons(self, filename='novel_exons.gtf'):
"""Write all de novo exons to a gtf"""
with open(filename, 'w') as f:
novel_exons = self.db.features_of_type(NOVEL_EXON,
order_by=ORDER_BY)
for noveL_exon in novel_exons:
f.write(str(noveL_exon) + '\n')
@staticmethod
def _single_junction_exon_triple(direction_ind, direction, exon_id):
"""Create exon, direction, junction triples for an exon + its junctions
Parameters
----------
direction_ind : pandas.Series
A boolean series of the indices of the junctions matching with the
provided exon. The index of the series must be the junction ID
direction : str
The direction of the exon relative to the junctions, either
"upstream" or "downstream"
exon_id : str
Unique identifier of the exon
Returns
-------
triples : pandas.DataFrame
A (n, 3) sized dataframe of an exon and its adjacent junctions
"""
length = direction_ind.sum()
exons = [exon_id] * length
directions = [direction] * length
junctions = direction_ind[direction_ind].index
return pd.DataFrame(list(zip(exons, directions, junctions)),
columns=['exon', 'direction', 'junction'])
@staticmethod
def _to_stranded_transcript_adjacency(adjacent_in_genome, strand):
"""If negative strand, swap the upstream/downstream adjacency
Parameters
----------
adjacent_in_genome : dict
dict of two keys, "upstream" and "downstream", mapping to a boolean
series indicating whether the junction is upstream or downstream of
a particular exon
strand : "-" | "+"
Positive or negative strand
"""
if strand == '+':
return {UPSTREAM: adjacent_in_genome[UPSTREAM],
DOWNSTREAM: adjacent_in_genome[DOWNSTREAM]}
elif strand == '-':
return {UPSTREAM: adjacent_in_genome[DOWNSTREAM],
DOWNSTREAM: adjacent_in_genome[UPSTREAM]}
else:
# If strand is unknown, put both upstream and downstream for each
# side
adjacent = pd.concat(adjacent_in_genome.values())
return {UPSTREAM: adjacent, DOWNSTREAM: adjacent}
def _junctions_genome_adjacent_to_exon(self, exon):
"""Get indices of junctions next to an exon, in genome coordinates"""
chrom_ind = self.metadata[self.chrom] == exon.chrom
strand_ind = self.metadata[self.strand] == exon.strand
common_ind = chrom_ind & strand_ind
upstream_in_genome = \
common_ind & (self.metadata[self.exon_stop] == exon.stop)
downstream_in_genome = \
common_ind & (self.metadata[self.exon_start] == exon.start)
return {UPSTREAM: upstream_in_genome, DOWNSTREAM: downstream_in_genome}
[docs] def junctions_adjacent_to_this_exon(self, exon):
"""Get junctions adjacent to this exon
Parameters
----------
exon : gffutils.Feature
An item in a gffutils database
"""
dfs = []
adjacent_in_genome = self._junctions_genome_adjacent_to_exon(exon)
adjacent_in_transcriptome = self._to_stranded_transcript_adjacency(
adjacent_in_genome, exon.strand)
exon_id = exon.id
for direction, ind in adjacent_in_transcriptome.items():
if ind.any():
df = self._single_junction_exon_triple(ind, direction, exon_id)
dfs.append(df)
if len(dfs) > 0:
return pd.concat(dfs, ignore_index=True)
else:
return pd.DataFrame()
[docs] def upstream_downstream_exons(self):
"""Get upstream and downstream exons of each junction
The "upstream" and "downstream" is relative to the **junction**, e.g.
exonA upstream junctionX
exonB downstream junctionX
should be read as "exonA is upstream of juction X" and "exonB is
downstream of junctionX"
Use junctions defined in ``sj_metadata`` and exons in ``db`` to
create triples of (exon, direction, junction), which are read like
(subject, object, verb) e.g. ('exon1', 'upstream', 'junction12'), for
creation of a graph database.
Parameters
----------
sj_metadata : pandas.DataFrame
A splice junction metadata dataframe with the junction id as the
index, with columns defined by variables ``exon_start`` and
``exon_stop``.
db : gffutils.FeatureDB
A database of gene annotations created by gffutils. Must have
features of type "exon"
exon_start : str, optional
Name of the column in sj_metadata corresponding to the start of the
exon
exon_stop : str, optional
Name of the column in sj_metadata corresponding to the end of the
exon
Returns
-------
junction_exon_triples : pandas.DataFrame
A three-column dataframe describing the relationship of where an
exon is relative to junctions
"""
n_exons = sum(1 for _ in self.db.features_of_type(self.exon_types))
dfs = []
progress('Starting annotation of all junctions with known '
'neighboring exons ...')
for i, exon in enumerate(self.db.features_of_type(self.exon_types)):
if (i + 1) % 10000 == 0:
progress('\t{}/{} exons completed'.format(i + 1, n_exons))
df = self.junctions_adjacent_to_this_exon(exon)
dfs.append(df)
junction_exon_triples = pd.concat(dfs, ignore_index=True)
done()
return junction_exon_triples