Source code for outrigger.io.gtf

"""
Functions for creating GTF databases using gffutils and using those databases
to annotate alternative events.
"""
from collections import Counter
import itertools
import os

import gffutils
from gffutils.helpers import merge_attributes
import pandas as pd

from ..common import SPLICE_TYPE_ISOFORM_EXONS, OUTRIGGER_DE_NOVO, NOVEL_EXON
from ..region import Region, STRANDS

# Annotations from:
# ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz

gene_transcript = set(('gene', 'transcript'))


[docs]def maybe_analyze(db): try: # For gffutils >0.8.7.1 db.analyze() except AttributeError: # For compatability with gffutils<=0.8.7.1 db.execute('ANALYZE features')
[docs]def transform(f): if f.featuretype in gene_transcript: return f else: exon_location = '{}:{}:{}-{}:{}'.format( f.featuretype, f.seqid, f.start, f.stop, f.strand) exon_id = exon_location if f.featuretype == 'CDS': exon_id += ':' + f.frame f.attributes['location_id'] = [exon_id] return f
[docs]def create_db(gtf_filename, db_filename=None): db_filename = ':memory:' if db_filename is None else db_filename db = gffutils.create_db( gtf_filename, db_filename, merge_strategy='merge', id_spec={'gene': 'gene_id', 'transcript': 'transcript_id', 'exon': 'location_id', 'CDS': 'location_id', 'start_codon': 'location_id', 'stop_codon': 'location_id', 'UTR': 'location_id'}, transform=transform, force=True, verbose=True, disable_infer_genes=True, disable_infer_transcripts=True, force_merge_fields=['source']) maybe_analyze(db) return db
[docs]class SplicingAnnotator(object): """Annotates basic features of splicing events: gene ids and names""" def __init__(self, db, events, splice_type): """Annotate splicing events with their respective genes Parameters ---------- db : gffutils.FeatureDB Database including all the exons found in the events events : pandas.DataFrame Table of events, with the event ids as the index splice_type : 'se' | 'mxe' The type of alternative splicing, which informs the exon configurations for different isoforms """ self.db = db self.events = events self.splice_type = splice_type self.isoform_exons = SPLICE_TYPE_ISOFORM_EXONS[ self.splice_type.lower()] self.exon_cols = list(set(itertools.chain( *self.isoform_exons.values()))) self.exon_cols.sort() # Make a dataframe with outrigger.Region objects self.regions = pd.DataFrame(index=self.events.index) self.region_cols = ['{}_region'.format(x) for x in self.exon_cols] for exon_col, region_col in zip(self.exon_cols, self.region_cols): self.regions[region_col] = self.events[exon_col].map(Region) # Make introns and copy-pastable genome locations for the whole event intron_regions = self.regions[self.region_cols].apply( self.event_introns_regions, axis=1) self.regions = pd.concat([self.regions, intron_regions], axis=1) self.region_cols.extend(['intron_region', 'event_region']) # Add the lengths of exons, introns, event region, and the genome # location ("name") of each intron self.lengths = self.regions.applymap(len) self.lengths.columns = [x.replace('_region', '_length') for x in self.lengths] self.lengths = self.lengths.astype(int) intron_names = intron_regions.applymap(lambda x: x.name) intron_names.columns = [x.replace('_region', '_location') for x in intron_names] self.events = pd.concat([self.events, self.lengths, intron_names], axis=1)
[docs] def maybe_get_feature(self, feature_id): try: return self.db[feature_id] except gffutils.FeatureNotFoundError: r = Region(feature_id) feature = location_to_feature(self.db, r.chrom, r.start, r.stop, r.strand, source=OUTRIGGER_DE_NOVO, featuretype=NOVEL_EXON) self.db.update([feature], make_backup=False, id_spec={NOVEL_EXON: 'location_id'}, transform=transform) return feature
[docs] def attributes(self): """Retrieve all GTF attributes for each isoform's event""" ignore_keys = 'location_id', 'exon_id', 'exon_number' lines = [] for event_id, row in self.events.iterrows(): attributes = pd.Series(name=event_id) for isoform, exons in self.isoform_exons.items(): for e in exons: attributes[e] = row[e] n_exons = len(exons) exon_ids = row[exons] exon_features = [self.maybe_get_feature(exon_id) for exon_id in exon_ids] keys = set(itertools.chain( *[exon.attributes.keys() for exon in exon_features])) for key in keys: # Skip the location IDs which is specific to the # outrigger-built database, and the exon ids which will # never match up across all exons if key in ignore_keys: continue values = Counter() for exon_id in exon_ids: try: values.update( self.db[exon_id].attributes[key]) except KeyError: continue if len(values) > 0: # Only use attributes that came up in for all exons # of the isoform values = [value for value, count in values.items() if count == n_exons] new_key = isoform + '_' + key attributes[new_key] = ','.join(sorted(values)) lines.append(attributes) event_attributes = pd.concat(lines, axis=1).T df = pd.concat([self.events, event_attributes], axis=1) df = df.loc[:, ~df.columns.duplicated()] return df
[docs] def exon_bedfiles(self, folder): for region_col in self.region_cols: column = self.regions[region_col] lines = (region.to_bed_format(event_id) for event_id, region in column.iteritems()) name = region_col.split('_')[0] basename = name + '.bed' filename = os.path.join(folder, basename) with open(filename, 'w') as f: f.write('\n'.join(lines) + '\n')
[docs] def event_introns_regions(self, exons): """Make intron and event regions for an event Parameters ---------- exons : outrigger.Regions List of exon ids, e.g. ["exon:chr1:100-200:+", "exon:chr1:300-400:+"] Returns ------- regions : dict """ first_exon = exons[0] last_exon = exons[-1] chrom = first_exon.chrom strand = first_exon.strand if strand == '-': intron_stop = first_exon.start intron_start = last_exon.stop event_start = last_exon.start event_stop = first_exon.stop else: # If strand is positive or undefined intron_start = first_exon.stop intron_stop = last_exon.start event_start = first_exon.start event_stop = last_exon.stop intron = Region('intron:{chrom}:{start}-{stop}:{strand}'.format( chrom=chrom, start=intron_start, stop=intron_stop, strand=strand)) event = Region('event:{chrom}:{start}-{stop}:{strand}'.format( chrom=chrom, start=event_start, stop=event_stop, strand=strand)) regions = pd.Series(dict(intron_region=intron, event_region=event)) return regions
[docs]def location_to_feature(db, chrom, start, stop, strand, source, featuretype): if strand not in STRANDS: strand = '.' overlapping_genes = db.region(seqid=chrom, start=start, end=stop, strand=strand, featuretype='gene') exon_id = 'exon:{chrom}:{start}-{stop}:{strand}'.format( chrom=chrom, start=start, stop=stop, strand=strand) attributes = {} for g in overlapping_genes: attributes = merge_attributes(attributes, g.attributes) exon = gffutils.Feature(chrom, source=source, featuretype=featuretype, start=start, end=stop, strand=strand, id=exon_id, attributes=attributes) return exon