Source code for outrigger.io.star

"""
Read splice junction output files from STAR aligner (SJ.out.tab)
"""
import os

import joblib
import numpy as np
import pandas as pd

from ..common import JUNCTION_ID, JUNCTION_START, JUNCTION_STOP, READS, \
    JUNCTION_MOTIF, EXON_START, EXON_STOP, CHROM, STRAND, ANNOTATED, \
    SAMPLE_ID, UNIQUE_READS, MULTIMAP_READS, MAX_OVERHANG

from .core import add_exons_and_junction_ids

COLUMN_NAMES = (CHROM, JUNCTION_START, JUNCTION_STOP, STRAND,
                JUNCTION_MOTIF, ANNOTATED, UNIQUE_READS, MULTIMAP_READS,
                MAX_OVERHANG)


[docs]def int_to_junction_motif(n): if n == 0: return 'non-canonical' if n == 1: return 'GT/AG' if n == 2: # Negative strand: CT/AC return 'GT/AG' if n == 3: return 'GC/AG' if n == 4: # Negative strand: CT/GC return 'GC/AG' if n == 5: return 'AT/AC' if n == 6: # Negative strand: GT/AT return 'AT/AC'
[docs]def read_sj_out_tab(filename): """Read an SJ.out.tab file as produced by the RNA-STAR aligner into a pandas Dataframe Parameters ---------- filename : str of filename or file handle Filename of the SJ.out.tab file you want to read in Returns ------- sj : pandas.DataFrame Dataframe of splice junctions with the columns, ('chrom', 'junction_start', 'junction_stop', 'strand', 'junction_motif', 'exon_start', 'exon_stop', 'annotated', 'unique_junction_reads', 'multimap_junction_reads', 'max_overhang') """ sj = pd.read_table(filename, header=None, names=COLUMN_NAMES, sep='\s+') sj[JUNCTION_MOTIF] = sj[JUNCTION_MOTIF].map(int_to_junction_motif) # Convert unknown strands to explicitly say "undefined" sj[STRAND] = sj[STRAND].replace(0, 'undefined') # Convert integer strand to symbol # Use index-based replacement because it's 100x faster than map rows = sj.strand == 1 sj.loc[rows, STRAND] = '+' rows = sj.strand == 2 sj.loc[rows, STRAND] = '-' # Translate negative strand intron motifs # rows = sj.strand == '-' # sj.loc[rows, 'intron_motif'] = sj.intron_motif[rows].map( # lambda x: NEG_STRAND_INTRON_MOTIF[x]) sj.annotated = sj.annotated.astype(bool) sj = add_exons_and_junction_ids(sj) return sj
def _read_single_filename(filename, sample_id_func, ignore_multimapping=False): splice_junction = read_sj_out_tab(filename) sample_id = sample_id_func(filename) sample_id = sample_id.split('SJ.out.tab')[0].rstrip('.') splice_junction[SAMPLE_ID] = sample_id if not ignore_multimapping: splice_junction[READS] = splice_junction[UNIQUE_READS] \ + splice_junction[MULTIMAP_READS] else: splice_junction[READS] = splice_junction[UNIQUE_READS] return splice_junction
[docs]def read_multiple_sj_out_tab(filenames, ignore_multimapping=False, sample_id_func=os.path.basename, n_jobs=-1): """Read the splice junction files and return a tall, tidy dataframe Adds a column called "sample_id" based on the basename of the file, minus "SJ.out.tab" Parameters ---------- filenames : iterator A list or other iterator of filenames to read multimapping : bool If True, include the multimapped reads in total read count sample_id_func : function A function to extract the sample id from the filenames Returns ------- metadata : pandas.DataFrame A tidy dataframe, where each row has the observed reads for a sample """ dfs = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_read_single_filename)( filename, sample_id_func, ignore_multimapping) for filename in filenames) splice_junctions = pd.concat(dfs, ignore_index=True) splice_junctions[CHROM] = splice_junctions[CHROM].astype(str) splice_junctions = splice_junctions.sort_values( by=[SAMPLE_ID, CHROM, JUNCTION_START, JUNCTION_STOP]) splice_junctions.index = np.arange(splice_junctions.shape[0]) return splice_junctions
[docs]def make_metadata(spliced_reads, columns=(JUNCTION_ID, CHROM, JUNCTION_START, JUNCTION_STOP, STRAND, ANNOTATED, EXON_START, EXON_STOP)): """Get barebones junction chrom, start, stop, strand information Parameters ---------- spliced_reads : pandas.DataFrame Concatenated SJ.out.tab files created by read_sj_out_tab columns : iterable Which columns to use to make the metadata Returns ------- junctions : pandas.DataFrame A (n_junctions, 9) dataframe containing the columns: - junction_id - chrom - intron_start - intron_stop - exon_start - exon_stop - strand - intron_motif - annotated """ columns = spliced_reads.columns.intersection(columns) metadata = spliced_reads[columns] metadata = metadata.drop_duplicates() metadata.index = np.arange(metadata.shape[0]) # Force all chromosomes to be strings to be compatible with ENSEMBL's # integer chromosomes metadata[CHROM] = metadata[CHROM].astype(str) return metadata