Source code for outrigger.io.bam

import collections
import os

import joblib
import numpy as np
import pandas as pd
import pysam

from ..common import UNIQUE_READS, MULTIMAP_READS, READS, CHROM, \
    JUNCTION_START, JUNCTION_STOP, STRAND
from .core import add_exons_and_junction_ids


def _report_read_positions(read, counter):
    chrom = read.reference_name
    strand = '-' if read.is_reverse else '+'

    last_read_pos = False
    for read_loc, genome_loc in read.get_aligned_pairs():
        if read_loc is None and last_read_pos:
            # Add one to be compatible with STAR output and show the
            # start of the intron (not the end of the exon)
            start = genome_loc + 1
        elif read_loc and last_read_pos is None:
            stop = genome_loc  # we are right exclusive ,so this is correct
            counter[(chrom, start, stop, strand)] += 1
            del start
            del stop
        last_read_pos = read_loc


def _choose_strand_and_sum(reads):
    """Use the strand with more counts and sum all reads with same junction

    STAR seems to take a simple majority to decide on strand when there are
    reads mapping to both, so we'll do the same

    Parameters
    ----------
    reads : pandas.Series
        A (chrom, start, stop, strand)-indexed series of read counts

    Returns
    -------
    reads_strand_chosen : pandas.Series
        A (chrom, start, stop, strand)-indexed series of read counts, with
        the majority strand as the "winner" and

    """
    if reads.empty:
        return pd.Series(name=reads.name)
    locations = reads.groupby(level=(0, 1, 2)).idxmax()
    counts = reads.groupby(level=(0, 1, 2)).sum()

    index = pd.MultiIndex.from_tuples(locations.values)

    return pd.Series(counts.values, index=index, name=reads.name)


def _combine_uniquely_multi(uniquely, multi, ignore_multimapping=False):
    """Combine uniquely and multi-mapped read counts into a single table

    Parameters
    ----------
    unqiuely, multi : dict
        A dictionary of {(chrom, start, end, strand) : n_reads} uniquely mapped
        and multi-mapped (reads that could map to multiple parts of the genome)
    ignore_multimapping : bool
        When summing all reads, whether or not to ignore the multimapping
        reads. Default is False.

    Returns
    -------
    reads : pandas.DataFrame
        A combined table of all uniquely and multi-mapped reads, with an
        additional column of "reads" which will ultimately be the reads used
        for creating an outrigger index and calculating percent spliced-in.
    """
    uniquely = pd.Series(uniquely, name=UNIQUE_READS)
    multi = pd.Series(multi, name=MULTIMAP_READS)

    uniquely = _choose_strand_and_sum(uniquely)
    multi = _choose_strand_and_sum(multi)

    # Join the data on the chromosome locations
    if multi.empty:
        reads = uniquely.to_frame()
        reads[MULTIMAP_READS] = np.nan
    elif uniquely.empty:
        reads = multi.to_frame()
        reads[UNIQUE_READS] = np.nan
    else:
        reads = uniquely.to_frame().join(multi)

    reads = reads.fillna(0)
    reads = reads.astype(int)

    if ignore_multimapping:
        reads[READS] = reads[UNIQUE_READS]
    else:
        reads[READS] = reads.sum(axis=1)
    reads = reads.reset_index()
    reads = reads.rename(columns={'level_0': CHROM, 'level_1': JUNCTION_START,
                                  'level_2': JUNCTION_STOP, 'level_3': STRAND})
    reads.index = np.arange(reads.shape[0])
    return reads


def _get_junction_reads(filename):
    """Read a sam file and extract unique and multi mapped junction reads"""
    samfile = pysam.AlignmentFile(filename, "rb")

    # Uniquely mapped reads
    uniquely = collections.Counter()

    # Multimapped reads
    multi = collections.Counter()

    for read in samfile.fetch():
        if "N" in read.cigarstring:
            if read.mapping_quality < 255:
                counter = multi
            else:
                counter = uniquely

            _report_read_positions(read, counter)
    samfile.close()
    return uniquely, multi


[docs]def bam_to_junction_reads_table(bam_filename, ignore_multimapping=False):
    """Create a table of reads for this bam file"""
    uniquely, multi = _get_junction_reads(bam_filename)
    reads = _combine_uniquely_multi(uniquely, multi, ignore_multimapping)

    # Remove "junctions" with same start and stop
    reads = reads.loc[reads[JUNCTION_START] != reads[JUNCTION_STOP]]
    reads.index = np.arange(reads.shape[0])

    reads['sample_id'] = os.path.basename(bam_filename)
    reads = add_exons_and_junction_ids(reads)
    return reads


[docs]def read_multiple_bams(bam_filenames, ignore_multimapping=False, n_jobs=-1):
    dfs = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(
            bam_to_junction_reads_table)(filename, ignore_multimapping)
        for filename in bam_filenames)
    reads = pd.concat(dfs, ignore_index=True)
    return reads