Source code for outrigger.psi.compute

import logging

import joblib
import pandas as pd

from ..common import INCOMPATIBLE_JUNCTIONS, MIN_READS, \
    UNEVEN_COVERAGE_MULTIPLIER, SAMPLE_ID, EVENT_ID, NOTES, PSI
from ..util import progress


logging.basicConfig()

idx = pd.IndexSlice


def _scale(x, n_junctions, method='mean'):
    if method == 'mean':
        return x.sum()/float(n_junctions)
    elif method == 'min':
        return x.min()


def _single_sample_maybe_sufficient_reads(isoform1, isoform2, n_junctions,
                                          min_reads, case, letters='ab'):
    """Check if the sum of reads is enough compared to number of junctions

    Parameters
    ----------
    isoform1, isoform2 : pandas.Series
        Number of reads found for a single sample's isoform1 and isoform2
        junctions
    n_junctions : int
        Total number of junctions, also len(isoform1) + len(isoform2)
    min_reads : int
        Minimum number of reads per junction
    case : str
        English explanation with case number for why an event was rejected or
        not. Must contain curly braces ("{}") to place the letter into.
    letters : str, optional
        Cases could have multiple clauses, by default the case is "option a" or
         "option b" but other letters could be used. (default='ab')

    Returns
    -------
    isoform1, isoform2 : pandas.Series or None
        If the sum of reads on all isoforms is equal to or greater than the
        minimum number of reads times the number of junctions, then return the
        event. Otherwise, there are insufficient reads and the event is
        rejected.
    """
    if (isoform1.sum() + isoform2.sum()) >= (min_reads * n_junctions):
        # Case 5a: There are sufficient junction reads
        return isoform1, isoform2, '{case}: There are sufficient junction ' \
                                   'reads'.format(case=case.format(letters[0]))
    else:
        # Case 5b: There are insufficient junction reads
        return None, None, '{case}: There are insufficient junction ' \
                           'reads'.format(case=case.format(letters[1]))


def _single_sample_check_unequal_read_coverage(
        isoform, uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER):
    """If one junction of an isoform is more heavily covered, reject it

    If the difference in read depth between two junctions of an isoform is
    higher than a multiplicative amount, reject the isoform.

    If the isoform only has one junction, don't reject it.

    Possible fallacy: assumes there can be at most two junctions per isoform...

    Parameters
    ----------
    isoform : pandas.Series
        Number of exon-exon junction-spanning reads found for an isoform
    uneven_coverage_multiplier : int
        Scale factor for the maximum amount bigger one side of a junction can
        be before rejecting the event, e.g. for an SE event with two junctions,
        junction12 and junction23, junction12=40 but junction23=500, then this
        event would be rejected because 500 > 40*10

    Returns
    -------
    isoform : pandas.Series or None
        If not rejected, the original is returned, otherwise return None
    """

    if len(isoform) == 1:
        return isoform

    multiplied = isoform * uneven_coverage_multiplier

    junction0 = isoform.iloc[0]
    junction1 = isoform.iloc[1]

    if junction0 > junction1 and junction0 > multiplied.iloc[1]:
        return None
    elif junction1 > junction0 and junction1 > multiplied.iloc[0]:
        return None
    else:
        return isoform


def _maybe_reject(reads, isoform1_ids, isoform2_ids, incompatible_ids,
                  n_junctions, min_reads=MIN_READS,
                  uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER):
    """Remove samples with reads that are incompatible with event definition

    Parameters
    ----------
    reads : pandas.DataFrame
        A (n_samples, n_junctions) table of the number of reads found in each
        samples' exon-exon junctions
    isoform1_ids : list of str
        Column names in ``reads`` tha correspond to the junction ids that are
        contained within isoform 1, e.g. ['junction1:chr1:100-400:+']
    isoform2_ids :
        Column names in ``reads`` tha correspond to the junction ids that are
        contained within isoform 2, e.g. ['junction:chr1:100-200:+',
        'junction:chr1:300-400:+']
    incompatible_ids : list of str
        Column names in ``reads`` tha correspond to the junction ids that are
        contained within junctions that are not compatible with the event
        definition
    n_junctions : int
        Total number of legal junctions,
        i.e. len(isoform1_ids) + len(isoform2_ids)
    min_reads : int
        Minimum number of reads for a junction to be viable. The rules
        governing compatibility of events are complex, and it is recommended to
        read the documentation for ``outrigger psi``
    uneven_coverage_multiplier : int
        Scale factor for the maximum amount bigger one side of a junction can
        be before rejecting the event, e.g. for an SE event with two junctions,
        junction12 and junction23, junction12=40 but junction23=500, then this
        event would be rejected because 500 > 40*10

    Returns
    -------

    """
    original_samples = reads.index
    if isinstance(incompatible_ids, list):
        incompatible_junctions_with_coverage = reads[incompatible_ids] \
                                               >= min_reads
        samples_with_incompatible_coverage = \
            incompatible_junctions_with_coverage.any(axis=1)
        reads = reads.loc[~samples_with_incompatible_coverage]

        # Make a dataframe with notes explaining why
        index = samples_with_incompatible_coverage[
            samples_with_incompatible_coverage].index
        columns = reads.columns
        incompatible_coverage = pd.DataFrame(None, index=index,
                                             columns=columns)
        incompatible_coverage[NOTES] = 'Case 1: >= {} reads on junctions ' \
                                       'that are incompatible with the ' \
                                       'annotation'.format(
            min_reads)

    maybe_rejected = reads.apply(
        lambda sample: _single_maybe_reject(
            sample, isoform1_ids, isoform2_ids,
            n_junctions=n_junctions, min_reads=min_reads,
            uneven_coverage_multiplier=uneven_coverage_multiplier), axis=1)
    if isinstance(incompatible_ids, list):
        all_rejected = pd.concat([maybe_rejected, incompatible_coverage])
    else:
        all_rejected = maybe_rejected

    # Return rejected or not samples in the same order as they were given
    all_rejected = all_rejected.loc[original_samples]
    return all_rejected


def _single_maybe_reject(
        sample, isoform1_ids, isoform2_ids, n_junctions, min_reads=MIN_READS,
        uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER):
    """Given a row of junction reads, return a filtered row of reads

    For a single sample's junction reads of an isoform, check if they should be
    rejected, and if they are, return a row with all NAs for the reads. Always
    include the case by which the reads were or were not rejected

    Parameters
    ----------
    sample : pandas.Series
        A single sample's junction reads across all isoforms of an alternative
        event
    isoform1_ids : list
        List of strings that correspond to indicies in ``sample``
    isoform2_ids : list
        List of strings that correspond to indicies in ``sample``
    n_junctions : int
        Total number of junctions expected in the splicing event
    min_reads : int, optional
        Minimum number of junction reads for an event to be valid. See
        documentation for much more detailed information regarding when events
        are rejected or retained. (default=10)
    uneven_coverage_multiplier : int, optional
        When checking for uneven coverage between two sides of a junction, one
        side must be this amount bigger than the other side to be rejected.
        For example, if one side has 10x (default) more read coverage than the
        other, then reject the event. (default=10)

    Returns
    -------
    single_maybe_rejected : pandas.Series
        Unrejected reads of a single samples' splicing event. If the event was
        rejected, the reads are replaced with NAs. This series has one more
        field than the input "sample", with the field of "notes" that explains
        why this event was or was not rejected.
    """
    isoform1, isoform2, case = _single_isoform_maybe_reject(
            sample[isoform1_ids], sample[isoform2_ids],
            n_junctions=n_junctions, min_reads=min_reads,
            uneven_coverage_multiplier=uneven_coverage_multiplier)
    if isoform1 is None:
        single_maybe_rejected = pd.Series(None, index=sample.index)
    else:
        single_maybe_rejected = sample
    single_maybe_rejected[NOTES] = case
    return single_maybe_rejected


def _single_isoform_maybe_reject(
        isoform1, isoform2, n_junctions, min_reads=MIN_READS,
        uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER):
    """Given junction reads of isoform1 and isoform2, remove if they are bad

    Parameters
    ----------
    isoform1, isoform2 : pandas.Series
        Number of reads found on exon-exon junctions for isoform1 and isoform2
    n_junctions : int
        Total number of junctions. Could also be found by
        len(isoform1) + len(isoform2)
    min_reads : int, optional
        Minimum number of reads for a junction to be counted, though the full
        explanation is a little more complicated, please see the documentation
        for more details. (default=10)
    uneven_coverage_multiplier : int, optional
        Scale factor for the maximum amount bigger one side of a junction can
        be before rejecting the event, e.g. for an SE event with two junctions,
        junction12 and junction23, junction12=40 but junction23=500, then this
        event would be rejected because 500 > 40*10

    Returns
    -------
    isoform1, isoform2 : pandas.Series or None
        If the event was not rejected, return the original event, otherwise
        return None
    case : str
        Reason for rejecting or retaining the event
    """
    # Make aliases for commonly used variables
    zero1 = isoform1 == 0
    zero2 = isoform2 == 0
    sufficient1 = isoform1 >= min_reads
    sufficient2 = isoform2 >= min_reads
    insufficient1 = isoform1 < min_reads
    insufficient2 = isoform2 < min_reads

    if zero1.all() and zero2.all():
        return None, None, 'Case 2: Zero observed reads'
    elif insufficient1.all() and insufficient2.all():
        # Case 12: isoform1 and isoform2 don't have sufficient reads
        return None, None, "Case 3: All junctions with insufficient reads"
    elif ((sufficient1.sum() < len(isoform1)) and insufficient2.all()) or \
            (insufficient1.all() and (sufficient2.sum() < len(isoform2))):
        # Case 13: isoform1 and isoform2 don't have sufficient reads
        return None, None, "Case 4: Only one junction with sufficient reads"
    # elif (zero1.any() and ~zero1.all()) or (zero2.any() and ~zero2.all()):
    #     return None, None, "Case 5: One or more junction is zero, but is " \
    #                        "incompatible with annotation"

    pseudo1 = isoform1 + 1
    pseudo2 = isoform2 + 1

    unequal1 = _single_sample_check_unequal_read_coverage(
        pseudo1, uneven_coverage_multiplier)
    unequal2 = _single_sample_check_unequal_read_coverage(
        pseudo2, uneven_coverage_multiplier)

    if unequal1 is None or unequal2 is None:
        return None, None, "Case 5: Unequal read coverage (one side has at " \
                           "least {}x more reads)".format(
                                uneven_coverage_multiplier)
    elif sufficient1.all() and zero2.all():
        return isoform1, isoform2, 'Case 6: Exclusion'
    elif zero1.all() and sufficient2.all():
        return isoform1, isoform2, 'Case 7: Inclusion'
    elif sufficient1.all() and sufficient2.all():
        return isoform1, isoform2, 'Case 8: Sufficient reads on all junctions'
    elif sufficient1.all() and insufficient2.any():
        return _single_sample_maybe_sufficient_reads(
            isoform1, isoform2, n_junctions, min_reads,
            'Case 9{}: Isoform1 with sufficient reads but Isoform2 has 1+ '
            'junctions with insufficient reads')
    elif insufficient1.any() and sufficient2.all():
        return _single_sample_maybe_sufficient_reads(
            isoform1, isoform2, n_junctions, min_reads,
            'Case 10{}: Isoform1 has 1+ junction with insufficient reads but '
            'Isoform2 with sufficient reads')
    elif (insufficient1.any() and sufficient1.any()) or \
            (insufficient2.any() and sufficient2.any()):
        return _single_sample_maybe_sufficient_reads(
            isoform1, isoform2, n_junctions, min_reads,
            'Case 11{}: Isoform1 and Isoform2 each have both sufficient and '
            'insufficient junctions')

    # If none of these is true, then there's some uncaught case
    return None, None, "Case ???"


def _make_summary_columns(isoform1_junction_numbers,
                          isoform2_junction_numbers,
                          incompatible_junctions=None):
    isoform1_numbers = ['isoform1_' + x for x in isoform1_junction_numbers]
    isoform2_numbers = ['isoform2_' + x for x in isoform2_junction_numbers]
    column_order = [SAMPLE_ID, EVENT_ID] + isoform1_numbers \
        + isoform2_numbers + [PSI, NOTES]
    if isinstance(incompatible_junctions, list):
        incompatible_numbers = ['incompatible_junction{}'.format(i)
                                for i, x in enumerate(incompatible_junctions)]
        column_order += incompatible_numbers

    return column_order


def _summarize_event(event_id, reads, maybe_rejected, psi,
                     isoform1_junction_ids, isoform2_junction_ids,
                     isoform1_junction_numbers, isoform2_junction_numbers,
                     incompatible_junctions=None):
    """Make table summarizing junction reads, psi, and notes for an event

    Parameters
    ----------
    event_id : str
        Uniquely identifying string for a splicing event
    reads : pandas.DataFrame
        A (n_samples, n_junctions) table of the number of reads found in each
        samples' exon-exon junctions for this event
    maybe_rejected : pandas.DataFrame
        A (n_samples, n_junctions + 1) table that is nearly a copy of ``reads``
        except has rejected events' junction reads replaced with NAs, and for
        all events, also has a column called "notes" which has the reason the
        event was or was not rejected
    psi : pandas.Series
        A (n_samples,) sized column of the percent spliced-in values for each
        sample, for this event
    isoform1_junction_ids : list of str
        Column names in ``reads`` tha correspond to the junction ids that are
        contained within isoform 1, e.g. ['junction1:chr1:100-400:+']
    isoform2_junction_ids : list of str
        Column names in ``reads`` tha correspond to the junction ids that are
        contained within isoform 2, e.g. ['junction:chr1:100-200:+',
        'junction:chr1:300-400:+']
    isoform1_junction_numbers : list of str
         Junction numbers corresponding to isoform 1, e.g. ['junction13']
    isoform2_junction_numbers : list of str
        Junction numbers corresponding to isoform 2, e.g. ['junction12',
        'junction23']

    Returns
    -------
    summary : pandas.DataFrame
        A (n_samples, 7) shaped table with the sample id, junction reads,
        percent spliced-in (Psi), and notes on each event in each sample, that
        explain why or why not Psi was calculated
    """
    summary_columns = _make_summary_columns(isoform1_junction_numbers,
                                            isoform2_junction_numbers,
                                            incompatible_junctions)

    if not maybe_rejected.empty:
        column_renamer = dict(zip(isoform1_junction_ids,
                                  [x for x in summary_columns
                                   if x.startswith('isoform1_')]))
        column_renamer.update(dict(zip(isoform2_junction_ids,
                                       [x for x in summary_columns
                                        if x.startswith('isoform2_')])))
        if isinstance(incompatible_junctions, list):
            column_renamer.update(dict(zip(incompatible_junctions,
                                           [x for x in summary_columns
                                            if x.startswith('incompatible')])))

        summary = reads.rename(columns=column_renamer)
        summary[NOTES] = maybe_rejected[NOTES]
        summary[PSI] = psi
        summary = summary.reset_index()
        summary[EVENT_ID] = event_id
        summary.columns.name = None

        summary = summary[summary_columns]
    else:
        summary = pd.DataFrame(columns=summary_columns)
    return summary


def _single_event_psi(event_id, event_df, reads2d,
                      isoform1_junction_numbers, isoform2_junction_numbers,
                      min_reads=MIN_READS, method='mean',
                      uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER):
    """Calculate percent spliced in for a single event across all samples

    Parameters
    ----------
    event_id : str
        Uniquely identifying string for a splicing event
    event_df : pandas.DataFrame
        A table with the event id as the index (row names) and the junction
        locations for the different isoforms. This may have multiple rows (or
        not) depending on the different widths of the flanking exons
    junction_reads_2d : pandas.DataFrame
        A (n_samples, n_total_junctions) table of the number of reads found in
        all samples' exon-exon, all junctions. Very very large, e.g.
        1000 samples x 50,000 junctions = 50 million elements
    isoform1_junction_numbers : list of str
         Junction numbers corresponding to the isoform,
         e.g. ``['junction13']``. Must be columns in ``event_df``
    isoform2_junction_numbers : list of str
        Junction numbers corresponding to the isoform, e.g. ``['junction12',
        'junction23']``. Must be columns in ``event_df``
    min_reads : int
        Minimum number of reads for a junction to be viable. The rules
        governing compatibility of events are complex, and it is recommended to
        read the documentation for ``outrigger psi``
    method : "mean" | "min"
        Denotes the method by which to aggregate junctions from the same
        isoform - either use the mean (default) or the minimum
    uneven_coverage_multiplier : int
        Scale factor for the maximum amount bigger one side of a junction can
        be before rejecting the event, e.g. for an SE event with two junctions,
        junction12 and junction23, junction12=40 but junction23=500, then this
        event would be rejected because 500 > 40*10

    Returns
    -------
    summary : pandas.DataFrame
        A (n_samples, 7) shaped table with the sample id, junction reads,
        percent spliced-in (Psi), and notes on each event in each sample, that
        explain why or why not Psi was calculated

    >>> # Example Summary output
    >>> summary.head()
                                               sample_id  junction13  \
    0  CAV_LP_Ipsi_tdTpos_cell_1-SRR2140356-GSM184094...          26
    1  CAV_LP_Ipsi_tdTpos_cell_10-SRR2140365-GSM18409...           0
    2  CAV_LP_Ipsi_tdTpos_cell_11-SRR2140366-GSM18409...          52
    3  CAV_LP_Ipsi_tdTpos_cell_12-SRR2140367-GSM18409...           0
    4  CAV_LP_Ipsi_tdTpos_cell_13-SRR2140368-GSM18409...          31

       junction12  junction23                      notes  psi  \
    0           0           0  Case 3: Perfect exclusion  0.0
    1           0           0  Case 2: No observed reads  NaN
    2           0           0  Case 3: Perfect exclusion  0.0
    3           0           0  Case 2: No observed reads  NaN
    4           0           0  Case 3: Perfect exclusion  0.0

                                                event_id
    0  isoform1=junction:chr10:128491034-128491719:-|...
    1  isoform1=junction:chr10:128491034-128491719:-|...
    2  isoform1=junction:chr10:128491034-128491719:-|...
    3  isoform1=junction:chr10:128491034-128491719:-|...
    4  isoform1=junction:chr10:128491034-128491719:-|...

    """
    junction_locations = event_df.iloc[0]

    n_junctions1 = len(isoform1_junction_numbers)
    n_junctions2 = len(isoform2_junction_numbers)
    n_junctions = n_junctions1 + n_junctions2

    isoform1_junction_ids = junction_locations[
        isoform1_junction_numbers].tolist()
    isoform2_junction_ids = junction_locations[
        isoform2_junction_numbers].tolist()
    incompatible_junctions = junction_locations[INCOMPATIBLE_JUNCTIONS]

    junction_cols = isoform1_junction_ids + isoform2_junction_ids

    # If this event from the index doesn't exist in the dataset, return an
    # empty dataframe
    junctions_in_data = reads2d.columns.intersection(junction_cols)
    if len(junctions_in_data) < n_junctions:
        summary_columns = _make_summary_columns(isoform1_junction_numbers,
                                                isoform2_junction_numbers,
                                                incompatible_junctions)
        return pd.DataFrame(columns=summary_columns)

    if not isinstance(incompatible_junctions, float):
        incompatible_junctions = incompatible_junctions.split('|')
        incompatible_junctions = reads2d.columns.intersection(
            incompatible_junctions).tolist()
        junction_cols += incompatible_junctions

    reads = reads2d[junction_cols]

    maybe_rejected = _maybe_reject(
        reads, isoform1_junction_ids, isoform2_junction_ids,
        incompatible_junctions, n_junctions, min_reads=min_reads,
        uneven_coverage_multiplier=uneven_coverage_multiplier)

    isoform1 = maybe_rejected[isoform1_junction_ids].apply(
        _scale, n_junctions=n_junctions1, method=method, axis=1)
    isoform2 = maybe_rejected[isoform2_junction_ids].apply(
        _scale, n_junctions=n_junctions2, method=method, axis=1)

    psi = isoform2 / (isoform2 + isoform1)

    summary = _summarize_event(event_id, reads, maybe_rejected, psi,
                               isoform1_junction_ids, isoform2_junction_ids,
                               isoform1_junction_numbers,
                               isoform2_junction_numbers,
                               incompatible_junctions=incompatible_junctions)
    return summary


def _maybe_parallelize_psi(
        event_annotation, reads2d, isoform1_junctions,
        isoform2_junctions, min_reads=MIN_READS, method='mean',
        uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER, n_jobs=-1):
    """If n_jobs!=1, run the parallelized version of psi

    Parameters
    ----------
    event_annotation : pandas.DataFrame
        A table of all possible events, with event ids as the index (row names)
        and all junctions described, and contains the columns described by
        ``isoform1_junctions`` and ``isoform_junctions``
    reads2d : pandas.DataFrame
        A (n_samples, n_total_junctions) table of the number of reads found in
        all samples' exon-exon, all junctions. Very very large, e.g.
        1000 samples x 50,000 junctions = 50 million elements
    isoform1_junctions : list of str
        Junction numbers corresponding to isoform 1, e.g. ['junction13']
    isoform2_junctions : list of str
        Junction numbers corresponding to isoform 2, e.g. ['junction12',
        'junction23']
    min_reads : int, optional
        Minimum number of reads for a junction to be viable. The rules
        governing compatibility of events are complex, and it is recommended to
        read the documentation for ``outrigger psi`` (default=10)
    method : "mean" | "min", optional
        Denotes the method by which to aggregate junctions from the same
        isoform - either use the mean (default) or the minimum.
        (default="mean")
    uneven_coverage_multiplier : int, optional
        Scale factor for the maximum amount bigger one side of a junction can
        be before rejecting the event, e.g. for an SE event with two junctions,
        junction12 and junction23, junction12=40 but junction23=500, then this
        event would be rejected because 500 > 40*10 (default=10)
    n_jobs : int, optional
        Number of subprocesses to create. Default is -1, which is to use as
        many processes/cores as possible

    Returns
    -------
    summary : pandas.DataFrame
        A (n_samples * n_events, 7) shaped table with the sample id, junction
        reads, percent spliced-in (Psi), and notes on each event in each
        sample, that explains why or why not Psi was calculated
    """
    # There are multiple rows with the same event id because the junctions
    # are the same, but the flanking exons may be a little wider or shorter,
    # but ultimately the event Psi is calculated only on the junctions so the
    # flanking exons don't matter for this. But, all the exons are in
    # exon\d.bed in the index! And you, the lovely user, can decide what you
    # want to do with them!
    grouped = event_annotation.groupby(level=0, axis=0)

    n_events = len(grouped.size())

    if n_jobs == 1:
        # Do a separate branch because joblib doesn't do a good job of
        # managing the python debugger so use --n-jobs=1 (n_jobs=1) when
        # debugging
        progress('\tIterating over {} events ...\n'.format(n_events))
        summaries = []
        for event_id, event_df in grouped:
            summary = _single_event_psi(
                event_id, event_df, reads2d,
                isoform1_junctions, isoform2_junctions,
                min_reads=min_reads,
                uneven_coverage_multiplier=uneven_coverage_multiplier,
                method=method)
            summaries.append(summary)
    else:
        processors = n_jobs if n_jobs > 0 else joblib.cpu_count()
        progress("\tParallelizing {} events' Psi calculation across {} "
                 "CPUs ...\n".format(n_events, processors))
        summaries = joblib.Parallel(n_jobs=n_jobs)(
            joblib.delayed(_single_event_psi)(
                event_id, event_df, reads2d,
                isoform1_junctions, isoform2_junctions,
                min_reads=min_reads,
                uneven_coverage_multiplier=uneven_coverage_multiplier,
                method=method)
            for event_id, event_df in grouped)

    return summaries


[docs]def calculate_psi(event_annotation, reads2d, isoform1_junctions, isoform2_junctions, min_reads=MIN_READS, method='mean', uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER, n_jobs=-1): """Compute percent-spliced-in of events based on junction reads Parameters ---------- event_annotation : pandas.DataFrame A table where each row represents a single splicing event. The required columns are the ones specified in `isoform1_junctions`, `isoform2_junctions`, and `event_col`. reads2d : pandas.DataFrame A (n_samples, n_total_junctions) table of the number of reads found in all samples' exon-exon, all junctions. Very very large, e.g. 1000 samples x 50,000 junctions = 50 million elements number of reads observed at a splice junction of a particular sample. isoform1_junctions : list Columns in `event_annotation` which represent junctions that correspond to isoform1, the Psi=0 isoform, e.g. ['junction13'] for SE (junctions between exon1 and exon3) isoform2_junctions : list Columns in `event_annotation` which represent junctions that correspond to isoform2, the Psi=1 isoform, e.g. ['junction12', 'junction23'] (junctions between exon1, exon2, and junction between exon2 and exon3) min_reads : int, optional Minimum number of reads for a junction to be viable. The rules governing compatibility of events are complex, and it is recommended to read the documentation for ``outrigger psi`` (default=10) method : "mean" | "min", optional Denotes the method by which to aggregate junctions from the same isoform - either use the mean (default) or the minimum. (default="mean") uneven_coverage_multiplier : int, optional Scale factor for the maximum amount bigger one side of a junction can be before rejecting the event, e.g. for an SE event with two junctions, junction12 and junction23, junction12=40 but junction23=500, then this event would be rejected because 500 > 40*10 (default=10) n_jobs : int, optional Number of subprocesses to create. Default is -1, which is to use as many processes/cores as possible Returns ------- psi : pandas.DataFrame An (samples, events) dataframe of the percent spliced-in values summary : pandas.DataFrame A (n_samples * n_events, 7) shaped table with the sample id, junction reads, percent spliced-in (Psi), and notes on each event in each sample, that explains why or why not Psi was calculated """ summaries = _maybe_parallelize_psi(event_annotation, reads2d, isoform1_junctions, isoform2_junctions, min_reads, method, uneven_coverage_multiplier, n_jobs) summary = pd.concat(summaries, ignore_index=True) psi = summary.pivot(index=SAMPLE_ID, columns=EVENT_ID, values=PSI) return psi, summary