Source code for outrigger.commandline

#!/usr/bin/env python

import argparse
import logging
import os
import pdb
import shutil
import sys
import traceback

import gffutils
import numpy as np
import pandas as pd

from outrigger import __version__
import outrigger.common
from outrigger import util, common
from outrigger.index import events, adjacencies
from outrigger.io import star, gtf, bam
from outrigger.psi import compute
from outrigger.validate import check_splice_sites


OUTPUT = os.path.join('.', 'outrigger_output')
JUNCTION_PATH = os.path.join(OUTPUT, 'junctions')
JUNCTION_READS_PATH = os.path.join(JUNCTION_PATH, 'reads.csv')
JUNCTION_METADATA_PATH = os.path.join(JUNCTION_PATH, 'metadata.csv')
INDEX = os.path.join(OUTPUT, 'index')
EVENTS_CSV = 'events.csv'
METADATA_CSV = 'metadata.csv'


[docs]class CommandLine(object): def __init__(self, input_options=None): self.parser = argparse.ArgumentParser( description='outrigger ({version}). Calculate "percent-spliced in"' ' (Psi) scores of alternative splicing on a *de novo*,' ' custom-built splicing index -- ' 'just for you!'.format(version=__version__)) self.parser.add_argument( '--version', action='version', version='outrigger {version}'.format(version=__version__)) self.subparser = self.parser.add_subparsers(help='Sub-commands') # --- Subcommand to build the index of splicing events --- # index_parser = self.subparser.add_parser( 'index', help='Build an index of splicing events using a graph ' 'database on your junction reads and an annotation') index_parser.add_argument( '-o', '--output', required=False, type=str, action='store', default=OUTPUT, help='Name of the folder where you saved the output from ' '"outrigger index" (default is {}, which is ' 'relative to the directory where you called the program)". ' 'You will need this file for the next step, "outrigger psi"' ''.format(OUTPUT)) index_junctions = index_parser.add_mutually_exclusive_group( required=True) index_junctions.add_argument( '-j', '--sj-out-tab', type=str, action='store', nargs='*', help='SJ.out.tab files from STAR aligner output. Not ' 'required if you specify ' '"--compiled-junction-reads"') index_junctions.add_argument( '-c', '--junction-reads-csv', required=False, help="Name of the splice junction files to detect novel exons and " "build an index of alternative splicing events from. " "Not required if you specify " "SJ.out.tab file with '--sj-out-tab'".format( sj_csv=JUNCTION_READS_PATH)) index_junctions.add_argument( '-b', '--bam', required=False, nargs='*', help="Location of bam files to use for finding events.") index_parser.add_argument('-m', '--min-reads', type=int, action='store', required=False, default=10, help='Minimum number of reads per junction ' 'for that junction to count in creating' ' the index of splicing events ' '(default=10)') index_parser.add_argument('--ignore-multimapping', action='store_true', help='Applies to STAR SJ.out.tab files only.' ' If this flag is used, then do not ' 'include reads that mapped to multiple ' 'locations in the genome, not uniquely ' 'to a locus, in the read count for a ' 'junction. If inputting "bam" files, ' 'then this means that reads with a ' 'mapping quality (MAPQ) of less than ' '255 are considered "multimapped." This' ' is the same thing as what the STAR ' 'aligner does. By default, this is off,' ' and all reads are used.') index_parser.add_argument( '-l', '--max-de-novo-exon-length', default=outrigger.common.MAX_DE_NOVO_EXON_LENGTH, action='store', help='Maximum length of an exon detected ' '*de novo* from the dataset. This is to' ' prevent multiple kilobase long exons ' 'from being accidentally created. ' '(default={})'.format( outrigger.common.MAX_DE_NOVO_EXON_LENGTH)) gtf_parser = index_parser.add_mutually_exclusive_group(required=True) gtf_parser.add_argument('-g', '--gtf-filename', type=str, action='store', help="Name of the gtf file you want to use. If" " a gffutils feature database doesn't " "already exist at this location plus " "'.db' (e.g. if your gtf is gencode.v19." "annotation.gtf, then the database is " "inferred to be gencode.v19.annotation." "gtf.db), then a database will be auto-" "created. Not required if you provide a " "pre-built database with " "'--gffutils-db'") gtf_parser.add_argument('-d', '--gffutils-db', type=str, action='store', help="Name of the gffutils database file you " "want to use. The exon IDs defined here " "will be used in the function when " "creating splicing event names. Not " "required if you provide a gtf file with" " '--gtf-filename'") index_parser.add_argument('--debug', required=False, action='store_true', help='If given, print debugging logging ' 'information to standard out (Warning:' ' LOTS of output. Not recommended ' 'unless you think something is going ' 'wrong)') index_parser.add_argument('--n-jobs', required=False, default=-1, action='store', type=int, help='Number of threads to use when ' 'parallelizing exon finding and file ' 'reading. Default is -1, which means ' 'to use as many threads as are ' 'available.') index_parser.add_argument('--low-memory', required=False, default=False, action='store_true', help='If set, then use a smaller memory ' 'footprint. By default, this is off.') index_parser.add_argument('--splice-types', required=False, default='all', action='store', help='Which splice types to find. By ' 'default, "all" are used which at this ' 'point is skipped exon (SE) and ' 'mutually exclusive exon (MXE) events. ' 'Can also specify only one, e.g. "se" ' 'or both "se,mxe"') overwrite_parser = index_parser.add_mutually_exclusive_group( required=False) overwrite_parser.add_argument('--force', action='store_true', help="If the 'outrigger index' command " "was interrupted, there will be " "intermediate files remaining. If " "you wish to restart outrigger and " "overwrite them all, use this flag." " If you want to continue from " "where you left off, use the " "'--resume' flag. If neither is " "specified, the program exits and " "complains to the user.") overwrite_parser.add_argument('--resume', action='store_true', help="If the 'outrigger index' command " "was interrupted, there will be " "intermediate files remaining. If " "you want to continue from " "where you left off, use this flag." " The default action is to do " "nothing and ask the user for " "input.") index_parser.set_defaults(func=self.index) # -- Subcommand to validate exons of built index --- # validate_parser = self.subparser.add_parser( 'validate', help='Ensure that the splicing events found all have ' 'the correct splice sites') validate_parser.add_argument('-f', '--fasta', required=True, help='Location of the genome fasta file ' 'for which to get the splice site ' 'sequences from') validate_parser.add_argument('-g', '--genome', required=True, help='Either the genome name (e.g. ' '"mm10" or "hg19") or location of ' 'the genome chromosome ' 'sizes file for "bedtools flank" to ' 'make sure we do not accidentally ' 'ask for genome positions that are ' 'outside of the defined range') validate_parser.add_argument('-i', '--index', required=False, default=None, help='Name of the folder where you saved ' 'the output from "outrigger index" ' '(default is {}, which is relative ' 'to the directory where you called ' 'this program, assuming you have ' 'called "outrigger psi" in the same ' 'folder as you called "outrigger ' 'index").'.format(INDEX)) validate_parser.add_argument( '-o', '--output', required=False, type=str, action='store', default=None, help='Name of the folder where you saved the output from ' '"outrigger index" (default is {}, which is ' 'relative to the directory where you called the program). ' ''.format(OUTPUT)) validate_parser.add_argument( '-s', '--valid-splice-sites', required=False, default=check_splice_sites.MAMMALIAN_SPLICE_SITES, help="The intron-definition based splice sites that are allowed in" " the data, which is in the format 5'/3' of the intron, and " "separated by commas for different types. Default is {}, " "which are the major and minor spliceosome splice sites in " "mammalian " "systems.".format(check_splice_sites.MAMMALIAN_SPLICE_SITES)) validate_parser.add_argument('--debug', required=False, action='store_true', help='If given, print debugging logging ' 'information to standard out') validate_parser.add_argument('--low-memory', required=False, default=False, action='store_true', help='If set, then use a smaller memory ' 'footprint. By default, this is ' 'off.') validate_parser.set_defaults(func=self.validate) # --- Subcommand to calculate psi on the built index --- # psi_parser = self.subparser.add_parser( 'psi', help='Calculate "percent spliced-in" (Psi) values using ' 'the splicing event index built with "outrigger ' 'index"') psi_parser.add_argument('-i', '--index', required=False, default=None, help='Name of the folder where you saved the ' 'output from "outrigger index" (default ' 'is {}, which is relative ' 'to the directory where you called this ' 'program, assuming you have called ' '"outrigger psi" in the same folder as ' 'you called "outrigger ' 'index")'.format(INDEX)) psi_parser.add_argument( '-o', '--output', required=False, type=str, action='store', default=None, help='Name of the folder where you saved the output from ' '"outrigger index" (default is {}, which is ' 'relative to the directory where you called the program). ' 'Cannot specify both an --index and --output with "psi"' ''.format(OUTPUT)) psi_junctions = psi_parser.add_mutually_exclusive_group( required=False) psi_junctions.add_argument( '-c', '--junction-reads-csv', required=False, help="Name of the compiled splice junction file to calculate psi " "scores on. Default is the '--output' folder's " "junctions/reads.csv file. Not required if you specify " "SJ.out.tab files with '--sj-out-tab'") psi_junctions.add_argument( '-j', '--sj-out-tab', required=False, type=str, action='store', nargs='*', help='SJ.out.tab files from STAR aligner output. Not required if ' 'you specify a file with "--compiled-junction-reads"') psi_junctions.add_argument( '-b', '--bam', required=False, type=str, action='store', nargs='*', help='Bam files to use to calculate psi on') psi_parser.add_argument('-m', '--min-reads', type=int, action='store', required=False, default=10, help='Minimum number of reads per junction for' ' calculating Psi (default=10)') psi_parser.add_argument('-e', '--method', type=str, action='store', required=False, default='mean', help='How to deal with multiple junctions on ' 'an event - take the mean (default) or ' 'the min? (the other option)') psi_parser.add_argument('-u', '--uneven-coverage-multiplier', type=int, action='store', required=False, default=10, help='If a junction one one side of an exon is' ' bigger than the other side of the exon ' 'by this amount, (default is 10, so 10x ' 'bigger), then do not use this event') psi_parser.add_argument('--ignore-multimapping', action='store_true', help='Applies to STAR SJ.out.tab files only.' ' If this flag is used, then do not ' 'include reads that mapped to multiple ' 'locations in the genome, not uniquely ' 'to a locus, in the read count for a ' 'junction. If inputting "bam" files, ' 'then this means that reads with a ' 'mapping quality (MAPQ) of less than ' '255 are considered "multimapped." This ' 'is the same thing as what the STAR ' 'aligner does. By default, this is off, ' 'and all reads are used.') psi_parser.add_argument('--reads-col', default='reads', help="Name of column in --splice-junction-csv " "containing reads to use. " "(default='reads')") psi_parser.add_argument('--sample-id-col', default='sample_id', help="Name of column in --splice-junction-csv " "containing sample ids to use. " "(default='sample_id')") psi_parser.add_argument('--junction-id-col', default='junction_id', help="Name of column in --splice-junction-csv " "containing the ID of the junction to use" ". Must match exactly with the junctions " "in the index." "(default='junction_id')") psi_parser.add_argument('--debug', required=False, action='store_true', help='If given, print debugging logging ' 'information to standard out') psi_parser.add_argument('--n-jobs', required=False, default=-1, action='store', type=int, help='Number of threads to use when ' 'parallelizing psi calculation and file ' 'reading. Default is -1, which means ' 'to use as many threads as are ' 'available.') psi_parser.add_argument('--low-memory', required=False, default=False, action='store_true', help='If set, then use a smaller memory ' 'footprint. By default, this is off.') psi_parser.set_defaults(func=self.psi) if input_options is None or len(input_options) == 0: self.parser.print_usage() self.args = None else: self.args = self.parser.parse_args(input_options) if self.args is not None: if self.args.debug: print(self.args) print(input_options) self.args.func()
[docs] def index(self): index = Index(**vars(self.args)) index.execute()
[docs] def validate(self): validate = Validate(**vars(self.args)) validate.execute()
[docs] def psi(self): psi = Psi(**vars(self.args)) psi.execute()
[docs] def do_usage_and_die(self, str): '''Cleanly exit if incorrect parameters are given If a critical error is encountered, where it is suspected that the program is not being called with consistent parameters or data, this method will write out an error string (str), then terminate execution of the program. ''' sys.stderr.write(str) self.parser.print_usage() type, value, tb = sys.exc_info() traceback.print_exc() debug = os.getenv('PYTHONDEBUG') if debug or self.args.debug: print(os.environ) # If the PYTHONDEBUG shell environment variable exists, then # launch the python debugger pdb.post_mortem(tb) return 2
# Class: Usage
[docs]class Usage(Exception): ''' Used to signal a Usage error, evoking a usage statement and eventual exit when raised ''' def __init__(self, msg): self.msg = msg
[docs]class Subcommand(object): # output_folder = OUTPUT sj_out_tab = None ignore_multimapping = False min_reads = common.MIN_READS reads_col = common.READS gtf_filename = None gffutils_db = None debug = False force = False resume = False def __init__(self, **kwargs): # Read all arguments and set as attributes of this class for key, value in kwargs.items(): setattr(self, key, value) for folder in self.folders: self.maybe_make_folder(folder)
[docs] def maybe_make_folder(self, folder): util.progress("Creating folder {} ...".format(folder)) if not os.path.exists(folder): os.makedirs(folder) util.done()
@property def folders(self): return self.output_folder, self.index_folder, self.gtf_folder, \ self.junctions_folder @property def output_folder(self): if not hasattr(self, 'output') or self.output is None: return OUTPUT else: return self.output @property def index_folder(self): return os.path.join(self.output_folder, 'index') @property def gtf_folder(self): return os.path.join(self.index_folder, 'gtf') @property def junctions_folder(self): return os.path.join(self.output_folder, 'junctions') @property def junction_reads_filename(self): if self.junction_reads_csv is not None: return self.junction_reads_csv else: return os.path.join(self.junctions_folder, 'reads.csv')
[docs] def make_junction_reads_file(self): if self.bam is None: util.progress( 'Reading SJ.out.files and creating a big splice junction' ' table of reads spanning exon-exon junctions...') splice_junctions = star.read_multiple_sj_out_tab( self.sj_out_tab, ignore_multimapping=self.ignore_multimapping) else: util.progress('Reading bam files and creating a big splice ' 'junction table of reads spanning exon-exon ' 'junctions') splice_junctions = bam.read_multiple_bams( self.bam, self.ignore_multimapping, self.n_jobs) dirname = os.path.dirname(self.junction_reads_filename) if not os.path.exists(dirname): os.makedirs(dirname) util.progress('Writing {} ...\n'.format(self.junction_reads_filename)) splice_junctions.to_csv(self.junction_reads_filename, index=False) util.done() return splice_junctions
[docs] def csv(self): """Create a csv file of compiled splice junctions""" if not os.path.exists(self.junction_reads_filename): splice_junctions = self.make_junction_reads_file() else: util.progress('Found compiled junction reads file in {} and ' 'reading it in ' '...'.format(self.junction_reads_filename)) splice_junctions = pd.read_csv(self.junction_reads_filename, low_memory=self.low_memory) util.done() return splice_junctions
@staticmethod
[docs] def junction_metadata(spliced_reads, csv): """Get just the junction info from the concatenated read files""" util.progress('Creating splice junction metadata of merely where ' 'junctions start and stop') metadata = star.make_metadata(spliced_reads) util.done() if not os.path.exists(csv): util.progress('Writing metadata of junctions to {csv}' ' ...'.format(csv=csv)) metadata.to_csv(csv, index=False) return metadata
[docs] def filter_junctions_on_reads(self, spliced_reads): # Filter junction metadata to only get junctions with minimum reads util.progress('Filtering for only junctions with minimum {} reads ' '...'.format(self.min_reads)) original = len(spliced_reads.groupby(common.JUNCTION_ID)) enough_reads_rows = spliced_reads[self.reads_col] >= self.min_reads spliced_reads = spliced_reads.loc[enough_reads_rows] enough_reads = len(spliced_reads.groupby(common.JUNCTION_ID)) filtered = original - enough_reads util.progress('\t{enough}/{original} junctions remain after ' 'filtering out {filtered} junctions with < ' '{min_reads} ' 'reads.'.format(filtered=filtered, enough=enough_reads, original=original, min_reads=self.min_reads)) util.done(2) return spliced_reads
[docs] def maybe_make_db(self): """Get GFFutils database from file or create from a gtf""" if self.gffutils_db is not None: copied_db = os.path.join(self.gtf_folder, os.path.basename(self.gffutils_db)) util.progress('Copying gffutils database from {} to {} ' '...'.format(self.gffutils_db, copied_db)) shutil.copyfile(self.gffutils_db, copied_db) util.done() util.progress('Reading gffutils database from {} ...'.format( copied_db)) db = gffutils.FeatureDB(copied_db) util.done() else: basename = os.path.basename(self.gtf_filename) db_filename = os.path.join(self.gtf_folder, '{}.db'.format(basename)) util.progress("Found GTF file in {}".format(self.gtf_filename)) try: db = gffutils.FeatureDB(db_filename) util.progress( "Found existing built outrigger-built gffutils database " "file in {}".format(db_filename)) except (ValueError, TypeError): util.progress( 'Creating a "gffutils" ' 'database {} ...'.format(db_filename)) db = gtf.create_db(self.gtf_filename, db_filename) util.done() return db
[docs] def maybe_overwrite(self, filename): """Ensures that filename is not overwritten unless user-specified - If the file doesn't exist, return "True", as in "yes, please overwrite" even though there wasn't really a file there, but the action is still to create the file in the next step - If the file exists and the user specified nothing, exit the program and complain that either --force or --resume must be specified - If the file exists and the user specified --force, then return True - If the file exists and the user specified --resume, then return False so that the file is not overwritten Parameters ---------- filename : str Path to a file that you may want to overwrite Returns ------- if_overwrite : bool If True, then the next step in the program has the go-ahead to "overwrite" or create the file. If False, the file exists and the user doesn't want to overwrite it """ if not os.path.exists(filename): return True if os.path.exists(filename): if self.force: util.progress("Found existing {filename}, overwriting with " "--force flag".format(filename=filename)) return True if self.resume: util.progress( "With the flag '--resume', Found an existing file " "containing novel exons," "{filename}, not re-calculating. To force overwriting, " "use the flag ''--force'.".format(filename=filename)) return False else: raise ValueError("Found existing {filename} " "but don't " "know whether you want me to continue where I " "stopped ('--resume') or force overwrite " "and restart from " "scratch ('--force')! Exiting." ".".format(filename=filename))
[docs]class Index(Subcommand): max_de_novo_exon_length = outrigger.common.MAX_DE_NOVO_EXON_LENGTH @property def splice_abbrevs(self): if self.splice_types == 'all': return common.SPLICE_ABBREVS return self.splice_types.split(',')
[docs] def make_exon_junction_adjacencies(self, metadata, db): """Get annotated exon_cols next to junctions in data""" exon_junction_adjacencies = adjacencies.ExonJunctionAdjacencies( metadata, db, max_de_novo_exon_length=self.max_de_novo_exon_length) novel_exons_gtf = os.path.join(self.gtf_folder, 'novel_exons.gtf') if self.maybe_overwrite(novel_exons_gtf): util.progress('Detecting de novo exons based on gaps between ' 'junctions ...') exon_junction_adjacencies.detect_exons_from_junctions() util.done() novel_exons_gtf = os.path.join(self.gtf_folder, 'novel_exons.gtf') novel_exons = exon_junction_adjacencies.db.features_of_type( outrigger.common.NOVEL_EXON) n_novel_exons = sum(1 for _ in novel_exons) util.progress('Writing {n} novel exons to {gtf} ...'.format( n=n_novel_exons, gtf=novel_exons_gtf)) exon_junction_adjacencies.write_de_novo_exons(novel_exons_gtf) util.done() csv = os.path.join(self.index_folder, 'exon_direction_junction.csv') if not os.path.exists(csv) or self.force: util.progress('Getting junction-direction-exon triples for graph ' 'database ...') junction_exon_triples = \ exon_junction_adjacencies.upstream_downstream_exons() util.done() util.progress('Writing junction-exon-direction triples' ' to {}...'.format(csv)) junction_exon_triples.to_csv(csv, index=False) util.done() elif self.resume: junction_exon_triples = pd.read_csv(csv, low_memory=self.low_memory) else: raise ValueError("Found existing junction-exon-triples file " "({csv}) but don't " "know whether you want me to continue where I " "stopped ('--resume') or force restart from " "scratch ('--force')! Exiting." ".".format(csv=csv)) return junction_exon_triples
@staticmethod
[docs] def make_graph(junction_exon_triples, db): """Create graph database of exon-junction adjacencies""" util.progress('Populating graph database of the ' 'junction-direction-exon triples ...') event_maker = events.EventMaker(junction_exon_triples, db) util.done() return event_maker
def _exists_event_csv(self, splice_abbrev): return os.path.exists( os.path.join(self.index_folder, splice_abbrev, EVENTS_CSV))
[docs] def make_events_by_traversing_graph(self, event_maker, db): """Search the splice graph for alternative exons""" existing_events = [self._exists_event_csv(splice_abbrev) for splice_abbrev in self.splice_abbrevs] if all(existing_events) and not self.force: util.progress('Found existing splicing events files for all splice' ' types, so not searching. To force' ' re-finding these splicing events, use the flag' ' "--force".') return undiscovered_splice_types = [ splice_abbrev for splice_abbrev in self.splice_abbrevs if not self._exists_event_csv(splice_abbrev) or self.force] event_dfs = event_maker.find_events( n_jobs=self.n_jobs, splice_types=undiscovered_splice_types) for splice_abbrev, event_df in event_dfs.items(): csv = os.path.join(self.index_folder, splice_abbrev.lower(), EVENTS_CSV) dirname = os.path.dirname(csv) if not os.path.exists(dirname): os.makedirs(dirname) n_events = len(event_df.groupby(level=0, axis=0)) if n_events > 0: util.progress( 'Found {n} {abbrev} events.'.format( n=n_events, abbrev=splice_abbrev.upper(), csv=csv)) self.get_event_attributes(db, event_df, splice_abbrev) else: util.progress( 'No {abbrev} events found in the junction and exon ' 'data.'.format(abbrev=splice_abbrev.upper()))
[docs] def get_event_attributes(self, db, event_df, splice_type): util.progress( 'Making metadata file of {splice_type} events, ' 'annotating them with GTF attributes ...'.format( splice_type=splice_type.upper())) sa = gtf.SplicingAnnotator(db, event_df, splice_type.upper()) util.progress('Making ".bed" files for exons in each event ...') folder = os.path.join(self.index_folder, splice_type) sa.exon_bedfiles(folder=folder) util.done() attributes = sa.attributes() util.done() # Write to a file csv = os.path.join(self.index_folder, splice_type, EVENTS_CSV) util.progress('Writing {splice_type} events to {csv} ' '...'.format(splice_type=splice_type.upper(), csv=csv)) attributes.to_csv(csv, index=True, index_label=outrigger.common.EVENT_ID) util.done()
[docs] def write_new_gtf(self, db): gtf = os.path.join(self.gtf_folder, os.path.basename(self.gtf_filename)) util.progress('Write new GTF to {} ...'.format(gtf)) with open(gtf, 'w') as f: for feature in db.all_features(order_by=common.ORDER_BY): f.write(str(feature) + '\n') util.done()
[docs] def execute(self): # Must output the junction exon triples logger = logging.getLogger('outrigger.index') if self.debug: logger.setLevel(10) spliced_reads = self.csv() spliced_reads = self.filter_junctions_on_reads(spliced_reads) metadata_csv = os.path.join(self.junctions_folder, METADATA_CSV) metadata = self.junction_metadata(spliced_reads, metadata_csv) db = self.maybe_make_db() junction_exon_triples = self.make_exon_junction_adjacencies( metadata, db) event_maker = self.make_graph(junction_exon_triples, db) self.make_events_by_traversing_graph(event_maker, db) self.write_new_gtf(db)
[docs]class SubcommandAfterIndex(Subcommand): """Can use different index folder than outrigger_output/index""" @property def input_index(self): """Where to read the index from""" if self.index is not None: return self.index else: return os.path.join(self.output_folder, 'index') @property def folders(self): return self.output_folder, self.input_index, self.index_folder,\ self.gtf_folder, self.junctions_folder
[docs]class Validate(SubcommandAfterIndex):
[docs] def exon_pair_splice_sites(self, exonA, exonB, splice_abbrev): name = '{exonA}-{exonB}_splice_site'.format( exonA=exonA, exonB=exonB) exonA_splice_sites = self.individual_exon_splice_sites( exonA, splice_abbrev, 'downstream') exonB_splice_sites = self.individual_exon_splice_sites( exonB, splice_abbrev, 'upstream') intron_splice_site = exonA_splice_sites + '/' \ + exonB_splice_sites intron_splice_site.name = name return intron_splice_site
[docs] def individual_exon_splice_sites(self, exon, splice_abbrev, direction): exon_bed = os.path.join(self.input_index, splice_abbrev, '{}.bed'.format(exon)) splice_sites = check_splice_sites.read_splice_sites( exon_bed, self.genome, self.fasta, direction) return splice_sites
[docs] def execute(self): valid_splice_sites = check_splice_sites.splice_site_str_to_tuple( self.valid_splice_sites) for splice_name, splice_abbrev in common.SPLICE_TYPES: splice_name_spaces = splice_name.replace('_', ' ').title() util.progress('Finding valid splice sites in {} ({}) ' 'splice type ...'.format(splice_name_spaces, splice_abbrev.upper())) isoform_exons = common.SPLICE_TYPE_ISOFORM_EXONS[splice_abbrev] validated_folder = os.path.join(self.index_folder, splice_abbrev, 'validated') self.maybe_make_folder(validated_folder) splice_sites_seriess = [] for isoform, exons in isoform_exons.items(): valid_str = ' or '.join(valid_splice_sites) util.progress('\tFinding valid splice sites for {isoform} of' ' {splice_name} events which match ' '{valid_splice_sites}' '...'.format(isoform=isoform, splice_name=splice_name_spaces, valid_splice_sites=valid_str)) exon_pairs = zip(exons, exons[1:]) for exonA, exonB in exon_pairs: util.progress('\t\tFinding splice sites for {exonA} and ' '{exonB} ...'.format(exonA=exonA, exonB=exonB)) intron_splice_site = self.exon_pair_splice_sites( exonA, exonB, splice_abbrev) splice_sites_seriess.append(intron_splice_site) util.done(4) util.done(3) splice_sites = pd.concat(splice_sites_seriess, axis=1) csv = os.path.join(self.index_folder, splice_abbrev, 'splice_sites.csv') util.progress('\tWriting splice sites to {csv} ...'.format( csv=csv)) splice_sites.to_csv(csv) util.done(3) n_total = len(splice_sites.groupby(level=0, axis=0)) splice_sites_is_valid = splice_sites.isin(valid_splice_sites) valid_events_rows = splice_sites_is_valid.all(axis=1) splice_sites_validated = splice_sites.loc[valid_events_rows] n_valid = len(splice_sites_validated.groupby(level=0, axis=0)) util.progress("\tValidated {valid}/{total} {splice_name} " "({splice_abbrev}) events. " "".format(valid=n_valid, total=n_total, splice_name=splice_name_spaces, splice_abbrev=splice_abbrev.upper())) original_events_csv = os.path.join(self.input_index, splice_abbrev, EVENTS_CSV) validated_events_csv = os.path.join(validated_folder, EVENTS_CSV) util.progress('\tWriting validated events to {csv} ...'.format( csv=validated_events_csv)) with open(validated_events_csv, 'w') as f_validated: with open(original_events_csv) as f_original: for i, line in enumerate(f_original): if i == 0: f_validated.write(line) continue if line.split(',')[0] in splice_sites_validated.index: f_validated.write(line) util.done(3)
[docs]class Psi(SubcommandAfterIndex): # Instantiate empty variables here so PyCharm doesn't get mad at me reads_col = None sample_id_col = None junction_id_col = None required_cols = {'--reads-col': reads_col, '--sample-id-col': sample_id_col, '--junction-id-col': junction_id_col} @property def psi_folder(self): return os.path.join(self.output_folder, 'psi') @property def splice_type_folders(self): return dict((splice_name, os.path.join(self.input_index, splice_abbrev)) for splice_name, splice_abbrev in outrigger.common.SPLICE_TYPES) @property def folders(self): return self.output_folder, self.psi_folder def __init__(self, **kwargs): # Read all arguments and set as attributes of this class for key, value in kwargs.items(): setattr(self, key, value) if not os.path.exists(self.input_index): raise OSError("The index folder ({}) doesn't exist! Cowardly " "exiting because I don't know what events to " "calcaulate psi on :(".format(self.input_index)) for splice_name, splice_folder in self.splice_type_folders.items(): if not os.path.exists(splice_folder): raise OSError( "The splicing index of {} ({}) splice types " "doesn't exist! Cowardly exiting because I " "don't know how to define events :(".format( splice_name, splice_folder)) if not os.path.exists(self.junction_reads_filename) and \ self.bam is None: raise OSError( "The junction reads csv file ({}) doesn't exist! " "Cowardly exiting because I don't have the junction " "counts calcaulate psi on :(".format( self.junction_reads_filename)) for folder in self.folders: self.maybe_make_folder(folder)
[docs] def maybe_read_junction_reads(self): try: dtype = {self.reads_col: np.float32} util.progress( 'Reading splice junction reads from {} ...'.format( self.junction_reads_filename)) junction_reads = pd.read_csv( self.junction_reads_filename, dtype=dtype, low_memory=self.low_memory) util.done() except OSError: raise IOError( "There is no junction reads file at the expected location" " ({csv}). Are you in the correct directory?".format( csv=self.junction_reads_filename)) return junction_reads
[docs] def validate_junction_reads_data(self, junction_reads): for flag, col in self.required_cols.items(): if col not in junction_reads: raise ValueError( "The required column name {col} does not exist in {csv}. " "You can change this with the command line flag, " "{flag}".format(col=col, csv=self.junction_reads_filename, flag=flag))
[docs] def maybe_get_validated_events(self, splice_abbrev): splice_folder = os.path.join(self.input_index, splice_abbrev) events = os.path.join(splice_folder, EVENTS_CSV) validated_events = os.path.join(splice_folder, 'validated', EVENTS_CSV) if os.path.exists(validated_events): return validated_events else: return events
[docs] def execute(self): """Calculate percent spliced in (psi) of splicing events""" logger = logging.getLogger('outrigger.psi') if self.debug: logger.setLevel(10) junction_reads = self.csv() metadata_csv = os.path.join(self.junctions_folder, METADATA_CSV) self.junction_metadata(junction_reads, metadata_csv) junction_reads_2d = junction_reads.pivot(index=self.sample_id_col, columns=self.junction_id_col, values=self.reads_col) junction_reads_2d.fillna(0, inplace=True) junction_reads_2d = junction_reads_2d.astype(int) logger.debug('\n--- Splice Junction reads ---') logger.debug(repr(junction_reads.head())) psis = [] summaries = [] for splice_name, splice_abbrev in outrigger.common.SPLICE_TYPES: filename = self.maybe_get_validated_events(splice_abbrev) if not os.path.exists(filename): util.progress('No {name} ({abbrev}) events found, ' 'skipping.'. format(name=splice_name, abbrev=splice_abbrev)) continue # event_type = os.path.basename(filename).split('.csv')[0] util.progress('Reading {name} ({abbrev}) events from {filename}' ' ...'.format(name=splice_name, abbrev=splice_abbrev, filename=filename)) event_annotation = pd.read_csv(filename, index_col=0, low_memory=self.low_memory) util.done() isoform_junctions = outrigger.common.ISOFORM_JUNCTIONS[ splice_abbrev] logger.debug('\n--- Splicing event annotation ---') logger.debug(repr(event_annotation.head())) util.progress( 'Calculating percent spliced-in (Psi) scores on ' '{name} ({abbrev}) events ...'.format( name=splice_name, abbrev=splice_abbrev)) # Splice type percent spliced-in (psi) and summary type_psi, summary = compute.calculate_psi( event_annotation, junction_reads_2d, min_reads=self.min_reads, n_jobs=self.n_jobs, method=self.method, uneven_coverage_multiplier=self.uneven_coverage_multiplier, **isoform_junctions) # Write this event's percent spliced-in matrix csv = os.path.join(self.psi_folder, splice_abbrev, 'psi.csv'.format(splice_abbrev)) util.progress('Writing {name} ({abbrev}) Psi values to {filename}' ' ...'.format(name=splice_name, abbrev=splice_abbrev, filename=csv)) self.maybe_make_folder(os.path.dirname(csv)) type_psi.to_csv(csv, na_rep='NA') # Write this event's summary of events and why they weren't or were # calculated Psi on csv = os.path.join(self.psi_folder, splice_abbrev, 'summary.csv'.format(splice_abbrev)) util.progress('Writing {name} ({abbrev}) event summaries (e.g. ' 'number of reads, why an event does not have a Psi ' 'score) to {filename} ...' ''.format(name=splice_name, abbrev=splice_abbrev, filename=csv)) self.maybe_make_folder(os.path.dirname(csv)) summary.to_csv(csv, na_rep='NA', index=False) psis.append(type_psi) summary['splice_type'] = splice_abbrev summaries.append(summary) util.done() util.progress('Concatenating all calculated psi scores ' 'into one big matrix...') splicing = pd.concat(psis, axis=1) util.done() splicing = splicing.T csv = os.path.join(self.psi_folder, 'outrigger_psi.csv') util.progress('Writing a samples x features matrix of Psi ' 'scores to {} ...'.format(csv)) splicing.to_csv(csv, na_rep='NA') util.done() util.progress('Concatenating all summaries ' 'into one big matrix...') summary = pd.concat(summaries, ignore_index=True) util.done() csv = os.path.join(self.psi_folder, 'outrigger_summary.csv') util.progress('Writing summary table of Psi scores, junction reads, ' 'and cases to {} ...'.format(csv)) summary.to_csv(csv, na_rep='NA') util.done()
[docs]def main(): try: cl = CommandLine(sys.argv[1:]) except Usage: cl.do_usage_and_die()
if __name__ == '__main__': main()