Source code for outrigger.validate.check_splice_sites
from collections import OrderedDict
from Bio import SeqIO
import pandas as pd
import pybedtools
NT = 2
MAMMALIAN_SPLICE_SITES = 'GT/AG,GC/AG,AT/AC'
[docs]def splice_site_str_to_tuple(splice_site):
pairs = splice_site.split(',')
return tuple(pairs)
[docs]def maybe_read_chromsizes(genome):
try:
chromsizes = OrderedDict()
with open(genome) as f:
for line in f:
chrom, size = line.strip().split()
size = int(size)
chromsizes[chrom] = (0, size)
except OSError:
chromsizes = pybedtools.chromsizes(genome)
return chromsizes
[docs]def read_splice_sites(bed, genome, fasta, direction='upstream'):
"""Read splice sites of an exon
Parameters
----------
bed : pybedtools.BedTool | str
Exons whose splice sites you're interested in
genome : str
Location of a chromosome sizes file for the genome
fasta : str
Location of the genome fasta file
direction : 'upstream' | 'downstream'
Which direction of splice sites you want for the exon
"""
if isinstance(bed, str):
bed = pybedtools.BedTool(bed)
genome = maybe_read_chromsizes(genome)
if direction == 'upstream':
left = NT
right = 0
elif direction == 'downstream':
left = 0
right = NT
flanked = bed.flank(l=left, r=right, s=True, genome=genome)
seqs = flanked.sequence(fi=fasta, s=True)
with open(seqs.seqfn) as f:
records = SeqIO.parse(f, 'fasta')
records = pd.Series([str(r.seq) for r in records],
index=[b.name for b in bed])
# import pdb; pdb.set_trace()
return records