def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.bam = pysam.Samfile(bamName)
if referenceName is not None:
self.reference = pysam.Fastafile(referenceName)
else:
self.reference = None
self.honH5 = honH5
#self.args = args
#self.kwargs = kwargs
python类Samfile()的实例源码
def __call__(self, bam, reference, honH5):
"""
Takes a pysam.Samfile
"""
logging.info("Starting %s" % (self.name))
size = self.end - self.start
regName = "%s:%d-%d" % (self.chrom, self.start, self.end)
logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))
logging.debug("Parsing bam" )
st = max(0, self.start)
readCount = bam.count(self.chrom, st, self.end)
reads = bam.fetch(self.chrom, st, self.end)
if readCount == 0:
logging.warning("No reads found in %s" % self.groupName)
self.failed = True
self.errMessage = "No reads found in %s" % self.groupName
return
else:
logging.info("%d reads to parse in %s" % (readCount, self.groupName))
myData = self.countErrors(reads, self.start, size, self.args)
#request loc on honH5 and flush results
with honH5.acquireH5('a') as h5dat:
out = h5dat.create_group(self.groupName)
out.attrs["reference"] = self.chrom
out.attrs["start"] = self.start
out.attrs["end"] = self.end
if size < CHUNKSHAPE[1]:
chunk = (CHUNKSHAPE[0], size-1)
else:
chunk = CHUNKSHAPE
container = out.create_dataset("data", data = myData, \
chunks=chunk, compression="gzip")
h5dat.flush()
def __call__(self, bam, reference, honH5):
"""
Takes a pysam.Samfile
"""
logging.info("Starting %s" % (self.name))
size = self.end - self.start
regName = "%s:%d-%d" % (self.chrom, self.start, self.end)
logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))
logging.debug("Parsing bam" )
st = max(0, self.start)
readCount = bam.count(self.chrom, st, self.end)
reads = bam.fetch(self.chrom, st, self.end)
if readCount == 0:
logging.warning("No reads found in %s" % self.groupName)
self.failed = True
self.errMessage = "No reads found in %s" % self.groupName
return
else:
logging.info("%d reads to parse in %s" % (readCount, self.groupName))
myData = self.countErrors(reads, self.start, size, self.args)
#request loc on honH5 and flush results
with honH5.acquireH5('a') as h5dat:
out = h5dat.create_group(self.groupName)
out.attrs["reference"] = self.chrom
out.attrs["start"] = self.start
out.attrs["end"] = self.end
if size < CHUNKSHAPE[1]:
chunk = (CHUNKSHAPE[0], size-1)
else:
chunk = CHUNKSHAPE
container = out.create_dataset("data", data = myData, \
chunks=chunk, compression="gzip")
h5dat.flush()
def mapTails(bam, args):
if bam.endswith('.bam'):
bam = pysam.Samfile(bam,'rb')
elif bam.endswith('.sam'):
bam = pysam.Samfile(bam)
else:
logging.error("Cannot open input file! %s" % (bam))
exit(1)
logging.info("Extracting tails")
tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp)
tailfastq.close(); tailfastq = tailfastq.name
r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail)
logging.info("Parsed %d reads" % (r))
logging.info("Found %d tails" % (t))
logging.info("%d reads had double tails" % (m))
if t == 0:
logging.info("No tails -- Exiting")
exit(0)
tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp)
tailmap.close(); tailmap = tailmap.name
callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap)
bam.close() #reset
return tailmap
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.bam = pysam.Samfile(bamName)
if referenceName is not None:
self.reference = pysam.Fastafile(referenceName)
else:
self.reference = None
self.honH5 = honH5
#self.args = args
#self.kwargs = kwargs
def __call__(self, bam, reference, honH5):
"""
Takes a pysam.Samfile
"""
logging.info("Starting %s" % (self.name))
size = self.end - self.start
regName = "%s:%d-%d" % (self.chrom, self.start, self.end)
logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))
logging.debug("Parsing bam" )
st = max(0, self.start)
readCount = bam.count(self.chrom, st, self.end)
reads = bam.fetch(self.chrom, st, self.end)
if readCount == 0:
logging.warning("No reads found in %s" % self.groupName)
self.failed = True
self.errMessage = "No reads found in %s" % self.groupName
return
else:
logging.info("%d reads to parse in %s" % (readCount, self.groupName))
myData = self.countErrors(reads, self.start, size, self.args)
#request loc on honH5 and flush results
with honH5.acquireH5('a') as h5dat:
out = h5dat.create_group(self.groupName)
out.attrs["reference"] = self.chrom
out.attrs["start"] = self.start
out.attrs["end"] = self.end
if size < CHUNKSHAPE[1]:
chunk = (CHUNKSHAPE[0], size-1)
else:
chunk = CHUNKSHAPE
container = out.create_dataset("data", data = myData, \
chunks=chunk, compression="gzip")
h5dat.flush()
def __init__(self, threadidx, options, config, startline, endline, names):
multiprocessing.Process.__init__(self)
# Initializing variables
self.threadidx = threadidx
self.options = options
self.config = config
self.startline = startline
self.endline = endline
self.names = names
# Initializing reads directory
self.reads = dict()
# Connecting to BAM file
self.samfile = pysam.Samfile(options.input, "rb")
# Connecting to transcript database file
if not config['transcript_db'] is None:
self.enstdb = pysam.Tabixfile(config['transcript_db'])
else:
self.enstdb = None
# Initializing output files
if int(options.threads) > 1:
if config['outputs']['regions']:
self.out_targets = open(options.output + '_regions_tmp_' + str(threadidx) + '.txt', 'w')
if config['outputs']['profiles']:
self.out_profiles = open(options.output + '_profiles_tmp_' + str(threadidx) + '.txt', 'w')
if not config['transcript_db'] is None:
self.out_poor = open(options.output + '_poor_tmp_' + str(threadidx) + '.txt', 'w')
else:
if config['outputs']['regions']:
self.out_targets = open(options.output + '_regions.txt', 'w')
if config['outputs']['profiles']:
self.out_profiles = open(options.output + '_profiles.txt', 'w')
if not config['transcript_db'] is None:
self.out_poor = open(options.output + '_poor.txt', 'w')
# Checking if target is fail or pass
def igv_reads(bamfile, virus_contig):
igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam"
if os.path.exists(igv_chimeric_file):
return igv_chimeric_file
with pysam.Samfile(args.bamfile, "rb") as in_handle, \
pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle:
contig = in_handle.gettid(args.virus_contig)
for read in in_handle:
if skip_read(read, contig, args.virus_contig, in_handle, False):
continue
chrom = in_handle.getrname(read.tid)
out_handle.write(read)
return igv_chimeric_file
def keep_chimeric_reads(bamfile, virus_contig):
chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.bam"
if os.path.exists(chimeric_file):
return chimeric_file
with pysam.Samfile(args.bamfile, "rb") as in_handle, \
pysam.Samfile(chimeric_file, "wb", template=in_handle) as out_handle:
contig = in_handle.gettid(args.virus_contig)
for read in in_handle:
if skip_read(read, contig, args.virus_contig, in_handle, True):
continue
chrom = in_handle.getrname(read.tid)
out_handle.write(read)
return chimeric_file
def get_duplicates(bam_file):
s = set()
with pysam.Samfile(bam_file, "rb") as in_handle:
for read in in_handle:
if read.is_duplicate:
s.update([read.qname])
return s
def get_duplicates(bam_file):
s = set()
with pysam.Samfile(bam_file, "rb") as in_handle:
for read in in_handle:
if read.is_duplicate:
s.update([read.qname])
return s
def concatenate_and_fix_bams(out_bamfile, bamfiles, drop_tags=None):
"""Concatenate bams with different references and populate tags.
Assumes that the input bams have completely non-overlapping reference entries.
No sorting assumed. Output reads as in the same order as input.
Tags are stripped from the header (which is assumed to look like an AugmentedFastqHeader)
and are stored as bam tags. Tags in drop_tags are completely dropped.
Args:
- drop_tags: Set of tag names to ignore. If None, than all tags will be included.
"""
template_bam = pysam.Samfile(bamfiles[0])
header = template_bam.header
for bam_fn in bamfiles[1:]:
bam = pysam.Samfile(bam_fn)
header['SQ'].extend(bam.header['SQ'])
refname_to_tid = { ref_head['SN']:idx for (idx, ref_head) in enumerate(header['SQ']) }
bam_out = pysam.Samfile(out_bamfile, "wb", header=header)
for bam_fn in bamfiles:
bam = pysam.Samfile(bam_fn)
for rec in bam:
if rec.is_unmapped and rec.mate_is_unmapped:
rec.reference_id = -1
rec.next_reference_id = -1
elif rec.is_unmapped and not rec.mate_is_unmapped:
rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]]
rec.reference_id = rec.next_reference_id
elif not rec.is_unmapped and rec.mate_is_unmapped:
rec.reference_id = refname_to_tid[bam.references[rec.reference_id]]
rec.next_reference_id = rec.reference_id
else:
rec.reference_id = refname_to_tid[bam.references[rec.reference_id]]
rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]]
# Pull fields out of qname, and make tags, writing out to bam_out
qname_split = rec.qname.split(cr_fastq.AugmentedFastqHeader.TAG_SEP)
rec.qname = qname_split[0]
tags = rec.tags
for i in range(1, len(qname_split), 2):
tag = (qname_split[i], qname_split[i+1])
# Don't add empty tags
if len(tag[1]) > 0 and (drop_tags is None or not tag[0] in drop_tags):
tags.append(tag)
rec.tags = tags
bam_out.write(rec)
def realigner(out_dir, tmp_dir, winsize=50, unstranded=False):
"""DOCSTRING
Args:
Returns:
"""
# file handlers
mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb')
ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
chr_list=[x['SN'] for x in ubam.header['SQ']]
# construct the mread_dict; this will be needed throughout
mread_dict = defaultdict(list)
for alignment in mbam:
mread_dict[alignment.qname].append(alignment)
# keep a record of processed reads
processed_mreads = set()
# iterate through all mreads
for read_qname in mread_dict:
if read_qname in processed_mreads:
continue
## construct the fully-connected subgraph for each read
read_to_locations, processed_mreads = \
construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded)
subgraph = set()
for read in read_to_locations:
_ = map(subgraph.add, read_to_locations[read].keys())
subgraph = list(subgraph)
## build the BIT tracks
node_track, multi_reads_weights = \
construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)
## run EM
multi_reads_weights = \
run_EM(node_track, multi_reads_weights, w=winsize)
## write to obam
for read in multi_reads_weights:
for node in multi_reads_weights[read]:
alignment = read_to_locations[read][node]
score = round(multi_reads_weights[read][node][0], 3)
alignment.set_tag('AS', score)
alignment.set_tag('PG', 'CLAM')
obam.write(alignment)
# sort the final output
logger.info('sorting output')
obam.close()
ubam.close()
mbam.close()
obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
pysam.index(obam_sorted_fn)
os.remove(os.path.join(out_dir, 'realigned.bam'))
return
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
"""DOCSTRING
Args:
Returns:
"""
# file handlers
mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
if unique_only:
ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
else:
ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')
# read in GTF
logger.info('read gtf from "%s" '% fn)
gene_annot = read_gtf(gtf_fp)
# fetch tag counts in each gene
## gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con }
logger.info('reading gene counts')
gene_count = defaultdict(dict)
for gene_name in tqdm(gene_annot):
gene = gene_annot[gene_name]
bin_interval_ip, bin_interval_con = \
gene_to_count(bam_dict, gene,
unique_only=unique_only, with_control=with_control,
unstranded=unstranded)
if bin_interval_ip is None: ## if no IP reads in this gene
continue
gene_count[gene_name]['ip'] = bin_interval_ip
gene_count[gene_name]['con'] = bin_interval_con
# estimate global overdispersion param. for each dataset
logger.info('estimating dispersion parameter')
alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' )
if with_control:
alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' )
else:
alpha_con_vec = np.asarray(alpha_ip_vec)
# perform statistical test
logger.info('calling peaks')
for gene_name in gene_to_count:
gene = gene_annot[gene_name]
BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'],
alpha_ip_vec, alpha_con_vec)
ofile.write(BED)
peak_counter += len(BED.split('\n'))
ofile.close()
logger.info('called %i peaks'%peak_counter)
return
def genebody_coverage(bam, position_list):
'''
position_list is dict returned from genebody_percentile
position is 1-based genome coordinate
'''
samfile = pysam.Samfile(bam, "rb")
aggreagated_cvg = collections.defaultdict(int)
gene_finished = 0
for chrom, strand, positions in position_list.values():
coverage = {}
for i in positions:
coverage[i] = 0.0
chrom_start = positions[0]-1
if chrom_start <0: chrom_start=0
chrom_end = positions[-1]
try:
samfile.pileup(chrom, 1,2)
except:
continue
for pileupcolumn in samfile.pileup(chrom, chrom_start, chrom_end, truncate=True):
ref_pos = pileupcolumn.pos+1
if ref_pos not in positions:
continue
if pileupcolumn.n == 0:
coverage[ref_pos] = 0
continue
cover_read = 0
for pileupread in pileupcolumn.pileups:
if pileupread.is_del: continue
if pileupread.alignment.is_qcfail:continue
if pileupread.alignment.is_secondary:continue
if pileupread.alignment.is_unmapped:continue
if pileupread.alignment.is_duplicate:continue
cover_read +=1
coverage[ref_pos] = cover_read
tmp = [coverage[k] for k in sorted(coverage)]
if strand == '-':
tmp = tmp[::-1]
for i in range(0,len(tmp)):
aggreagated_cvg[i] += tmp[i]
gene_finished += 1
if gene_finished % 100 == 0:
print >>sys.stderr, "\t%d transcripts finished\r" % (gene_finished),
return aggreagated_cvg
def read_bam(bam_fn, precursors, clean = True):
"""
read bam file and perform realignment of hits
"""
bam_fn = _sam_to_bam(bam_fn)
bam_fn = _bam_sort(bam_fn)
mode = "r" if bam_fn.endswith("sam") else "rb"
handle = pysam.Samfile(bam_fn, mode)
reads = defaultdict(hits)
for line in handle:
if line.reference_id < 0:
logger.debug("Sequence not mapped: %s" % line.reference_id)
continue
query_name = line.query_name
if query_name not in reads and line.query_sequence == None:
continue
if line.query_sequence and line.query_sequence.find("N") > -1:
continue
if query_name not in reads:
reads[query_name].set_sequence(line.query_sequence)
reads[query_name].counts = _get_freq(query_name)
if line.is_reverse:
logger.debug("Sequence is reverse: %s" % line.query_name)
continue
chrom = handle.getrname(line.reference_id)
# print "%s %s %s %s" % (line.query_name, line.reference_start, line.query_sequence, chrom)
cigar = line.cigartuples
iso = isomir()
iso.align = line
iso.set_pos(line.reference_start, len(reads[query_name].sequence))
logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end))
if len(precursors[chrom]) < line.reference_start + len(reads[query_name].sequence):
continue
iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], line.reference_start, cigar)
logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
if len(iso.subs) < 2:
reads[query_name].set_precursor(chrom, iso)
logger.info("Hits: %s" % len(reads))
if clean:
reads = filter.clean_hits(reads)
logger.info("Hits after clean: %s" % len(reads))
return reads
def main(argv):
args = parse_args(argv)
print args
if args.input == "-" and sys.stdin.isatty():
sys.stderr.write("STDIN is a terminal, terminating!\n")
return 1
elif sys.stdout.isatty():
sys.stderr.write("STDOUT is a terminal, terminating!\n")
return 1
with pysam.Samfile(args.input, "rb") as infile:
with pysam.Samfile("-", "wb", template=infile) as outfile:
curpos = None
curvariants = {}
for (read_num, read) in enumerate(infile):
if curpos and ((read.tid, read.pos) != curpos):
# Sort order is defined as ascending 'tid's and positions
if curpos > (read.tid, read.pos) and not read.is_unmapped:
sys.stderr.write("ERROR: Input file does not appear "
"to be sorted by coordinates at "
"record %i, aborting ...\n"
% (read_num,))
return 1
_flush_buffer(outfile, curvariants,
args.remove_duplicates)
curpos = None
is_filtered = read.flag & _FILTERED_FLAGS
is_collapsed = read.qname.startswith("M_")
if is_filtered or not (read.qual and is_collapsed):
outfile.write(read)
continue
curpos = (read.tid, read.pos)
nkey = (read.is_reverse, read.pos, read.alen)
if nkey in curvariants:
curvariants[nkey][0].append(read)
curvariants[nkey][1] += 1
else:
curvariants[nkey] = [[read], 1]
_flush_buffer(outfile, curvariants, args.remove_duplicates)
return 0
hints_db.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def setup_hints(job, input_file_ids):
"""
Generates hints for a given genome with a list of BAMs. Will add annotation if it exists.
"""
# RNA-seq hints
filtered_bam_file_ids = {'BAM': collections.defaultdict(list), 'INTRONBAM': collections.defaultdict(list)}
for dtype, bam_dict in input_file_ids['bams'].iteritems():
if len(bam_dict) == 0:
continue
# Since BAMs are valid, we can assume that they all share the same header
bam_file_id, bai_file_id = bam_dict.values()[0]
bam_path = job.fileStore.readGlobalFile(bam_file_id)
sam_handle = pysam.Samfile(bam_path)
# triple disk usage to deal with name sorted bam
disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id]) * 3
# generate reference grouping that will be used downstream until final cat step
grouped_references = [tuple(x) for x in group_references(sam_handle)]
for original_path, (bam_file_id, bai_file_id) in bam_dict.iteritems():
for reference_subset in grouped_references:
j = job.addChildJobFn(namesort_bam, bam_file_id, bai_file_id, reference_subset, disk_usage,
disk=disk_usage, cores=4, memory='16G')
filtered_bam_file_ids[dtype][reference_subset].append(j.rv())
# IsoSeq hints
iso_seq_hints_file_ids = []
iso_seq_file_ids = input_file_ids['iso_seq_bams']
if len(iso_seq_file_ids) > 0:
for bam_file_id, bai_file_id in iso_seq_file_ids:
disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id])
j = job.addChildJobFn(generate_iso_seq_hints, bam_file_id, bai_file_id, disk=disk_usage)
iso_seq_hints_file_ids.append(j.rv())
# protein hints
if input_file_ids['protein_fasta'] is not None:
disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['protein_fasta'])
j = job.addChildJobFn(generate_protein_hints, input_file_ids['protein_fasta'], input_file_ids['genome_fasta'],
disk=disk_usage)
protein_hints_file_id = j.rv()
else:
protein_hints_file_id = None
# annotation hints
if input_file_ids['annotation'] is not None:
disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['annotation'])
j = job.addChildJobFn(generate_annotation_hints, input_file_ids['annotation'], disk=disk_usage)
annotation_hints_file_id = j.rv()
else:
annotation_hints_file_id = None
return job.addFollowOnJobFn(merge_bams, filtered_bam_file_ids, annotation_hints_file_id,
iso_seq_hints_file_ids, protein_hints_file_id).rv()
hints_db.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def namesort_bam(job, bam_file_id, bai_file_id, reference_subset, disk_usage, num_reads=50 ** 6):
"""
Slices out the reference subset from a BAM, name sorts that subset, then chunks the resulting reads up for
processing by filterBam.
"""
def write_bam(r, ns_handle):
"""Write to the path, returns file ID"""
outf = tools.fileOps.get_tmp_toil_file()
outf_h = pysam.Samfile(outf, 'wb', template=ns_handle)
for rec in r:
outf_h.write(rec)
outf_h.close()
return job.fileStore.writeGlobalFile(outf)
bam_path = job.fileStore.readGlobalFile(bam_file_id)
is_paired = bam_is_paired(bam_path)
job.fileStore.readGlobalFile(bai_file_id, bam_path + '.bai')
name_sorted = tools.fileOps.get_tmp_toil_file(suffix='name_sorted.bam')
cmd = [['samtools', 'view', '-b', bam_path] + list(reference_subset),
['sambamba', 'sort', '-t', '4', '-m', '15G', '-o', '/dev/stdout', '-n', '/dev/stdin']]
tools.procOps.run_proc(cmd, stdout=name_sorted)
ns_handle = pysam.Samfile(name_sorted)
# this group may come up empty -- check to see if we have at least one mapped read
try:
_ = ns_handle.next()
except StopIteration:
return None
# reset file handle to start
ns_handle = pysam.Samfile(name_sorted)
filtered_file_ids = []
r = []
for qname, reads in itertools.groupby(ns_handle, lambda x: x.qname):
r.extend(list(reads))
if len(r) >= num_reads:
file_id = write_bam(r, ns_handle)
j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G')
filtered_file_ids.append(j.rv())
r = []
# do the last bin, if its non-empty
if len(r) > 0:
file_id = write_bam(r, ns_handle)
j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G')
filtered_file_ids.append(j.rv())
return job.addFollowOnJobFn(merge_filtered_bams, filtered_file_ids, disk=disk_usage, memory='16G').rv()
def test(argv):
numpy.seterr(all="ignore")
args = parseArgs(argv)
setupLogging(True)#keep debug on.. you're testing!
logging.critical(("Running HSpots.py directly implements testing mode. "
"If you're trying to run the full, actual program, use "
"Honey.py spots"))
bam = pysam.Samfile(args.bam)
reference = pysam.Fastafile(args.reference)
try:
if bam.header["HD"]["SO"] != "coordinate":
logging.warning("BAM is not sorted by coordinates! Performance may be slower")
except KeyError:
logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
logging.info("Running in test mode")
#do what you will.. from here
#spot = SpotResult(chrom='11', start=2215290, end=2215798, svtype="DEL", size=208)
#spot = SpotResult(chrom='22', start=45964261, end=45965596, svtype="DEL", size=-1)
# This is what I need to start with
#spot = SpotResult(chrom="22", start=45963975, end=45964532, svtype="DEL", size=57)
fh = open("honeymissing.bed")
for line in fh.readlines():
data = line.strip().split('\t')
spot = SpotResult(chrom=data[0], start=int(data[1]), end = int(data[2]), \
size=int(data[3].split('=')[-1]), svtype="DEL")
j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
if j.supportingReadsFilter(spot, bam, args):
consen = ConsensusCaller(spot, args)
consen(bam, reference, 'none')
for i in consen.newSpots:
i.tags["seqmade"] = True
print i
if len(consen.newSpots) == 0:
spot.tags["noseq"] = True
print str(spot)
else:
spot.tags["filtfail"] = True
print str(spot)
#done with test code
logging.info("Finished testing")