def filter_multihits(filename, max_hits, verbose, tmp_dir):
"""
Pre-processing function for cleaning up the input bam file.
"""
if verbose:
print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
multiread_set=set()
subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
for aligned_read in oldfile:
try:
if aligned_read.opt("NH") < max_hits:
newfile.write(aligned_read)
if aligned_read.opt("NH")>1:
multiread_set.add(aligned_read.qname)
except KeyError:
newfile.write(aligned_read)
oldfile.close()
newfile.close()
sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
index_cmd='samtools index %s' % sorted_filename
subprocess.call(sort_cmd, shell=True)
subprocess.call(index_cmd, shell=True)
subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
return(sorted_filename, multiread_set)
评论列表
文章目录