def filter_bam(input_file, output_file, downweight_number, ctot, gtoa):
"""
Takes a bam file as input and weights the quality of the reads down.
Need to ensure we write the header out :)
Investigate pysam and look for a header,
this should really help us understand how to get this bam filter working
and writing the bam files directly back out to the terminal.
"""
bam = pysam.Samfile(input_file,'rb')
bam_out = pysam.Samfile(output_file, 'wb',template=bam)
for line in bam:
change_bases_c = None
change_bases_t = None
seq = line.seq
qual = line.qual
if(ctot):
change_bases_c = [check_c_2_t(nuc) and i < downweight_number for i, nuc in enumerate(seq)]
if(gtoa):
change_bases_t = [check_g_2_a(nuc) and (len(seq)-i) <= downweight_number for i, nuc in enumerate(seq)]
new_qual = downweight_quality(qual,change_bases_c, change_bases_t)
line.qual = new_qual
bam_out.write(line)
python类Samfile()的实例源码
hints_db.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome):
"""
Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences
can exist in the reference that do not exist in the BAM, but not the other way around.
"""
handle = pysam.Samfile(bam_path, 'rb')
bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])}
difference = bam_sequences - fasta_sequences
if len(difference) > 0:
base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.'
err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference]))
raise UserException(err)
missing_seqs = fasta_sequences - bam_sequences
if len(missing_seqs) > 0:
base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.'
msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs]))
logger.warning(msg)
def main(bam):
# get contig2size
#faidx = FastaIndex(fasta)
#contig2size = {c: faidx.id2stats[c][0] for c in faidx}
sam = pysam.Samfile(bam)
contig2size = {c: s for c, s in zip(sam.references, sam.lengths)}
# estimate on largest contigs
longest_contigs = sorted(contig2size, key=lambda x: contig2size[x], reverse=1)
totdata = [0, 0]
for c in longest_contigs[:25]:
data = bam2matches(bam, regions=[c], mapq=10)
print c, contig2size[c], 1.*data[0] / sum(data), sum(data)
for i in range(2):
totdata[i] += data[i]
data = totdata
print 1.*data[0] / sum(data), sum(data)
def test_make_split_read_bam_file(self):
sorted_bam = path.join(TEST_DATA_DIR, 'sorted.bam')
with pysam.Samfile(sorted_bam, 'rb') as samfile:
for read in samfile:
if not read.cigarstring:
continue
for breakpoint in (10, 50, 100):
if breakpoint >= read.rlen:
continue
for is_left_split in (True, False):
split_read = make_split_read(read, breakpoint, is_left_split)
cigar_items = list(Cigar(split_read.cigarstring).items())
clipped_item = cigar_items[0] if is_left_split else cigar_items[-1]
min_clip_len = breakpoint if is_left_split else read.rlen - breakpoint # Can be longer if adjacent to another clip.
self.assertGreaterEqual(clipped_item[0], min_clip_len)
self.assertIn(clipped_item[1], ('S', 'H')) # Will be soft-clipped unless already hard-clipped.
def chimeric_reads(bamfile, virus_contig, duplicates):
igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam"
if os.path.exists(igv_chimeric_file):
return igv_chimeric_file
with pysam.Samfile(args.bamfile, "rb") as in_handle, \
pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle:
for read in in_handle:
try:
chrom = in_handle.getrname(read.tid)
except ValueError:
continue
if not is_chimeric_read(read, chrom, virus_contig):
continue
if read.qname in duplicates:
continue
out_handle.write(read)
return igv_chimeric_file
def merge_by_key(bam_filenames, key_func, bam_out):
file_cache = tk_cache.FileHandleCache(mode='rb', open_func=pysam.Samfile)
total_reads = 0
heap = []
for bam_filename in bam_filenames:
try:
bam = file_cache.get(bam_filename)
first_read = bam.next()
heapq.heappush(heap, (key_func(first_read), first_read, bam_filename))
except StopIteration:
pass
while len(heap) > 0:
# Get the minimum item and write it to the bam.
key, read, bam_filename = heapq.heappop(heap)
bam = file_cache.get(bam_filename)
bam_out.write(read)
total_reads += 1
# Get the next read from the source bam we just wrote from
# If that BAM file is out of reads, then we leave that one out
try:
next_read = bam.next()
heapq.heappush(heap, (key_func(next_read), next_read, bam_filename))
except StopIteration:
pass
return total_reads
def create_bam_outfile(file_name, chrom_names, chrom_lengths, template=None, pgs=None, cos=None, rgs=None, replace_rg=False):
""" Creates a bam file with given chromosome names and lengths.
template is an existing bam file. If it is specified, chrom_names and chrom_lengths
are ignored. pg is dictionary specifying a 'PG' entry. ID field is required; PN/CL/PP/DS/VN fields are optional.
rgs is a list of dicts specifiying an 'RG' entry. If replace_rg is True, the existing 'RG' entry is overwritten.
"""
if template:
header = template.header
if pgs is not None:
for pg in pgs:
if not header.has_key('PG'):
header['PG'] = []
# add in the PP field based on previous PG entry
if len(header['PG']) > 0:
pp = header['PG'][-1]['ID']
if pp is not None:
pg['PP'] = pp
header['PG'].append(pg)
if cos is not None:
for co in cos:
if not header.has_key('CO'):
header['CO'] = []
header['CO'].append(co)
if rgs is not None:
if replace_rg and header.has_key('RG') and len(rgs) > 0:
header['RG'] = []
for rg in rgs:
if not header.has_key('RG'):
header['RG'] = []
header['RG'].append(rg)
bam_file = pysam.Samfile(file_name, 'wb', header=header)
tids = {name:n for (n, name) in enumerate(template.references)}
else:
header = {'SQ': [{'SN': chrom_names[n], 'LN': chrom_lengths[n]} for n in xrange(len(chrom_names))]}
bam_file = pysam.Samfile(file_name, 'wb', header=header)
tids = {chrom_names[n]:n for n in xrange(len(chrom_names))}
return bam_file, tids
def create_bam_infile(file_name):
bam_file = pysam.Samfile(file_name, 'rb')
return bam_file
def create_sam_infile(file_name):
sam_file = pysam.Samfile(file_name, 'r')
return sam_file
def sort_by_name(file_name, sorted_prefix=None):
""" Sorts a bam file by the read name, for paired-end
"""
if sorted_prefix is None:
sorted_prefix = file_name.replace('.bam', '') + '_namesorted'
sorted_name = sorted_prefix + '.bam'
# NOTE -- need to update our internal samtools in order to use pysam.sort
#pysam.sort('-n', file_name, sorted_prefix)
subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix])
return pysam.Samfile(sorted_name, 'rb')
def filter_multihits(filename, max_hits, verbose, tmp_dir):
"""
Pre-processing function for cleaning up the input bam file.
"""
if verbose:
print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
multiread_set=set()
subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
for aligned_read in oldfile:
try:
if aligned_read.opt("NH") < max_hits:
newfile.write(aligned_read)
if aligned_read.opt("NH")>1:
multiread_set.add(aligned_read.qname)
except KeyError:
newfile.write(aligned_read)
oldfile.close()
newfile.close()
sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
index_cmd='samtools index %s' % sorted_filename
subprocess.call(sort_cmd, shell=True)
subprocess.call(index_cmd, shell=True)
subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
return(sorted_filename, multiread_set)
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
"""DOCSTRING
Args:
Returns:
"""
# file handlers
mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
if unique_only:
ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
else:
ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')
# read in GTF
gene_annot = read_gtf(gtf_fp)
# iteratively call peaks in each gene
peak_counter = 0
for gene_name in tqdm(gene_annot):
gene = gene_annot[gene_name]
BED = call_gene_peak(bam_dict, gene,
unique_only=unique_only, with_control=with_control,
unstranded=unstranded)
ofile.write(BED)
#print BED
peak_counter += len(BED.split('\n'))
ofile.close()
logger.info('called %i peaks'%peak_counter)
return
def make_bam_handler_dict(ip_bam_list, con_bam_list):
"""DOCSTRING
Args
Returns
"""
bam_dict = defaultdict(list)
try:
ubam_ip, mbam_ip = ip_bam_list
except:
ubam_ip = ip_bam_list[0]
mbam_ip = None
for bam_fn in ubam_ip.split(','):
if not os.path.isfile(bam_fn):
raise Exception('%s not found'%bam_fn)
bam_dict['ubam.ip'].append( pysam.Samfile(bam_fn, 'rb') )
if mbam_ip is not None:
for bam_fn in mbam_ip.split(','):
if not os.path.isfile(bam_fn):
raise Exception('%s not found'%bam_fn)
bam_dict['mbam.ip'].append( pysam.Samfile(bam_fn, 'rb') )
if con_bam_list is None:
return bam_dict
try:
ubam_con, mbam_con = con_bam_list
except:
ubam_con = con_bam_list[0]
mbam_con = None
for bam_fn in ubam_con.split(','):
if not os.path.isfile(bam_fn):
raise Exception('%s not found'%bam_fn)
bam_dict['ubam.con'].append( pysam.Samfile(bam_fn, 'rb') )
if mbam_con is not None:
for bam_fn in mbam_con.split(','):
if not os.path.isfile(bam_fn):
raise Exception('%s not found'%bam_fn)
bam_dict['mbam.con'].append( pysam.Samfile(bam_fn, 'rb') )
return bam_dict
def filter_multihits(filename, max_hits, verbose, tmp_dir):
"""
Pre-processing function for cleaning up the input bam file.
"""
if verbose:
print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
multiread_set=set()
subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
for aligned_read in oldfile:
try:
if aligned_read.opt("NH") < max_hits:
newfile.write(aligned_read)
if aligned_read.opt("NH")>1:
multiread_set.add(aligned_read.qname)
except KeyError:
newfile.write(aligned_read)
oldfile.close()
newfile.close()
sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
index_cmd='samtools index %s' % sorted_filename
subprocess.call(sort_cmd, shell=True)
subprocess.call(index_cmd, shell=True)
subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
return(sorted_filename, multiread_set)
def main():
usage="%prog [options]" + '\n' + __doc__ + "\n"
parser = OptionParser(usage,version="%prog " + __version__)
parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file should be sorted and indexed.")
parser.add_option("-n","--subset-num",action="store",type="int",dest="subset_num",help="Number of small BAM files")
parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. Output \"Prefix_num.bam\".")
parser.add_option("-s","--skip-unmap",action="store_true",dest="skip_unmap", help="Skip unmapped reads.")
(options,args)=parser.parse_args()
if not (options.input_file and options.subset_num and options.output_prefix):
parser.print_help()
sys.exit(0)
if not os.path.exists(options.input_file):
print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
sys.exit(0)
samfile = pysam.Samfile(options.input_file,'rb')
sub_bam = {}
count_bam={}
for i in range(0,options.subset_num):
sub_bam[i] = pysam.Samfile(options.output_prefix + '_' + str(i) +'.bam','wb',template=samfile)
count_bam[i] = 0
total_alignment = 0
print >>sys.stderr, "Dividing " + options.input_file + " ...",
try:
while(1):
aligned_read = samfile.next()
if aligned_read.is_unmapped and options.skip_unmap is True:
continue
total_alignment += 1
tmp = randrange(0,options.subset_num)
sub_bam[tmp].write(aligned_read)
count_bam[tmp] += 1
except StopIteration:
print >>sys.stderr, "Done"
for i in range(0,options.subset_num):
print "%-55s%d" % (options.output_prefix + '_' + str(i) +'.bam', count_bam[i])
def main():
usage="%prog [options]" + '\n' + __doc__ + "\n"
parser = OptionParser(usage,version="%prog " + __version__)
parser.add_option("-i","--input",action="store",type="string",dest="input_file",help="Input BAM file")
parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in BED format. Must be strandard 12-column BED file. [required]")
parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
parser.add_option("-n","--frag-num",action="store",type="int",dest="fragment_num",default=3,help="Minimum number of fragment. default=%default")
(options,args)=parser.parse_args()
if not (options.input_file and options.refgene_bed):
parser.print_help()
sys.exit(0)
if not os.path.exists(options.input_file + '.bai'):
print >>sys.stderr, "cannot find index file of input BAM file"
print >>sys.stderr, options.input_file + '.bai' + " does not exists"
sys.exit(0)
for file in (options.input_file, options.refgene_bed):
if not os.path.exists(file):
print >>sys.stderr, file + " does NOT exists" + '\n'
sys.exit(0)
print '\t'.join([str(i) for i in ("chrom","tx_start", "tx_end","symbol","frag_count","frag_mean","frag_median","frag_std")])
for tmp in fragment_size(options.refgene_bed, pysam.Samfile(options.input_file), options.map_qual, options.fragment_num):
print tmp
def __init__(self,inputFile):
'''constructor. input could be bam or sam'''
try:
self.samfile = pysam.Samfile(inputFile,'rb')
if len(self.samfile.header) ==0:
print >>sys.stderr, "BAM/SAM file has no header section. Exit!"
sys.exit(1)
self.bam_format = True
except:
self.samfile = pysam.Samfile(inputFile,'r')
if len(self.samfile.header) ==0:
print >>sys.stderr, "BAM/SAM file has no header section. Exit!"
sys.exit(1)
self.bam_format = False
def fixmate(infile, outfile):
inbam = pysam.Samfile(infile, 'rb')
outbam = pysam.Samfile(outfile, 'wb', header=inbam.header,
referencenames=inbam.references)
qname = None
nTotal = 0
nFixed = 0
count = 0;
reads = []
gc.disable()
for rseq in inbam.fetch(until_eof=True):
nTotal += 1
if qname is None or qname == rseq.qname:
qname = rseq.qname
reads.append(rseq)
else:
count = process(reads, inbam.getrname)
if count > 0:
for r in reads:
outbam.write(r)
nFixed += count
qname = rseq.qname
del reads
reads = [rseq]
if nTotal % 200000 == 0:
logger.info('%d read(s) fixed' % nTotal)
gc.enable()
gc.disable()
count = process(reads, inbam.getrname)
if count > 0:
for r in reads:
outbam.write(r)
nFixed += count
logger.info('%d read(s) processed' % nTotal)
logger.info('%d read(s) fixed and written to file' % nFixed)
inbam.close()
outbam.close()
return (nTotal, nFixed)
def genes_alignments(outdir):
genes_align = {}
align_file = pysam.Samfile(outdir+'ORFs_on_genes_align.sam', 'rb')
for rec in align_file:
if rec.cigarstring is not None:
gene_in_contig_len = int(re.findall(r'_len_(\d+)', rec.reference_name)[0])
total_match = sum([i[1] for i in rec.cigar if i[0] == 0])
conf = float(re.findall(r'_conf_([0-9]*\.[0-9]*)', rec.reference_name)[0])
if gene_in_contig_len / total_match > 0.98:
if genes_align.get(rec.reference_name) is None:
genes_align[rec.reference_name] = conf, set()
genes_align[rec.reference_name][1].add(rec.qname)
return genes_align
hints_db.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def bam_is_paired(bam_path, num_reads=20000, paired_cutoff=0.75):
"""
Infers the paired-ness of a bam file.
"""
sam = pysam.Samfile(bam_path)
count = 0
for rec in itertools.islice(sam, num_reads):
if rec.is_paired:
count += 1
if tools.mathOps.format_ratio(count, num_reads) > 0.75:
return True
elif tools.mathOps.format_ratio(count, num_reads) < 1 - paired_cutoff:
return False
else:
raise UserException("Unable to infer pairing from bamfile {}".format(bam_path))
misc.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def is_bam(path):
"""Checks if a path is a BAMfile"""
try:
pysam.Samfile(path)
except IOError:
raise RuntimeError('Path {} does not exist'.format(path))
except ValueError:
return False
return True
def renamereads(inbamfn, outbamfn):
inbam = pysam.Samfile(inbamfn, 'rb')
outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)
paired = {}
n = 0
p = 0
u = 0
w = 0
m = 0
for read in inbam.fetch(until_eof=True):
n += 1
if read.is_paired:
p += 1
if read.qname in paired:
uuid = paired[read.qname]
del paired[read.qname]
read.qname = uuid
outbam.write(read)
w += 1
m += 1
else:
newname = str(uuid4())
paired[read.qname] = newname
read.qname = newname
outbam.write(read)
w += 1
else:
u += 1
read.qname = str(uuid4())
outbam.write(read)
w += 1
outbam.close()
inbam.close()
def renamereads(inbamfn, outbamfn):
inbam = pysam.Samfile(inbamfn, 'rb')
outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)
paired = {}
n = 0
p = 0
u = 0
w = 0
m = 0
for read in inbam.fetch(until_eof=True):
n += 1
if read.is_paired:
p += 1
if read.qname in paired:
uuid = paired[read.qname]
del paired[read.qname]
read.qname = uuid
outbam.write(read)
w += 1
m += 1
else:
newname = str(uuid4())
paired[read.qname] = newname
read.qname = newname
outbam.write(read)
w += 1
else:
u += 1
read.qname = str(uuid4())
outbam.write(read)
w += 1
outbam.close()
inbam.close()
def renamereads(inbamfn, outbamfn):
inbam = pysam.Samfile(inbamfn, 'rb')
outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)
paired = {}
n = 0;p = 0;u = 0;w = 0;m = 0
for read in inbam.fetch(until_eof=True):
n += 1
if read.is_paired:
p += 1
if read.qname in paired:
uuid = paired[read.qname]
del paired[read.qname]
read.qname = uuid
outbam.write(read)
w += 1;m += 1
else:
newname = str(uuid4())
paired[read.qname] = newname
read.qname = newname
outbam.write(read)
w += 1
else:
u += 1
read.qname = str(uuid4())
outbam.write(read)
w += 1
outbam.close()
inbam.close()
def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path):
print "___ removing reads overlapping heterozygous region ___"
inbamsorted = sub('.bam$','.sorted',inbamfn)
pysam.sort(inbamfn, inbamsorted)
pysam.index(inbamsorted+'.bam')
alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" )
outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile )
bedfile = open(bedfn, 'r')
for bedline in bedfile:
c = bedline.strip().split()
if (len(c) == 3 ):
chr2 = c[0]
chr = c[0].strip("chr")
start = int(c[1])
end = int(c[2])
else :
continue
try:
readmappings = alignmentfile.fetch(chr2, start, end)
except ValueError as e:
print("problem fetching the read ")
for shortread in readmappings:
try:
outbam.write(shortread)
except ValueError as e:
print ("problem removing read :" + shortread.qname)
outbamsorted = sub('.bam$','.sorted',outbamfn)
pysam.sort(outbamfn, outbamsorted)
bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path )
outbam.close()
def pileup(bamfile_name):
samfile = pysam.Samfile(bamfile_name,"rb")
output_pileup = open("test.cov", "w")
sum_cov=0
for pos_pileup in samfile.pileup():
sum_cov+=pos_pileup.n
if not pos_pileup.pos % 1e3:
av_cov=int(sum_cov/1e3)
sum_cov=0
print(pos_pileup.tid,pos_pileup.pos,av_cov,file=output_pileup)
def buildcov(self):
"""
Builds Barcode counts of the reads aligned to the genome
input : bam_file and bed_file
output : Returns bed file with Chr, start, end, barcode, no of occurences, strand
"""
before = datetime.now()
barcode_pattern = re.compile(r'BX:(A-Z)?:[ACGT]*-[0-9]')
self.__LOGGER.info("WARNING: Dropping Reads Alignments that are not barcoded")
output_bed = open(self.barcode_bed,"wb")
with open(self.bed_file) as regions:
with pysam.Samfile(self.obam_file, "rb") as samfile:
for line in regions:
chr, start, end , info = line.rstrip('\n').rstrip('\r').split("\t")
b_dict = {}
try:
readsinregion=samfile.fetch(chr,int(start),int(end))
for read in readsinregion:
tags = dict(read.tags)
if 'BX' in tags.keys():
if tags['BX'] in b_dict: b_dict[tags['BX']] += 1
else: b_dict[tags['BX']] = 1
except:
self.__LOGGER.info("Skipping region : {0}:{1}:{2}".format(chr,start,end))
continue
for k, v in b_dict.items():
output_bed.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chr,start,end,info,k,v))
regions.close()
samfile.close()
output_bed.close()
after = datetime.now()
self.__LOGGER.info('Computed Barcode profiles for the vcf regions completed in {0}'.format(str(after - before)))
def load_samfile(sam_file):
"""To load an indexed bam file"""
ftype = sam_file.split(".")[-1]
if ftype != "bam" and ftype != "sam":
print("Error: file type need suffix of bam or sam.")
sys.exit(1)
# print("Loading a %s" %ftype + " with file name: %s" %sam_file)
if ftype == "bam":
samfile = pysam.Samfile(sam_file, "rb")
else:
samfile = pysam.Samfile(sam_file, "r")
return samfile
def run(self):
try:
proc_name = self.name
#Open the bams
nBams = [] # nonTrim Bams
tBams = [] # trim Bams
for i in self.args.bam:
nBams.append(pysam.Samfile(i))
for i in self.args.pacBam:
tBams.append(pysam.Samfile(i))
while True:
next_task = self.task_queue.get()
if next_task is None:
# Poison pill means shutdown
logging.info('Thread %s: Exiting\n' % proc_name)
self.task_queue.task_done()
break
try:
answer = next_task(nBams, tBams)
except Exception as e:
logging.error("Exception raised in task %s" % (str(e)))
self.task_queue.task_done()
self.result_queue.put("Failure - UNK - %s" % str(e))
logging.info("fail in groupid=%s" % next_task.data.name)
continue
self.task_queue.task_done()
self.result_queue.put(answer)
return
except Exception as e:
logging.error("Consumer %s Died\nERROR: %s" % (self.name, e))
return
def mapTails(bam, args):
if bam.endswith('.bam'):
bam = pysam.Samfile(bam,'rb')
elif bam.endswith('.sam'):
bam = pysam.Samfile(bam)
else:
logging.error("Cannot open input file! %s" % (bam))
exit(1)
logging.info("Extracting tails")
tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp)
tailfastq.close(); tailfastq = tailfastq.name
r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail)
logging.info("Parsed %d reads" % (r))
logging.info("Found %d tails" % (t))
logging.info("%d reads had double tails" % (m))
if t == 0:
logging.info("No tails -- Exiting")
exit(0)
tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp)
tailmap.close(); tailmap = tailmap.name
callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap)
bam.close() #reset
return tailmap