def merge_reads(bams, merged_output_dir):
"""
Merge bam files, can take odd numbers of reads.
"""
try:
os.mkdir(merged_output_dir)
except OSError:
pass
bam_list = get_bam_pairs(bams)
for sample_id, bams in bam_list.items():
header = get_header(bams[0])
bam_out = os.path.join(merged_output_dir, os.path.basename(sample_id)) + '.bam'
out = pysam.AlignmentFile(bam_out, 'wb', header=header)
for bam in bams:
sam = pysam.AlignmentFile(bam, 'rb')
for read in sam:
out.write(read)
out.close()
pysam.sort(bam_out, 'tmp')
os.rename('tmp.bam',bam_out)
pysam.index(bam_out)
python类sort()的实例源码
def sort(file_name, sorted_prefix=None):
""" Sorts and indexes the bam file given by file_name.
"""
if sorted_prefix is None:
sorted_prefix = file_name.replace('.bam', '') + '_sorted'
sorted_name = sorted_prefix + ".bam"
subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])
def sort_and_index(file_name, sorted_prefix=None):
""" Sorts and indexes the bam file given by file_name.
"""
if sorted_prefix is None:
sorted_prefix = file_name.replace('.bam', '') + '_sorted'
sorted_name = sorted_prefix + '.bam'
subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])
pysam.index(sorted_name)
def sort_by_name(file_name, sorted_prefix=None):
""" Sorts a bam file by the read name, for paired-end
"""
if sorted_prefix is None:
sorted_prefix = file_name.replace('.bam', '') + '_namesorted'
sorted_name = sorted_prefix + '.bam'
# NOTE -- need to update our internal samtools in order to use pysam.sort
#pysam.sort('-n', file_name, sorted_prefix)
subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix])
return pysam.Samfile(sorted_name, 'rb')
def _save_dict(bed, out_fname, val_index=None):
"""Save data from dict to BED file."""
sites = pybedtools.BedTool(
_iter_bed_dict(bed, val_index=val_index)
).saveas()
sites1 = sites.sort().saveas(out_fname)
return sites1
def find_roi_bam(chromosome_event):
chr,event = chromosome_event .split("_")
roi,sortbyname,sortbyCoord, hetsnp = init_file_names(chr, event, tmpbams_path, haplotype_path)
exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi_"+ chr +'.bed'])
success = True
try:
if not terminating.is_set():
roisort = sub('.bam$', '.sorted', roi)
if(os.path.isfile(exonsinroibed)):
cmd=" ".join(["sort -u", exonsinroibed, "-o", exonsinroibed]); runCommand(cmd)
extractPairedReadfromROI(sortbyname, exonsinroibed, roi)
removeIfEmpty(tmpbams_path,ntpath.basename(roi))
pysam.sort(roi ,roisort )
pysam.index(roisort+'.bam')
os.remove(roi)
except (KeyboardInterrupt):
logger.error('Exception Crtl+C pressed in the child process in find_roi_bam for chr ' +chr + event)
terminating.set()
success=False
return
except Exception as e:
logger.exception("Exception in find_roi_bam %s" ,e )
terminating.set()
success=False
return
if(success):
logger.debug("find_roi_bam complete successfully for "+chr + event)
return
def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path):
print "___ removing reads overlapping heterozygous region ___"
inbamsorted = sub('.bam$','.sorted',inbamfn)
pysam.sort(inbamfn, inbamsorted)
pysam.index(inbamsorted+'.bam')
alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" )
outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile )
bedfile = open(bedfn, 'r')
for bedline in bedfile:
c = bedline.strip().split()
if (len(c) == 3 ):
chr2 = c[0]
chr = c[0].strip("chr")
start = int(c[1])
end = int(c[2])
else :
continue
try:
readmappings = alignmentfile.fetch(chr2, start, end)
except ValueError as e:
print("problem fetching the read ")
for shortread in readmappings:
try:
outbam.write(shortread)
except ValueError as e:
print ("problem removing read :" + shortread.qname)
outbamsorted = sub('.bam$','.sorted',outbamfn)
pysam.sort(outbamfn, outbamsorted)
bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path )
outbam.close()
def find_roi_amp(chromosome_event):
chr, event = chromosome_event.split("_")
roi, sortbyname, sortbyCoord, hetsnp = init_file_names(chr, event, tmpbams_path, haplotype_path)
exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi_" + chr + '.bed'])
success = True
try:
if not terminating.is_set():
roisort = sub('.bam$', '.sorted', roi)
if (os.path.isfile(exonsinroibed)):
cmd = " ".join(["sort -u", exonsinroibed, "-o", exonsinroibed]);
runCommand(cmd)
extractPairedReadfromROI(sortbyname, exonsinroibed, roi)
removeIfEmpty(tmpbams_path, ntpath.basename(roi))
pysam.sort(roi, roisort)
pysam.index(roisort + '.bam')
os.remove(roi)
except (KeyboardInterrupt):
logger.error('Exception Crtl+C pressed in the child process in find_roi_bam for chr ' + chr + event)
terminating.set()
success = False
return
except Exception as e:
logger.exception("Exception in find_roi_bam %s", e)
terminating.set()
success = False
return
if (success):
logger.debug("find_roi_bam complete successfully for " + chr + event)
return
def _sort_index(unsorted, output_bam):
pysam.sort("-o", output_bam, "-m", PYSAM_SORT_MEM, unsorted)
os.unlink(unsorted)
pysam.index(output_bam)
def realigner(out_dir, tmp_dir, winsize=50, unstranded=False):
"""DOCSTRING
Args:
Returns:
"""
# file handlers
mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb')
ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
chr_list=[x['SN'] for x in ubam.header['SQ']]
# construct the mread_dict; this will be needed throughout
mread_dict = defaultdict(list)
for alignment in mbam:
mread_dict[alignment.qname].append(alignment)
# keep a record of processed reads
processed_mreads = set()
# iterate through all mreads
for read_qname in mread_dict:
if read_qname in processed_mreads:
continue
## construct the fully-connected subgraph for each read
read_to_locations, processed_mreads = \
construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded)
subgraph = set()
for read in read_to_locations:
_ = map(subgraph.add, read_to_locations[read].keys())
subgraph = list(subgraph)
## build the BIT tracks
node_track, multi_reads_weights = \
construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)
## run EM
multi_reads_weights = \
run_EM(node_track, multi_reads_weights, w=winsize)
## write to obam
for read in multi_reads_weights:
for node in multi_reads_weights[read]:
alignment = read_to_locations[read][node]
score = round(multi_reads_weights[read][node][0], 3)
alignment.set_tag('AS', score)
alignment.set_tag('PG', 'CLAM')
obam.write(alignment)
# sort the final output
logger.info('sorting output')
obam.close()
ubam.close()
mbam.close()
obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
pysam.index(obam_sorted_fn)
os.remove(os.path.join(out_dir, 'realigned.bam'))
return
def filter_bam_maxtags(obam_fn, ibam_fn, max_tags=1):
"""DOCSTRING
Args
Returns
"""
assert max_tags>0
# prepare files
ibam = pysam.Samfile(ibam_fn, 'rb')
obam = pysam.Samfile(obam_fn, 'wb', template=ibam)
# init
collapse_dict = defaultdict(list)
chr_list=[x['SN'] for x in ibam.header['SQ']]
input_counter = 0
output_counter = 0
for chr in chr_list:
# empty stack for each new chromosome
stack = []
last_pos = -1
for read in ibam.fetch(chr):
input_counter += 1
if not (input_counter % (5*(10**6)) ):
logger.debug('collapsed %i alignments'%input_counter)
if read.positions[0] > last_pos:
new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags)
output_counter += len(new_alignment_list)
last_pos = read.positions[0]
stack = [read]
for new_alignment in new_alignment_list:
new_alignment.query_sequence = '*'
new_alignment.query_qualities = '0'
_ = obam.write(new_alignment)
else:
stack.append(read)
new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags)
output_counter += len(new_alignment_list)
last_pos = read.positions[0]
for new_alignment in new_alignment_list:
new_alignment.query_sequence = '*'
new_alignment.query_qualities = '0'
_ = obam.write(new_alignment)
ibam.close()
obam.close()
#os.rename(obam_fn, ibam_fn)
#pysam.sort(obam_fn)
pysam.index(obam_fn)
logger.info('Input = %s; Output = %s; Redundancy = %.2f'%(input_counter,output_counter, 1-float(output_counter)/input_counter))
return
def initialize(results_path,haplotype_path,cancer_dir_path):
try:
event_list=['gain','loss']
gaincnv = params.GetGainCNV()
losscnv = params.GetLossCNV()
logger.debug(' --- Initializing input files --- ')
vcf_path = bamhelp.GetVCF()
exons_path = bamhelp.GetExons()
reference_path = bamhelp.GetRef()
vpath, vcf = os.path.split(vcf_path)
phasedvcf = "/".join([results_path, sub('.vcf$', '_phased.vcf.gz', vcf)])
vcftobed = "/".join([results_path, sub('.vcf$', '.bed', vcf)])
hap1vcf = "/".join([results_path,"hap1_het.vcf"])
hap2vcf = "/".join([results_path, "hap2_het.vcf"])
hap1vcffiltered = "/".join([results_path, "hap1_het_filtered"])
hap2vcffiltered = "/".join([results_path, "hap2_het_filtered"])
hap1vcffilteredtobed = "/".join([results_path, "hap1_het_filtered.bed"])
hap2vcffilteredtobed = "/".join([results_path, "hap2_het_filtered.bed"])
phased_bed = "/".join([results_path, "PHASED.BED"])
phaseVCF(vcf_path, phasedvcf)
getVCFHaplotypes(phasedvcf, hap1vcf, hap2vcf)
thinVCF(hap1vcf, hap1vcffiltered)
thinVCF(hap2vcf, hap2vcffiltered)
convertvcftobed(hap1vcffiltered+".recode.vcf", hap1vcffilteredtobed)
convertvcftobed(hap2vcffiltered+".recode.vcf", hap2vcffilteredtobed)
cmd1 = """sed -i 's/$/\thap1/' """+ hap1vcffilteredtobed
cmd2 = """sed -i 's/$/\thap2/' """+ hap2vcffilteredtobed
cmd3 = "cat " + hap1vcffilteredtobed + " " + hap2vcffilteredtobed + " > " + 'tmp.bed'
cmd4 = "sort -V -k1,1 -k2,2 tmp.bed > " + phased_bed
runCommand(cmd1)
runCommand(cmd2)
runCommand(cmd3)
runCommand(cmd4)
os.remove('tmp.bed')
for event in event_list:
roibed = "/".join([haplotype_path, event + "_roi.bed"])
exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi.bed"])
nonhetbed = "/".join([haplotype_path, event + "_non_het.bed"])
hetbed = "/".join([haplotype_path, event + "_het.bed"])
hetsnpbed = "/".join([haplotype_path, event + "_het_snp.bed"])
if(locals()[event + 'cnv']):
intersectBed( exons_path, locals()[event + 'cnv'], exonsinroibed, wa=True)
intersectBed(phased_bed, exonsinroibed, hetsnpbed, wa=True)
splitBed(exonsinroibed, event+'_exons_in_roi_')
splitBed(hetsnpbed, event+'_het_snp_')
except:
logger.exception("Initialization error !")
raise
logger.debug("--- initialization complete ---")
return
def initialize_amp(results_path, haplotype_path, cancer_dir_path):
try:
event_list = ['gain', 'loss']
gaincnv = params.GetGainCNV()
losscnv = params.GetLossCNV()
logger.debug(' --- Initializing input files --- ')
vcf_path = bamhelp.GetVCF()
exons_path = bamhelp.GetExons()
reference_path = bamhelp.GetRef()
vpath, vcf = os.path.split(vcf_path)
phasedvcf = "/".join([results_path, sub('.vcf$', '_phased.vcf.gz', vcf)])
vcftobed = "/".join([results_path, sub('.vcf$', '.bed', vcf)])
hap1vcf = "/".join([results_path, "hap1_het.vcf"])
hap2vcf = "/".join([results_path, "hap2_het.vcf"])
hap1vcffiltered = "/".join([results_path, "hap1_het_filtered"])
hap2vcffiltered = "/".join([results_path, "hap2_het_filtered"])
hap1vcffilteredtobed = "/".join([results_path, "hap1_het_filtered.bed"])
hap2vcffilteredtobed = "/".join([results_path, "hap2_het_filtered.bed"])
phased_bed = "/".join([results_path, "PHASED.BED"])
phaseVCF(vcf_path, phasedvcf)
getVCFHaplotypes(phasedvcf, hap1vcf, hap2vcf)
thinVCF(hap1vcf, hap1vcffiltered)
thinVCF(hap2vcf, hap2vcffiltered)
convertvcftobed(hap1vcffiltered + ".recode.vcf", hap1vcffilteredtobed)
convertvcftobed(hap2vcffiltered + ".recode.vcf", hap2vcffilteredtobed)
cmd1 = """sed -i 's/$/\thap1/' """ + hap1vcffilteredtobed
cmd2 = """sed -i 's/$/\thap2/' """ + hap2vcffilteredtobed
cmd3 = "cat " + hap1vcffilteredtobed + " " + hap2vcffilteredtobed + " > " + 'tmp.bed'
cmd4 = "sort -V -k1,1 -k2,2 tmp.bed > " + phased_bed
runCommand(cmd1)
runCommand(cmd2)
runCommand(cmd3)
runCommand(cmd4)
os.remove('tmp.bed')
for event in event_list:
roibed = "/".join([haplotype_path, event + "_roi.bed"])
exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi.bed"])
nonhetbed = "/".join([haplotype_path, event + "_non_het.bed"])
hetbed = "/".join([haplotype_path, event + "_het.bed"])
hetsnpbed = "/".join([haplotype_path, event + "_het_snp.bed"])
if (locals()[event + 'cnv']):
intersectBed(exons_path, locals()[event + 'cnv'], exonsinroibed, wa=True)
intersectBed(phased_bed, exonsinroibed, hetsnpbed, wa=True)
splitBed(exonsinroibed, event + '_exons_in_roi_')
splitBed(hetsnpbed, event + '_het_snp_')
except:
logger.exception("Initialization error !")
raise
logger.debug("--- initialization complete ---")
return