def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
"""DOCSTRING
Args:
Returns:
"""
# file handlers
mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
if unique_only:
ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
else:
ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')
# read in GTF
logger.info('read gtf from "%s" '% fn)
gene_annot = read_gtf(gtf_fp)
# fetch tag counts in each gene
## gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con }
logger.info('reading gene counts')
gene_count = defaultdict(dict)
for gene_name in tqdm(gene_annot):
gene = gene_annot[gene_name]
bin_interval_ip, bin_interval_con = \
gene_to_count(bam_dict, gene,
unique_only=unique_only, with_control=with_control,
unstranded=unstranded)
if bin_interval_ip is None: ## if no IP reads in this gene
continue
gene_count[gene_name]['ip'] = bin_interval_ip
gene_count[gene_name]['con'] = bin_interval_con
# estimate global overdispersion param. for each dataset
logger.info('estimating dispersion parameter')
alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' )
if with_control:
alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' )
else:
alpha_con_vec = np.asarray(alpha_ip_vec)
# perform statistical test
logger.info('calling peaks')
for gene_name in gene_to_count:
gene = gene_annot[gene_name]
BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'],
alpha_ip_vec, alpha_con_vec)
ofile.write(BED)
peak_counter += len(BED.split('\n'))
ofile.close()
logger.info('called %i peaks'%peak_counter)
return
评论列表
文章目录