def impute_missing_bins(hic_matrix, regions=None, per_chromosome=True, stat=np.ma.mean):
"""
Impute missing contacts in a Hi-C matrix.
For inter-chromosomal data uses the mean of all inter-chromosomal contacts,
for intra-chromosomal data uses the mean of intra-chromosomal counts at the corresponding diagonal.
:param hic_matrix: A square numpy array
:param regions: A list of :class:`~GenomicRegion`s - if omitted, will create a dummy list
:param per_chromosome: Do imputation on a per-chromosome basis (recommended)
:param stat: The aggregation statistic to be used for imputation, defaults to the mean.
"""
if regions is None:
for i in range(hic_matrix.shape[0]):
regions.append(GenomicRegion(chromosome='', start=i, end=i))
chr_bins = dict()
for i, region in enumerate(regions):
if region.chromosome not in chr_bins:
chr_bins[region.chromosome] = [i, i]
else:
chr_bins[region.chromosome][1] = i
n = len(regions)
if not hasattr(hic_matrix, "mask"):
hic_matrix = masked_matrix(hic_matrix)
imputed = hic_matrix.copy()
if per_chromosome:
for c_start, c_end in chr_bins.itervalues():
# Correcting intrachromoc_startmal contacts by mean contact count at each diagonal
for i in range(c_end - c_start):
ind = kth_diag_indices(c_end - c_start, -i)
diag = imputed[c_start:c_end, c_start:c_end][ind]
diag[diag.mask] = stat(diag)
imputed[c_start:c_end, c_start:c_end][ind] = diag
# Correcting interchromoc_startmal contacts by mean of all contact counts between
# each set of chromoc_startmes
for other_start, other_end in chr_bins.itervalues():
# Only correct upper triangle
if other_start <= c_start:
continue
inter = imputed[c_start:c_end, other_start:other_end]
inter[inter.mask] = stat(inter)
imputed[c_start:c_end, other_start:other_end] = inter
else:
for i in range(n):
diag = imputed[kth_diag_indices(n, -i)]
diag[diag.mask] = stat(diag)
imputed[kth_diag_indices(n, -i)] = diag
# Copying upper triangle to lower triangle
imputed[np.tril_indices(n)] = imputed.T[np.tril_indices(n)]
return imputed
评论列表
文章目录