def get_subtrees0(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4):
"""Recomputing linkage is slower than pruning the tree for large matrices"""
maxtdist = 0
i = 0
subtrees = []
for i in range(1, nchrom):
Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)
names = get_names(bin_chr, bin_position)
t = distance_matrix2tree(Z, names)
#tree = sch.to_tree(Z, False); t = ete3.Tree(getNewick(tree, "", tree.dist, names))
# get longest branch
n, bestdist = get_longest(t)
tname, tdist = t.get_farthest_leaf()#[1]
if maxtdist < tdist:
maxtdist = t.get_farthest_leaf()[1]
# break if small subtree
if tdist / maxtdist < 1.1 * bestdist / tdist or tdist < maxtdist*distfrac:
break
pruned = n.get_leaf_names()
subtrees.append(pruned)
c = Counter(get_chr_name(n) for n in pruned)
print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)
t = truncate(t, maxd=5); t.render('tree_%s.pdf'%i)
# prune array
indices = np.array([False if name in set(pruned) else True for _i, name in enumerate(names)])
d = d[indices, :]
d = d[:, indices]
bin_chr = bin_chr[indices]
bin_position = bin_position[indices, :]
if i:
subtrees.append(t.get_leaf_names())
pruned = t.get_leaf_names()
c = Counter(get_chr_name(n) for n in pruned)
print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)
return subtrees
评论列表
文章目录