def cluster(target_sequence_ids, fasta_filename, method='average'):
""" Form distance-based hierachical clustering of sequences.
Looks up each entry in target_sequence_ids in the file
specified by fasta_filename to obtain an associated DNA
sequence.
In principle, we could just work with the Hamming distance, but
the sequences may be of different lengths (mostly small
differences.) So we need a more sophisticated approach: we use
pairwise global alignment, scoring 0 for a match, -1 for mismatch,
and -1.5 for opening or extending a gap. We then take the distance
to be -1.0*(score).
UPGMA clustering is used when method='average', the default.
Returns the distance matrix and the linkage matrix returned
by the clustering routine.
"""
# globalms arguments: seq1, seq2, match, mismatch, open, extend
distance = lambda seq1, seq2: -1.0*(
pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
)
sequences = fasta_to_dict(fasta_filename)
N = len(target_sequence_ids)
distances = np.zeros((N,N))
# fill in the upper triangle
for i,seqid1 in enumerate(target_sequence_ids):
seq1 = sequences[seqid1]
for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
j = j_offset + i + 1
seq2 = sequences[seqid2]
distances[i][j] = distance(seq1, seq2)
# convert to the form expected by the scipy clustering routines
y = squareform(distances,checks=False)
return distances, hierarchy.linkage(y,method)
评论列表
文章目录