distance_based.py 文件源码-python代码片段

def cluster(target_sequence_ids, fasta_filename, method='average'):
    """ Form distance-based hierachical clustering of sequences.

    Looks up each entry in target_sequence_ids in the file 
    specified by fasta_filename to obtain an associated DNA 
    sequence. 

    In principle, we could just work with the Hamming distance, but 
    the sequences may be of different lengths (mostly small 
    differences.) So we need a more sophisticated approach: we use
    pairwise global alignment, scoring 0 for a match, -1 for mismatch,
    and -1.5 for opening or extending a gap. We then take the distance
    to be -1.0*(score). 

    UPGMA clustering is used when method='average', the default.

    Returns the distance matrix and the linkage matrix returned
    by the clustering routine.

    """
    # globalms arguments: seq1, seq2, match, mismatch, open, extend
    distance = lambda seq1, seq2: -1.0*(
        pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
    )
    sequences = fasta_to_dict(fasta_filename)
    N = len(target_sequence_ids)
    distances = np.zeros((N,N))
    # fill in the upper triangle
    for i,seqid1 in enumerate(target_sequence_ids):
        seq1 = sequences[seqid1]
        for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
            j = j_offset + i + 1
            seq2 = sequences[seqid2]
            distances[i][j] = distance(seq1, seq2)
    # convert to the form expected by the scipy clustering routines
    y = squareform(distances,checks=False)
    return distances, hierarchy.linkage(y,method)