hu2004.py 文件源码-python代码片段

def _transactions_fuzzy_matching(transactions, match):
    """
    Runs fuzzy matching on the transactions, by applying a complete linkage
    hierarchical clustering algorithm to the set of different itemsets in the
    transactions. For clustering, the similarity ratio as given by
    fuzzywuzzy.ratio is used as the distance measure
    Input:
        transactions: list of tuples representing items on each transaction
        match: minimum similarity ratio (0 to 100) for clustering
    Output:
        transactions: new version of the transactions, where each item has been
                      replaced by the first item on its corresponding cluster
        word_clusters: dictionary that maps the cluster for each item
        in the transactions
    """
    words = set([])
    for transaction in transactions:
        words |= set(transaction)
    words = sorted(words)
    l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
         for a, b in combinations(words, 2)]
    d = [value for pair, value in l]
    r = linkage(d, 'complete')
    clusters_index = fcluster(r, 100-match, "distance")
    clusters = {}
    for obs_i, cluster_i in enumerate(clusters_index):
        if cluster_i in clusters:
            clusters[cluster_i].append(words[obs_i])
        else:
            clusters[cluster_i] = [words[obs_i]]

    word_clusters = {word: clusters[clusters_index[i]]
                     for i, word in enumerate(words)}
    new_transactions = []
    for transaction in transactions:
        new_transaction = tuple(set(([word_clusters[word][0]
                                      for word in transaction])))
        new_transactions.append(new_transaction)
    return new_transactions, word_clusters