def _transactions_fuzzy_matching(transactions, match):
"""
Runs fuzzy matching on the transactions, by applying a complete linkage
hierarchical clustering algorithm to the set of different itemsets in the
transactions. For clustering, the similarity ratio as given by
fuzzywuzzy.ratio is used as the distance measure
Input:
transactions: list of tuples representing items on each transaction
match: minimum similarity ratio (0 to 100) for clustering
Output:
transactions: new version of the transactions, where each item has been
replaced by the first item on its corresponding cluster
word_clusters: dictionary that maps the cluster for each item
in the transactions
"""
words = set([])
for transaction in transactions:
words |= set(transaction)
words = sorted(words)
l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
for a, b in combinations(words, 2)]
d = [value for pair, value in l]
r = linkage(d, 'complete')
clusters_index = fcluster(r, 100-match, "distance")
clusters = {}
for obs_i, cluster_i in enumerate(clusters_index):
if cluster_i in clusters:
clusters[cluster_i].append(words[obs_i])
else:
clusters[cluster_i] = [words[obs_i]]
word_clusters = {word: clusters[clusters_index[i]]
for i, word in enumerate(words)}
new_transactions = []
for transaction in transactions:
new_transaction = tuple(set(([word_clusters[word][0]
for word in transaction])))
new_transactions.append(new_transaction)
return new_transactions, word_clusters
评论列表
文章目录