def compute_affinity(item):
text, f_idx, table_name, f_sql = item
tokens = text.split()
# Find out which tokens are defined
valid_tokens = [w for w in tokens if w in M]
collections.Counter(valid_tokens)
labels = np.array(list(set(valid_tokens)))
token_clf_index = np.array([M.word2index[w]
for w in labels])
if not labels.size:
msg = "Document has no valid tokens! This is problem."
raise ValueError(msg)
V = np.array([M[w] for w in labels])
DV = cdist(V, V, metric='cosine')
# Values are sometimes "slightly" less than zero due to rounding
DV[DV < 0] = 0
cluster_args = {"damping": damping}
cluster = cluster_clf(**cluster_args)
y_labels = cluster.fit_predict(DV)
data = {}
data = {
"token_clf_index": token_clf_index,
"y_labels": y_labels,
}
return f_idx, f_sql, data
评论列表
文章目录