def get_local_words(preds, vocab, NEs=[], k=50):
"""
given the word probabilities over many coordinates,
first normalize the probability of each word in different
locations to get a probability distribution, then compute
the entropy of the word's distribution over all coordinates
and return the words that are low entropy and are not
named entities.
"""
#normalize the probabilites of each vocab using entropy
normalized_preds = normalize(preds, norm='l1', axis=0)
entropies = stats.entropy(normalized_preds)
sorted_indices = np.argsort(entropies)
sorted_local_words = np.array(vocab)[sorted_indices].tolist()
filtered_local_words = []
NEset = set(NEs)
for word in sorted_local_words:
if word in NEset: continue
filtered_local_words.append(word)
return filtered_local_words[0:k]
评论列表
文章目录