def __call__(self, row):
'''
Compute partition function stats over each document.
'''
text = row['text']
stat_names = [
'Z_mu', 'Z_std', 'Z_skew', 'Z_kurtosis',
'I_mu', 'I_std', 'I_skew', 'I_kurtosis',
]
stats = {}
for key in stat_names:
stats[key] = 0.0
# Only keep words that are defined in the embedding
valid_tokens = [w for w in text.split() if w in self.Z]
# Take only the unique words in the document
all_tokens = np.array(list(set(valid_tokens)))
if len(all_tokens) > 3:
# Possibly clip the values here as very large Z don't contribute
doc_z = np.array([self.Z[w] for w in all_tokens])
compute_stats(doc_z, stats, "Z")
# Take top x% most descriptive words
z_sort_idx = np.argsort(doc_z)[::-1]
z_cut = max(int(self.intra_document_cutoff * len(doc_z)), 3)
important_index = z_sort_idx[:z_cut]
sub_tokens = all_tokens[important_index]
doc_v = np.array([self.model[w] for w in sub_tokens])
upper_idx = np.triu_indices(doc_v.shape[0], k=1)
dist = np.dot(doc_v, doc_v.T)[upper_idx]
compute_stats(dist, stats, "I")
stats['_ref'] = row['_ref']
return stats
评论列表
文章目录