def get_centroid_idf(text, emb, idf, stopwords, D):
# Computing Terms' Frequency
tf = defaultdict(int)
tokens = bioclean(text)
for word in tokens:
if word in emb and word not in stopwords:
tf[word] += 1
# Computing the centroid
centroid = np.zeros((1, D))
div = 0
for word in tf:
if word in idf:
p = tf[word] * idf[word]
centroid = np.add(centroid, emb[word]*p)
div += p
if div != 0:
centroid = np.divide(centroid, div)
return centroid
评论列表
文章目录