def compute_VwS(self,s):
""" Compute V(w,S) as defined by Cohen et al.'s IJCAI03 paper """
# Get term-frequency vectors and vocab list for string
cv = CountVectorizer(min_df = 0.0, token_pattern=u'(?u)\\b\\w+\\b')
tf = cv.fit_transform([s]); tf = tf.tocsr()
vocab = cv.vocabulary_
# Compute V(w,S) for string
vprime_ws = dict()
vprime_ws_norm = 0
for w in vocab:
if w in self.CORPUS_VOCAB:
vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.LOG_IDF[self.CORPUS_VOCAB[w]]
else:
vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.OOV_IDF_VAL #if not in vocab, defauly to OOC_IDF_VAL
vprime_ws_norm += vprime_ws[w]**2
vprime_ws_norm = math.sqrt(vprime_ws_norm)
return (vocab,vprime_ws,vprime_ws_norm)
评论列表
文章目录