def train_wordfilter_coefficient(self, seed_words, wordfilters):
mined_words = defaultdict(lambda: defaultdict(lambda: 0))
filter_set = {wordfilter for (rng, wordfilter) in wordfilters}
ranges = {rng for (rng, wordfilter) in wordfilters}
for num_doc, doc in enumerate(Word2vecCorpus(self.corpus_file)):
len_doc = len(doc)
for rng in ranges:
(fb, fe) = rng
if len_doc < (fe - fb + 1):
continue
words = doc[-fb:-fe]
contexts = []
for i, word in enumerate(doc):
if (i + fb < 0) or (i + fe >= len_doc):
continue
contexts.append(tuple([doc[i+r] for r in range(fb, fe+1) if r != 0]))
for i, context in enumerate(contexts):
if context in filter_set:
mined_words[(rng, context)][words[i]] += 1
result = []
seeds_idx = sorted([self.word2index[seed] for seed in seed_words])
seeds_vec = [self.word2vec_model.syn0[idx] for idx in seeds_idx]
for ((rng, context), word2freq) in sorted(mined_words.items(), key=lambda x:sum(x[1].values()), reverse=True):
word_freq = [(self.word2index[word], freq) for (word, freq) in word2freq.items()]
word_freq = [v for v in word_freq if v[0] != -1]
word_freq = sorted(word_freq)
idx = [pair[0] for pair in word_freq]
word_vec = self.word2vec_model.syn0[idx]
sum_freq = sum([v[1] for v in word_freq])
score = 0
for seed_vec in seeds_vec:
sim = 1 + -1 * pairwise_distances(word_vec, seed_vec, metric='cosine')
score += sum([wf[1] * s for wf, s in zip(word_freq, sim)]) / sum_freq
score /= len(seed_words)
result.append((context, rng, score, sum_freq))
return result
评论列表
文章目录