def _train(self, training_frame):
hashing_vectorizer = HashingVectorizer(analyzer="word", n_features=(2 ** 30),
ngram_range=(1, 3), stop_words="english")
training_hashing_matrix = hashing_vectorizer.fit_transform(training_frame["description"])
self.log.info("starting kernel")
start = time()
cosine_similarities = cosine_similarity(training_hashing_matrix, training_hashing_matrix)
self.log.info("finished kernel. this took {} s".format(time() - start))
self.log.info("starting adding to redis database")
start = time()
i = 0
l = len(training_frame.index)
print_progress(i, l, prefix="Progress:", suffix="Complete", bar_length=50)
for idx, row in training_frame.iterrows():
similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
similar_items = [(cosine_similarities[idx][i], training_frame['id'][i]) for i in similar_indices]
flattened = sum(similar_items[1:], ())
self._r.zadd(self.SIMKEY % row['id'], *flattened)
i += 1
print_progress(i, l, prefix="Progress:", suffix="Complete", bar_length=50)
self.log.info("finished adding {} rows to redis database. this took {} s".format(i, time() - start))
评论列表
文章目录