def grab_articles(self, ids):
task_id = self.request.id
ids = ids[0]
print("Entering Grab Articles Task: ", len(ids))
print("Task id from self: ", task_id)
s = select([articles_db.c.id, articles_db.c.tfidf]).where(articles_db.c.id.in_(ids))
all_articles = pd.read_sql(s, con=connection, chunksize=350)
all_articles = pd.concat(all_articles, ignore_index=True)
stored_data = json.loads(r.get(task_id))
stored_data['status'] = "creating article matrix"
r.set(task_id, json.dumps(stored_data))
tfidf_dict = stored_data['tfidf_dict']
all_articles = all_articles.append({'id': 1, 'tfidf': tfidf_dict}, ignore_index=True)
corpus = helpers.generate_sparse_matrix(all_articles)
query_article_vector = corpus.getrow(-1)
all_articles['distance'] = pairwise_distances(corpus, query_article_vector, metric='cosine').flatten()
stored_data['status'] = "computing best matches"
r.set(task_id, json.dumps(stored_data))
max_distance_from_query = 0.75 # on a scale of 0 (exact match) to 1.0 (not even close)
all_articles = all_articles[all_articles['distance'] < max_distance_from_query]
print("Done computing matrix and distances")
s = select([articles_db.c.id, articles_db.c.headline, articles_db.c.url, articles_db.c.date]).where(
articles_db.c.id.in_(all_articles['id'].tolist()))
all_articles = pd.read_sql(s, connection).set_index('id').join(all_articles.set_index('id')).sort_values(by='date')
query_article = {'headline': stored_data['headline'], 'date': datetime.strptime(stored_data['date'], "%d-%b-%Y"),
'distance': 0, 'url': stored_data['url']}
articles = helpers.make_article_array(all_articles, query_article)
return articles, query_article['headline']
评论列表
文章目录