def get_article_statistics(self, recompute_statistics=True):
"""
Agregate statistics from both forum.
"""
if recompute_statistics:
self.get_article_index()
data_generator = loadDataGenerator()
cnt_not_match_links = 0
links_cnt = 0
l2a = Link2Article()
# log = open("./logs", "w")
# error_link = []
for question_batch in data_generator:
for question in question_batch:
links = LinksSearcher(question.get_all_text()).get_simple_links()
for link in links:
# log.write(link.link_text + "\n")
# log.flush()
# function from Alexandrina
article = l2a.link2article(link)
# print (article)
if article:
# print (article.article_ID)
links_cnt += 1
self.article_index[article.article_ID].add_question(question, link)
else:
cnt_not_match_links += 1
sys.stderr.write("\r\t\t\t\t\tALL LINKS: %d; CAN't MATCH: %d" % (links_cnt, cnt_not_match_links))
with open("./../data/statistics/article_statistics", "wb") as f:
cPickle.dump(self.article_index, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
with open("./../data/statistics/article_statistics", "rb") as f:
self.article_index = cPickle.load(f)
评论列表
文章目录