def parse_all_files(self, directory, dictionary, use_chars, cache_file):
"""
parse all files under the given directory into a list of questions,
where each element is in the form of (document, query, answer, filename)
"""
if os.path.exists(cache_file):
gc.disable()
temp = cPickle.load(open(cache_file))
gc.enable()
return temp
all_files = glob.glob(directory + '/*.question')
questions = []
for i, f in enumerate(all_files):
if i % 10000 == 0:
print 'parsing {}'.format(i)
questions.append(self.parse_one_file(f, dictionary, use_chars) + (f,))
questions = self.parse_ner_pos(questions)
cPickle.dump(questions, open(cache_file, 'w'), cPickle.HIGHEST_PROTOCOL)
return questions
评论列表
文章目录