def _create_filtered_index(self, source=dir_path + '../data/character_index.csv',
destination=dir_path + '../data/character_index_filtered.csv'):
with io.open(source, 'rb') as fin_index, io.open(destination, 'w', encoding='utf8') as fout:
total_lines_relations = line_counting.cached_counter.count_lines(self.path_relations)
self.logger.print_info('Collecting important entities...')
important_articles = set()
nt_reader = NTReader(self.path_relations)
for subject, predicate, object in tqdm(nt_reader.yield_cleaned_entry_names(), total=total_lines_relations):
important_articles.add(subject)
total_lines_index = line_counting.cached_counter.count_lines(source)
self.logger.print_info('Filtering important entities...')
index_reader = csv.reader(fin_index, delimiter=self.delimiter, encoding='utf-8', quoting=csv.QUOTE_NONE)
for line in tqdm(index_reader, total=total_lines_index):
subject, character_offset = line
if subject in important_articles:
fout.write(subject + self.delimiter + character_offset + '\n')
wikipedia_dump_index_creator.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录