wikipedia_dump_index_creator.py 文件源码-python代码片段

def _create_filtered_index(self, source=dir_path + '../data/character_index.csv',
                               destination=dir_path + '../data/character_index_filtered.csv'):
        with io.open(source, 'rb') as fin_index, io.open(destination, 'w', encoding='utf8') as fout:
            total_lines_relations = line_counting.cached_counter.count_lines(self.path_relations)
            self.logger.print_info('Collecting important entities...')
            important_articles = set()
            nt_reader = NTReader(self.path_relations)
            for subject, predicate, object in tqdm(nt_reader.yield_cleaned_entry_names(), total=total_lines_relations):
                important_articles.add(subject)

            total_lines_index = line_counting.cached_counter.count_lines(source)
            self.logger.print_info('Filtering important entities...')
            index_reader = csv.reader(fin_index, delimiter=self.delimiter, encoding='utf-8', quoting=csv.QUOTE_NONE)
            for line in tqdm(index_reader, total=total_lines_index):
                subject, character_offset = line
                if subject in important_articles:
                    fout.write(subject + self.delimiter + character_offset + '\n')