def read_and_clean_files(clueweb_file, ann_file, data_dir, ann_dir):
"""
Read file from data_dir and ann_dir, replace entity mentions and clean records in that file
:param clueweb_file:
:param ann_file:
:param data_dir: Warc files directory
:param ann_dir: Annotations directory
:return: {'record_id': record_id,
'replaced_record': cleaned_replaced_record,
'cleaned_record': cleaned_record}
"""
annotation_input = fileinput.FileInput(os.path.join(ann_dir, ann_file), openhook=fileinput.hook_compressed)
annotation_list = []
for line in annotation_input:
annotation_list.append(Annotation.parse_annotation(line))
warc_path = os.path.join(data_dir, clueweb_file)
warc_file = warc.open(warc_path)
print "Replacing entity mentions for ", clueweb_file, ":", ann_file, "..."
start = time.time()
warc_entry = WarcEntry(warc_path, warc_file, annotation_list)
cleaned_records = warc_entry.replace_entity_mentions()
end = time.time()
print "Time used: ", end - start
warc_file.close()
return cleaned_records
评论列表
文章目录