def parse_xml_all(self, data_file, doc_type, language='english'):
e = ET.parse(data_file)
cluster_data = {}
root = e.getroot()
for topics in root:
data = []
topic_id = topics.attrib.get('id')
for documents in topics.findall(doc_type):
doc_id = documents.attrib.get('id')
if doc_type == 'document':
title_text = documents.find('title').text
doc_text = documents.find('text').text
text = text_normalization(doc_text)
doc_sents = sent_tokenize(text, language)
data.append([doc_id, doc_sents])
cluster_data[topic_id] = data
return cluster_data
corpus_cleaner.py 文件源码
python
阅读 29
收藏 0
点赞 0
评论 0
评论列表
文章目录