def parse_xml_language_similarity(file_read,file_write):
count = 0
with open(file_read,'r') as f, open(file_write,'w') as out:
for line in f:
count +=1
if count %1000 == 0: print(count)
if "row Id" in line:
line = line.strip()
root = xml.etree.ElementTree.fromstring(line)
try:
body = remove_tags(root.get('Body'))
title = remove_tags(root.get('Title'))
body_sentences = sent_tokenize(body)
title_sentences = sent_tokenize(title)
for line in body_sentences:
out.write(line+"\n")
for line in title_sentences:
out.write(line+"\n")
except:
continue
评论列表
文章目录