def maybe_build_sentences(text_filename, sent_filename):
sents = []
if os.path.exists(sent_filename):
fsent = open(sent_filename, "rb")
for line in fsent:
docid, sent_id, sent = line.strip().split("\t")
sents.append(sent)
fsent.close()
else:
ftext = open(text_filename, "rb")
fsent = open(sent_filename, "wb")
for line in ftext:
docid, text = line.strip().split("\t")
sent_id = 1
for sent in nltk.sent_tokenize(text):
sents.append(sent)
fsent.write("{:d}\t{:d}\t{:s}\n"
.format(int(docid), sent_id, sent))
sent_id += 1
fsent.close()
ftext.close()
return sents
评论列表
文章目录