def load_unannotated_file(filepath='test.txt', nb_instances=None, tokenized_input=False):
if tokenized_input:
instances = []
for line in codecs.open(filepath, 'r', 'utf8'):
line = line.strip()
if line:
instances.append(line)
if nb_instances:
nb_instances -= 1
if nb_instances <= 0:
break
return instances
else:
from nltk.tokenize import wordpunct_tokenize
W = re.compile(r'\s+')
with codecs.open(filepath, 'r', 'utf8') as f:
text = W.sub(f.read(), ' ')
tokens = wordpunct_tokenize(text)
if nb_instances:
return tokens[:nb_instances]
else:
return tokens
评论列表
文章目录