def preprocess_questions(examples, nlp='nltk'):
if nlp == 'nltk':
from nltk.tokenize import word_tokenize
print('Example of generated tokens after preprocessing some questions:')
for i, ex in enumerate(examples):
s = ex['question']
if nlp == 'nltk':
ex['question_words'] = word_tokenize(str(s).lower())
elif nlp == 'mcb':
ex['question_words'] = tokenize_mcb(s)
else:
ex['question_words'] = tokenize(s)
if i < 10:
print(ex['question_words'])
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(examples), i*100.0/len(examples)) )
sys.stdout.flush()
return examples
评论列表
文章目录