def tokenize(self, file_name):
"""Tokenizes the file and produces a dataset."""
lines = read_lines(file_name)
random.shuffle(lines)
unk = self.word_dict.get_idx('<unk>')
dataset, total, unks = [], 0, 0
for line in lines:
tokens = line.split()
input_idxs = self.context_dict.w2i(get_tag(tokens, 'input'))
word_idxs = self.word_dict.w2i(get_tag(tokens, 'dialogue'))
item_idxs = self.item_dict.w2i(get_tag(tokens, 'output'))
dataset.append((input_idxs, word_idxs, item_idxs))
# compute statistics
total += len(input_idxs) + len(word_idxs) + len(item_idxs)
unks += np.count_nonzero([idx == unk for idx in word_idxs])
if self.verbose:
print('dataset %s, total %d, unks %s, ratio %0.2f%%' % (
file_name, total, unks, 100. * unks / total))
return dataset
评论列表
文章目录