def load_data(data_dir, order='pre_order'):
'''construct vocab and load data with a specified traversal order'''
word_vocab = Vocab()
nt_vocab = Vocab()
ter_vocab = Vocab()
act_vocab = Vocab()
act_vocab.feed_all(['NT', 'TER', 'ACT'])
word_tokens = collections.defaultdict(list)
tree_tokens = collections.defaultdict(list)
tran_actions = collections.defaultdict(list)
for fname in ('train', 'valid', 'test'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
sen, sexp = line.rstrip().split('\t')
sen = sen.split(' ')
word_vocab.feed_all(sen)
word_tokens[fname].append(sen)
parse_tree = Tree()
parse_tree.construct_from_sexp(sexp)
nt, ter = parse_tree.get_nt_ter()
nt_vocab.feed_all(nt)
ter_vocab.feed_all(ter)
traverse_method = getattr(parse_tree, order)
tree_token, action = traverse_method(_ROOT)
tree_tokens[fname].append(tree_token)
tran_actions[fname].append(action)
return word_vocab, nt_vocab, ter_vocab, act_vocab, word_tokens, tree_tokens, tran_actions
评论列表
文章目录