def load_data(data_dir, order='top_down'):
'''construct vocab and load data with a specified traversal order'''
general_predicate_dir = os.path.join(data_dir, "general_nts")
action_dir = os.path.join(data_dir, "actions")
general_predicate = []
word_vocab = Vocab()
nt_vocab = Vocab()
ter_vocab = Vocab()
act_vocab = Vocab()
with open(general_predicate_dir, 'r') as f:
general_predicate = f.read().split('\n')
nt_vocab.feed_all(general_predicate)
with open(action_dir, 'r') as f:
actions = f.read().split('\n')
act_vocab.feed_all(actions)
word_tokens = collections.defaultdict(list)
tree_tokens = collections.defaultdict(list)
tran_actions = collections.defaultdict(list)
for fname in ('train', 'valid', 'test'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
sen, sexp = line.rstrip().split('\t')
sen = sen.split(' ')
word_vocab.feed_all(sen)
word_tokens[fname].append(sen)
parse_tree = Tree()
parse_tree.construct_from_sexp(sexp)
nt, ter = parse_tree.get_nt_ter()
nt_vocab.feed_all(nt)
ter_vocab.feed_all(ter)
tree_token, action = parse_tree.get_oracle(order, general_predicate)
#print (tree_token, action)
tree_tokens[fname].append(tree_token)
tran_actions[fname].append(action)
return word_vocab, nt_vocab, ter_vocab, act_vocab, word_tokens, tree_tokens, tran_actions
评论列表
文章目录