def train(args):
trace('loading corpus ...')
with open(args.source) as fp:
trees = [make_tree(l) for l in fp]
trace('extracting leaf nodes ...')
word_lists = [extract_words(t) for t in trees]
trace('extracting gold operations ...')
op_lists = [make_operations(t) for t in trees]
trace('making vocabulary ...')
word_vocab = Vocabulary.new(word_lists, args.vocab)
phrase_set = set()
semi_set = set()
for tree in trees:
phrase_set |= set(extract_phrase_labels(tree))
semi_set |= set(extract_semi_labels(tree))
phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False)
semi_vocab = Vocabulary.new([list(semi_set)], len(semi_set), add_special_tokens=False)
trace('converting data ...')
word_lists = [convert_word_list(x, word_vocab) for x in word_lists]
op_lists = [convert_op_list(x, phrase_vocab, semi_vocab) for x in op_lists]
trace('start training ...')
parser = Parser(
args.vocab, args.embed, args.queue, args.stack,
len(phrase_set), len(semi_set),
)
if USE_GPU:
parser.to_gpu()
opt = optimizers.AdaGrad(lr = 0.005)
opt.setup(parser)
opt.add_hook(optimizer.GradientClipping(5))
for epoch in range(args.epoch):
n = 0
for samples in batch(zip(word_lists, op_lists), args.minibatch):
parser.zerograds()
loss = my_zeros((), np.float32)
for word_list, op_list in zip(*samples):
trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1))
loss += parser.forward(word_list, op_list, 0)
n += 1
loss.backward()
opt.update()
trace('saving model ...')
prefix = args.model + '.%03.d' % (epoch + 1)
word_vocab.save(prefix + '.words')
phrase_vocab.save(prefix + '.phrases')
semi_vocab.save(prefix + '.semiterminals')
parser.save_spec(prefix + '.spec')
serializers.save_hdf5(prefix + '.weights', parser)
trace('finished.')
评论列表
文章目录