def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
if maxlen is None:
maxlen = max(len(s) for s in data)
word2idx = dict((w, i) for i, w in enumerate(vocab))
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to <unk> (index 1) if not in vocab
y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]
x = pad_sequences(x, maxlen) # left padding
y_pos = pad_sequences(y_pos, maxlen, value=-1) # lef padded with -1. Indeed, any interger works as it will be masked
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
if onehot:
y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
else:
y_pos = numpy.expand_dims(y_pos, 2)
y_chunk = numpy.expand_dims(y_chunk, 2)
return x, y_pos, y_chunk
评论列表
文章目录