def gen_dataset(sentences,
categories,
max_words=78,
train_test_split=True):
''' Generate a dataset of (input, output) pairs where the
input is an embedded vector and output the category (one-hotted)
Args
----
sentences : list
list of sentences where each sentence is list of tokens
max_words : integer
maximum number of words allowed in sentence
train_test_split : boolean
whether to split data into 2 sets
'''
num_sentences = len(sentences)
model = models.Word2Vec.load_word2vec_format(
local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'),
binary=True)
vectorizer = lambda x: model[x] if x in model else np.zeros(300)
encoder = one_hot_encoding(categories)
X = np.zeros((num_sentences, max_words, 300))
y = np.zeros((num_sentences, max_words, len(encoder.keys())))
K = np.zeros(num_sentences)
I = np.arange(num_sentences)
param_dict = {}
param_dict['max_words'] = max_words
param_dict['encoder'] = encoder
for sent_i in I:
words = sentences[sent_i]
cats = categories[sent_i]
if sent_i % 1000 == 0:
print("{} sentences parsed. {} remaining.".format(
sent_i, num_sentences - sent_i - 1))
X[sent_i, :, :], y[sent_i, :, :] = \
prepare_sentence(words, categories=cats,
vectorizer=vectorizer,
encoder=encoder,
max_words=max_words)
K[sent_i] = len(words) # keep track of num words in sentence
if train_test_split:
(X_train, X_test), (I_train, I_test) = util.split_data(
X, out_data=I, frac=0.80)
y_train, y_test = y[I_train], y[I_test]
K_train, K_test = K[I_train], K[I_test]
return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
return (X, y, K), param_dict
评论列表
文章目录