def prep_data(self):
# 1, Read raw Training,Validation and Test data
self.train,self.validation,self.test = self.load_data()
# 2, Prep Word Indexer: assign each word a number
self.indexer = Tokenizer(lower=False, filters='')
self.indexer.fit_on_texts(self.train[0] + self.train[1]) # todo remove test
self.Vocab = len(self.indexer.word_counts) + 1
# 3, Convert each word in sent to num and zero pad
def padding(x, MaxLen):
return pad_sequences(sequences=self.indexer.texts_to_sequences(x), maxlen=MaxLen)
def pad_data(x):
return padding(x[0], self.SentMaxLen), padding(x[1], self.SentMaxLen), x[2]
self.train = pad_data(self.train)
self.validation = pad_data(self.validation)
self.test = pad_data(self.test)
python类pad_sequences()的实例源码
def data():
maxlen = 100
max_features = 20000
print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
return X_train, X_test, y_train, y_test, max_features, maxlen
pos-tagging-explore.py 文件源码
项目:Deep-Learning-with-Keras
作者: PacktPublishing
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def build_tensor(filename, numrecs, word2index, maxlen,
make_categorical=False):
data = np.empty((numrecs, ), dtype=list)
fin = open(filename, "rb")
i = 0
for line in fin:
wids = []
for word in line.strip().split():
if word2index.has_key(word):
wids.append(word2index[word])
else:
wids.append(word2index["UNK"])
if make_categorical:
data[i] = np_utils.to_categorical(
wids, num_classes=len(word2index))
else:
data[i] = wids
i += 1
fin.close()
pdata = sequence.pad_sequences(data, maxlen=maxlen)
return pdata
pos_tagging_gru.py 文件源码
项目:Deep-Learning-with-Keras
作者: PacktPublishing
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def generate_batch(s_sents, s_word2index, t_sents, t_word2index,
batch_size, maxlen):
while True:
# shuffle the input
indices = np.random.permutation(np.arange(len(s_sents)))
ss_sents = [s_sents[ix] for ix in indices]
ts_sents = [t_sents[ix] for ix in indices]
# convert to word indices
si_sents = [[get_or_else(s_word2index, word, s_word2index["UNK"])
for word in sent]
for sent in ss_sents]
ti_sents = [[t_word2index[word] for word in sent]
for sent in ts_sents]
# inner loop should run for an epoch
num_batches = len(s_sents) // batch_size
for i in range(num_batches):
s_batch = si_sents[i * batch_size : (i + 1) * batch_size]
t_batch = ti_sents[i * batch_size : (i + 1) * batch_size]
sp_batch = sequence.pad_sequences(s_batch, maxlen=maxlen)
tp_batch = sequence.pad_sequences(t_batch, maxlen=maxlen)
tpc_batch = np_utils.to_categorical(tp_batch.reshape(-1, 1),
num_classes=len(t_word2index)).reshape(batch_size,
-1, len(t_word2index))
yield sp_batch, tpc_batch
def label_test_file(self):
outfile = open("pred_vld.txt","w")
prep_alfa = lambda X: pad_sequences(sequences=self.indexer.texts_to_sequences(X),
maxlen=self.SentMaxLen)
vld = json.loads(open('validation.json', 'r').read())
for prem, hypo, label in zip(vld[0], vld[1], vld[2]):
prem_pad, hypo_pad = prep_alfa([prem]), prep_alfa([hypo])
ans = np.reshape(self.model.predict(x=[prem_pad, hypo_pad], batch_size = 1), -1) # PREDICTION
if np.argmax(ans) != label:
outfile.write(prem + "\n" + hypo + "\n")
outfile.write("Truth: " + self.rLabels[label] + "\n")
outfile.write('Contradiction \t{:.1f}%\n'.format(float(ans[0]) * 100) +
'Neutral \t\t{:.1f}%\n'.format(float(ans[1]) * 100) +
'Entailment \t{:.1f}%\n'.format(float(ans[2]) * 100))
outfile.write("-"*15 + "\n")
outfile.close()
QnARecurAtteLatest2GRU.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# for validation dataset
QnARecurAtteLatest2GRU.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnARecurAtteLatest3Atten.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# for validation dataset
QnARecurAtteLatest3Atten.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnARecurAtteLatest2GRUUnidirect.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnARecurAtteLatest2Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# for validation dataset
QnARecurAtteLatest2Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnARecurAtteLatest2GRU1SATTE.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# for validation dataset
QnARecurAtteLatest2GRU1SATTE.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnA.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# Note: Need to download and unzip Glove pre-train model files into same file as this script
QnARecurAtteLatest1Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# for validation dataset
QnARecurAtteLatest1Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnARecurAtteLatest3Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
# map the first and last words of answer span to one-hot representations
y_Begin = np.zeros(len(xContext[i]))
y_Begin[xAnswerBeing[i]] = 1
y_End = np.zeros(len(xContext[i]))
y_End[xAnswerEnd[i]] = 1
X.append(x)
Xq.append(xq)
YBegin.append(y_Begin)
YEnd.append(y_End)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')
# for validation dataset
QnARecurAtteLatest3Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
QnARecurAtteLatest.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
'''Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
X = []
Xq = []
YBegin = []
YEnd = []
for i in xrange(len(xContext)):
x = [word_index[w] for w in xContext[i]]
xq = [word_index[w] for w in xQuestion[i]]
X.append(x)
Xq.append(xq)
return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
# fit tokenizer
tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
word_index = tk.word_index
# q1, q2 training text sequence
# (sentence_len, MAX_SEQUENCE_LENGTH)
train_x1 = tk.texts_to_sequences(train_ori1)
train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
train_x2 = tk.texts_to_sequences(train_ori2)
train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
# q1, q2 testing text sequence
test_x1 = tk.texts_to_sequences(test_ori1)
test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
test_x2 = tk.texts_to_sequences(test_ori2)
test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
return train_x1, train_x2, test_x1, test_x2, word_index
def words_to_char_sequence(words_list, tk):
"""Convert words list to chars sequence
# Arguments
words: word list, (sentence_len, word_len)
# Output shape
(sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
"""
c_seqs = np.zeros((len(words_list),
TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
for w_i in xrange(len(words_list)):
words = words_list[w_i]
fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
ws = tk.texts_to_sequences(words)
ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
else:
max_word_len = len(words)
fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
c_seqs[w_i] = fixed_ws
return c_seqs
def preprocess_batch(self, captions_label_encoded):
captions = keras_seq.pad_sequences(captions_label_encoded,
padding='post')
# Because the number of timesteps/words resulted by the model is
# maxlen(captions) + 1 (because the first "word" is the image).
captions_extended1 = keras_seq.pad_sequences(captions,
maxlen=captions.shape[-1] + 1,
padding='post')
captions_one_hot = map(self._tokenizer.sequences_to_matrix,
np.expand_dims(captions_extended1, -1))
captions_one_hot = np.array(captions_one_hot, dtype='int')
# Decrease/shift word index by 1.
# Shifting `captions_one_hot` makes the padding word
# (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
# so its cross entropy loss will be zero.
captions_decreased = captions.copy()
captions_decreased[captions_decreased > 0] -= 1
captions_one_hot_shifted = captions_one_hot[:, :, 1:]
captions_input = captions_decreased
captions_output = captions_one_hot_shifted
return captions_input, captions_output
def build_vectors(keyword="",data_label="",lower_limit=None,upper_limit=None,folder_path="dataset"):
# training
training_vector,labels,maxlen_training = create_dataset(dataset_path = folder_path+"/train",keyword=keyword,lower_limit=lower_limit,upper_limit=upper_limit)
# validation
evaluation_training_vector,evaluation_labels,maxlen_evaluation = create_dataset(dataset_path = "{0}/test".format(folder_path),keyword=keyword,lower_limit=lower_limit,upper_limit=upper_limit)
# # X_training
training_vector = sequence.pad_sequences(training_vector, maxlen=np.max([maxlen_training,maxlen_evaluation]),dtype='float32')
pickle.dump(training_vector,open("pickled_vectors/{1}{0}_training_vector.pickle".format(keyword,data_label),"wb"))
#
# # y
#
pickle.dump(labels,open("pickled_vectors/{1}{0}_label.pickle".format(keyword,data_label),"wb"))
#
#
# # evaluation
evaluation_training_vector = sequence.pad_sequences(evaluation_training_vector, maxlen=np.max([maxlen_training,maxlen_evaluation]),dtype='float32')
pickle.dump(evaluation_training_vector,open("pickled_vectors/{1}{0}_evaluation_training_vector.pickle".format(keyword,data_label),"wb"))
#
# # evaluation
pickle.dump(evaluation_labels,open("pickled_vectors/{1}{0}_evaluation_label.pickle".format(keyword,data_label),"wb"))
with(open("maxlen_{0}".format(keyword),"w")) as _f:
_f.write(str(np.max([maxlen_training,maxlen_evaluation])))
def test_pad_sequences():
a = [[1], [1, 2], [1, 2, 3]]
# test padding
b = pad_sequences(a, maxlen=3, padding='pre')
assert_allclose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
b = pad_sequences(a, maxlen=3, padding='post')
assert_allclose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])
# test truncating
b = pad_sequences(a, maxlen=2, truncating='pre')
assert_allclose(b, [[0, 1], [1, 2], [2, 3]])
b = pad_sequences(a, maxlen=2, truncating='post')
assert_allclose(b, [[0, 1], [1, 2], [1, 2]])
# test value
b = pad_sequences(a, maxlen=3, value=1)
assert_allclose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
__init__.py 文件源码
项目:text-classification-with-convnets
作者: osmanbaskaya
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def testset_read(fn, word_idx, maxlen):
total_num_of_unk = 0
tokenizer = TreebankWordTokenizer()
try:
lines = codecs.open(fn, encoding='utf8').read().splitlines()
except UnicodeDecodeError:
lines = codecs.open(fn).read().splitlines()
X = []
sentences = []
for line in lines:
s = []
for token in tokenizer.tokenize(line):
idx = word_idx.get(token, 1) # 1 is UNKNOWN word id
if idx == 1:
total_num_of_unk += 1
s.append(idx)
X.append(s)
sentences.append(line)
X = sequence.pad_sequences(X, maxlen=maxlen)
print >> sys.stderr, "Total number of UNK={}, Avg. {}".format(total_num_of_unk, total_num_of_unk / float(len(sentences)))
return X, sentences
def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
if maxlen is None:
maxlen = max(len(s) for s in data)
word2idx = dict((w, i) for i, w in enumerate(vocab))
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to <unk> (index 1) if not in vocab
y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]
x = pad_sequences(x, maxlen) # left padding
y_pos = pad_sequences(y_pos, maxlen, value=-1) # lef padded with -1. Indeed, any interger works as it will be masked
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
if onehot:
y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
else:
y_pos = numpy.expand_dims(y_pos, 2)
y_chunk = numpy.expand_dims(y_chunk, 2)
return x, y_pos, y_chunk
def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
# fit tokenizer
tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
word_index = tk.word_index
# q1, q2 training text sequence
# (sentence_len, MAX_SEQUENCE_LENGTH)
train_x1 = tk.texts_to_sequences(train_ori1)
train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
train_x2 = tk.texts_to_sequences(train_ori2)
train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
# q1, q2 testing text sequence
test_x1 = tk.texts_to_sequences(test_ori1)
test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
test_x2 = tk.texts_to_sequences(test_ori2)
test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
return train_x1, train_x2, test_x1, test_x2, word_index
def words_to_char_sequence(words_list, tk):
"""Convert words list to chars sequence
# Arguments
words: word list, (sentence_len, word_len)
# Output shape
(sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
"""
c_seqs = np.zeros((len(words_list),
TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
for w_i in xrange(len(words_list)):
words = words_list[w_i]
fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
ws = tk.texts_to_sequences(words)
ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
else:
max_word_len = len(words)
fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
c_seqs[w_i] = fixed_ws
return c_seqs
def preprocess_input_sequences(self, data, shuffle=True):
"""
??????
shuffle
PAD/TRUNC????????
y_true????self.A_len????index=0??????one-hot??
"""
documents, questions, answer, candidates = self.union_shuffle(data) if shuffle else data
d_lens = [len(i) for i in documents]
questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post")
documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post")
context_mask = K.eval(tf.sequence_mask(d_lens, self.d_len, dtype=tf.float32))
candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post")
y_true = np.zeros_like(candidates_ok)
y_true[:, 0] = 1
return questions_ok, documents_ok, context_mask, candidates_ok, y_true