def train_batch_sg(model, sentences, alpha=None, work=None,sub_batch_size=256,batch_size=256):
batch_count=0
sub_batch_count=0
train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
train_y =np.zeros((batch_size,sub_batch_size),dtype='int8')
while 1:
for sentence in sentences:
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
#window_length=len(word_vocabs[start:(pos + model.window + 1 - reduced_window)])
#print window_length,
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index)
for xy in xy_gen :
if xy !=None:
(x0,x1,y)=xy
train_x0[batch_count][sub_batch_count]=x0
train_x1[batch_count][sub_batch_count]=x1
train_y[batch_count][sub_batch_count]=y
sub_batch_count += 1
if sub_batch_count >= sub_batch_size :
batch_count += 1
sub_batch_count=0
if batch_count >= batch_size :
yield { 'index':train_x0, 'point':train_x1, 'code':train_y}
batch_count=0
word2veckeras.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录