scoreword2veckeras.py 文件源码-python代码片段

def train_batch_score_sg(model, scored_word_sentences,
                         score_vector_size,
                         alpha=None, work=None,
                         sub_batch_size=256,
                         batch_size=256):

    batch_count=0
    sub_batch_count=0
    train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_y0  =np.zeros((batch_size,sub_batch_size),dtype='int8')
    train_y1  =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32')
    # train_x0=[[0]]*batch_size
    # train_x1=[[0]]*batch_size
    # train_y0=[[0]]*batch_size
    # train_y1=[[0]]*batch_size
    while 1:
        for scored_word_sentence in scored_word_sentences:
            #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence]

            word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and
                           model.vocab[w].sample_int > model.random.rand() * 2**32]
            for pos, scored_word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                word=scored_word2word(scored_word)
                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)
                for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                    word2=scored_word2word(scored_word2)
                    # don't train on the `word` itself
                    if pos2 != pos:
                        xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha)
                        for xy in xy_gen :
                            if xy !=None:
                                (x0,x1,y0)=xy
                                y1=scored_word2score(scored_word)
                                train_x0[batch_count][sub_batch_count]=x0
                                train_x1[batch_count][sub_batch_count]=x1
                                train_y0[batch_count][sub_batch_count]=y0
                                train_y1[batch_count][sub_batch_count]=y1
                                sub_batch_count += 1
                                if sub_batch_count >= sub_batch_size :
                                    batch_count += 1
                                    sub_batch_count=0
                                if batch_count >= batch_size :
                                    yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1}
                                    batch_count=0

                                # train_x0[batch_count]=[x0]
                                # train_x1[batch_count]=x1
                                # train_y0[batch_count]=y0
                                # train_y1[batch_count]=y1
                                # #print train_x0,train_y1,
                                # batch_count += 1
                                # if batch_count >= batch_size :
                                #     #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
                                #     #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)}
                                #     yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
                                #     batch_count=0