python类Word2Vec()的实例源码-第2页-面圈网

_ner.py 文件源码项目：soy 作者: lovit 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def train_word2vec(self, min_count=10, size=100, window=5, workers=3):
        self.word2vec_model = Word2Vec(Word2vecCorpus(self.corpus_file), min_count=min_count, size=size, window=window, workers=workers)

embedding_trainer.py 文件源码项目：Kaggle_HomeDepot 作者: ChenglongChen 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __init__(self, df, columns, model_param):
        self.df = df
        self.columns = columns
        self.model_param = model_param
        self.model = Word2Vec(sg=self.model_param["sg"], 
                                hs=self.model_param["hs"], 
                                alpha=self.model_param["alpha"],
                                min_alpha=self.model_param["alpha"],
                                min_count=self.model_param["min_count"], 
                                size=self.model_param["size"], 
                                sample=self.model_param["sample"], 
                                window=self.model_param["window"], 
                                workers=self.model_param["workers"])

create_wordembeddings.py 文件源码项目：deep-hashtagprediction 作者: jderiu 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    #unsupervised data
    hashtag_tweets = 'tweets/hashtag_tweets.gz'
    files = [hashtag_tweets]
    sentences = MySentences(files=files)
    model = models.Word2Vec(sentences, size=100, window=5, min_count=15, workers=8,sg=1,sample=1e-5,hs=1)
    model.save_word2vec_format('embeddings/hashtag_tweets_embedding',binary=False)

train_word2vec.py 文件源码项目：noungroups 作者: gushecht 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def main(in_dir, out_loc, task=1, size=128, window=5, min_count=10,
    n_workers=4, hs=1, nr_iter=5):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        sg=task,
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        hs=1,
        iter=nr_iter
    )
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        with io.open(text_loc, 'r', encoding='utf8') as file_:
            try:
                text = file_.read()
            except UnicodeDecodeError:
                print(text_loc)
        total_sents += text.count('\n')
        total_words += corpus.count_doc(text.split())  
        logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
                    text_no, total_words, len(corpus.strings))
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for key, string in corpus.strings.items():
        model.raw_vocab[string] = corpus.counts[key]
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus)

    # Trims down model
    model.init_sims(replace=True)

    model.save(out_loc)

embedding.py 文件源码项目：QA 作者: KiddoZhu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def train(self, **kargs) :
        self.config.update(kargs)
        self.model = _Word2Vec(list(self.database.sentences), **self.config)
        delattr(self, "database")

RNA2DocVec.py 文件源码项目：rna_protein_binding 作者: wentaozhu 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'):
    min_count = 5
    dim = 50
    window = 5

    print('dim: ' + str(dim) + ', window: ' + str(window))
    seq_dict = read_fasta_file(seq_file)

    #text = seq_dict.values()
    tris = get_6_trids()
    sentences = []
    for seq in seq_dict.values():
        seq = seq.replace('T', 'U')
        bag_sen = []
        bag_seqs = split_overlap_seq(seq)
        for new_seq in bag_seqs:
            trvec = get_4_nucleotide_composition(tris, new_seq)
            bag_sen.append(trvec)
        #for aa in range(len(text)):
        sentences.append(bag_sen)
    #pdb.set_trace()
    print(len(sentences))
    model = None
    docs = train_tag_doc(sentences)
    #model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100)
    #model =  gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4)
    model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50)
    model.build_vocab(docs)
    model.train(docs)
    '''vocab = list(model.vocab.keys())
    print vocab
    fw = open('rna_doc_dict', 'w')
    for val in vocab:
        fw.write(val + '\n')
    fw.close()
    #print model.syn0
    #pdb.set_trace()
    embeddingWeights = np.empty([len(vocab), dim])

    for i in range(len(vocab)):
        embeddingWeights[i,:] = model[vocab[i]]  

    allWeights.append(embeddingWeights)

    '''
    #model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])
    #with open(outfile, 'w') as f:
    #    pickle.dump(model, f)
    # store the model to mmap-able files
    pdb.set_trace()
    model.save(outfile)
    # load the model back
    #model_loaded = Doc2Vec.load(outfile)

p2fa_helper.py 文件源码项目：CMU-MultimodalDataSDK 作者: A2Zadeh 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def load_w2v(self):
        """
        Load Word2Vec embeddings from P2FA files and pre-trained Word2Vec 
        KeyedVectors text file and store them in the 
        directory path mentioned in self.embedding_dir.
        :returns segment wise feature dictionary for embeddings
        :Note: Do not provide KeyedVector file in binary format
        """
        from gensim.models.keyedvectors import KeyedVectors
        from gensim.models import Word2Vec

        is_binary = True if self.embed_model_type == "binary" else False
        model = KeyedVectors.load_word2vec_format(self.embed_model_path, 
                                                  binary = is_binary )
        print "Word2Vec model Loaded"
        self.embed_model = model
        self.embed_length = model.vector_size
        if not self.word_dict:
            self.load_words()

        features = {}
        system("mkdir -p "+self.embedding_dir)
        for video_id, video_word_data in self.word_dict.iteritems():
            video_feats = {}
            for segment_id, segment_word_data in video_word_data.iteritems():
                video_feats[segment_id] = []
                for word_feat in segment_word_data:
                    start, end, word = word_feat
                    try:
                        embed = self.embed_model[word]
                    except:
                        embed = np.zeros(self.embed_length)
                    video_feats[segment_id].append((start, end, embed))

                fname = video_id+"_"+segment_id+".csv"
                fpath = join(self.embedding_dir, fname)
                with open(fpath,"wb") as fh:
                    # Writing each feature in csv file for segment
                    for f in video_feats[segment_id]:
                        f_start = str(f[0])
                        f_end = str(f[1])
                        f_val = [str(val) for val in f[2].tolist()]
                        str2write = ",".join([f_start, f_end] + f_val)
                        str2write += "\n"
                        fh.write(str2write)
            features[video_id] = video_feats
        return features

string_base.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def makeFeature(df_features):
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get sentence vector')
    model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    # model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)
    # model = Word2Vec(brown.sents())
    df_features['vec1'] = df_features.q1_expand.map(lambda x: getVec(x, model))
    df_features['vec2'] = df_features.q2_expand.map(lambda x: getVec(x, model))

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get six kinds of coefficient about vector')
    df_features['f_cosine'] = df_features.apply(lambda x: Cosine(x['vec1'], x['vec2']), axis=1)
    df_features['f_manhatton'] = df_features.apply(lambda x: Manhatton(x['vec1'], x['vec2']), axis=1)
    df_features['f_euclidean'] = df_features.apply(lambda x: Euclidean(x['vec1'], x['vec2']), axis=1)
    df_features['f_pearson'] = df_features.apply(lambda x: PearsonSimilar(x['vec1'], x['vec2']), axis=1)
    df_features['f_spearman'] = df_features.apply(lambda x: SpearmanSimilar(x['vec1'], x['vec2']), axis=1)
    df_features['f_kendall'] = df_features.apply(lambda x: KendallSimilar(x['vec1'], x['vec2']), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get 3 kinds of coefficient about from w2c 2 document')
    df_features['f_cosine_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Cosine, model), axis=1)
    df_features['f_euclidean_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Euclidean, model), axis=1)
    df_features['f_manhatton_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Manhatton, model), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get three kinds of coefficient about nouns, verb, adj')
    df_features['f_raw_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1'], x['question2']), axis=1)
    df_features['f_raw_dice'] = df_features.apply(lambda x: Dice(x['question1'], x['question2']),axis=1)
    df_features['f_raw_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1'], x['question2']), axis=1)
    df_features['f_expand_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['q1_expand'], x['q2_expand']), axis=1)
    df_features['f_expand_dice'] = df_features.apply(lambda x: Dice(x['q1_expand'], x['q2_expand']),axis=1)
    df_features['f_expand_ochiai'] = df_features.apply(lambda x: Ochiai(x['q1_expand'], x['q2_expand']), axis=1)
    df_features['f_nouns_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_nouns'], x['question2_nouns']), axis=1)
    df_features['f_nouns_dice'] = df_features.apply(lambda x: Dice(x['question1_nouns'], x['question2_nouns']),axis=1)
    df_features['f_nouns_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_nouns'], x['question2_nouns']), axis=1)
    df_features['f_verbs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_verbs'], x['question2_verbs']), axis=1)
    df_features['f_verbs_dice'] = df_features.apply(lambda x: Dice(x['question1_verbs'], x['question2_verbs']),axis=1)
    df_features['f_verbs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_verbs'], x['question2_verbs']), axis=1)
    df_features['f_adjs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_adjs'], x['question2_adjs']), axis=1)
    df_features['f_adjs_dice'] = df_features.apply(lambda x: Dice(x['question1_adjs'], x['question2_adjs']),axis=1)
    df_features['f_adjs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_adjs'], x['question2_adjs']), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get weighted overlap about expand')
    weights = word_weights(df_features)
    df_features['f_weighted_overlap'] = df_features.apply(lambda x: weighted_Overlap(x['q1_expand'], x['q2_expand'], weights), axis=1)

    print('all done')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features.fillna(0.0)
    return df_features

yahoo.py 文件源码项目：liveqa2017 作者: codekansas 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_word_embeddings(num_dimensions=500,
                        cache_loc=EMBEDDINGS_FILE):
    """Generates word embeddings.

    Args:
        num_dimensions: int, number of embedding dimensions.
        cache_loc: str, where to cache the word embeddings.

    Returns:
        numpy array representing the embeddings, with shape (NUM_TOKENS,
            num_dimensions).
    """

    if os.path.exists(cache_loc):
        embeddings = np.load(cache_loc)
    else:
        class SentenceGenerator(object):
            def __iter__(self):
                iterable = itertools.islice(iterate_qa_pairs(), 1000000)
                for i, (question, answer) in enumerate(iterable, 1):
                    q, a, _, _ = tokenize(question=question, answer=answer,
                                          use_pad=False, include_rev=False)
                    yield [str(w) for w in q]
                    yield [str(w) for w in a]

                    del q, a, w

                    if i % 1000 == 0:
                        sys.stderr.write('\rprocessed %d' % i)
                        sys.stderr.flush()

                sys.stderr.write('\rprocessed %d\n' % i)
                sys.stderr.flush()

        # The default embeddings.
        embeddings = np.random.normal(size=(NUM_TOKENS, num_dimensions))

        sentences = SentenceGenerator()
        model = models.Word2Vec(sentences, size=num_dimensions)

        word_vectors = model.wv
        del model

        # Puts the Word2Vec weights into the right order.
        weights = word_vectors.syn0
        vocab = word_vectors.vocab
        for k, v in vocab.items():
            embeddings[int(k)] = weights[v.index]

        with open(cache_loc, 'wb') as f:
            np.save(f, embeddings)
            pass

    assert embeddings.shape == (NUM_TOKENS, num_dimensions)
    return embeddings

vocab.py 文件源码项目：diversity_based_attention 作者: PrekshaNema25 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def get_global_embeddings(self, filenames, embedding_size, embedding_dir):
        """ Construct the Embedding Matrix for the sentences in filenames.

            Args:
                filenames: File names of the training files: Based on 
                which the vocab will be built. This is used when there
                are no pretrained embeddings present. Then instead of 
                using random embeddings, Word2Vec algorithm is used 
        to train the embeddings on the dataset avaliable.
                embedding_size: Dimensions for the embedding to be used.

            Returns
                Embedding matrix.
        """
        sentences = []

        if (os.path.exists(embedding_dir + 'vocab_len.pkl')):
                vocab_len_stored = pickle.load(open(embedding_dir + "vocab_len.pkl"))
        else:
                vocab_len_stored = 0

        if (vocab_len_stored == self.len_vocab and os.path.exists(embedding_dir + "embeddings.pkl")):
                print ("Load file")
                self.embeddings = pickle.load(open(embedding_dir +  "embeddings.pkl"))
                return None

        if (os.path.exists(embedding_dir + 'embeddings') == True):
            model = KeyedVectors.load_word2vec_format(embedding_dir + 'embeddings', binary = False)
            print ("Loading pretriained embeddings")

        else:
            for file in filenames:
                with open(file, 'rb') as f:
                    for lines in f:
                        words = [lines.split()]
                        sentences.extend(words)

            model = Word2Vec(sentences, size=embedding_size, min_count=0)
            model.save(embedding_dir + 'embeddings')

        self.embeddings_model = model
        return model

scoreword2veckeras.py 文件源码项目：word2vec-keras-in-gensim 作者: niitsuma 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def train_batch_score_cbow_xy_generator(model, scored_word_sentences):
    for scored_word_sentence in scored_word_sentences:
        #print scored_word_sentence
        scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and  model.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, scored_word in enumerate(scored_word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)]
            xy_gen=train_cbow_pair(model, scored_word[0] , word2_indices , None, None)
            for xy in xy_gen:
                if xy !=None:
                    xy1=[xy[0],xy[1],xy[2],[scored_word[1]]]
                    yield xy1

            # if xy !=None:
            #     xy1=[xy[0],xy[1],xy[2],scored_word[1]]
            #     yield xy1

embed.py 文件源码项目：korean_restaurant_reservation 作者: JudeLee19 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def __init__(self, fname='data/korean_word2vec', dim=300):
        self.dim = dim
        try:
            # load saved model
            print('Loading korean word2vec model')
            self.model = word2vec.Word2Vec.load(fname)
        except:
            print(':: There is no word2vec model')

knock96.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def extract_countries():
    countries_vec = {}
    vec = word2vec.Word2Vec.load("word2vec")
    for line in open("../chapter09/countries.txt", "r"):
        country = line.strip().replace(" ", "_")
        if country in vec.vocab.keys():
            countries_vec[country] = vec[country]

    return countries_vec

utterance_embed.py 文件源码项目：dstc6_dialogue_breakdown_task 作者: JudeLee19 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def __init__(self, file_name, dim=300):
        self.dim = dim
        try:
            print('Loading english word2vec model')
            self.word2vec_model = word2vec.Word2Vec.load(file_name)
        except:
            print('Error while loading word2vec model')

process.py 文件源码项目：YahooAnswer-LSTM 作者: rgtjf 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def load_embedding(data, embedding_file, binary=True, prefix=None, file_name='embedding.pkl'):
    """

    :param data:
    :param embedding_file:
    :param binary:
    :param prefix: if prefix is None, then write to file_name, else load from prefix
    :param file_name:
    :return:
    """
    if prefix == None:
        vocab = sorted(reduce(lambda x, y: x | y, (set(sentence) for sentence, _ in data)))
        word_idx = dict((c, i) for i, c in enumerate(vocab))
        vocab_size = len(word_idx) + 1  # +1 for nil word

        # "/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin"
        model = word2vec.Word2Vec.load_word2vec_format(embedding_file, binary=binary)

        embedding = []
        for c in word_idx:
            if c in model:
                embedding.append(model[c])
            else:
                embedding.append(np.random.uniform(0.1, 0.1, 300))
        embedding = np.array(embedding, dtype=np.float32)
        with open(file_name, 'wb') as f:
            pickle.dump(embedding, f)
            pickle.dump(vocab_size, f)
            pickle.dump(word_idx, f)
    else:
        with open(prefix, 'rb') as f:
            embedding = pickle.load(f)
            vocab_size = pickle.load(f)
            word_idx = pickle.load(f)

    return vocab_size, word_idx, embedding

word2veckeras.py 文件源码项目：word2vec-keras-in-gensim 作者: niitsuma 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def train_batch_sg(model, sentences, alpha=None, work=None,sub_batch_size=256,batch_size=256):

    batch_count=0
    sub_batch_count=0
    train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_y  =np.zeros((batch_size,sub_batch_size),dtype='int8')

    while 1:
        for sentence in sentences:
            word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
                           model.vocab[w].sample_int > model.random.rand() * 2**32]
            for pos, word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code

                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)
                #window_length=len(word_vocabs[start:(pos + model.window + 1 - reduced_window)])
                #print window_length,
                for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                    # don't train on the `word` itself
                    if pos2 != pos:
                        xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index)
                        for xy in xy_gen :
                            if xy !=None:
                                (x0,x1,y)=xy
                                train_x0[batch_count][sub_batch_count]=x0
                                train_x1[batch_count][sub_batch_count]=x1
                                train_y[batch_count][sub_batch_count]=y
                                sub_batch_count += 1
                                if sub_batch_count >= sub_batch_size :
                                    batch_count += 1
                                    sub_batch_count=0
                                if batch_count >= batch_size :
                                    yield { 'index':train_x0, 'point':train_x1, 'code':train_y}
                                    batch_count=0

word2veckeras.py 文件源码项目：word2vec-keras-in-gensim 作者: niitsuma 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def train_batch_cbow_xy_generator(model, sentences):
    for sentence in sentences:
        word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and  model.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
            xy_gen=train_cbow_pair(model, word , word2_indices , None, None)
            for xy in xy_gen:
                if xy !=None:
                    yield xy

scoreword2veckeras.py 文件源码项目：word2vec-keras-in-gensim 作者: niitsuma 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def train_batch_score_sg(model, scored_word_sentences,
                         score_vector_size,
                         alpha=None, work=None,
                         sub_batch_size=256,
                         batch_size=256):

    batch_count=0
    sub_batch_count=0
    train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_y0  =np.zeros((batch_size,sub_batch_size),dtype='int8')
    train_y1  =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32')
    # train_x0=[[0]]*batch_size
    # train_x1=[[0]]*batch_size
    # train_y0=[[0]]*batch_size
    # train_y1=[[0]]*batch_size
    while 1:
        for scored_word_sentence in scored_word_sentences:
            #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence]

            word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and
                           model.vocab[w].sample_int > model.random.rand() * 2**32]
            for pos, scored_word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                word=scored_word2word(scored_word)
                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)
                for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                    word2=scored_word2word(scored_word2)
                    # don't train on the `word` itself
                    if pos2 != pos:
                        xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha)
                        for xy in xy_gen :
                            if xy !=None:
                                (x0,x1,y0)=xy
                                y1=scored_word2score(scored_word)
                                train_x0[batch_count][sub_batch_count]=x0
                                train_x1[batch_count][sub_batch_count]=x1
                                train_y0[batch_count][sub_batch_count]=y0
                                train_y1[batch_count][sub_batch_count]=y1
                                sub_batch_count += 1
                                if sub_batch_count >= sub_batch_size :
                                    batch_count += 1
                                    sub_batch_count=0
                                if batch_count >= batch_size :
                                    yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1}
                                    batch_count=0

                                # train_x0[batch_count]=[x0]
                                # train_x1[batch_count]=x1
                                # train_y0[batch_count]=y0
                                # train_y1[batch_count]=y1
                                # #print train_x0,train_y1,
                                # batch_count += 1
                                # if batch_count >= batch_size :
                                #     #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
                                #     #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)}
                                #     yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
                                #     batch_count=0