ner.py 文件源码-python代码片段

def gen_dataset(sentences,
                categories,
                max_words=78,
                train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output the category (one-hotted)

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'),
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.zeros(300)
    encoder = one_hot_encoding(categories)

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, len(encoder.keys())))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words
    param_dict['encoder'] = encoder

    for sent_i in I:
        words = sentences[sent_i]
        cats = categories[sent_i]

        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words, categories=cats,
                                    vectorizer=vectorizer,
                                    encoder=encoder,
                                    max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = util.split_data(
            X, out_data=I, frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
    return (X, y, K), param_dict