useful_functions.py 文件源码-python代码片段

def sentence2vec(sentence, model=WORD2VEC, stopwords=STOPWORDS, metadata=None, section=None, wordvecs_only=True):
    """
    Changes a sentence into a vector by averaging the word vectors of every non-stopword word in the sentence.
    :param sentence: the sentence to turn into a vector, as a list of words
    :param model: the word2vec model to use to convert words to vectors
    :param stopwords: stopwords to not include in the averaging of each sentence.
    :param metadata: dictionaries of metadata for the paper.
    :param section: the section of the paper the sentence occurs in.
    :param wordvecs_only: will turn a sentence into a vector using only the the word vectors from the model, no extra
                          features.
    :return: the sentence in vector representation
    """
    # The shape of the model, used to get the number of features and its vocab
    model_shape = model.syn0.shape
    vocab = set(model.index2word)

    # The array that will be used to calculate the average word vector
    average = np.zeros((model_shape[1]), dtype="float32")
    total_word_count = 0

    for word in sentence:

        if word in stopwords:
            continue

        if word in vocab:
            word_rep = model[word]
            average += word_rep
            total_word_count += 1

    if total_word_count == 0:
        total_word_count = 1

    average = np.divide(average, total_word_count)

    sentence_vec = average

    return sentence_vec