transformers.py 文件源码-python代码片段

def word_vector_sum(
        documents: Iterable[str],
        vector_dict: Dict[str, np.array],
        *,
        separator: Optional[FrozenSet] = None,
        lowercase: bool = True,
        stop_words: Optional[Set] = None
):
    """
    Calculate the word vector sum for each document in an iterable.

    Args:
        documents: An iterable of strings representing text
            documents.

        vector_dict(dict): A dictionary of words and their
            corresponding vectors.

        separator(frozenset): A frozenset of Unicode code points
            to use as separators. Defaults to
            ``cypunct.unicode_classes.COMMON_SEPARATORS``.

        lowercase(bool): If True, lowercase each document before
            processing it.

        stop_words(set): A set of words whose vectors should not
            be included in the sum.

    Returns:
        (np.array): Return a numpy array containing a word
            vector for each document, where each document's
            word vector is the sum of its word vectors.

    """
    # Dimension of the vectors we're using.
    dimension = len(next(vector_dict.values().__iter__()))
    # If every word in a document is either a stop word or
    # not in the dictionary...
    # then the list comprehension inside np.average will return
    # []. For that case, we will insert an array of zeros.
    is_empty = [np.zeros(dimension)]
    if lowercase:
        documents = (doc.lower() for doc in documents)
    if stop_words:
        return np.asarray([
            np.average([
                vector_dict[x]
                for x in cypunct.split(doc, separator)
                if x in vector_dict and
                x not in stop_words
            ] or is_empty, axis=0)
            for doc in documents
        ])
    return np.asarray([
        np.average([
            vector_dict[x]
            for x in cypunct.split(doc, separator)
            if x in vector_dict
        ] or is_empty, axis=0)
        for doc in documents
    ])