def word_vector_sum(
documents: Iterable[str],
vector_dict: Dict[str, np.array],
*,
separator: Optional[FrozenSet] = None,
lowercase: bool = True,
stop_words: Optional[Set] = None
):
"""
Calculate the word vector sum for each document in an iterable.
Args:
documents: An iterable of strings representing text
documents.
vector_dict(dict): A dictionary of words and their
corresponding vectors.
separator(frozenset): A frozenset of Unicode code points
to use as separators. Defaults to
``cypunct.unicode_classes.COMMON_SEPARATORS``.
lowercase(bool): If True, lowercase each document before
processing it.
stop_words(set): A set of words whose vectors should not
be included in the sum.
Returns:
(np.array): Return a numpy array containing a word
vector for each document, where each document's
word vector is the sum of its word vectors.
"""
# Dimension of the vectors we're using.
dimension = len(next(vector_dict.values().__iter__()))
# If every word in a document is either a stop word or
# not in the dictionary...
# then the list comprehension inside np.average will return
# []. For that case, we will insert an array of zeros.
is_empty = [np.zeros(dimension)]
if lowercase:
documents = (doc.lower() for doc in documents)
if stop_words:
return np.asarray([
np.average([
vector_dict[x]
for x in cypunct.split(doc, separator)
if x in vector_dict and
x not in stop_words
] or is_empty, axis=0)
for doc in documents
])
return np.asarray([
np.average([
vector_dict[x]
for x in cypunct.split(doc, separator)
if x in vector_dict
] or is_empty, axis=0)
for doc in documents
])
评论列表
文章目录