def is_embedded(sentence, embedding, analyzer):
"""
>>> embedding = ["a", "b", "c"]
>>> queries = ["a b c", "a", "b", "c", "a b c d", "d", "a b c" ]
>>> analyzer = lambda x: x.split()
>>> [query for query in queries if is_embedded(query, embedding, analyzer)]
['a b c', 'a', 'b', 'c', 'a b c']
>>> analyzer = CountVectorizer().build_analyzer()
>>> [query for query in queries if is_embedded(query, embedding, analyzer)]
['a b c', 'a', 'b', 'c', 'a b c']
"""
for word in analyzer(sentence):
if word not in embedding:
print("Dropping:", sentence, file=sys.stderr)
return False
return True
评论列表
文章目录