def ngram_list(n, word_list, stop_word_list=None):
"""
Generate ngrams with width n excluding those that are entirely formed of stop words
Args:
n (int): i.e. 1, 2, 3...
word_list (list of str): list of words
stop_word_list (list of str, Optional): list of words that should be excluded while obtaining
list of ngrams
Returns:
list of str: List of ngrams formed from the given word list except for those that have all their tokes in
stop words list
"""
stop_word_set = set(stop_word_list) if stop_word_list else []
all_ngrams = nltk.ngrams(word_list, n)
ngram_list = []
for ngram in all_ngrams:
lowered_ngram_tokens = map(lambda token: token.lower(), ngram)
if any(token not in stop_word_set for token in lowered_ngram_tokens):
ngram_list.append(' '.join(ngram))
return ngram_list
评论列表
文章目录