ngram.py 文件源码-python代码片段

ngram.py 文件源码

python

阅读 50 收藏 0 点赞 0 评论 0

项目：chatbot_ner 作者: hellohaptik 项目源码文件源码

def ngram_list(n, word_list, stop_word_list=None):
        """
        Generate ngrams with width n excluding those that are entirely formed of stop words

        Args:
            n (int): i.e. 1, 2, 3...
            word_list (list of str): list of words
            stop_word_list (list of str, Optional): list of words that should be excluded while obtaining
                                                    list of ngrams

        Returns:
            list of str: List of ngrams formed from the given word list except for those that have all their tokes in
                         stop words list
        """
        stop_word_set = set(stop_word_list) if stop_word_list else []
        all_ngrams = nltk.ngrams(word_list, n)
        ngram_list = []
        for ngram in all_ngrams:
            lowered_ngram_tokens = map(lambda token: token.lower(), ngram)
            if any(token not in stop_word_set for token in lowered_ngram_tokens):
                ngram_list.append(' '.join(ngram))
        return ngram_list