python类words()的实例源码-面圈网

bot.py 文件源码项目：customer-service-chatbot 作者: xploiter-projects 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def SpeechToText():
        r = sr.Recognizer()   #Speech recognition
        with sr.Microphone() as source:
            print("Say something!")
            audio = r.listen(source)
            message = r.recognize_google(audio)
            print("Check: "+message)
        try:
            print("User: " + r.recognize_google(audio))
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))
        return message

#function to find importance of words to use them to deduce that which thing is being asked more

readdata.py 文件源码项目：Natural-Language-Processing-Python-and-NLTK 作者: PacktPublishing 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

reuters_classifier.py 文件源码项目：ml-projects 作者: saopayne 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def collection_stats():
    # list of documents
    documents_stat = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat))
    print(str(len(train_docs_stat)) + " total training documents")

    test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat))
    print(str(len(test_docs_stat) + " total test documents"))

    # list of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # get the documents in a category
    category_docs = reuters.fileids("acq")

    # words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # print the raw document
    print(reuters.raw(document_id))

text.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

language_model.py 文件源码项目：facebook-message-analysis 作者: szheng17 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_user_to_word_proportion(user_to_text, word):
    """
    Maps each user to the proportion of his words that consist of a specificied
    word.
    """
    user_to_word_proportion = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        n_tokens = len(lm.lowercase_tokens)
        if n_tokens > 0:
            fd = nltk.FreqDist(lm.lowercase_tokens)
            user_to_word_proportion[user] = fd[word] / float(n_tokens)
        else:
            user_to_word_proportion[user] = 0.0
        print 'Finished user {}'.format(user.encode('utf-8'))
    return user_to_word_proportion

language_model.py 文件源码项目：facebook-message-analysis 作者: szheng17 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def generate(cfd, start_word, n):
        word = start_word
        words = []
        for i in range(n):
            words.append(word)
            # word = cfd[word].max()
            fd = cfd[word]
            n_next_words = sum(fd.values())
            if n_next_words > 0:
                probabilities = [fd[w]/float(n_next_words) for w in sorted(fd.keys())]
                word = choice(sorted(fd.keys()), p=probabilities)
            else:
                # Pick random word
                old_word = word
                # TODO: use unigram probabilities later
                word = choice(cfd.keys())
        words.append(word)
        sentence = ' '.join(words)
        # TODO: modify above for punctuation
        return sentence

utils.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def rm_stop_words(data, mode="nltk",silent=1):
    """
    Input:
        data is a set, {} or Counter
    """
    if silent==0:
        print("remove stop words ...")
    if mode == "nltk":
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
    else:
        print("unknown mode",mode)
        assert 0

    if isinstance(data,list):   
        data = [i for i in data if i.lower() not in stop_words]
        return data
    else:
        for word in stop_words:
            if word in data:
                del data[word]

data_util.py 文件源码项目：BiMPM_keras 作者: ijinmao 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs

preprocessing.py 文件源码项目：KATE 作者: hugochan 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]

preprocessing.py 文件源码项目：KATE 作者: hugochan 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def build_vocab(word_freq, threshold=5, topn=None, start_idx=0):
    """
    threshold only take effects when topn is None.
    words are indexed by overall frequency in the dataset.
    """
    word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True)
    if topn:
        word_freq = zip(*word_freq[:topn])[0]
        vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx)))
    else:
        idx = start_idx
        vocab_dict = {}
        for word, freq in word_freq:
            if freq < threshold:
                return vocab_dict
            vocab_dict[word] = idx
            idx += 1
    return vocab_dict

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def bigrams(words, join_string, skip=0):
        """
           Input: a list of words, e.g., ["I", "am", "Denny"]
           Output: a list of bigram, e.g., ["I_am", "am_Denny"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for k in range(1, skip + 2):
                    if i + k < L:
                        lst.append(join_string.join([words[i], words[i + k]]))
        else:
            # set it as unigram
            lst = NgramUtil.unigrams(words)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def trigrams(words, join_string, skip=0):
        """
           Input: a list of words, e.g., ["I", "am", "Denny"]
           Output: a list of trigram, e.g., ["I_am_Denny"]
        """
        assert type(words) == list
        L = len(words)
        if L > 2:
            lst = []
            for i in range(L - 2):
                for k1 in range(1, skip + 2):
                    for k2 in range(1, skip + 2):
                        if i + k1 < L and i + k1 + k2 < L:
                            lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
        else:
            # set it as bigram
            lst = NgramUtil.bigrams(words, join_string, skip)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def biterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for j in range(i + 1, L):
                    lst.append(join_string.join([words[i], words[j]]))
        else:
            # set it as uniterm
            lst = NgramUtil.uniterms(words)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def triterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 2:
            lst = []
            for i in xrange(L - 2):
                for j in xrange(i + 1, L - 1):
                    for k in xrange(j + 1, L):
                        lst.append(join_string.join([words[i], words[j], words[k]]))
        else:
            # set it as biterm
            lst = NgramUtil.biterms(words, join_string)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def fourterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
            Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
        """
        assert type(words) == list
        L = len(words)
        if L > 3:
            lst = []
            for i in xrange(L - 3):
                for j in xrange(i + 1, L - 2):
                    for k in xrange(j + 1, L - 1):
                        for l in xrange(k + 1, L):
                            lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
        else:
            # set it as triterm
            lst = NgramUtil.triterms(words, join_string)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def ngrams(words, ngram, join_string=" "):
        """
        wrapper for ngram
        """
        if ngram == 1:
            return NgramUtil.unigrams(words)
        elif ngram == 2:
            return NgramUtil.bigrams(words, join_string)
        elif ngram == 3:
            return NgramUtil.trigrams(words, join_string)
        elif ngram == 4:
            return NgramUtil.fourgrams(words, join_string)
        elif ngram == 12:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            return unigram + bigram
        elif ngram == 123:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
            return unigram + bigram + trigram

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def bigrams(words, join_string, skip=0):
        """
           Input: a list of words, e.g., ["I", "am", "Denny"]
           Output: a list of bigram, e.g., ["I_am", "am_Denny"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for k in range(1, skip + 2):
                    if i + k < L:
                        lst.append(join_string.join([words[i], words[i + k]]))
        else:
            # set it as unigram
            lst = NgramUtil.unigrams(words)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def biterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for j in range(i + 1, L):
                    lst.append(join_string.join([words[i], words[j]]))
        else:
            # set it as uniterm
            lst = NgramUtil.uniterms(words)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def triterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 2:
            lst = []
            for i in xrange(L - 2):
                for j in xrange(i + 1, L - 1):
                    for k in xrange(j + 1, L):
                        lst.append(join_string.join([words[i], words[j], words[k]]))
        else:
            # set it as biterm
            lst = NgramUtil.biterms(words, join_string)
        return lst

preprocess.py 文件源码项目：MatchZoo 作者: faneshion 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def fourterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
            Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
        """
        assert type(words) == list
        L = len(words)
        if L > 3:
            lst = []
            for i in xrange(L - 3):
                for j in xrange(i + 1, L - 2):
                    for k in xrange(j + 1, L - 1):
                        for l in xrange(k + 1, L):
                            lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
        else:
            # set it as triterm
            lst = NgramUtil.triterms(words, join_string)
        return lst

preprocess_data.py 文件源码项目：identifiera-sarkasm 作者: risnejunior 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def build_vocabulary( words, max_size ):
    vocab_instances = 0
    unique_counts = Counter(words)
    d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
    vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1],  reverse=True) )

    # start at 2 to leave room for padding & unknown
    pb = Progress_bar(len(d) - 1) 
    for i, (key, value) in enumerate(vocabulary.items(), start=2):      
        vocab_instances += value
        vocabulary[key] = i
        pb.tick()

    vocabulary[cfg.padding_char] = 0
    vocabulary[cfg.placeholder_char] = 1
    #reverse the vocbulary (for reverse lookup)
    rev_vocabulary = {v: k for k, v in vocabulary.items()}  
    vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)

    return vocab

preprocess_data.py 文件源码项目：identifiera-sarkasm 作者: risnejunior 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def tokenize_text( sample_text ):
    global sequence_lengths
    processed_text = []

    if cfg.remove_punctuation:
        cleaned = sample_text.lower().translate( t_table )
    else:
        cleaned = sample_text

    if cfg.use_casual_tokenizer:
        tokens = tknzr.tokenize( cleaned )
    else:
        tokens = nltk.word_tokenize( cleaned, language='english')

    if cfg.remove_stopwords:
        tokens = [w for w in tokens if not w in stopwords.words('english')]

    sequence_lengths.append( len( tokens ) )
    processed_text.extend( tokens )

    return processed_text

NewsAutosummarize.py 文件源码项目：Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def _init_(self, min_cut=0.1, max_cut=0.9):
        # identation changes - we are inside the constructor
        # here we set up the behaviour
        # this is called each time an object of feq summ class is
        # created or instantiated
        self._min_cut = min_cut    # self=keyword that reports the variable
        self._max_cut = max_cut
        # we save the val of the 2 parameters passed by assigning them
        # two member variables - the 'self.' prefix identifies them as part
        # of the self argument - using underscore as first char.
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        # this is alist of all common words and punc symols

    # identation changes - we are out of the constructor here
    # This is still the body of the class
    # Defining var here ( outside a member function) but within the class
    # member var becomes STATIC. This means it belongs to the class, and not
    # to any specific individual instance (object) of the class

NewsArticleClass.py 文件源码项目：Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)

eagle.py 文件源码项目：stock-eagle 作者: mtusman 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def similarity(c1, c2):
    '''stop words are words like "it" and "the" , that have no massive impact on the 
    sentence'''
    stop_words = list(stopwords.words("english"))
    # Removes stop words in both sentences
    c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
    c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
    c1_words = Counter(dedupe(c1_cleaned))
    c2_words = Counter(dedupe(c2_cleaned))
    total_words = c1_words + c2_words
    similarity_between_words = 0
    for key, val in total_words.items():
        ''' Looks at whether the two articles share a word'''
        if total_words[key] > 1:
            similarity_between_words += 1

    return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def _answer_stop_word_density(self, row):
        """Percentage of tokens in the answer are stopwords
        - Args:
            row(pandas.dataframe): input row vector
        - Returns:
            row(pandas.dataframe): ouput vector with new feature 
        """
        stop = stopwords.words('english')
        answer = row.Answer
        if answer:
            tokens = answer.split()
            num_tokens = len(tokens)
            stop_word_in_answer = [i for i in tokens if i in stop]
            num_stop_word_in_answer = len(stop_word_in_answer)
            row['ANSWER_STOPWORD_DENSITY'] = float(
                num_stop_word_in_answer) / num_tokens
            return row
        else:
            row['ANSWER_STOPWORD_DENSITY'] = 0
            return row

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def _answer_quantifier_density(self, row):
        """Percentage of tokens in the answer that are quantifier words
        - Args:
            row(pandas.dataframe): input pandas dataframe
        - Returns:
            row(pandas.dataframe): result a pandas dataframe with new feature
        """
        answer = row.Answer
        if answer:
            tokens = answer.split()
            answer_len = len(tokens)
            quantifier_tokens = [
                i for i in tokens if i in ling.QUANTIFIER_WORDS]
            quantifier_tokens_len = len(quantifier_tokens)
            row['ANSWER_QUANTIFIER_DENSITY'] = float(
                quantifier_tokens_len) / answer_len
            return row
        else:
            row['ANSWER_QUANTIFIER_DENSITY'] = 0
            return row

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def _percentage_capitalized_word_in_answer(self, row):
        """Percentage of capitalized words in the sentence that are in the answer
        - Args:
            row(pandas.dataframe): input pandas dataframe
        - Returns:
            row(pandas.dataframe): result a pandas dataframe with new feature
        """
        answer = row.Answer
        sentence = row.Sentence
        if answer is not None and sentence is not None:
            tokens = sentence.split()
            num_tokens = len(tokens)
            cap_tokens = [i for i in tokens if i.isupper() == True]
            cap_tokens_in_answer = [i for i in cap_tokens if i in answer]
            row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = float(
                len(cap_tokens_in_answer)) / num_tokens
            return row
        else:
            row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = 0
            return row

idf_utils.py 文件源码项目：MP-CNN-Variants 作者: tuzhucheng 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt):
    """
    Get overlap, idf weighted overlap, overlap excluding stopwords, and idf weighted overlap excluding stopwords.
    """
    stoplist = set(stopwords.words('english'))
    num_docs = len(sent_list_1)
    overlap_feats = []

    for s1, s2 in zip(sent_list_1, sent_list_2):
        tokens_a_set, tokens_b_set = set(s1), set(s2)
        intersect = tokens_a_set & tokens_b_set
        overlap = len(intersect) / (len(tokens_a_set) + len(tokens_b_set))
        idf_intersect = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect)
        idf_weighted_overlap = idf_intersect / (len(tokens_a_set) + len(tokens_b_set))

        tokens_a_set_no_stop = set(w for w in s1 if w not in stoplist)
        tokens_b_set_no_stop = set(w for w in s2 if w not in stoplist)
        intersect_no_stop = tokens_a_set_no_stop & tokens_b_set_no_stop
        overlap_no_stop = len(intersect_no_stop) / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
        idf_intersect_no_stop = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect_no_stop)
        idf_weighted_overlap_no_stop = idf_intersect_no_stop / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
        overlap_feats.append([overlap, idf_weighted_overlap, overlap_no_stop, idf_weighted_overlap_no_stop])

    return overlap_feats

lda_utils.py 文件源码项目：LDA-REST 作者: valentinarho 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_similar_documents_for_query(model_id, text):
    """
    Return documents similar to the query or an empty set if an error occurs or the query has no words after preprocessing
    :param model_id:
    :param text:
    :return:
    """
    model = db_utils.get_model(model_id)
    topics_assignment = assign_topics_for_query(model_id, text)

    if len(topics_assignment) != 0:
        topics_vector = transform_topics_assignment_from_lda_to_vector(model['number_of_topics'], topics_assignment[0])
        # print(topics_vector)
        return get_similar_documents_by_vector(model_id, topics_vector)
    else:
        return []