python类TfidfModel()的实例源码

kmeans_cluster.py 文件源码 项目:ParseLawDocuments 作者: FanhuaandLuomu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.text.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    ds = []
    for doc in corpus_tfidf:
        d = [0] * n_items
        for index, value in doc :
            d[index]  = value
        ds.append(d)
    return ds
TF_IDF.py 文件源码 项目:OpinionSpam 作者: Coder-Yu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = models.TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
topic_modeling.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def train_lda_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda
tf-idf.py 文件源码 项目:Answer_Selection 作者: xjtushilei 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def get_similarity(query, ans_list):
    s_lenth = len(ans_list)
    Corp = ans_list
    # ??????????
    dictionary = corpora.Dictionary(Corp)
    # ??????????
    corpus = [dictionary.doc2bow(text) for text in Corp]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    vec_bow = dictionary.doc2bow(query)
    vec_tfidf = tfidf[vec_bow]

    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]
    similarity = list(sims)
    # print(similarity)
    end_lenth = len(similarity)
    if s_lenth != end_lenth:
        print('bug')
    return similarity
test_miislita.py 文件源码 项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
test_miislita.py 文件源码 项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
test_miislita.py 文件源码 项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
nlp.py 文件源码 项目:weibo_scrawler_app 作者: coolspiderghy 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def train_by_lsi(lib_texts):
    """
        ??LSI?????
    """
    from gensim import corpora, models, similarities

    #?????????
    #import logging
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts]     #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #???????topic???10?LSI??
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    index = similarities.MatrixSimilarity(lsi[corpus])     # index ? gensim.similarities.docsim.MatrixSimilarity ??

    return (index, dictionary, lsi)


#????? -- ??????????????????????
tfidf.py 文件源码 项目:DeepNews 作者: kabrapratik28 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def load_model_and_dictionary(self):
        self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model')
        self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary')
        print ("Dictionary & Model Loaded Successfully")
test_tfidf.py 文件源码 项目:ParseLawDocuments 作者: FanhuaandLuomu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return corpus_tfidf
reader.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def load_tfidf(corpus, dictionary):
    if not os.path.isfile(TFIDF_MODEL_PATH):
        print('Creating TF-IDF')
        tfidf = models.TfidfModel(corpus)
        print('TF-IDF created')
        tfidf.save(TFIDF_MODEL_PATH)

    print('Loading TF-IDF model')
    tfidf = models.TfidfModel.load(TFIDF_MODEL_PATH)
    return tfidf
# doc_list = get_data()
# print(len(doc_list))
RecSvd.py 文件源码 项目:readmeinfo 作者: taozhijiang 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def do_calc_svd(self):

        print("?????%d" %(nlp_master.get_dict_len()))
        self.k_value = int(0.1*(nlp_master.get_dict_len()))
        if self.k_value < 300:
            self.k_value = 300
        if self.k_value > 1000:
            self.k_value = 1000
        print("k??%d" %(self.k_value))            

        tfidf = models.TfidfModel(list(nlp_master._id_docs.values()))
        tfidf_corpus = tfidf[list(nlp_master._id_docs.values())]

        # num_topics?????????????? 200–500
        # LSI??
        self.lsi = models.LsiModel(tfidf_corpus, id2word=nlp_master.dictionary, num_topics=self.k_value, chunksize=2000)

        # ??????
        today = datetime.date.today()
        self.dumpfile = "dumpdir/recsvd_dump.%d_%d" %(today.month, today.day)        

        with open(self.dumpfile,'wb', -1) as fp:
            dump_data = []
            dump_data.append(self._user_classifier)
            dump_data.append(self.k_value)
            dump_data.append(self.lsi)
            pickle.dump(dump_data, fp, -1)

        return


    # ???????NULL???
    # ???????site_news?????????????
iqss_interface.py 文件源码 项目:tRECS 作者: TeeOhh 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def build_tfidf_base(self, corpus, bow_matrix):
        ## Description: Build and save objects common to TFIDF and LSA
        ## Params: Corpus, BOW matrix
        ## Returns: TF-IDF corpus and matrix

        tfidf_model = models.TfidfModel(corpus)
        tfidf_corpus= tfidf_model[corpus]
        tfidf_matrix = bow_matrix.apply(lambda x: tfidf_model[x[0]], 1)
        return tfidf_corpus, tfidf_matrix


    #MODEL OBJECTS
    #A model object consists of gensim similarity index and matrix containing transformed data
keyphrase_extraction.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):

    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}

    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)

    return weighted_phrases[:top_n]
topic_modeling.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def train_lsi_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi
svm_tfidf.py 文件源码 项目:SinaWeiboSpider 作者: SuperSaiyanSSS 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def reduce_tfidf(dictionary, weibo_test):
    corpus_tfidf = None
    # # # # ?????  ????????tfidf
    if not os.path.exists(path_tmp_tfidf):
        print('=== ?????tfidf??????????tfidf?? ===')
        # ?????????tfidf???????????????????
        if not dictionary:  # ????????????????????
            dictionary = corpora.Dictionary.load(path_dictionary)
        os.makedirs(path_tmp_tfidf)
        files = os_path.LoadFiles(path_doc_root)
        tfidf_model = models.TfidfModel(dictionary=dictionary)
        corpus_tfidf = {}
        for i, msg in enumerate(files):
            catg = msg[0]
            file = msg[1]
            word_list = convert_doc_to_wordlist(file, cut_all=False)
            file_bow = dictionary.doc2bow(word_list)
            file_tfidf = tfidf_model[file_bow]
            tmp = corpus_tfidf.get(catg, [])
            tmp.append(file_tfidf)
            if tmp.__len__() == 1:
                corpus_tfidf[catg] = tmp
        # ?tfidf????????
        catgs = list(corpus_tfidf.keys())
        for catg in catgs:
            corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg),
                                       corpus_tfidf.get(catg),
                                       id2word=dictionary
                                       )
            print('catg {c} has been transformed into tfidf vector'.format(c=catg))
        print('=== tfidf?????? ===')
    else:
        print('=== ???tfidf???????????? ===')

    svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
svm_result.py 文件源码 项目:SinaWeiboSpider 作者: SuperSaiyanSSS 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def reduce_result(dictionary, lsi_model, predictor, weibo_test):
    # # # # ?????  ????????
    if not dictionary:
        dictionary = corpora.Dictionary.load(path_dictionary)
    if not lsi_model:
        lsi_file = open(path_tmp_lsimodel,'rb')
        lsi_model = pkl.load(lsi_file)
        lsi_file.close()
    if not predictor:
        x = open(path_tmp_predictor,'rb')
        predictor = pkl.load(x)
        x.close()
    files = os.listdir(path_tmp_lsi)
    catg_list = []
    for file in files:
        t = file.split('.')[0]
        if t not in catg_list:
            catg_list.append(t)

    demo_doc = weibo_test
    print(demo_doc)
    demo_doc = list(jieba.cut(demo_doc,cut_all=False))
    demo_bow = dictionary.doc2bow(demo_doc)
    tfidf_model = models.TfidfModel(dictionary=dictionary)
    demo_tfidf = tfidf_model[demo_bow]
    demo_lsi = lsi_model[demo_tfidf]
    data = []
    cols = []
    rows = []
    for item in demo_lsi:
        data.append(item[1])
        cols.append(item[0])
        rows.append(0)
    demo_matrix = csr_matrix((data,(rows,cols))).toarray()
    x = predictor.predict(demo_matrix)
    print('??????{x}'.format(x=catg_list[x[0]]))
topic_models.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def save_tfidf():
    corpus_bow = corpora.MmCorpus(BowFile)
    tfidf_model = models.TfidfModel(corpus_bow)

    corpus_tfidf = tfidf_model[corpus_bow]
    corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf)

    print "==================== TF-IDF data Generated and Saved ===================="
test_web.py 文件源码 项目:memex-dossier-open 作者: dossier 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def tfidf():
    if not TFIDF:
        return
    doc1 = u'Andrew likes Diet Pepsi.'
    doc2 = u'Andrew knows the muffin man.'
    doc3 = u'Andrew lives near the muffin man on Shirley Lane.'
    corpus = map(sip.noun_phrases, [doc1, doc2, doc3])
    dictionary = corpora.Dictionary(corpus)
    bows = [dictionary.doc2bow(tokens) for tokens in corpus]
    return models.TfidfModel(bows, id2word=dictionary)
make_wikicorpus.py 文件源码 项目:lsi-document-similarity 作者: dvictor 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
corpusbuilder.py 文件源码 项目:simsearch 作者: chrisjmccormick 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def buildCorpus(self):
        """
        Build the corpus from the documents:
            1. Remove words that only appeared once.
            2. Create the Dictionary object.
            3. Convert the documents to simple bag-of-words representation.
            4. Convert the bag-of-words vectors to tf-idf.
        """
        # Remove words that only appear once.
        self.documents = [[token for token in doc if self.frequency[token] > 1]
                          for doc in self.documents]

        # Build a dictionary from the text.
        self.dictionary = corpora.Dictionary(self.documents)

        # Map the documents to vectors.
        corpus = [self.dictionary.doc2bow(text) for text in self.documents]

        # Delete the tokenized representation of the documents--no need to
        # carry this around!
        del self.documents[:]

        # Convert the simple bag-of-words vectors to a tf-idf representation.        
        self.tfidf_model = TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus]
similarity.py 文件源码 项目:narrative-prediction 作者: roemmele 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def load_tfidf_model(self):
        print "loading tfidf from", self.tfidf_filepath
        self.tfidf_model = models.TfidfModel.load(self.tfidf_filepath, mmap='r')
similarity.py 文件源码 项目:narrative-prediction 作者: roemmele 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def make_tfidf_model(self, seqs):
        self.tfidf_model = models.TfidfModel((self.lexicon.doc2bow(tokenize(seq)) for seq in seqs))
        self.tfidf_model.save(self.tfidf_filepath)
        print "saved tfidf to", self.tfidf_filepath
lsi_stream_train.py 文件源码 项目:recommended_system 作者: wac81 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def getLsiModel(lsipath='./lsi/', num_topics=300):
    # ????
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    print '??????'
    # ???
    corpus = corpora.MmCorpus(lsipath +'viva.mm')
    print ('mm load')

    t31 = time.time()

    # tfidf
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    t32 = time.time()
    print "tfidf_corpus time = ", t32 - t31

    # baobao change 3 lines
    # corpus = MyCorpus()
    # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
    # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
    lsi = None
    try:
         lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True)  #????????
         lsi.save(lsipath  + 'viva.lsi')
         print('lsi??????')
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception, e:
        logging.error('Failed to lsi train', exc_info=True)

    return lsi
clusters.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def tfidf_model(self):
        if self._tfidf_model is None:
            doc_count = self.post_ids_query.count()
            if doc_count < 10:
                return None
            dictionary = self.dictionary
            tfidf_model = gmodels.TfidfModel(id2word=dictionary)
            tfidf_fname = join(self.dirname, "tfidf_%d.model" % (
                self.discussion.id,))
            subcorpus = self.subcorpus
            if exists(tfidf_fname):
                tfidf_model = tfidf_model.load(tfidf_fname)
                # assumption: count implies identity.
                # Wrong in corner cases: hidden, etc.
                if tfidf_model.num_docs != doc_count:
                    unlink(tfidf_fname)
                    tfidf_model = gmodels.TfidfModel(id2word=dictionary)
            if tfidf_model.num_docs != doc_count:
                tfidf_model.initialize(subcorpus)
                tfidf_model.save(tfidf_fname)
            self._tfidf_model = tfidf_model
        return self._tfidf_model
gensimLDA.py 文件源码 项目:quoll 作者: LanguageMachines 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def tfidf_weight(self):
        self.corpus = models.TfidfModel(self.corpus, normalize=True)
LDAModel_English.py 文件源码 项目:LDA_RecEngine 作者: easonchan1213 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def trainModel(self):
        '''
        Train a LDA model, inclusive of 4 steps:
        1. Parse the whole corpora into unigram token collections and document mapping (for later use)
        2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
        3. Indexing the token collections and do TF-IDF transformation
        4. Call gensim.models.LdaModel and generate topic distributions of the corpora
        '''
        print 'Start preparing unigram tokens....'      
        ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
        # Get document_count, tokens, and document-index mapping from the corpora
        doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) 
        # Put the training data into gensim.corpora for later use
        dic = corpora.Dictionary(train_set) 
        denominator = len(dic)
        # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
        dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
        nominator = len(dic)
        corpus = [dic.doc2bow(text) for text in train_set]  # transform every token into BOW
        print 'There are %i documents in the pool' % (doc_count)
        print "In the corpus there are ", denominator, " raw tokens"
        print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
        print 'Finished preparing unigram tokens....'   
        ##END 

        print 'Start training LDA model....'
        ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
        corpus_lda = lda[corpus_tfidf]
        # Once done training, print all the topics and related words
        print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'    
        for i in range(self.num_topics):
            print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
        # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
        print '==============================='
        print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
        print '==============================='   

        return lda,doc_mapping,link_mapping,corpus
CCIR_lda_test_new_new.py 文件源码 项目:CCIR 作者: xiaogang00 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir):
    DCG_score_list = []
    for question_index in range(int(question_num)):
        if (question_index+1)%1000 == 1:
            print 'Now for line : ' + str(question_index+1) + '\n'
        index = question_index + 1
        file_read_name = os.path.join(question_answer_word_dir,str(index))
        file_write_name = os.path.join(question_answer_score_label_file_dir,str(index))
        file_read = open(file_read_name,'rb+')
        question_line = file_read.readline()
        question_line_list = question_line.strip().split('\t')
        question_line_list.remove('question')
        answer_index = 0
        answer_index_line_label_dict = {}
        answer_sentences_word_list = []
        for line in file_read.readlines():
            answer_temp_line_list = line.strip().split('\t')
            answer_label = answer_temp_line_list[1]
            answer_temp_line_list.remove('answer')
            answer_temp_line_list.remove(answer_label)
            answer_sentences_word_list.append(answer_temp_line_list)
            answer_list_temp = []
            answer_list_temp.append(answer_label)
            answer_index_line_label_dict[answer_index] = answer_list_temp
            answer_index += 1
        dic = corpora.Dictionary(answer_sentences_word_list)
        corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
        index = similarities.MatrixSimilarity(lda[corpus_tfidf])
        query_bow = dic.doc2bow(question_line_list)
        query_lda = lda[query_bow]
        sims = index[query_lda]
        list_simes = list(enumerate(sims))
        sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
        #answer_label_list = []
        for item in list_simes:
            answer_index_temp = item[0]
            answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
            answer_score = str(item[1])
            file_write = open(file_write_name,'ab+')
            file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n')
            file_write.close()
            #answer_label_list.append(answer_label)
        #DCG_score = calu_DCG(answer_label_list,k)
        #DCG_score_list.append(DCG_score)
    #DCG_avg = calu_avg_answer_length(DCG_score_list)
    #print 'DCG_avg : \t' + str(DCG_avg)
CCIR_lda_test.py 文件源码 项目:CCIR 作者: xiaogang00 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_score_for_question(question_answer_word_dir,question_num,k):
    DCG_score_list = []
    for question_index in range(int(question_num)):
        if (question_index+1)%1000 == 1:
            print 'Now for line : ' + str(question_index+1) + '\n'
        index = question_index + 1
        file_read_name = os.path.join(question_answer_word_dir,str(index))
        file_read = open(file_read_name,'rb+')
        question_line = file_read.readline()
        question_line_list = question_line.strip().split('\t')
        question_line_list.remove('question')
        answer_index = 0
        answer_index_line_label_dict = {}
        answer_sentences_word_list = []
        for line in file_read.readlines():
            answer_temp_line_list = line.strip().split('\t')
            answer_label = answer_temp_line_list[1]
            answer_temp_line_list.remove('answer')
            answer_temp_line_list.remove(answer_label)
            answer_sentences_word_list.append(answer_temp_line_list)
            answer_list_temp = []
            answer_list_temp.append(answer_label)
            answer_index_line_label_dict[answer_index] = answer_list_temp
            answer_index += 1
        dic = corpora.Dictionary(answer_sentences_word_list)
        corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
        index = similarities.MatrixSimilarity(lda[corpus_tfidf])
        query_bow = dic.doc2bow(question_line_list)
        query_lda = lda[query_bow]
        sims = index[query_lda]
        sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
        answer_label_list = []
        for item in sort_sims:
            answer_index_temp = item[0]
            answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
            answer_label_list.append(answer_label)
        DCG_score = calu_DCG(answer_label_list,k)
        DCG_score_list.append(DCG_score)
    DCG_avg = calu_avg_answer_length(DCG_score_list)
    print 'DCG_avg : \t' + str(DCG_avg)
tfidf.py 文件源码 项目:DeepNews 作者: kabrapratik28 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'):
        textfile = codecs.open(file_path, "r", "utf-8")   

        print("Reading and Processing Text File")
        first_lines=[]
        for line in textfile:
            first_lines.append(line.strip())

        print ("--------Building Corpora Dictionary---------------" )
        dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines)

        #remove words that appear less than 2 times
        #twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2]
        #dictionary.filter_tokens(fiveids)

        #Remove Gaps
        dictionary.compactify()
        dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False)
        dictionary.save('../../temp_results/tfidf_dictionary')
        print("Dictionary Saved")

        print ("--Now Transforming to Bag of Words Vectors on the Fly--")
        class MyCorpus(object):
            def __iter__(self):
                for line in first_lines:
                    yield dictionary.doc2bow(line.split()) 

        news_corpus  = MyCorpus()
        print("Corpus Built...Now Starting Model Training")
        tfidf_model = models.TfidfModel(news_corpus)
        tfidf_model.save('../../temp_results/tfidf_model')
        print("Model Trained & Saved")


问题


面经


文章

微信
公众号

扫码关注公众号