python类Dictionary()的实例源码

intentparser.py 文件源码 项目:IntentParser 作者: nonkung51 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def getTextConfidence(self, text):
        if self.typeOfSim == 'jaccard':
            intend_confidenceList = []
            for i in self.know_words:
                intend_confidenceList.append(jaccard_compare(text, i))
            if len(self.know_words) > 0:
                return max(intend_confidenceList)
            else :
                return 0
        elif self.typeOfSim == 'gensim':
            try:
                from gensim import corpora, models, similarities
            except Exception as e:
                print(e)
            dictionary = corpora.Dictionary(self.know_words_remove_stopwords)
            corpus = [dictionary.doc2bow(text) for text in self.know_words_remove_stopwords]
            lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
            new_doc = text
            vec_bow = dictionary.doc2bow(new_doc.lower().split())
            vec_lsi = lsi[vec_bow]
            index = similarities.MatrixSimilarity(lsi[corpus])
            sims = index[vec_lsi]
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            most_sim = sims[0]
            return most_sim[1]
lsi_model.py 文件源码 项目:Answer_Selection 作者: xjtushilei 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_similarity(query, ans_list):
    s_lenth = len(ans_list)
    Corp = ans_list
    # ??????????
    dictionary = corpora.Dictionary(Corp)
    # ??????????
    corpus = [dictionary.doc2bow(text) for text in Corp]

    lsi = models.LsiModel(corpus)
    corpus_lsi = lsi[corpus]

    vec_bow = dictionary.doc2bow(query)
    vec_lsi = lsi[vec_bow]

    index = similarities.MatrixSimilarity(corpus_lsi)
    sims = index[vec_lsi]
    similarity = list(sims)
    # print(similarity)
    end_lenth = len(similarity)
    if s_lenth != end_lenth:
        print('bug')
    return similarity
W2V.py 文件源码 项目:USTC_AILab2 作者: overflocat 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def getCorpus():
    documents = []
    txtNames = glob.glob("original/*.txt")
    for fileName in txtNames:
        fp = open(fileName)
        buf = fp.readline()
        documents.append(buf)

    stoplist = set('for a of the and to in at'.split())
    texts = [[word for word in document.translate(string.maketrans("", ""), string.punctuation).lower().split() if word not in stoplist]
             for document in documents]

    #Actually dictionary and corpus are of no use here
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=50000)
    dictionary.save('tmp/imdb.dict')

    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('tmp/imdb.mm', corpus)

    return texts
corpus_processor.py 文件源码 项目:DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def corpus2bow(self, tokenized_corpus=default_documents):
        """returns (vocab,corpus_in_bow)

        ??????? BOW ??

        Arguments:
        tokenized_corpus -- ?????????

        Return:
        vocab -- {'human': 0, ... 'minors': 11}
        corpus_in_bow -- [[(0, 1), (1, 1), (2, 1)]...]
        """
        dictionary = corpora.Dictionary(tokenized_corpus)

        # ????
        vocab = dictionary.token2id

        # ?????????
        corpus_in_bow = [dictionary.doc2bow(text) for text in tokenized_corpus]

        return (vocab, corpus_in_bow)
lda.py 文件源码 项目:Sentences-analysis 作者: sungminoh 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def build_id2word(self, fname=None, save_to=None):
        # read words.csv file
        if not fname:
            fname = self.words_fname or click.prompt('words file')
        fname = self.__dest(fname)
        assert os.path.isfile(fname), 'No such file: %s' % fname
        if save_to:
            self.id2word_fname = self.__dest(save_to)
        else:
            self.id2word_fname = LdaUtils.change_ext(fname, 'id2word')
        # if there is no id2word file or the user wants to rebuild, build .id2word
        if not os.path.isfile(self.id2word_fname) or click.confirm('There alread is id2word. Do you want to rebuild?'):
            print 'start building id2word'
            start = time()
            id2word = corpora.Dictionary(LdaUtils.filter_words(LdaUtils.iter_csv(fname, -1).split()))
            id2word.save(self.id2word_fname)  # save
            print 'building id2word takes: %s' % LdaUtils.human_readable_time(time() - start)
        self.id2word = corpora.Dictionary.load(self.id2word_fname)
        return self.id2word
kmeans_cluster.py 文件源码 项目:ParseLawDocuments 作者: FanhuaandLuomu 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.text.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    ds = []
    for doc in corpus_tfidf:
        d = [0] * n_items
        for index, value in doc :
            d[index]  = value
        ds.append(d)
    return ds
topicModel.py 文件源码 项目:TextSummarization 作者: g-deoliveira 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def fit(self, documents):
        '''
        parameters:
          documents: list of strings, each represents a document
        '''

        # tokens, dictionary, corpus for LDA
        self.tokens = self.preProcessCorpus(documents)
        self.dictionary = corpora.Dictionary(self.tokens)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]

        self.lda = self.getLDA(dictionary=self.dictionary, 
                               corpus=self.corpus, 
                               num_topics=self.num_topics, 
                               random_state=self.random_state)

        self.num_dominant_topics=min(10, self.num_topics)
        self.dominant_topic_ids = self.getDominantTopics(self.corpus, 
                                                         self.lda, 
                                                         self.num_dominant_topics)
unigram.py 文件源码 项目:OpinionSpam 作者: Coder-Yu 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
TF_IDF.py 文件源码 项目:OpinionSpam 作者: Coder-Yu 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = models.TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
load_data.py 文件源码 项目:sequence-labeling 作者: BUAAQingYuan 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def transfer_corpus(sents):
    words_dict = invert_dict(corpora.Dictionary.load('words.dict'))
    max_length = 40
    sentence = numpy.zeros(shape=(len(sents), max_length),dtype=numpy.int32)
    label = numpy.zeros(shape=(len(sents), max_length), dtype=numpy.int32)
    lengths = []
    for i in range(len(sents)):
        current_sent = sents[i]
        words = []
        labels = []
        lengths.append(len(current_sent))
        for item in current_sent:
            words.append(words_dict[item[0]])
            labels.append(label_str[item[1]])
        sentence[i] = numpy.asarray(words + (max_length - len(current_sent))*[28782],dtype=numpy.float32)
        label[i] = numpy.asarray(labels + (max_length - len(current_sent))*[8],dtype=numpy.float32)

    return sentence,label,numpy.asarray(lengths,dtype=numpy.int32)


# train = train_ + valid_ = 16551
# test = test = 3327
preprocess.py 文件源码 项目:DeepBot 作者: IgorWang 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def build_dictionary(generator, min_freq=5):
    dictionary_path = os.path.join(DATA_PATH, DICT_NAME)

    if os.path.exists(dictionary_path) and os.path.isfile(dictionary_path):
        print("Delete dictionary and rebuild")
        os.remove(dictionary_path)

    dictionary = corpora.Dictionary(c + u for c, u in generator)

    # ?????ID
    filter_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if
                  docfreq < min_freq]

    dictionary.filter_tokens(filter_ids)
    dictionary.compactify()

    dictionary.add_documents([_START_VOCAB])

    pickle.dump(dictionary, open(dictionary_path, 'wb'))
    print("SVAE dictionary to %s" % (dictionary_path))

    return dictionary
ContextExtractor.py 文件源码 项目:quetch 作者: juliakreutzer 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def corpus2dict15(corpusfiles, lowercase=True): 
    """ From a given corpus, create a gensim dictionary for mapping words to ints, important: WMT15 data is already tokenized! """
    corpus = list()
    corpus.append(["PADDING"]) #has word index 0
    corpus.append(["UNKNOWN"]) #has word index 1
    for cf in corpusfiles:
        if cf is not None: #source can be none

#just for huge lookuptable that contains all words from pretraining
#           if lowercase:
#               corpus.extend([l.lower().split() for l in codecs.open(cf,"r","utf8").readlines()])
#           else:
#               corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])

            corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
    wordDictionary = corpora.Dictionary(corpus)
    #print "... build word dictionary with vocabulary size =", len(wordDictionary)
    return wordDictionary
topic_modeling.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def train_lda_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda
svm_dict.py 文件源码 项目:SinaWeiboSpider 作者: SuperSaiyanSSS 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def reduce_dict(weibo_test):
    dictionary = None
    if not os.path.exists(path_tmp):
        os.makedirs(path_tmp)
    # ?????????????????
    if not os.path.exists(path_dictionary):
        dictionary = corpora.Dictionary()
        files = os_path.LoadFiles(path_doc_root)
        for i, msg in enumerate(files):
            catg = msg[0]
            file = msg[1]
            file = convert_doc_to_wordlist(file, cut_all=False)
            dictionary.add_documents([file])
        # ??????????????
        small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5]
        dictionary.filter_tokens(small_freq_ids)
        dictionary.compactify()
        dictionary.save(path_dictionary)
    svm_tfidf.reduce_tfidf(dictionary, weibo_test)
train.py 文件源码 项目:topic_modelling 作者: jorotenev 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def createDictionary(extraLabel=""):
    # TODO in the report note the optimization done on the dict - it was ~700 000 workds, now ~90 000
    dic = Dictionary()
    d = corpora.Dictionary(dic)

    d.filter_extremes(no_below=10, no_above=0.6, keep_n=None)
    d.compactify()
    # add the visual terms as words in the vocabulary too
    d.add_documents([get_visual_terms_labels(config)])
    extraLabel = extraLabel+"_"+config.dictionary_label
    fName = 'data/dics/%s_%s.dict' % (pretty_current_time(), extraLabel)
    d.save(fName+'.bin')

    d.save_as_text(fName+'.txt')
    setLastDictFileName(fName+'.bin')
    logger.info('Dict created and saved to %s. Size: %i' % (fName, len(d)))
    return d
one_to_one.py 文件源码 项目:semantic_selector 作者: toshiya 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def generate_training_data(self, options):
        """
        set self.dictionary, self.lable_types and
        generate train_x(y) and test_x(y)
        """
        input_table = InputTable(options['threashold'])
        (training, test) = input_table.fetch_data(options['ratio_test'],
                                                  options['seed'])

        word_vecs_train = self.convert_to_word_vecs(training)
        topic_vecs_train = self.convert_to_topic_vecs(training)
        word_vecs_test = self.convert_to_word_vecs(test)
        topic_vecs_test = self.convert_to_topic_vecs(test)

        # use dictionary and topic_types of training set
        dictionary = corpora.Dictionary(word_vecs_train)
        all_topics = list(set(topic_vecs_train))

        x_train = self.adjust_x_format(dictionary, word_vecs_train)
        y_train = self.adjust_y_format(all_topics, topic_vecs_train)
        x_test = self.adjust_x_format(dictionary, word_vecs_test)
        y_test = self.adjust_y_format(all_topics, topic_vecs_test)
        return (x_train, y_train, x_test, y_test, dictionary, all_topics)
utils.py 文件源码 项目:attention-over-attention 作者: marshmelloX 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def create_vocabulary(input_stream, vocab_size, sentence_to_tokens_fn=None):
  t0 = time.time()
  print(" [*] Creating a new vocabulary...")

  if not sentence_to_tokens_fn:
    sentence_to_tokens_fn = default_sentence_to_tokens

  docs = []
  lines = []
  for line in input_stream:
    rline = line.strip()
    tokens = sentence_to_tokens_fn(rline)
    if '##########' not in tokens and len(rline) > 0:
      lines += [token.lower() for token in tokens if token.lower() not in cachedStopWords]
    elif '##########' in tokens:
      docs.append(lines)
      lines = []

  limit = np.abs(vocab_size - 4)
  vocab = corpora.Dictionary(docs)
  vocab.filter_extremes(no_below=1, no_above=0.7, keep_n=limit)
  print(" [*] Tokenize : %.4fs" % (time.time() - t0))

  return vocab
tf-idf.py 文件源码 项目:Answer_Selection 作者: xjtushilei 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def get_similarity(query, ans_list):
    s_lenth = len(ans_list)
    Corp = ans_list
    # ??????????
    dictionary = corpora.Dictionary(Corp)
    # ??????????
    corpus = [dictionary.doc2bow(text) for text in Corp]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    vec_bow = dictionary.doc2bow(query)
    vec_tfidf = tfidf[vec_bow]

    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]
    similarity = list(sims)
    # print(similarity)
    end_lenth = len(similarity)
    if s_lenth != end_lenth:
        print('bug')
    return similarity
gensimLDA.py 文件源码 项目:quoll 作者: LanguageMachines 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def load_dict(self, infile):
        self.dict = corpora.Dictionary.load(infile)
gensimLDA.py 文件源码 项目:quoll 作者: LanguageMachines 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def save_corpus(self, corpusfile, dictfile):
        dictionary = corpora.Dictionary(self.lines)
        corpus = [dictionary.doc2bow(line) for line in self.lines]
        dictionary.save(dictfile)
        corpora.MmCorpus.serialize(corpusfile, corpus)
ucicorpus.py 文件源码 项目:paragraph2vec 作者: thunlp 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
responsesEvaluate.py 文件源码 项目:PTTChatBot_DL2017 作者: thisray 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def buildTokenDictionary(self):
        """
        ? self.segResponses ??????? id
        """
        self.tokenDictionary = corpora.Dictionary(self.segResponses)
        logging.info("?????????%s" % str(self.tokenDictionary))
ucicorpus.py 文件源码 项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
ucicorpus.py 文件源码 项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
ucicorpus.py 文件源码 项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
textprocessing.py 文件源码 项目:weibo_scrawler_app 作者: coolspiderghy 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def getWordFreq(lib_texts):
    from gensim import corpora, models, similarities
    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts]
    return corpus
nlp.py 文件源码 项目:weibo_scrawler_app 作者: coolspiderghy 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def train_by_lsi(lib_texts):
    """
        ??LSI?????
    """
    from gensim import corpora, models, similarities

    #?????????
    #import logging
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts]     #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #???????topic???10?LSI??
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    index = similarities.MatrixSimilarity(lsi[corpus])     # index ? gensim.similarities.docsim.MatrixSimilarity ??

    return (index, dictionary, lsi)


#????? -- ??????????????????????
lda_train.py 文件源码 项目:Sentences-analysis 作者: sungminoh 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def query_tag(id2word, model, split_word):
  # id2word = corpora.Dictionary.load(path+'.id2word')
  # model = LdaMulticore.load(path+'.lda')
  bow = id2word.doc2bow(split_word)
  if len(bow) == 0:
    return None
  gamma, _ = model.inference([bow])
  topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution
  # [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)]
  return topic_dist
lda_train.py 文件源码 项目:Sentences-analysis 作者: sungminoh 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def main(argv):
  cli_parser = make_cli_parser()
  opts, args = cli_parser.parse_args(argv)
  if len(args) != 2:
    cli_parser.error("Please provide an input/output file")

  if not os.path.isfile(args[1]+'.lda'):
    if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'):
      id2word = corpora.Dictionary.load(args[1]+'.id2word')
    else :
      id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
      # ignore words that appear in less than 5 documents or more than 20% documents
      # when we do filtering, some vector becomes empty! it generates a huge problem!!
      # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
      # save dictionary
      id2word.save(args[1]+'.id2word')
      # save doc2bow vector
      corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word))
    mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm')
    model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs)
    model.save(args[1]+'.lda')

  infile = open(args[0])
  outfile = open(args[1]+'.csv', "w")
  out_csvfile = csv.writer(outfile, delimiter =',')
  in_csvfile = csv.reader(infile, delimiter=',')
  for row in in_csvfile:
    if row[0] == 0:
      break
    processed_post = preprocess(row[3]).split()
    if len(processed_post) == 0: # skip 0~2 word documents (quite useless)
      continue
    result_list = row[1:3]
    result_list.extend(query_tag(id2word, model, processed_post))
    out_csvfile.writerow(result_list)
  infile.close()
  outfile.close()

  #print query_tag(id2word, model, "Hello über, world is awesome!")
tfidf.py 文件源码 项目:DeepNews 作者: kabrapratik28 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def load_model_and_dictionary(self):
        self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model')
        self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary')
        print ("Dictionary & Model Loaded Successfully")


问题


面经


文章

微信
公众号

扫码关注公众号