python类Dictionary()的实例源码

test_tfidf.py 文件源码 项目:ParseLawDocuments 作者: FanhuaandLuomu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return corpus_tfidf
reader.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def load_dict_corpus_all_review():
    '''
    return the gensim dict&corpus on the whole review corpus
    :return: dict&corpus
    '''
    if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)):
        generate_dict_corpus_all_review()
    print('Reading dict & corpus')
    dict = corpora.Dictionary.load(DICT_PATH)
    corpus = corpora.MmCorpus(CORPUS_PATH)
    print('Reading complicated')
    return corpus, dict
CorpusHandler.py 文件源码 项目:Rnews 作者: suemi994 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def generateDictionary(self):
        dictionary=corpora.Dictionary(self.wordProvider)
        stop_ids=[]
        once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
        dictionary.filter_tokens(stop_ids + once_ids)
        dictionary.compactify()
        self.dictionary=dictionary
        return self.dictionary
stopwords.py 文件源码 项目:natural-language-preprocessings 作者: Hironsan 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def create_dictionary(texts):
    dictionary = corpora.Dictionary(texts)
    return dictionary
pdf_to_video.py 文件源码 项目:Opened 作者: Veerendra-Gopi 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_topics_from_text(line):
    doc_complete = line.split('.')
    doc_clean = [clean_txt_to_clean_words(doc).split() for doc in doc_complete]# ignore if length of docs for topic analysis is less than 3        
    doc_clean_empty = True
    all_topics = []
    for doc in doc_clean:
        if len(doc) > 0:
            doc_clean_empty = False
    if len(doc_clean) >=1 and doc_clean_empty == False:
        dictionary = corpora.Dictionary(doc_clean)
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
        Lda = gensim.models.ldamodel.LdaModel
        num_topics = 3
        ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=25)
        # print '\n\n',doc_complete
        # print '\n',doc_clean, '\n'
        # print ldamodel.print_topics(num_topics=5, num_words=2), '\n\n'
        for i in range(0,num_topics):
            topic = ldamodel.get_topic_terms(i, topn=2)
            topic_list = []
            for word in topic:
                word_name = dictionary.get(word[0])
                if len(word_name) > 1:
                    topic_list.append(word_name)
            topic_list.sort()
            topic_name = " ".join(topic_list)
            add = False
            for ch in topic_name:# ignore numerical topics
                if ch in r"[abcdefghijklmnopqrstuvwxyz]":
                    add = True
            if add:
                if topic_name not in all_topics:
                    all_topics.append(str(topic_name))

    return all_topics
load_data.py 文件源码 项目:sequence-labeling 作者: BUAAQingYuan 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def generate_dic():
    train_sents = load_corpus('CoNLL-2003/train.txt')
    valid_sents = load_corpus('CoNLL-2003/valid.txt')
    test_sents = load_corpus('CoNLL-2003/test.txt')
    train_ = [get_sent(sent) for sent in train_sents]
    print("train size: "+str(len(train_sents)))
    valid_ = [get_sent(sent) for sent in valid_sents]
    print("valid size: "+str(len(valid_sents)))
    test_ = [get_sent(sent) for sent in test_sents]
    print("test size: "+str(len(test_sents)))
    all_ = train_ + valid_ + test_
    lengths = [len(text) for text in all_]
    print("all data: "+str(len(lengths)))
    print_lengths(lengths)
    dic_words = corpora.Dictionary(all_)
    dic_words.save('words.dict')
    print(len(dic_words))
    # label
    train_.clear()
    valid_.clear()
    test_.clear()
    train_ = [get_label(sent) for sent in train_sents]
    valid_ = [get_label(sent) for sent in valid_sents]
    test_ = [get_label(sent) for sent in test_sents]
    all_ = train_ + valid_ + test_
    dic_labels = corpora.Dictionary(all_)
    for key,value in dic_labels.items():
        print(value)
    print(len(dic_labels))
ContextExtractor.py 文件源码 项目:quetch 作者: juliakreutzer 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def corpus2dict(corpusfiles):
    """ From a given corpus, create a gensim dictionary for mapping words to ints """
    corpus = list()
    corpus.append(["PADDING"]) #has word index 0
    corpus.append(["UNKNOWN"]) #has word index 1
    for cf in corpusfiles:
        #print "INFO: corpus = %s"%(corpusfiles)
        if cf is not None: #source can be none
            corpus.extend(preprocess(codecs.open(cf,"r","utf8").readlines()))
    wordDictionary = corpora.Dictionary(corpus)
    return wordDictionary
CPTCorpus.py 文件源码 项目:cptm 作者: NLeSC 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, input=None, topicDict=None, opinionDict=None,
                 testSplit=None, file_dict=None, topicLines=[0],
                 opinionLines=[1]):
        if not file_dict is None:
            logger.info('initialize CPT Corpus with file_dict: {} perspectives'
                        .format(len(file_dict)))
            self.perspectives = [Perspective(file_dict=file_dict.get(str(p)),
                                             topicLines=topicLines,
                                             opinionLines=opinionLines)
                                 for p in range(len(file_dict))]
        else:
            logger.info('initialize CPT Corpus with {} perspectives'
                        .format(len(input)))
            input.sort()
            self.perspectives = [Perspective(input=glob.glob('{}/*.txt'.
                                             format(d)), testSplit=testSplit,
                                             topicLines=topicLines,
                                             opinionLines=opinionLines)
                                 for d in input]
            self.input = input

        if isinstance(topicDict, str) or isinstance(topicDict, unicode):
            self.load_dictionaries(topicDict=topicDict)
        elif isinstance(topicDict, corpora.Dictionary):
            self.topicDictionary = topicDict

        if isinstance(opinionDict, str) or isinstance(opinionDict, unicode):
            self.load_dictionaries(opinionDict=opinionDict)
        elif isinstance(opinionDict, corpora.Dictionary):
            self.opinionDictionary = opinionDict

        if not topicDict or not opinionDict:
            self._create_corpus_wide_dictionaries()

        self.testSplit = testSplit
        self.nPerspectives = len(self.perspectives)
CPTCorpus.py 文件源码 项目:cptm 作者: NLeSC 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def load_dictionaries(self, topicDict=None, opinionDict=None):
        if topicDict:
            self.topicDictionary = corpora.Dictionary.load(topicDict)
            logger.info('topic dictionary {}'.format(self.topicDictionary))
        if opinionDict:
            self.opinionDictionary = corpora.Dictionary.load(opinionDict)
            logger.info('opinion dictionary {}'.format(self.opinionDictionary))
keyphrase_extraction.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):

    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}

    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)

    return weighted_phrases[:top_n]
topic_modeling.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def train_lsi_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi
topic_models.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def build_dictionary(hotel_files,extra_stopwords=None):
    stream_of_words = words_stream(hotel_files,extra_stopwords)
    dictionary = corpora.Dictionary(stream_of_words)
    dictionary.save(DictionaryFile)  # store the dictionary, for future reference
    print "==================== Dictionary Generated and Saved ===================="
topic_models.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self,hotel_files,extra_stopwords = None):
        self._dictionary = corpora.Dictionary.load(DictionaryFile)
        self._hotel_files = hotel_files
topic_models.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def lsi_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_tfidf = corpora.MmCorpus(TfidfFile)

    N_TOPICS = 300
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LSI MODEL IS BUILT ================="

    lsi_model.save(LsiModelFile)
    save_topics(lsi_model,LsiTopicsFile)
topic_models.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def lda_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_bow = corpora.MmCorpus(BowFile)

    N_TOPICS = 100
    model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LDA MODEL IS BUILT ================="

    model.save(LdaModelFile)
    save_topics(model,LdaTopicsFile)
load_data.py 文件源码 项目:TopicModel 作者: BUAAQingYuan 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def load_corpus(data_file):
    texts = load_texts(data_file)
    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return corpus,dictionary
load_data.py 文件源码 项目:TopicModel 作者: BUAAQingYuan 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def load_corpus(data_file):
    texts = load_texts(data_file)
    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus = [[token[0] for token in text] for text in corpus]
    return corpus, dictionary
test_web.py 文件源码 项目:memex-dossier-open 作者: dossier 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def tfidf():
    if not TFIDF:
        return
    doc1 = u'Andrew likes Diet Pepsi.'
    doc2 = u'Andrew knows the muffin man.'
    doc3 = u'Andrew lives near the muffin man on Shirley Lane.'
    corpus = map(sip.noun_phrases, [doc1, doc2, doc3])
    dictionary = corpora.Dictionary(corpus)
    bows = [dictionary.doc2bow(tokens) for tokens in corpus]
    return models.TfidfModel(bows, id2word=dictionary)
segmentation.py 文件源码 项目:email-segmentation 作者: gorgias 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def LSI_fit(self, data):
        '''
        Fits an LSI model and returns it with associated dictionary
        '''
        texts = [[tag for tag in sent] for sent in self.get_pos(data)]
        dictionary = corpora.Dictionary(texts)
        texts = map(dictionary.doc2bow, texts)
        lsi = models.LsiModel(texts, id2word=dictionary, 
                                                    num_topics=self.num_topics)

        return dictionary, lsi
shallow_rank.py 文件源码 项目:liveqa2017 作者: codekansas 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def train(self, corpus, passes=1):
        """Updates dictionary and model given a corpus.

        Args:
            corpus: list of str, the documents to tokenize.
        """

        if self.dictionary is not None or self.model is not None:
            x = raw_input('You are about to overwrite an existing '
                          'model file (%s). Are you sure? [y/N] '
                          % self.model_file)

            if x[0] != 'y':
                raise RuntimeError('You chose not to overwrite the '
                                   'existing model and dictionary.')

        # Tokenizes the corpus.
        documents = [self.tokenize(document) for document in corpus]

        # Builds a dictionary from the existing documents.
        self.dictionary = corpora.Dictionary(documents)

        # Dumps the dictionary to a pickled file to use later.
        pickle.dump(self.dictionary, open(self.dictionary_file, 'wb'))

        # Converts the corpus to tokens.
        corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents]

        # Trains the LSI model.
        self.model = models.LdaModel(corpus_bow,
                                     passes=passes,
                                     id2word=self.dictionary,
                                     num_topics=self.num_topics)

        # Saves the model to use later.
        self.model.save(self.model_file)

        # Flag to remember that training has taken place.
        self._trained = True


问题


面经


文章

微信
公众号

扫码关注公众号