def get_tfidf(documents): # ??gensim????tfidf
documents=[[word for word in document.split()] for document in documents]
dictionary = corpora.Dictionary(documents)
n_items = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in documents]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf
python类Dictionary()的实例源码
def load_dict_corpus_all_review():
'''
return the gensim dict&corpus on the whole review corpus
:return: dict&corpus
'''
if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)):
generate_dict_corpus_all_review()
print('Reading dict & corpus')
dict = corpora.Dictionary.load(DICT_PATH)
corpus = corpora.MmCorpus(CORPUS_PATH)
print('Reading complicated')
return corpus, dict
def generateDictionary(self):
dictionary=corpora.Dictionary(self.wordProvider)
stop_ids=[]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
self.dictionary=dictionary
return self.dictionary
def create_dictionary(texts):
dictionary = corpora.Dictionary(texts)
return dictionary
def get_topics_from_text(line):
doc_complete = line.split('.')
doc_clean = [clean_txt_to_clean_words(doc).split() for doc in doc_complete]# ignore if length of docs for topic analysis is less than 3
doc_clean_empty = True
all_topics = []
for doc in doc_clean:
if len(doc) > 0:
doc_clean_empty = False
if len(doc_clean) >=1 and doc_clean_empty == False:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
Lda = gensim.models.ldamodel.LdaModel
num_topics = 3
ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=25)
# print '\n\n',doc_complete
# print '\n',doc_clean, '\n'
# print ldamodel.print_topics(num_topics=5, num_words=2), '\n\n'
for i in range(0,num_topics):
topic = ldamodel.get_topic_terms(i, topn=2)
topic_list = []
for word in topic:
word_name = dictionary.get(word[0])
if len(word_name) > 1:
topic_list.append(word_name)
topic_list.sort()
topic_name = " ".join(topic_list)
add = False
for ch in topic_name:# ignore numerical topics
if ch in r"[abcdefghijklmnopqrstuvwxyz]":
add = True
if add:
if topic_name not in all_topics:
all_topics.append(str(topic_name))
return all_topics
def generate_dic():
train_sents = load_corpus('CoNLL-2003/train.txt')
valid_sents = load_corpus('CoNLL-2003/valid.txt')
test_sents = load_corpus('CoNLL-2003/test.txt')
train_ = [get_sent(sent) for sent in train_sents]
print("train size: "+str(len(train_sents)))
valid_ = [get_sent(sent) for sent in valid_sents]
print("valid size: "+str(len(valid_sents)))
test_ = [get_sent(sent) for sent in test_sents]
print("test size: "+str(len(test_sents)))
all_ = train_ + valid_ + test_
lengths = [len(text) for text in all_]
print("all data: "+str(len(lengths)))
print_lengths(lengths)
dic_words = corpora.Dictionary(all_)
dic_words.save('words.dict')
print(len(dic_words))
# label
train_.clear()
valid_.clear()
test_.clear()
train_ = [get_label(sent) for sent in train_sents]
valid_ = [get_label(sent) for sent in valid_sents]
test_ = [get_label(sent) for sent in test_sents]
all_ = train_ + valid_ + test_
dic_labels = corpora.Dictionary(all_)
for key,value in dic_labels.items():
print(value)
print(len(dic_labels))
def corpus2dict(corpusfiles):
""" From a given corpus, create a gensim dictionary for mapping words to ints """
corpus = list()
corpus.append(["PADDING"]) #has word index 0
corpus.append(["UNKNOWN"]) #has word index 1
for cf in corpusfiles:
#print "INFO: corpus = %s"%(corpusfiles)
if cf is not None: #source can be none
corpus.extend(preprocess(codecs.open(cf,"r","utf8").readlines()))
wordDictionary = corpora.Dictionary(corpus)
return wordDictionary
def __init__(self, input=None, topicDict=None, opinionDict=None,
testSplit=None, file_dict=None, topicLines=[0],
opinionLines=[1]):
if not file_dict is None:
logger.info('initialize CPT Corpus with file_dict: {} perspectives'
.format(len(file_dict)))
self.perspectives = [Perspective(file_dict=file_dict.get(str(p)),
topicLines=topicLines,
opinionLines=opinionLines)
for p in range(len(file_dict))]
else:
logger.info('initialize CPT Corpus with {} perspectives'
.format(len(input)))
input.sort()
self.perspectives = [Perspective(input=glob.glob('{}/*.txt'.
format(d)), testSplit=testSplit,
topicLines=topicLines,
opinionLines=opinionLines)
for d in input]
self.input = input
if isinstance(topicDict, str) or isinstance(topicDict, unicode):
self.load_dictionaries(topicDict=topicDict)
elif isinstance(topicDict, corpora.Dictionary):
self.topicDictionary = topicDict
if isinstance(opinionDict, str) or isinstance(opinionDict, unicode):
self.load_dictionaries(opinionDict=opinionDict)
elif isinstance(opinionDict, corpora.Dictionary):
self.opinionDictionary = opinionDict
if not topicDict or not opinionDict:
self._create_corpus_wide_dictionaries()
self.testSplit = testSplit
self.nPerspectives = len(self.perspectives)
def load_dictionaries(self, topicDict=None, opinionDict=None):
if topicDict:
self.topicDictionary = corpora.Dictionary.load(topicDict)
logger.info('topic dictionary {}'.format(self.topicDictionary))
if opinionDict:
self.opinionDictionary = corpora.Dictionary.load(opinionDict)
logger.info('opinion dictionary {}'.format(self.opinionDictionary))
keyphrase_extraction.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_tfidf_weighted_keyphrases(sentences,
grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
top_n=10):
valid_chunks = get_chunks(sentences, grammar=grammar)
dictionary = corpora.Dictionary(valid_chunks)
corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
weighted_phrases = {dictionary.get(id): round(value,3)
for doc in corpus_tfidf
for id, value in doc}
weighted_phrases = sorted(weighted_phrases.items(),
key=itemgetter(1), reverse=True)
return weighted_phrases[:top_n]
def train_lsi_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lsi = models.LsiModel(corpus_tfidf,
id2word=dictionary,
num_topics=total_topics)
return lsi
def build_dictionary(hotel_files,extra_stopwords=None):
stream_of_words = words_stream(hotel_files,extra_stopwords)
dictionary = corpora.Dictionary(stream_of_words)
dictionary.save(DictionaryFile) # store the dictionary, for future reference
print "==================== Dictionary Generated and Saved ===================="
def __init__(self,hotel_files,extra_stopwords = None):
self._dictionary = corpora.Dictionary.load(DictionaryFile)
self._hotel_files = hotel_files
def lsi_model_topics():
dictionary = corpora.Dictionary.load(DictionaryFile)
corpus_tfidf = corpora.MmCorpus(TfidfFile)
N_TOPICS = 300
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
print "================= LSI MODEL IS BUILT ================="
lsi_model.save(LsiModelFile)
save_topics(lsi_model,LsiTopicsFile)
def lda_model_topics():
dictionary = corpora.Dictionary.load(DictionaryFile)
corpus_bow = corpora.MmCorpus(BowFile)
N_TOPICS = 100
model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
print "================= LDA MODEL IS BUILT ================="
model.save(LdaModelFile)
save_topics(model,LdaTopicsFile)
def load_corpus(data_file):
texts = load_texts(data_file)
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
return corpus,dictionary
def load_corpus(data_file):
texts = load_texts(data_file)
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
corpus = [[token[0] for token in text] for text in corpus]
return corpus, dictionary
def tfidf():
if not TFIDF:
return
doc1 = u'Andrew likes Diet Pepsi.'
doc2 = u'Andrew knows the muffin man.'
doc3 = u'Andrew lives near the muffin man on Shirley Lane.'
corpus = map(sip.noun_phrases, [doc1, doc2, doc3])
dictionary = corpora.Dictionary(corpus)
bows = [dictionary.doc2bow(tokens) for tokens in corpus]
return models.TfidfModel(bows, id2word=dictionary)
def LSI_fit(self, data):
'''
Fits an LSI model and returns it with associated dictionary
'''
texts = [[tag for tag in sent] for sent in self.get_pos(data)]
dictionary = corpora.Dictionary(texts)
texts = map(dictionary.doc2bow, texts)
lsi = models.LsiModel(texts, id2word=dictionary,
num_topics=self.num_topics)
return dictionary, lsi
def train(self, corpus, passes=1):
"""Updates dictionary and model given a corpus.
Args:
corpus: list of str, the documents to tokenize.
"""
if self.dictionary is not None or self.model is not None:
x = raw_input('You are about to overwrite an existing '
'model file (%s). Are you sure? [y/N] '
% self.model_file)
if x[0] != 'y':
raise RuntimeError('You chose not to overwrite the '
'existing model and dictionary.')
# Tokenizes the corpus.
documents = [self.tokenize(document) for document in corpus]
# Builds a dictionary from the existing documents.
self.dictionary = corpora.Dictionary(documents)
# Dumps the dictionary to a pickled file to use later.
pickle.dump(self.dictionary, open(self.dictionary_file, 'wb'))
# Converts the corpus to tokens.
corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents]
# Trains the LSI model.
self.model = models.LdaModel(corpus_bow,
passes=passes,
id2word=self.dictionary,
num_topics=self.num_topics)
# Saves the model to use later.
self.model.save(self.model_file)
# Flag to remember that training has taken place.
self._trained = True