def load(cls, save_dir='./'):
"""
Load the corpus from a save directory.
"""
tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
tagsToDocs = tables[0]
docsToTags = tables[1]
titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
ksearch = KeySearch(dictionary, tfidf_model,
corpus_tfidf, titles, tagsToDocs,
docsToTags, files, doc_line_nums)
return ksearch
python类MmCorpus()的实例源码
def build_corpus(self, fname=None, save_to=None):
# read sentences file
if not fname:
fname = click.prompt('sentences file')
fname = self.__dest(fname)
assert os.path.isfile(fname), 'No such file: %s' % fname
if save_to:
self.corpus_fname = self.__dest(save_to)
else:
self.corpus_fname = LdaUtils.change_ext(fname, 'corpus')
# if there is no corpus file or the user wants to rebuild, build .corpus
if not os.path.isfile(self.corpus_fname) or click.confirm('There already is corpus. Do you want to rebuild?'):
print 'start building corpus'
start = time()
corpora.MmCorpus.serialize(self.corpus_fname, self.__iter_doc2bow(LdaUtils.iter_csv(fname, -1).split())) # save
print 'building corpus takes: %s' % LdaUtils.human_readable_time(time() - start)
self.corpus = corpora.MmCorpus(self.corpus_fname)
return self.corpus
def load(cls, save_dir='./'):
"""
Load the corpus from a save directory.
"""
tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
tagsToDocs = tables[0]
docsToTags = tables[1]
titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
ksearch = KeySearch(dictionary, tfidf_model,
corpus_tfidf, titles, tagsToDocs,
docsToTags, files, doc_line_nums)
return ksearch
def getIndex(lsipath='./lsi/', NUM_TOPIC=300):
# ????
corpus = corpora.MmCorpus( lsipath + 'viva.mm')
print 'mm loaded'
# ????
# lsi = models.LsiModel.load( lsipath + 'viva.lsi')
# baobao change 1 line
lsi = models.lsimodel.LsiModel.load( lsipath + 'viva.lsi')
print 'lsi model loaded'
# ??
# index = similarities.MatrixSimilarity(lsi[corpus])
# index.save( lsipath + 'viva.index')
# baobao changed 1 line
index = similarities.docsim.Similarity(lsipath + 'viva.index', lsi[corpus], num_features=NUM_TOPIC)
index.save(lsipath + 'viva.index')
print('index saved')
def load_corpus(self, infile):
self.corpus = corpora.MmCorpus(infile)
def save_corpus(self, corpusfile, dictfile):
dictionary = corpora.Dictionary(self.lines)
corpus = [dictionary.doc2bow(line) for line in self.lines]
dictionary.save(dictfile)
corpora.MmCorpus.serialize(corpusfile, corpus)
def save(self, save_dir='./'):
"""
Write out the built corpus to a save directory.
"""
# Store the tag tables.
pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
# Store the document titles.
pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
# Write out the tfidf model.
self.tfidf_model.save(save_dir + 'documents.tfidf_model')
# Write out the tfidf corpus.
corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)
# Write out the dictionary.
self.dictionary.save(save_dir + 'documents.dict')
# Save the filenames.
pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
# Save the file ID and line numbers for each document.
pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
# Objects that are not saved:
# - stop_list - You don't need to filter stop words for new input
# text, they simply aren't found in the dictionary.
# - frequency - This preliminary word count object is only used for
# removing infrequent words. Final word counts are in
# the `dictionary` object.
def gensim(self):
# https://radimrehurek.com/gensim/dist_lsi.html
# https://radimrehurek.com/gensim/models/lsimodel.html
corpus = corpora.MmCorpus('../lda/lda_sources/documents_corpus.mm')
id2word = corpora.Dictionary.load('../lda/lda_sources/documents_dictionary.dict')
lsi = models.LsiModel(corpus, id2word=id2word, num_topics=self.dimensions)
return lsi
def test_textcorpus(self):
"""Make sure TextCorpus can be serialized to disk. """
# construct corpus from file
miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))
# make sure deserializing gives the same result
miislita2 = corpora.MmCorpus(ftmp)
self.assertEqual(list(miislita), list(miislita2))
def test_textcorpus(self):
"""Make sure TextCorpus can be serialized to disk. """
# construct corpus from file
miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))
# make sure deserializing gives the same result
miislita2 = corpora.MmCorpus(ftmp)
self.assertEqual(list(miislita), list(miislita2))
def test_textcorpus(self):
"""Make sure TextCorpus can be serialized to disk. """
# construct corpus from file
miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))
# make sure deserializing gives the same result
miislita2 = corpora.MmCorpus(ftmp)
self.assertEqual(list(miislita), list(miislita2))
def set_corpus(self, corpus):
if isinstance(corpus, str):
self.corpus = corpora.MmCorpus(corpus)
elif isinstance(corpus, corpora.MmCorpus):
self.corpus = corpus
def set_data(self, corpus):
if isinstance(corpus, str):
self.data = corpora.MmCorpus(corpus)
elif isinstance(corpus, corpora.MmCorpus):
self.data = corpus
def main(argv):
cli_parser = make_cli_parser()
opts, args = cli_parser.parse_args(argv)
if len(args) != 2:
cli_parser.error("Please provide an input/output file")
if not os.path.isfile(args[1]+'.lda'):
if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'):
id2word = corpora.Dictionary.load(args[1]+'.id2word')
else :
id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
# ignore words that appear in less than 5 documents or more than 20% documents
# when we do filtering, some vector becomes empty! it generates a huge problem!!
# id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
# save dictionary
id2word.save(args[1]+'.id2word')
# save doc2bow vector
corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word))
mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm')
model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs)
model.save(args[1]+'.lda')
infile = open(args[0])
outfile = open(args[1]+'.csv', "w")
out_csvfile = csv.writer(outfile, delimiter =',')
in_csvfile = csv.reader(infile, delimiter=',')
for row in in_csvfile:
if row[0] == 0:
break
processed_post = preprocess(row[3]).split()
if len(processed_post) == 0: # skip 0~2 word documents (quite useless)
continue
result_list = row[1:3]
result_list.extend(query_tag(id2word, model, processed_post))
out_csvfile.writerow(result_list)
infile.close()
outfile.close()
#print query_tag(id2word, model, "Hello über, world is awesome!")
def load_dict_corpus_all_review():
'''
return the gensim dict&corpus on the whole review corpus
:return: dict&corpus
'''
if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)):
generate_dict_corpus_all_review()
print('Reading dict & corpus')
dict = corpora.Dictionary.load(DICT_PATH)
corpus = corpora.MmCorpus(CORPUS_PATH)
print('Reading complicated')
return corpus, dict
def save_bow(hotel_files,extra_stopwords=None):
corpus = Corpus(hotel_files,extra_stopwords)
corpora.MmCorpus.serialize(BowFile,corpus)
print "==================== BOW data Generated and Saved ===================="
def save_tfidf():
corpus_bow = corpora.MmCorpus(BowFile)
tfidf_model = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf_model[corpus_bow]
corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf)
print "==================== TF-IDF data Generated and Saved ===================="
def lsi_model_topics():
dictionary = corpora.Dictionary.load(DictionaryFile)
corpus_tfidf = corpora.MmCorpus(TfidfFile)
N_TOPICS = 300
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
print "================= LSI MODEL IS BUILT ================="
lsi_model.save(LsiModelFile)
save_topics(lsi_model,LsiTopicsFile)
def lda_model_topics():
dictionary = corpora.Dictionary.load(DictionaryFile)
corpus_bow = corpora.MmCorpus(BowFile)
N_TOPICS = 100
model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
print "================= LDA MODEL IS BUILT ================="
model.save(LdaModelFile)
save_topics(model,LdaTopicsFile)
evaluation_runner.py 文件源码
项目:moviegeek
作者: practical-recommender-systems
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def evaluate_cb_recommender():
K = 20
timestr = time.strftime("%Y%m%d-%H%M%S")
file_name = '{}-cb-k.csv'.format(timestr)
lda_path = './lda/'
corpus = corpora.MmCorpus(lda_path + 'corpus.mm'),
index = similarities.MatrixSimilarity.load(lda_path + 'index.lda')
with open(file_name, 'a', 1) as logfile:
logfile.write("rak, pak, mae, min_overlap, min_sim, K, min_num_of_ratings, min_rank, user_coverage, "
"movie_coverage\n")
for K in np.arange(5, 20, 3):
recommender = ContentBasedRecs()
er = EvaluationRunner(0,
None,
recommender,
K)
result = er.calculate(1, 5, number_test_users=-1)
user_coverage, movie_coverage = RecommenderCoverage(recommender).calculate_coverage()
pak = result['pak']
mae = result['mae']
rak = result['rak']
logfile.write("{}, {}, {}, {}, {}, {}\n".format(rak, pak, mae, K, user_coverage, movie_coverage))
logfile.flush()
def save(self, save_dir='./'):
"""
Write out the built corpus to a save directory.
"""
# Store the tag tables.
pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
# Store the document titles.
pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
# Write out the tfidf model.
self.tfidf_model.save(save_dir + 'documents.tfidf_model')
# Write out the tfidf corpus.
corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)
# Write out the dictionary.
self.dictionary.save(save_dir + 'documents.dict')
# Save the filenames.
pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
# Save the file ID and line numbers for each document.
pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
# Objects that are not saved:
# - stop_list - You don't need to filter stop words for new input
# text, they simply aren't found in the dictionary.
# - frequency - This preliminary word count object is only used for
# removing infrequent words. Final word counts are in
# the `dictionary` object.
def getLsiModel(lsipath='./lsi/', num_topics=300):
# ????
dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
print '??????'
# ???
corpus = corpora.MmCorpus(lsipath +'viva.mm')
print ('mm load')
t31 = time.time()
# tfidf
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
t32 = time.time()
print "tfidf_corpus time = ", t32 - t31
# baobao change 3 lines
# corpus = MyCorpus()
# lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
# lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
lsi = None
try:
lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #????????
lsi.save(lsipath + 'viva.lsi')
print('lsi??????')
except (SystemExit, KeyboardInterrupt):
raise
except Exception, e:
logging.error('Failed to lsi train', exc_info=True)
return lsi
def main(text_dir):
topics = range(10, 101, 10) + range(120, 201, 20) + range(250, 451, 50)
#topics = range(10, 21, 10)
#corpus = DocCorpus(text_dir)
#dictionary = corpus.dictionary
corpus = MmCorpus('../twitter_LDA_topic_modeling/simple-wiki.mm')
dictionary = Dictionary.load('../twitter_LDA_topic_modeling/simple-wiki.dict')
print('Building LDA models')
lda_models = [models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=i, passes=5) for i in tqdm(topics)]
print('Generating coherence models')
texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
func = partial(build_coherence_models,
corpus=corpus,
dictionary=dictionary,
texts=texts)
coherence_models = pool.map(func, lda_models)
pool.close()
# print('Extracting data from models')
# model_data = [extract_data(model, corpus, dictionary) for model in tqdm(lda_models)]
# d = defaultdict(list)
# print('Generating output data')
# for i, data in tqdm(enumerate(model_data)):
# d['num_topics'].append(data['num_topics'])
# d['cao_juan_2009'].append(cao_juan_2009(data['topic_term_dists'], data['num_topics']))
# d['arun_2010'].append(arun_2010(data['topic_term_dists'], data['doc_topic_dists'], data['doc_lengths'], data['num_topics']))
# d['deveaud_2014'].append(deveaud_2014(data['topic_term_dists'], data['num_topics']))
# d['u_mass_coherence'].append(data['u_mass_coherence'])
d = defaultdict(list)
print('Generating output data')
for data in tqdm(coherence_models):
d['num_topics'].append(data['num_topics'])
d['u_mass'].append(data['u_mass'])
d['c_v'].append(data['c_v'])
d['c_uci'].append(data['c_uci'])
d['c_npmi'].append(data['c_npmi'])
df = pd.DataFrame(d)
df = df.set_index('num_topics')
df.to_csv('coherence_simple_wiki', sep='\t')
df.plot(xticks=df.index, style=['bs-', 'yo-', 'r^-', 'gx-'])
ax1 = df.plot(xticks=df.index, style='bs-', grid=True, y='u_mass')
ax2 = df.plot(xticks=df.index, style='yo-', grid=True, y='c_v', ax=ax1)
ax3 = df.plot(xticks=df.index, style='r^-', grid=True, y='c_npmi', ax=ax2)
df.plot(xticks=df.index, style='gx-', grid=True, y='c_uci', ax=ax3)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.17), fancybox=True, shadow=True, ncol=4, fontsize=9)
plt.subplots_adjust(bottom=0.2)
plt.xticks(df.index, rotation=45, ha='right', fontsize=8)
plt.savefig('coherence_simple_wiki')
plt.close()
def generate_dict_corpus_all_review():
'''
generate the gensim dict&corpus on the whole review corpus
:return:
'''
print('Generating new dict and corpus on all Yelp reviews')
review_file = open(FULL_YELP_REVIEW_PATH, 'r')
# output_review = open("review.json", 'w')
# output_tip = open("tip.json", 'w')
texts = []
stoplist = load_stopword(STOPWORD_PATH)
count = 0
for line in review_file:
count += 1
if count % 10000 ==0:
print(count)
json_review = json.loads(line.strip())
text = json_review.get("text").decode('utf-8').lower()
# tokenize and clean. Split non-word&number: re.sub(r'\W+|\d+', '', word.decode('utf-8')). Keep all words:r'\d+'
tokens = [re.sub(r'\W+|\d+', '', word) for word in text.split()]
# remove stop words and short tokens
tokens = [token for token in tokens if ((not token.strip()=='') and (not token in stoplist))]
# stemming, experiment shows that stemming works nothing...
# if (stemming):
# stemmer = PorterStemmer()
# texts = [[ stemmer.stem(token) for token in text] for text in texts]
texts.append(tokens)
review_file.close()
# remove words that appear only once
# from collections import defaultdict
# frequency = defaultdict(int)
# for token in tokens:
# frequency[token] += 1
# for text in texts:
# tokens = []
# for token in text:
# if (frequency[token] > 1):
# tokens.append(token)
# text = tokens
# texts = [[token for token in text if (frequency[token] > 1)] for text in texts]
print('Corpus preprocessing and counting complished!')
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5)
dictionary.save(DICT_PATH) # store the dictionary, for future reference
dictionary.save_as_text(DICT_TXT_PATH)
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(CORPUS_PATH, corpus) # store to disk, for later use
print('Generating dict and corpus complished!')
def reduce_lsi(dictionary, corpus_tfidf, weibo_test):
corpus_lsi = None
lsi_model = None
# # # # ????? ???tfidf???lsi
if not os.path.exists(path_tmp_lsi):
print('=== ?????lsi??????????lsi?? ===')
if not dictionary:
dictionary = corpora.Dictionary.load(path_dictionary)
if not corpus_tfidf: # ??????????????????tfidf??
print('--- ????tfidf??????????? ---')
# ?????????????
files = os.listdir(path_tmp_tfidf)
catg_list = []
for file in files:
t = file.split('.')[0]
if t not in catg_list:
catg_list.append(t)
# ??????corpus
corpus_tfidf = {}
for catg in catg_list:
path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg)
corpus = corpora.MmCorpus(path)
corpus_tfidf[catg] = corpus
print('--- tfidf????????????lsi?? ---')
# ??lsi model
os.makedirs(path_tmp_lsi)
corpus_tfidf_total = []
catgs = list(corpus_tfidf.keys())
for catg in catgs:
tmp = corpus_tfidf.get(catg)
corpus_tfidf_total += tmp
lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50)
# ?lsi????????
lsi_file = open(path_tmp_lsimodel, 'wb')
pkl.dump(lsi_model, lsi_file)
lsi_file.close()
del corpus_tfidf_total # lsi model???????????
print('--- lsi?????? ---')
# ??corpus of lsi, ????? corpus of tfidf
corpus_lsi = {}
for catg in catgs:
corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)]
corpus_lsi[catg] = corpu
corpus_tfidf.pop(catg)
corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg),
corpu,
id2word=dictionary)
print('=== lsi?????? ===')
else:
print('=== ???lsi???????????? ===')
svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test)