def getTextConfidence(self, text):
if self.typeOfSim == 'jaccard':
intend_confidenceList = []
for i in self.know_words:
intend_confidenceList.append(jaccard_compare(text, i))
if len(self.know_words) > 0:
return max(intend_confidenceList)
else :
return 0
elif self.typeOfSim == 'gensim':
try:
from gensim import corpora, models, similarities
except Exception as e:
print(e)
dictionary = corpora.Dictionary(self.know_words_remove_stopwords)
corpus = [dictionary.doc2bow(text) for text in self.know_words_remove_stopwords]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
new_doc = text
vec_bow = dictionary.doc2bow(new_doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
most_sim = sims[0]
return most_sim[1]
python类LsiModel()的实例源码
def get_similarity(query, ans_list):
s_lenth = len(ans_list)
Corp = ans_list
# ??????????
dictionary = corpora.Dictionary(Corp)
# ??????????
corpus = [dictionary.doc2bow(text) for text in Corp]
lsi = models.LsiModel(corpus)
corpus_lsi = lsi[corpus]
vec_bow = dictionary.doc2bow(query)
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]
similarity = list(sims)
# print(similarity)
end_lenth = len(similarity)
if s_lenth != end_lenth:
print('bug')
return similarity
def run_model(name):
if name == 'lsi':
lsi = models.LsiModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics)
print('Saving lsi_model...')
lsi.save('exports/lsi.model')
print('lsi_model saved!')
# lsi_matrix = gensim.matutils.corpus2dense(lsi[corpus_gensim], len(lsi.projection.s)).T / lsi.projection.s
# print('Saving lsi_matrix...')
# pickle.dump(lsi_matrix, open('exports/lsi_matrix.p','wb'))
# print('lsi_matrix saved!')
elif name == 'lda':
# lda = models.LdaModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)
lda = models.ldamulticore.LdaMulticore(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)#, alpha='auto') #auto needs non multicore LDA
print('Saving lda_model...')
lda.save('exports/lda.model')
print('lda_model saved!')
# lda_matrix = gensim.matutils.corpus2dense(lda[corpus_gensim], lda.num_topics)
# print('Saving lda_matrix...')
# pickle.dump(lda_matrix, open('exports/lda_matrix.p','wb'))
# print('lda_matrix saved!')
gc.collect()
def train_model(model_name, corpus, id2word, num_topics):
"""
Train specified model
"""
# LDA
if model_name == 'lda':
model = models.LdaModel(
corpus,
id2word=id2word,
num_topics=num_topics,
alpha='auto',
eval_every=5,
)
return model
# LSI
elif model_name == 'lsi':
model = models.LsiModel(
corpus,
id2word=id2word,
num_topics=num_topics,
)
return model
else:
print('Invalid model name')
return None
def gensim(self):
# https://radimrehurek.com/gensim/dist_lsi.html
# https://radimrehurek.com/gensim/models/lsimodel.html
corpus = corpora.MmCorpus('../lda/lda_sources/documents_corpus.mm')
id2word = corpora.Dictionary.load('../lda/lda_sources/documents_dictionary.dict')
lsi = models.LsiModel(corpus, id2word=id2word, num_topics=self.dimensions)
return lsi
def train_by_lsi(lib_texts):
"""
??LSI?????
"""
from gensim import corpora, models, similarities
#?????????
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(lib_texts)
corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#???????topic???10?LSI??
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus]) # index ? gensim.similarities.docsim.MatrixSimilarity ??
return (index, dictionary, lsi)
#????? -- ??????????????????????
def generateTopic(self,method=TopicMethod.LDA,numTopics=10):
corpus=[self.dictionary.doc2bow(article.wordList) for article in Article.objects.only("wordList")]
if method==TopicMethod.LDA:
instance=ldamodel.LdaModel(corpus,id2word=self.dictionary,num_topics=numTopics)
elif method==TopicMethod.LSI:
instance=models.LsiModel(corpus,id2word=self.dictionary,num_topics=numTopics)
dstCorpus=instance[corpus]
return dstCorpus
def do_calc_svd(self):
print("?????%d" %(nlp_master.get_dict_len()))
self.k_value = int(0.1*(nlp_master.get_dict_len()))
if self.k_value < 300:
self.k_value = 300
if self.k_value > 1000:
self.k_value = 1000
print("k??%d" %(self.k_value))
tfidf = models.TfidfModel(list(nlp_master._id_docs.values()))
tfidf_corpus = tfidf[list(nlp_master._id_docs.values())]
# num_topics?????????????? 200–500
# LSI??
self.lsi = models.LsiModel(tfidf_corpus, id2word=nlp_master.dictionary, num_topics=self.k_value, chunksize=2000)
# ??????
today = datetime.date.today()
self.dumpfile = "dumpdir/recsvd_dump.%d_%d" %(today.month, today.day)
with open(self.dumpfile,'wb', -1) as fp:
dump_data = []
dump_data.append(self._user_classifier)
dump_data.append(self.k_value)
dump_data.append(self.lsi)
pickle.dump(dump_data, fp, -1)
return
# ???????NULL???
# ???????site_news?????????????
def build_lsa(self, nt, dictionary, tfidf_corpus, tfidf_matrix):
## Description: Builds LSA model and performs document similarity
## Params: Number of topics, dict, TFIDF corpus, TFIDF matrix
## Returns: Similarity index and matrix
lsa_model = models.LsiModel(tfidf_corpus, id2word= dictionary, num_topics=nt)
index = similarities.MatrixSimilarity(lsa_model[tfidf_corpus])
matrix = tfidf_matrix.apply(lambda x: lsa_model[x], 1)
return (index, matrix)
def train_lsi_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lsi = models.LsiModel(corpus_tfidf,
id2word=dictionary,
num_topics=total_topics)
return lsi
def lsi_model_topics():
dictionary = corpora.Dictionary.load(DictionaryFile)
corpus_tfidf = corpora.MmCorpus(TfidfFile)
N_TOPICS = 300
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
print "================= LSI MODEL IS BUILT ================="
lsi_model.save(LsiModelFile)
save_topics(lsi_model,LsiTopicsFile)
def LSI_fit(self, data):
'''
Fits an LSI model and returns it with associated dictionary
'''
texts = [[tag for tag in sent] for sent in self.get_pos(data)]
dictionary = corpora.Dictionary(texts)
texts = map(dictionary.doc2bow, texts)
lsi = models.LsiModel(texts, id2word=dictionary,
num_topics=self.num_topics)
return dictionary, lsi
def load_lsi_model(self):
print "loading lsi model from", self.lsi_filepath
self.lsi_model = models.LsiModel.load(self.lsi_filepath, mmap='r')
def make_lsi_model(self, seqs):
if self.use_tfidf:
seqs = (self.tfidf_model[self.lexicon.doc2bow(tokenize(seq))] for seq in seqs)
else:
seqs = (self.lexicon.doc2bow(tokenize(seq)) for seq in seqs)
self.lsi_model = models.LsiModel(seqs, num_topics=self.n_lsi_dim, id2word=self.lexicon)
self.lsi_model.save(self.lsi_filepath)
print "saved lsi model to", self.lsi_filepath
def getLsiModel(lsipath='./lsi/', num_topics=300):
# ????
dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
print '??????'
# ???
corpus = corpora.MmCorpus(lsipath +'viva.mm')
print ('mm load')
t31 = time.time()
# tfidf
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
t32 = time.time()
print "tfidf_corpus time = ", t32 - t31
# baobao change 3 lines
# corpus = MyCorpus()
# lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
# lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
lsi = None
try:
lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #????????
lsi.save(lsipath + 'viva.lsi')
print('lsi??????')
except (SystemExit, KeyboardInterrupt):
raise
except Exception, e:
logging.error('Failed to lsi train', exc_info=True)
return lsi
def process(self, unused, site, config):
try:
num_related_posts = config.num_related_posts
# Tokenize
docs = []
valid_posts = [] #exclude pages that are not posts
for post in site.posts:
if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT:
continue
txt = post.md
docs.append(gensim.utils.simple_preprocess(txt, deacc=True, min_len=3, max_len=15))
valid_posts.append(post)
# Fixme stemming
# build model
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
tfidf = models.tfidfmodel.TfidfModel(corpus=corpus)
# Fixme: get correct number of topics
num_topics = site.posts_by_tag.get_num_collections() * 2 # use collections as a proxy for the number of topics
topic_model = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=num_topics)
index = similarities.MatrixSimilarity(topic_model[tfidf[corpus]], num_best=num_related_posts + 1) #+1 because the best one is itself
# find simlar posts and store them
log_details = ""
for post, sims in zip(valid_posts, index):
if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT:
continue
post.meta.related_posts = []
log_details += '<div class="subsection"><h3>%s</h3>Related posts:<ol>' % (post.meta.title)
for idx, score in sims[1:]: #1: > first one is the article itself
p = valid_posts[idx]
o = utils.create_objdict()
o.meta = p.meta
o.score = score
o.html = p.score
post.meta.related_posts.append(o)
log_details += '<li>%s (%s)</li>' % (o.meta.title, round(score,2))
log_details += '<ol></div>'
return (SiteFab.OK, "Related posts via LSI", log_details)
except Exception as e:
return (SiteFab.ERROR, "Related posts via LSI", e)
def test_lee(self):
"""correlation with human data > 0.6
(this is the value which was achieved in the original paper)
"""
global bg_corpus, corpus
# create a dictionary and corpus (bag of words)
dictionary = corpora.Dictionary(bg_corpus)
bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
corpus = [dictionary.doc2bow(text) for text in corpus]
# transform the bag of words with log_entropy normalization
log_ent = models.LogEntropyModel(bg_corpus)
bg_corpus_ent = log_ent[bg_corpus]
# initialize an LSI transformation from background corpus
lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
# transform small corpus to lsi bow->log_ent->fold-in-lsi
corpus_lsi = lsi[log_ent[corpus]]
# compute pairwise similarity matrix and extract upper triangular
res = np.zeros((len(corpus), len(corpus)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
flat = res[matutils.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
logging.info("LSI correlation coefficient is %s" % cor)
self.assertTrue(cor > 0.6)
# def test_lee_mallet(self):
# global bg_corpus, corpus, bg_corpus2, corpus2
# # create a dictionary and corpus (bag of words)
# dictionary = corpora.Dictionary(bg_corpus2)
# bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
# corpus = [dictionary.doc2bow(text) for text in corpus2]
# # initialize an LDA transformation from background corpus
# lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
# corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
# corpus_lda = lda[corpus]
# # compute pairwise similarity matrix and extract upper triangular
# res = np.zeros((len(corpus), len(corpus)))
# for i, par1 in enumerate(corpus_lda):
# for j, par2 in enumerate(corpus_lda):
# res[i, j] = matutils.cossim(par1, par2)
# flat = res[matutils.triu_indices(len(corpus), 1)]
# cor = np.corrcoef(flat, human_sim_vector)[0, 1]
# logging.info("LDA correlation coefficient is %s" % cor)
# self.assertTrue(cor > 0.35)
def test_lee(self):
"""correlation with human data > 0.6
(this is the value which was achieved in the original paper)
"""
global bg_corpus, corpus
# create a dictionary and corpus (bag of words)
dictionary = corpora.Dictionary(bg_corpus)
bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
corpus = [dictionary.doc2bow(text) for text in corpus]
# transform the bag of words with log_entropy normalization
log_ent = models.LogEntropyModel(bg_corpus)
bg_corpus_ent = log_ent[bg_corpus]
# initialize an LSI transformation from background corpus
lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
# transform small corpus to lsi bow->log_ent->fold-in-lsi
corpus_lsi = lsi[log_ent[corpus]]
# compute pairwise similarity matrix and extract upper triangular
res = np.zeros((len(corpus), len(corpus)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
flat = res[matutils.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
logging.info("LSI correlation coefficient is %s" % cor)
self.assertTrue(cor > 0.6)
# def test_lee_mallet(self):
# global bg_corpus, corpus, bg_corpus2, corpus2
# # create a dictionary and corpus (bag of words)
# dictionary = corpora.Dictionary(bg_corpus2)
# bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
# corpus = [dictionary.doc2bow(text) for text in corpus2]
# # initialize an LDA transformation from background corpus
# lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
# corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
# corpus_lda = lda[corpus]
# # compute pairwise similarity matrix and extract upper triangular
# res = np.zeros((len(corpus), len(corpus)))
# for i, par1 in enumerate(corpus_lda):
# for j, par2 in enumerate(corpus_lda):
# res[i, j] = matutils.cossim(par1, par2)
# flat = res[matutils.triu_indices(len(corpus), 1)]
# cor = np.corrcoef(flat, human_sim_vector)[0, 1]
# logging.info("LDA correlation coefficient is %s" % cor)
# self.assertTrue(cor > 0.35)
def test_lee(self):
"""correlation with human data > 0.6
(this is the value which was achieved in the original paper)
"""
global bg_corpus, corpus
# create a dictionary and corpus (bag of words)
dictionary = corpora.Dictionary(bg_corpus)
bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
corpus = [dictionary.doc2bow(text) for text in corpus]
# transform the bag of words with log_entropy normalization
log_ent = models.LogEntropyModel(bg_corpus)
bg_corpus_ent = log_ent[bg_corpus]
# initialize an LSI transformation from background corpus
lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
# transform small corpus to lsi bow->log_ent->fold-in-lsi
corpus_lsi = lsi[log_ent[corpus]]
# compute pairwise similarity matrix and extract upper triangular
res = np.zeros((len(corpus), len(corpus)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
flat = res[matutils.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
logging.info("LSI correlation coefficient is %s" % cor)
self.assertTrue(cor > 0.6)
# def test_lee_mallet(self):
# global bg_corpus, corpus, bg_corpus2, corpus2
# # create a dictionary and corpus (bag of words)
# dictionary = corpora.Dictionary(bg_corpus2)
# bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
# corpus = [dictionary.doc2bow(text) for text in corpus2]
# # initialize an LDA transformation from background corpus
# lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
# corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
# corpus_lda = lda[corpus]
# # compute pairwise similarity matrix and extract upper triangular
# res = np.zeros((len(corpus), len(corpus)))
# for i, par1 in enumerate(corpus_lda):
# for j, par2 in enumerate(corpus_lda):
# res[i, j] = matutils.cossim(par1, par2)
# flat = res[matutils.triu_indices(len(corpus), 1)]
# cor = np.corrcoef(flat, human_sim_vector)[0, 1]
# logging.info("LDA correlation coefficient is %s" % cor)
# self.assertTrue(cor > 0.35)
def __init__(self, itemInfos):
lastTime = time.time()
# itemInfos : dict[(pid, description)]
# train model
jieba.load_userdict('./dict.txt.big.txt')
stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
stopWords.add('\n')
stopWords.add(' ')
stopWords.add(u'\u2022')
stopWords.add(u'\xa9')
texts = []
self.name2id = {}
self.id2name = []
for k, v in itemInfos.iteritems():
seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
texts.append(list(seg_list))
self.name2id[k] = len(self.id2name)
self.id2name.append(k)
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
print "start cast :", (time.time() - lastTime)
lastTime = time.time()
dictionary = corpora.Dictionary(texts)
print "dictionary cast :", (time.time() - lastTime)
lastTime = time.time()
corpus = [dictionary.doc2bow(text) for text in texts]
print "doc2bow cast :", (time.time() - lastTime)
lastTime = time.time()
tfidf = models.TfidfModel(corpus)
print "tfid model cast :", (time.time() - lastTime)
lastTime = time.time()
lastTime = time.time()
corpus_tfidf = tfidf[corpus]
print "tfidf corpus cast :", (time.time() - lastTime)
lastTime = time.time()
self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
print "lsi model cast :", (time.time() - lastTime)
lastTime = time.time()
#corpus_lsi = lsi[corpus_tfidf]
self.index = similarities.MatrixSimilarity(self.lsi[corpus])
self.corpus = corpus
self.pidName = getPidName()
print "init finish"
def reduce_lsi(dictionary, corpus_tfidf, weibo_test):
corpus_lsi = None
lsi_model = None
# # # # ????? ???tfidf???lsi
if not os.path.exists(path_tmp_lsi):
print('=== ?????lsi??????????lsi?? ===')
if not dictionary:
dictionary = corpora.Dictionary.load(path_dictionary)
if not corpus_tfidf: # ??????????????????tfidf??
print('--- ????tfidf??????????? ---')
# ?????????????
files = os.listdir(path_tmp_tfidf)
catg_list = []
for file in files:
t = file.split('.')[0]
if t not in catg_list:
catg_list.append(t)
# ??????corpus
corpus_tfidf = {}
for catg in catg_list:
path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg)
corpus = corpora.MmCorpus(path)
corpus_tfidf[catg] = corpus
print('--- tfidf????????????lsi?? ---')
# ??lsi model
os.makedirs(path_tmp_lsi)
corpus_tfidf_total = []
catgs = list(corpus_tfidf.keys())
for catg in catgs:
tmp = corpus_tfidf.get(catg)
corpus_tfidf_total += tmp
lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50)
# ?lsi????????
lsi_file = open(path_tmp_lsimodel, 'wb')
pkl.dump(lsi_model, lsi_file)
lsi_file.close()
del corpus_tfidf_total # lsi model???????????
print('--- lsi?????? ---')
# ??corpus of lsi, ????? corpus of tfidf
corpus_lsi = {}
for catg in catgs:
corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)]
corpus_lsi[catg] = corpu
corpus_tfidf.pop(catg)
corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg),
corpu,
id2word=dictionary)
print('=== lsi?????? ===')
else:
print('=== ???lsi???????????? ===')
svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test)
def __init__(self, filename):
self.docs = loads(open(filename, "r").read())
self.docmap = hoist_dict(self.docs, "id")
if isfile("data.dict"):
self.dictionary = Dictionary.load("data.dict")
else:
self.dictionary = Dictionary(iterate_summaries(self.docs))
self.dictionary.save("data.dict")
if isfile("data.mm"):
self.corpus = MmCorpus("data.mm")
else:
corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
MmCorpus.serialize("data.mm", corpus)
self.corpus = MmCorpus("data.mm")
self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)
if isfile("data.sim"):
self.sim = MatrixSimilarity.load("data.sim")
else:
self.sim = MatrixSimilarity(self.lsi[self.corpus])
self.sim.save("data.sim")
# self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)
self.sentiment_model = Doc2Vec.load("imdb.d2v")
self.sentiment = LogisticRegression()
self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
[self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))
if isfile("arxiv.d2v"):
self.doc_model = Doc2Vec.load("arxiv.d2v")
else:
tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
doc_model.build_vocab(tagged)
shuffle(tagged) # Replace with functional stuff
for epoch in range(10):
doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
doc_model.save("arxiv.d2v")