def get_tfidf(documents): # ??gensim????tfidf
documents=[[word for word in document.text.split()] for document in documents]
dictionary = corpora.Dictionary(documents)
n_items = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in documents]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ds = []
for doc in corpus_tfidf:
d = [0] * n_items
for index, value in doc :
d[index] = value
ds.append(d)
return ds
python类TfidfModel()的实例源码
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
model = models.TfidfModel(corpus)
corpus = [text for text in model[corpus]]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def train_lda_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lda = models.LdaModel(corpus_tfidf,
id2word=dictionary,
iterations=1000,
num_topics=total_topics)
return lda
def get_similarity(query, ans_list):
s_lenth = len(ans_list)
Corp = ans_list
# ??????????
dictionary = corpora.Dictionary(Corp)
# ??????????
corpus = [dictionary.doc2bow(text) for text in Corp]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
vec_bow = dictionary.doc2bow(query)
vec_tfidf = tfidf[vec_bow]
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[vec_tfidf]
similarity = list(sims)
# print(similarity)
end_lenth = len(similarity)
if s_lenth != end_lenth:
print('bug')
return similarity
def test_miislita_high_level(self):
# construct corpus from file
miislita = CorpusMiislita(datapath('miIslita.cor'))
# initialize tfidf transformation and similarity index
tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))
# compare to query
query = 'latent semantic indexing'
vec_bow = miislita.dictionary.doc2bow(query.lower().split())
vec_tfidf = tfidf[vec_bow]
# perform a similarity query against the corpus
sims_tfidf = index[vec_tfidf]
# for the expected results see the article
expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
for i, value in enumerate(expected):
self.assertAlmostEqual(sims_tfidf[i], value, 2)
def test_miislita_high_level(self):
# construct corpus from file
miislita = CorpusMiislita(datapath('miIslita.cor'))
# initialize tfidf transformation and similarity index
tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))
# compare to query
query = 'latent semantic indexing'
vec_bow = miislita.dictionary.doc2bow(query.lower().split())
vec_tfidf = tfidf[vec_bow]
# perform a similarity query against the corpus
sims_tfidf = index[vec_tfidf]
# for the expected results see the article
expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
for i, value in enumerate(expected):
self.assertAlmostEqual(sims_tfidf[i], value, 2)
def test_miislita_high_level(self):
# construct corpus from file
miislita = CorpusMiislita(datapath('miIslita.cor'))
# initialize tfidf transformation and similarity index
tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))
# compare to query
query = 'latent semantic indexing'
vec_bow = miislita.dictionary.doc2bow(query.lower().split())
vec_tfidf = tfidf[vec_bow]
# perform a similarity query against the corpus
sims_tfidf = index[vec_tfidf]
# for the expected results see the article
expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
for i, value in enumerate(expected):
self.assertAlmostEqual(sims_tfidf[i], value, 2)
def train_by_lsi(lib_texts):
"""
??LSI?????
"""
from gensim import corpora, models, similarities
#?????????
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(lib_texts)
corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#???????topic???10?LSI??
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus]) # index ? gensim.similarities.docsim.MatrixSimilarity ??
return (index, dictionary, lsi)
#????? -- ??????????????????????
def load_model_and_dictionary(self):
self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model')
self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary')
print ("Dictionary & Model Loaded Successfully")
def get_tfidf(documents): # ??gensim????tfidf
documents=[[word for word in document.split()] for document in documents]
dictionary = corpora.Dictionary(documents)
n_items = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in documents]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf
def load_tfidf(corpus, dictionary):
if not os.path.isfile(TFIDF_MODEL_PATH):
print('Creating TF-IDF')
tfidf = models.TfidfModel(corpus)
print('TF-IDF created')
tfidf.save(TFIDF_MODEL_PATH)
print('Loading TF-IDF model')
tfidf = models.TfidfModel.load(TFIDF_MODEL_PATH)
return tfidf
# doc_list = get_data()
# print(len(doc_list))
def do_calc_svd(self):
print("?????%d" %(nlp_master.get_dict_len()))
self.k_value = int(0.1*(nlp_master.get_dict_len()))
if self.k_value < 300:
self.k_value = 300
if self.k_value > 1000:
self.k_value = 1000
print("k??%d" %(self.k_value))
tfidf = models.TfidfModel(list(nlp_master._id_docs.values()))
tfidf_corpus = tfidf[list(nlp_master._id_docs.values())]
# num_topics?????????????? 200–500
# LSI??
self.lsi = models.LsiModel(tfidf_corpus, id2word=nlp_master.dictionary, num_topics=self.k_value, chunksize=2000)
# ??????
today = datetime.date.today()
self.dumpfile = "dumpdir/recsvd_dump.%d_%d" %(today.month, today.day)
with open(self.dumpfile,'wb', -1) as fp:
dump_data = []
dump_data.append(self._user_classifier)
dump_data.append(self.k_value)
dump_data.append(self.lsi)
pickle.dump(dump_data, fp, -1)
return
# ???????NULL???
# ???????site_news?????????????
def build_tfidf_base(self, corpus, bow_matrix):
## Description: Build and save objects common to TFIDF and LSA
## Params: Corpus, BOW matrix
## Returns: TF-IDF corpus and matrix
tfidf_model = models.TfidfModel(corpus)
tfidf_corpus= tfidf_model[corpus]
tfidf_matrix = bow_matrix.apply(lambda x: tfidf_model[x[0]], 1)
return tfidf_corpus, tfidf_matrix
#MODEL OBJECTS
#A model object consists of gensim similarity index and matrix containing transformed data
keyphrase_extraction.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def get_tfidf_weighted_keyphrases(sentences,
grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
top_n=10):
valid_chunks = get_chunks(sentences, grammar=grammar)
dictionary = corpora.Dictionary(valid_chunks)
corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
weighted_phrases = {dictionary.get(id): round(value,3)
for doc in corpus_tfidf
for id, value in doc}
weighted_phrases = sorted(weighted_phrases.items(),
key=itemgetter(1), reverse=True)
return weighted_phrases[:top_n]
def train_lsi_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lsi = models.LsiModel(corpus_tfidf,
id2word=dictionary,
num_topics=total_topics)
return lsi
def reduce_tfidf(dictionary, weibo_test):
corpus_tfidf = None
# # # # ????? ????????tfidf
if not os.path.exists(path_tmp_tfidf):
print('=== ?????tfidf??????????tfidf?? ===')
# ?????????tfidf???????????????????
if not dictionary: # ????????????????????
dictionary = corpora.Dictionary.load(path_dictionary)
os.makedirs(path_tmp_tfidf)
files = os_path.LoadFiles(path_doc_root)
tfidf_model = models.TfidfModel(dictionary=dictionary)
corpus_tfidf = {}
for i, msg in enumerate(files):
catg = msg[0]
file = msg[1]
word_list = convert_doc_to_wordlist(file, cut_all=False)
file_bow = dictionary.doc2bow(word_list)
file_tfidf = tfidf_model[file_bow]
tmp = corpus_tfidf.get(catg, [])
tmp.append(file_tfidf)
if tmp.__len__() == 1:
corpus_tfidf[catg] = tmp
# ?tfidf????????
catgs = list(corpus_tfidf.keys())
for catg in catgs:
corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg),
corpus_tfidf.get(catg),
id2word=dictionary
)
print('catg {c} has been transformed into tfidf vector'.format(c=catg))
print('=== tfidf?????? ===')
else:
print('=== ???tfidf???????????? ===')
svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
def reduce_result(dictionary, lsi_model, predictor, weibo_test):
# # # # ????? ????????
if not dictionary:
dictionary = corpora.Dictionary.load(path_dictionary)
if not lsi_model:
lsi_file = open(path_tmp_lsimodel,'rb')
lsi_model = pkl.load(lsi_file)
lsi_file.close()
if not predictor:
x = open(path_tmp_predictor,'rb')
predictor = pkl.load(x)
x.close()
files = os.listdir(path_tmp_lsi)
catg_list = []
for file in files:
t = file.split('.')[0]
if t not in catg_list:
catg_list.append(t)
demo_doc = weibo_test
print(demo_doc)
demo_doc = list(jieba.cut(demo_doc,cut_all=False))
demo_bow = dictionary.doc2bow(demo_doc)
tfidf_model = models.TfidfModel(dictionary=dictionary)
demo_tfidf = tfidf_model[demo_bow]
demo_lsi = lsi_model[demo_tfidf]
data = []
cols = []
rows = []
for item in demo_lsi:
data.append(item[1])
cols.append(item[0])
rows.append(0)
demo_matrix = csr_matrix((data,(rows,cols))).toarray()
x = predictor.predict(demo_matrix)
print('??????{x}'.format(x=catg_list[x[0]]))
def save_tfidf():
corpus_bow = corpora.MmCorpus(BowFile)
tfidf_model = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf_model[corpus_bow]
corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf)
print "==================== TF-IDF data Generated and Saved ===================="
def tfidf():
if not TFIDF:
return
doc1 = u'Andrew likes Diet Pepsi.'
doc2 = u'Andrew knows the muffin man.'
doc3 = u'Andrew lives near the muffin man on Shirley Lane.'
corpus = map(sip.noun_phrases, [doc1, doc2, doc3])
dictionary = corpora.Dictionary(corpus)
bows = [dictionary.doc2bow(tokens) for tokens in corpus]
return models.TfidfModel(bows, id2word=dictionary)
def main():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
outp = OUT_PREFIX
keep_words = DEFAULT_DICT_SIZE
# the doc index
dbc = get_cursor()
dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
docindex = [(pageid, title) for pageid, title in dbc]
pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))
lemmatize = True # 'lemma' in program
wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
# only keep the most frequent words
wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(outp + '.tfidf_model')
# save tfidf vectors in matrix market format
# another long task
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logger.info("finished running %s" % program)
def buildCorpus(self):
"""
Build the corpus from the documents:
1. Remove words that only appeared once.
2. Create the Dictionary object.
3. Convert the documents to simple bag-of-words representation.
4. Convert the bag-of-words vectors to tf-idf.
"""
# Remove words that only appear once.
self.documents = [[token for token in doc if self.frequency[token] > 1]
for doc in self.documents]
# Build a dictionary from the text.
self.dictionary = corpora.Dictionary(self.documents)
# Map the documents to vectors.
corpus = [self.dictionary.doc2bow(text) for text in self.documents]
# Delete the tokenized representation of the documents--no need to
# carry this around!
del self.documents[:]
# Convert the simple bag-of-words vectors to a tf-idf representation.
self.tfidf_model = TfidfModel(corpus)
self.corpus_tfidf = self.tfidf_model[corpus]
def load_tfidf_model(self):
print "loading tfidf from", self.tfidf_filepath
self.tfidf_model = models.TfidfModel.load(self.tfidf_filepath, mmap='r')
def make_tfidf_model(self, seqs):
self.tfidf_model = models.TfidfModel((self.lexicon.doc2bow(tokenize(seq)) for seq in seqs))
self.tfidf_model.save(self.tfidf_filepath)
print "saved tfidf to", self.tfidf_filepath
def getLsiModel(lsipath='./lsi/', num_topics=300):
# ????
dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
print '??????'
# ???
corpus = corpora.MmCorpus(lsipath +'viva.mm')
print ('mm load')
t31 = time.time()
# tfidf
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
t32 = time.time()
print "tfidf_corpus time = ", t32 - t31
# baobao change 3 lines
# corpus = MyCorpus()
# lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
# lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
lsi = None
try:
lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #????????
lsi.save(lsipath + 'viva.lsi')
print('lsi??????')
except (SystemExit, KeyboardInterrupt):
raise
except Exception, e:
logging.error('Failed to lsi train', exc_info=True)
return lsi
def tfidf_model(self):
if self._tfidf_model is None:
doc_count = self.post_ids_query.count()
if doc_count < 10:
return None
dictionary = self.dictionary
tfidf_model = gmodels.TfidfModel(id2word=dictionary)
tfidf_fname = join(self.dirname, "tfidf_%d.model" % (
self.discussion.id,))
subcorpus = self.subcorpus
if exists(tfidf_fname):
tfidf_model = tfidf_model.load(tfidf_fname)
# assumption: count implies identity.
# Wrong in corner cases: hidden, etc.
if tfidf_model.num_docs != doc_count:
unlink(tfidf_fname)
tfidf_model = gmodels.TfidfModel(id2word=dictionary)
if tfidf_model.num_docs != doc_count:
tfidf_model.initialize(subcorpus)
tfidf_model.save(tfidf_fname)
self._tfidf_model = tfidf_model
return self._tfidf_model
def tfidf_weight(self):
self.corpus = models.TfidfModel(self.corpus, normalize=True)
def trainModel(self):
'''
Train a LDA model, inclusive of 4 steps:
1. Parse the whole corpora into unigram token collections and document mapping (for later use)
2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
3. Indexing the token collections and do TF-IDF transformation
4. Call gensim.models.LdaModel and generate topic distributions of the corpora
'''
print 'Start preparing unigram tokens....'
## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
# Get document_count, tokens, and document-index mapping from the corpora
doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora)
# Put the training data into gensim.corpora for later use
dic = corpora.Dictionary(train_set)
denominator = len(dic)
# Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
nominator = len(dic)
corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW
print 'There are %i documents in the pool' % (doc_count)
print "In the corpus there are ", denominator, " raw tokens"
print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
print 'Finished preparing unigram tokens....'
##END
print 'Start training LDA model....'
## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
corpus_lda = lda[corpus_tfidf]
# Once done training, print all the topics and related words
print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'
for i in range(self.num_topics):
print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
# Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
print '==============================='
print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
print '==============================='
return lda,doc_mapping,link_mapping,corpus
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir):
DCG_score_list = []
for question_index in range(int(question_num)):
if (question_index+1)%1000 == 1:
print 'Now for line : ' + str(question_index+1) + '\n'
index = question_index + 1
file_read_name = os.path.join(question_answer_word_dir,str(index))
file_write_name = os.path.join(question_answer_score_label_file_dir,str(index))
file_read = open(file_read_name,'rb+')
question_line = file_read.readline()
question_line_list = question_line.strip().split('\t')
question_line_list.remove('question')
answer_index = 0
answer_index_line_label_dict = {}
answer_sentences_word_list = []
for line in file_read.readlines():
answer_temp_line_list = line.strip().split('\t')
answer_label = answer_temp_line_list[1]
answer_temp_line_list.remove('answer')
answer_temp_line_list.remove(answer_label)
answer_sentences_word_list.append(answer_temp_line_list)
answer_list_temp = []
answer_list_temp.append(answer_label)
answer_index_line_label_dict[answer_index] = answer_list_temp
answer_index += 1
dic = corpora.Dictionary(answer_sentences_word_list)
corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
index = similarities.MatrixSimilarity(lda[corpus_tfidf])
query_bow = dic.doc2bow(question_line_list)
query_lda = lda[query_bow]
sims = index[query_lda]
list_simes = list(enumerate(sims))
sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
#answer_label_list = []
for item in list_simes:
answer_index_temp = item[0]
answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
answer_score = str(item[1])
file_write = open(file_write_name,'ab+')
file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n')
file_write.close()
#answer_label_list.append(answer_label)
#DCG_score = calu_DCG(answer_label_list,k)
#DCG_score_list.append(DCG_score)
#DCG_avg = calu_avg_answer_length(DCG_score_list)
#print 'DCG_avg : \t' + str(DCG_avg)
def get_score_for_question(question_answer_word_dir,question_num,k):
DCG_score_list = []
for question_index in range(int(question_num)):
if (question_index+1)%1000 == 1:
print 'Now for line : ' + str(question_index+1) + '\n'
index = question_index + 1
file_read_name = os.path.join(question_answer_word_dir,str(index))
file_read = open(file_read_name,'rb+')
question_line = file_read.readline()
question_line_list = question_line.strip().split('\t')
question_line_list.remove('question')
answer_index = 0
answer_index_line_label_dict = {}
answer_sentences_word_list = []
for line in file_read.readlines():
answer_temp_line_list = line.strip().split('\t')
answer_label = answer_temp_line_list[1]
answer_temp_line_list.remove('answer')
answer_temp_line_list.remove(answer_label)
answer_sentences_word_list.append(answer_temp_line_list)
answer_list_temp = []
answer_list_temp.append(answer_label)
answer_index_line_label_dict[answer_index] = answer_list_temp
answer_index += 1
dic = corpora.Dictionary(answer_sentences_word_list)
corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
index = similarities.MatrixSimilarity(lda[corpus_tfidf])
query_bow = dic.doc2bow(question_line_list)
query_lda = lda[query_bow]
sims = index[query_lda]
sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
answer_label_list = []
for item in sort_sims:
answer_index_temp = item[0]
answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
answer_label_list.append(answer_label)
DCG_score = calu_DCG(answer_label_list,k)
DCG_score_list.append(DCG_score)
DCG_avg = calu_avg_answer_length(DCG_score_list)
print 'DCG_avg : \t' + str(DCG_avg)
def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'):
textfile = codecs.open(file_path, "r", "utf-8")
print("Reading and Processing Text File")
first_lines=[]
for line in textfile:
first_lines.append(line.strip())
print ("--------Building Corpora Dictionary---------------" )
dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines)
#remove words that appear less than 2 times
#twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2]
#dictionary.filter_tokens(fiveids)
#Remove Gaps
dictionary.compactify()
dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False)
dictionary.save('../../temp_results/tfidf_dictionary')
print("Dictionary Saved")
print ("--Now Transforming to Bag of Words Vectors on the Fly--")
class MyCorpus(object):
def __iter__(self):
for line in first_lines:
yield dictionary.doc2bow(line.split())
news_corpus = MyCorpus()
print("Corpus Built...Now Starting Model Training")
tfidf_model = models.TfidfModel(news_corpus)
tfidf_model.save('../../temp_results/tfidf_model')
print("Model Trained & Saved")