def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'):
textfile = codecs.open(file_path, "r", "utf-8")
print("Reading and Processing Text File")
first_lines=[]
for line in textfile:
first_lines.append(line.strip())
print ("--------Building Corpora Dictionary---------------" )
dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines)
#remove words that appear less than 2 times
#twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2]
#dictionary.filter_tokens(fiveids)
#Remove Gaps
dictionary.compactify()
dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False)
dictionary.save('../../temp_results/tfidf_dictionary')
print("Dictionary Saved")
print ("--Now Transforming to Bag of Words Vectors on the Fly--")
class MyCorpus(object):
def __iter__(self):
for line in first_lines:
yield dictionary.doc2bow(line.split())
news_corpus = MyCorpus()
print("Corpus Built...Now Starting Model Training")
tfidf_model = models.TfidfModel(news_corpus)
tfidf_model.save('../../temp_results/tfidf_model')
print("Model Trained & Saved")
评论列表
文章目录