def __init__(self, itemInfos):
lastTime = time.time()
# itemInfos : dict[(pid, description)]
# train model
jieba.load_userdict('./dict.txt.big.txt')
stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
stopWords.add('\n')
stopWords.add(' ')
stopWords.add(u'\u2022')
stopWords.add(u'\xa9')
texts = []
self.name2id = {}
self.id2name = []
for k, v in itemInfos.iteritems():
seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
texts.append(list(seg_list))
self.name2id[k] = len(self.id2name)
self.id2name.append(k)
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
print "start cast :", (time.time() - lastTime)
lastTime = time.time()
dictionary = corpora.Dictionary(texts)
print "dictionary cast :", (time.time() - lastTime)
lastTime = time.time()
corpus = [dictionary.doc2bow(text) for text in texts]
print "doc2bow cast :", (time.time() - lastTime)
lastTime = time.time()
tfidf = models.TfidfModel(corpus)
print "tfid model cast :", (time.time() - lastTime)
lastTime = time.time()
lastTime = time.time()
corpus_tfidf = tfidf[corpus]
print "tfidf corpus cast :", (time.time() - lastTime)
lastTime = time.time()
self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
print "lsi model cast :", (time.time() - lastTime)
lastTime = time.time()
#corpus_lsi = lsi[corpus_tfidf]
self.index = similarities.MatrixSimilarity(self.lsi[corpus])
self.corpus = corpus
self.pidName = getPidName()
print "init finish"
评论列表
文章目录