def newDBSCANModel(vectorFile, outputFile):
model = Doc2Vec.load("Models\\" + vectorFile)
vecs = []
for doc in range(0, len(model.docvecs)):
doc_vec = model.docvecs[doc]
# print doc_vec
vecs.append(doc_vec.reshape((1, 300)))
doc_vecs = np.array(vecs, dtype='float') # TSNE expects float type values
# print doc_vecs
docs = []
for i in doc_vecs:
docs.append(i[0])
db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
joblib.dump(db, outputFile)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
clusters = db.labels_.tolist()
cluster_info = {'labels': model.docvecs.offset2doctag,
"index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
model.docvecs.offset2doctag],
'clusters': clusters}
sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
columns=['labels', "index, wordcount and repeated words", 'clusters'])
print(sentenceDF)
sentenceDF.to_csv("DBSCAN.csv")
print('Estimated number of clusters: %d' % n_clusters_)
评论列表
文章目录