def cluster_process(filenames,key_part,s,n_clusters=2):
documents=[]
texts=[]
for fname in filenames:
# key_part: # ?? ???# ?? ???# ???# ???# ???# ???
# # ?? ???# ?? ???# ???# ???# ??
# = = = ?? = = = = = = ?? = = = = = = ?? = = =
# key_part=['# ??']
# ??key_part??
d=extract(fname,key_part=key_part)
documents.append(d)
# documents,words=tfidf(documents)
# print len(documents),len(words)
# docs=create_format_mat(documents,words)
docs=get_tfidf(documents) # ??gensim??tfidf
# ??
# labels [0,1,0,1,1,...]
labels,score=clustering(docs,n_clusters)
print 'key_part:','_'.join(key_part).decode('utf-8')
item_parts=[]
filename_parts=[]
for i in range(n_clusters):
# item=[filenames[j] for j in range(len(labels)) if labels[j]==i]
item=[documents[j] for j in range(len(labels)) if labels[j]==i]
# print cchardet.detect(s)
# ?????
filename_parts.append(([filenames[j] for j in range(len(labels)) if labels[j]==i],u'%s_%s_%d' %(s,'_'.join(key_part),i)))
item_parts.append((item,u'%s_%s_%d' %(s,'_'.join(key_part),i)))
print 'class_%d:%d' %(i,len(item))
# ?????
print 'score:',score
print '-'*20
write2file(item_parts) # ????
return filename_parts
kmeans_cluster.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录