def main():
"""
compute_embeddings_vectors()
print "Reading embedding vectors"
with open('triples_vectors.pkl', 'r') as in_file:
triples = pickle.load(in_file)
vectors = []
for t in triples:
vectors.append(t.vector)
"""
text = []
triples = []
with open('triples.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for t in reader:
e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4]
t = Triple(e1, e1_type, rel, e2, e2_type)
text.append(rel)
triples.append(t)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(text)
print "Clustering"
dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute',
leaf_size=30, p=None, n_jobs=1)
labels = dbscan.fit_predict(tfidf_matrix)
with open('triples_labels.txt', 'w') as out_file:
for l in labels:
out_file.write(str(l) + '\n')
print "Reading cluster labels"
labels = []
with open('triples_labels.txt', 'r') as in_file:
for label in in_file:
labels.append(int(label.strip()))
for i in range(len(triples)):
triples[i].label = labels[i]
clusters = dict()
for t in triples:
try:
clusters[t.label] += 1
except KeyError:
clusters[t.label] = 1
print clusters
exit(-1)
# print len(clusters)
# top-terms for each cluster
for x in range(-1, len(clusters)):
print x, len(clusters[x])
for t in triples:
if t.label == str(x):
print t.rel
print
print
cluster-triples.py 文件源码
python
阅读 16
收藏 0
点赞 0
评论 0
评论列表
文章目录