def compute_suggested_adv_keyterms_dataset(relative_dataset_filename, min_members_per_cluster = 5):
filepath = "dataset/keyterm_clustering/" + relative_dataset_filename + ".json"
top_adv_clusters_filepath = "dataset/keyterm_clustering/top_adv_keyterm_clusters.dump"
## load dataset and embedding model
print "Loading Embedding model ..."
embedding_model = load_embedding_model(True)
vocabulary = embedding_model.vocab
df = None
top_adv_clusters = None
print "Loading datasets ..."
with open(top_adv_clusters_filepath) as fp:
top_adv_clusters = np.load(fp)
with open(filepath) as fp:
df = pd.read_json(fp)
## compute
result_dataset = []
print "Starting computation ..."
for index, row in df.iterrows():
url = row['url']
print "Processing clusters for URL: " + url + " ..."
clusters = row['clusters']
for cl_data in clusters:
if cl_data['len'] >= min_members_per_cluster:
suggested_keyterms = suggest_top_adv_keyterms(cl_data, top_adv_clusters, embedding_model)
entry = {
'url': url,
'cl_idx': cl_data['idx'],
'cl_center': cl_data['center'],
'cl_len': cl_data['len'],
'suggested_keyterms': suggested_keyterms
}
result_dataset.append(entry)
df_matching = pd.DataFrame.from_records(result_dataset)
writer = pd.ExcelWriter("dataset/keyterm_clustering/" + relative_dataset_filename + "_suggested_adv" + ".xlsx")
df_matching.to_excel(writer, "adv_matching")
writer.save()
return df_matching
keyterm_clustering.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录