def calculate_similarity(self, article1, article2, type):
"""
Calculate the similarity between two articles, e.g. the cosine similarity or the Euclidean distance.
:param article1: coordinates (feature values) of article 1
:param article2: coordinates (feature values) of article 2
:return:
"""
if self.type == 'Cos':
similarity = self.cosine_similarity(article1, article2) # Cosine similarity formula
if self.type == 'Euc':
similarity = self.euclidean_distance(article1, article2) # Euclidean distance formula
'''
if self.type == 'Jac':
similarity = self.calculate_jaccard_score(article1, article2) # jaccard distance formula
'''
similarity = "{0:.2f}".format(round(similarity, 2))
return float(similarity)
python类jaccard()的实例源码
def calculate_similarity(self, article1, article2, type):
"""
Calculate the similarity between two articles, e.g. the cosine similarity or the Euclidean distance.
:param article1: coordinates (feature values) of article 1
:param article2: coordinates (feature values) of article 2
:return:
"""
if self.type == 'Cos':
similarity = self.cosine_similarity(article1, article2) # Cosine similarity formula
if self.type == 'Euc':
similarity = self.euclidean_distance(article1, article2) # Euclidean distance formula
'''
if self.type == 'Jac':
similarity = self.calculate_jaccard_score(article1, article2) # jaccard distance formula
'''
similarity = "{0:.2f}".format(round(similarity, 2))
return float(similarity)
def features(self, q1, q2):
q1 = str(q1).lower().split()
q2 = str(q2).lower().split()
q1 = [w for w in q1 if w not in stopwords]
q2 = [w for w in q2 if w not in stopwords]
wmd = min(self.model.wmdistance(q1, q2), 10)
q1vec = self.sent2vec(q1)
q2vec = self.sent2vec(q2)
if q1vec is not None and q2vec is not None:
cos = cosine(q1vec, q2vec)
city = cityblock(q1vec, q2vec)
jacc = jaccard(q1vec, q2vec)
canb = canberra(q1vec, q2vec)
eucl = euclidean(q1vec, q2vec)
mink = minkowski(q1vec, q2vec, 3)
bray = braycurtis(q1vec, q2vec)
q1_skew = skew(q1vec)
q2_skew = skew(q2vec)
q1_kurt = kurtosis(q1vec)
q2_kurt = kurtosis(q2vec)
else:
cos = -1
city = -1
jacc = -1
canb = -1
eucl = -1
mink = -1
bray = -1
q1_skew = 0
q2_skew = 0
q1_kurt = 0
q2_kurt = 0
return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def features(self, q1, q2):
q1 = str(q1).lower().split()
q2 = str(q2).lower().split()
q1 = [w for w in q1 if w not in stopwords]
q2 = [w for w in q2 if w not in stopwords]
wmd = min(self.model.wmdistance(q1, q2), 10)
wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10)
q1vec = self.sent2vec(q1)
q2vec = self.sent2vec(q2)
if q1vec is not None and q2vec is not None:
cos = cosine(q1vec, q2vec)
city = cityblock(q1vec, q2vec)
jacc = jaccard(q1vec, q2vec)
canb = canberra(q1vec, q2vec)
eucl = euclidean(q1vec, q2vec)
mink = minkowski(q1vec, q2vec, 3)
bray = braycurtis(q1vec, q2vec)
q1_skew = skew(q1vec)
q2_skew = skew(q2vec)
q1_kurt = kurtosis(q1vec)
q2_kurt = kurtosis(q2vec)
else:
cos = -1
city = -1
jacc = -1
canb = -1
eucl = -1
mink = -1
bray = -1
q1_skew = 0
q2_skew = 0
q1_kurt = 0
q2_kurt = 0
return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def getDistanceFunction(requested_metric):
"""
This function returns a specified distance function.
**PARAMETERS**
:'requested_metric': can be 'hamming', 'eculidean' or any of the functions in https://docs.scipy.org/doc/scipy/reference/spatial.distance.html which only require u and v as input.
**OUTPUT**
returns distance function (as function)
**HISTORY**
:Created: Dec 2016, WHT
:Updated (v0.2.1): Aug 2017, WHT. Changed from distance functions being in misc to using scipy.
"""
distance_options = {
'braycurtis': distance.braycurtis,
'canberra': distance.canberra,
'chebyshev': distance.chebyshev,
'cityblock': distance.cityblock,
'correlation': distance.correlation,
'cosine': distance.cosine,
'euclidean': distance.euclidean,
'sqeuclidean': distance.sqeuclidean,
'dice': distance.dice,
'hamming': distance.hamming,
'jaccard': distance.jaccard,
'kulsinski': distance.kulsinski,
'matching': distance.matching,
'rogerstanimoto': distance.rogerstanimoto,
'russellrao': distance.russellrao,
'sokalmichener': distance.sokalmichener,
'sokalsneath': distance.sokalsneath,
'yule': distance.yule,
}
if requested_metric in distance_options:
return distance_options[requested_metric]
else:
raise ValueError('Distance function cannot be found.')
btm_dis_features.py 文件源码
项目:zhihu-machine-learning-challenge-2017
作者: HouJP
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def generate(config, argv):
# load valid dataset index
valid_index_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'),
config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
valid_index = DataUtil.load_vector(valid_index_fp, 'int')
valid_index = [num - 1 for num in valid_index]
# load topic btm vec
topic_btm_vec = load_topic_btm_vec(config)
# offline / online
data_name = argv[0]
dis_func_names = ["cosine",
"cityblock",
"jaccard",
"canberra",
"euclidean",
"minkowski",
"braycurtis"]
btm_dis_feature_fn = ['vote_fs_btm_dis_%s' % dis_func_name for dis_func_name in dis_func_names]
btm_dis_feature_f = [open('%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'),
fn,
data_name), 'w') for fn in btm_dis_feature_fn]
if 'offline' == data_name:
btm_tw_cw_features = load_features_from_file(config, 'fs_btm_tw_cw', data_name, valid_index)
LogUtil.log('INFO', 'load_features_from_file, len=%d' % len(btm_tw_cw_features))
for line_id in range(len(btm_tw_cw_features)):
doc_vec = btm_tw_cw_features[line_id]
for dis_id in range(len(dis_func_names)):
vec = [0.] * 1999
for topic_id in range(1999):
topic_vec = topic_btm_vec[topic_id]
if 'minkowski' == dis_func_names[dis_id]:
vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3)
else:
vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec)
btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec]))
else:
btm_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), data_name)
btm_vec_f = open(btm_vec_fp, 'r')
for line in btm_vec_f:
doc_vec = np.nan_to_num(parse_feature_vec(line))
for dis_id in range(len(dis_func_names)):
vec = [0.] * 1999
for topic_id in range(1999):
topic_vec = topic_btm_vec[topic_id]
if 'minkowski' == dis_func_names[dis_id]:
vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3)
else:
vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec)
btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec]))
for f in btm_dis_feature_f:
f.close()