def similarity_function(x, y):
""" Similarity function for comparing user features.
This actually really should be implemented in taar.similarity_recommender
and then imported here for consistency.
"""
def safe_get(field, row, default_value):
# Safely get a value from the Row. If the value is None, get the
# default value.
return row[field] if row[field] is not None else default_value
# Extract the values for the categorical and continuous features for both
# the x and y samples. Use an empty string as the default value for missing
# categorical fields and 0 for the continuous ones.
x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES]
x_continuous_features = [safe_get(k, x, 0) for k in CONTINUOUS_FEATURES]
y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES]
y_continuous_features = [safe_get(k, y, 0) for k in CONTINUOUS_FEATURES]
# Here a larger distance indicates a poorer match between categorical variables.
j_d = (distance.hamming(x_categorical_features, y_categorical_features))
j_c = (distance.canberra(x_continuous_features, y_continuous_features))
# Take the product of similarities to attain a univariate similarity score.
# Add a minimal constant to prevent zero values from categorical features.
# Note: since both the distance function return a Numpy type, we need to
# call the |item| function to get the underlying Python type. If we don't
# do that this job will fail when performing KDE due to SPARK-20803 on
# Spark 2.2.0.
return abs((j_c + 0.001) * j_d).item()
评论列表
文章目录