def test_similarity():
UserDataRow = Row(
"normalized_channel", "geo_city", "subsession_length", "os", "locale",
"active_addons", "bookmark_count", "tab_open_count", "total_uri", "unique_tlds"
)
test_user_1 = UserDataRow("release", "Boston", 10, "Windows", "en-US", [], 1, 2, 3, 4)
test_user_2 = UserDataRow("release", "notsoB", 10, "swodniW", "SU-ne", [], 1, 2, 3, 4)
test_user_3 = UserDataRow("release", "Boston", 0, "Windows", "en-US", [], 0, 0, 0, 0)
test_user_4 = UserDataRow("release", "notsoB", 0, "swodniW", "SU-ne", [], 0, 0, 0, 0)
# The following user contains a None value for "total_uri" and geo_city
# (categorical feature). The latter should never be possible, but let's be cautious.
test_user_5 = UserDataRow("release", None, 10, "swodniW", "SU-ne", [], 1, None, 3, 4)
# Identical users should be very close (0 distance) and the result must not
# be a Numpy number.
similarity_result = taar_similarity.similarity_function(test_user_1, test_user_1)
assert not isinstance(similarity_result, np.generic)
assert np.isclose(similarity_result, 0.0)
# Users with completely different categorical features but identical
# continuous features should be slightly different.
assert np.isclose(taar_similarity.similarity_function(test_user_1, test_user_2), 0.001)
# Users with completely different continuous features but identical
# categorical features should be very close.
assert np.isclose(taar_similarity.similarity_function(test_user_1, test_user_3), 0.0)
# Completely different users should be far away.
assert taar_similarity.similarity_function(test_user_1, test_user_4) >= 1.0
# Partial user information should not break the similarity function.
assert taar_similarity.similarity_function(test_user_1, test_user_5)
评论列表
文章目录