test_taar_similarity.py 文件源码-python代码片段

def test_similarity():
    UserDataRow = Row(
        "normalized_channel", "geo_city", "subsession_length", "os", "locale",
        "active_addons", "bookmark_count", "tab_open_count", "total_uri", "unique_tlds"
    )

    test_user_1 = UserDataRow("release", "Boston", 10, "Windows", "en-US", [], 1, 2, 3, 4)
    test_user_2 = UserDataRow("release", "notsoB", 10, "swodniW", "SU-ne", [], 1, 2, 3, 4)
    test_user_3 = UserDataRow("release", "Boston", 0, "Windows", "en-US", [], 0, 0, 0, 0)
    test_user_4 = UserDataRow("release", "notsoB", 0, "swodniW", "SU-ne", [], 0, 0, 0, 0)
    # The following user contains a None value for "total_uri" and geo_city
    # (categorical feature). The latter should never be possible, but let's be cautious.
    test_user_5 = UserDataRow("release", None, 10, "swodniW", "SU-ne", [], 1, None, 3, 4)

    # Identical users should be very close (0 distance) and the result must not
    # be a Numpy number.
    similarity_result = taar_similarity.similarity_function(test_user_1, test_user_1)
    assert not isinstance(similarity_result, np.generic)
    assert np.isclose(similarity_result, 0.0)
    # Users with completely different categorical features but identical
    # continuous features should be slightly different.
    assert np.isclose(taar_similarity.similarity_function(test_user_1, test_user_2), 0.001)
    # Users with completely different continuous features but identical
    # categorical features should be very close.
    assert np.isclose(taar_similarity.similarity_function(test_user_1, test_user_3), 0.0)
    # Completely different users should be far away.
    assert taar_similarity.similarity_function(test_user_1, test_user_4) >= 1.0
    # Partial user information should not break the similarity function.
    assert taar_similarity.similarity_function(test_user_1, test_user_5)