test_norm_query.py 文件源码

python
阅读 33 收藏 0 点赞 0 评论 0

项目:search-MjoLniR 作者: wikimedia 项目源码 文件源码
def test_vectorized_jaccard_sim():
    # The vectorized version of jaccard similarity is 20x faster, but it is
    # harder to understand. Compute it the simple way and compare to the
    # vectorized version
    def jaccard_sim(X, Y):
        assert len(X) == len(Y)
        a = np.sum((X == 1) & (Y == 1))
        d = np.sum((X == 0) & (Y == 0))
        return a / float(len(X) - d)

    def binary_sim(mat):
        n_rows = mat.shape[0]
        out = np.empty((n_rows, n_rows), dtype=np.float64)
        for i in range(n_rows):
            out[i][i] = 1.
            for j in range(0, i):
                out[i][j] = jaccard_sim(mat[i], mat[j])
                out[j][i] = out[i][j]
        return out

    # Simulate 200 queries with 100 shared page ids
    matrix = np.random.rand(200, 100) > 0.7
    simple = binary_sim(matrix)
    vectorized = mjolnir.norm_query._binary_sim(matrix)
    assert np.array_equal(simple, vectorized)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号