norm_query.py 文件源码-python代码片段

def _binary_sim(matrix):
    """Compute a jaccard similarity matrix.

    Vecorization based on: https://stackoverflow.com/a/40579567

    Parameters
    ----------
    matrix : np.array

    Returns
    -------
    np.array
        matrix of shape (n_rows, n_rows) giving the similarity
        between rows of the input matrix.
    """
    # Generate the indices of the lower triangle of our result matrix.
    # The diagonal is offset by -1 because the identity in a similarity
    # matrix is always 1.
    r, c = np.tril_indices(matrix.shape[0], -1)

    # Particularly large groups can blow out memory usage. Chunk the calculation
    # into steps that require no more than ~100MB of memory at a time.
    # We have 4 2d intermediate arrays in memory at a given time, plus the
    # input and output:
    #  p1 = max_rows * matrix.shape[1]
    #  p2 = max_rows * matrix.shape[1]
    #  intersection = max_rows * matrix.shape[1] * 4
    #  union = max_rows * matrix.shape[1] * 8
    # This adds up to:
    #  memory usage = max_rows * matrix.shape[1] * 14
    mem_limit = 100 * pow(2, 20)
    max_rows = mem_limit / (14 * matrix.shape[1])
    out = np.eye(matrix.shape[0])
    for c_batch, r_batch in _batch(c, r, max_rows):
        # Use those indices to build two matrices that contains all
        # the rows we need to do a similarity comparison on
        p1 = matrix[c_batch]
        p2 = matrix[r_batch]
        # Run the main jaccard calculation
        intersection = np.logical_and(p1, p2).sum(1)
        union = np.logical_or(p1, p2).sum(1).astype(np.float64)
        # Build the result matrix with 1's on the diagonal
        # Insert the result of our similarity calculation at their original indices
        out[c_batch, r_batch] = intersection / union
    # Above only populated half of the matrix, the mirrored diagonal contains
    # only zeros. Fix that up by transposing. Adding the transposed matrix double
    # counts the diagonal so subtract that back out. We could skip this step and
    # leave half the matrix empty, but it takes a fraction of a ms to be correct
    # even on mid-sized inputs (~200 queries).
    return out + out.T - np.diag(np.diag(out))