def fit(self, X, y=None):
"""
Parameters
----------
X : {array, sparse matrix}, shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
Returns
-------
self : object
Returns self.
"""
from simhash import compute
self._fit_X = X = check_array(X, accept_sparse='csr')
n_features = X.shape[1]
def _scale_hash_32_64bit(indices):
return indices*((2**64-1)//2**32-1)
hash_func = self.hash_func
hashing_table = np.array(
[hash_func(el, 0) for el in range(n_features)],
dtype='uint64')
shash = []
for idx in range(X.shape[0]):
# get hashes of indices
mhash = hashing_table[X[idx].indices]
if self.hash_func_nbytes == 32:
mhash = _scale_hash_32_64bit(mhash)
shash.append(compute(mhash))
_fit_shash = np.asarray(shash, dtype='uint64')
self._fit_shash = _fit_shash
self._fit_shash_dict = {val: key
for key, val in enumerate(self._fit_shash)}
评论列表
文章目录