def laplace_gpu(y_gpu, mode='valid'):
shape = np.array(y_gpu.shape).astype(np.uint32)
dtype = y_gpu.dtype
block_size = (16,16,1)
grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
int(np.ceil(float(shape[0])/block_size[1])))
shared_size = int((2+block_size[0])*(2+block_size[1])*dtype.itemsize)
preproc = _generate_preproc(dtype, shape)
mod = SourceModule(preproc + kernel_code, keep=True)
if mode == 'valid':
laplace_fun_gpu = mod.get_function("laplace_valid")
laplace_gpu = cua.empty((y_gpu.shape[0]-2, y_gpu.shape[1]-2), y_gpu.dtype)
if mode == 'same':
laplace_fun_gpu = mod.get_function("laplace_same")
laplace_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
laplace_fun_gpu(laplace_gpu.gpudata, y_gpu.gpudata,
block=block_size, grid=grid_size, shared=shared_size)
return laplace_gpu
评论列表
文章目录