def ola_GPU(xs_gpu, sy, csf, hop):
y_gpu = cua.empty(sy, np.float32)
block_size = (16,16,1)
grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[1])),
int(np.ceil(np.float32(sz[1])/block_size[0])))
mod = cu.module_from_buffer(cubin)
copy_Kernel = mod.get_function("copy_Kernel")
for i in range(csf[0]):
for j in range(csf[1]):
copy_Kernel(y_gpu, np.uint32(sy[0]), np.uint32(sy[0]),
xs_gpu, np.uint32(sx[0]), np.uint32(sx[1]), np.uint32(sx[2]),
np.uint32(offset[0]), np.uint32(offset[1]), np.uint32(startrow),
block=block_size, grid=grid_size)
return np.real(y_gpu.get())
评论列表
文章目录