def run_benchmark_matmul_aa_correlator_kernel(self, ntime, nstand, nchan):
x_shape = (ntime, nchan, nstand*2)
perm = [1,0,2]
x8 = ((np.random.random(size=x_shape+(2,))*2-1)*127).astype(np.int8)
x = x8.astype(np.float32).view(np.complex64).reshape(x_shape)
x = x.transpose(perm)
b_gold = np.matmul(H(x[:,[0],:]), x[:,[0],:])
triu = np.triu_indices(x_shape[-1], 1)
b_gold[..., triu[0], triu[1]] = 0
x = x8.view(bf.DataType.ci8).reshape(x_shape)
x = bf.asarray(x, space='cuda')
x = x.transpose(perm)
b = bf.zeros_like(b_gold, space='cuda')
bf.device.stream_synchronize();
t0 = time.time()
nrep = 200
for _ in xrange(nrep):
self.linalg.matmul(1, None, x, 0, b)
bf.device.stream_synchronize();
dt = time.time() - t0
nflop = nrep * nchan * ntime * nstand*(nstand+1)/2 * 2*2 * 8
print nstand, '\t', nflop / dt / 1e9, 'GFLOP/s'
print '\t\t', nrep*ntime*nchan / dt / 1e6, 'MHz'
评论列表
文章目录