Cython函数比纯Python需要更多时间
我正在尝试加速我的代码,这部分给我带来了问题,
我尝试使用Cython,然后按照此处给出的建议进行操作,但是我的纯python函数的性能优于cython和cython_optimized函数
cython代码如下:
import numpy as np
cimport numpy as np
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):
assert u.dtype == DTYPE
assert PorosityProfile.dtype == DTYPE
assert DensityIceProfile.dtype == DTYPE
assert DensityDustProfile.dtype == DTYPE
assert DensityProfile.dtype == DTYPE
cdef float DustJ = 250.0
cdef float DustF = 633.0
cdef float DustG = 2.513
cdef float DustH = -2.2e-3
cdef float DustI = -2.8e-6
cdef float IceI = 273.16
cdef float IceC = 1.843e5
cdef float IceD = 1.6357e8
cdef float IceE = 3.5519e9
cdef float IceF = 1.6670e2
cdef float IceG = 6.4650e4
cdef float IceH = 1.6935e6
cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
然后,我运行以下命令:
def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
import sublimation
import numpy as np
%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
结果如下:
对于纯python: 68.9 µs ± 851 ns per loop (mean ± std. dev. of 7 runs, 10000 loops
each)
对于非优化的cython: 68.2 µs ± 685 ns per loop (mean ± std. dev. of 7 runs, 10000
loops each)
对于优化的: 72.7 µs ± 416 ns per loop (mean ± std. dev. of 7 runs, 10000 loops
each)
我究竟做错了什么 ?
谢谢你的帮助,
-
使用Numba的解决方案
使用Cython,CodeSurgeon已经给出了很好的答案。在这个答案中,我不想展示使用Numba的另一种方法。
我创建了三个版本。在
naive_numba
我只添加了一个功能装饰。在improved_Numba
我手动组合的循环中(每个矢量化命令实际上都是一个循环)。在improved_Numba_p
我已经并行化了功能。请注意,使用并行加速器时,显然存在一个错误,不允许定义常量值。还应注意,并行化版本仅对较大的输入阵列有利。但是,您还可以添加一个小的包装器,该包装器根据输入数组的大小调用单线程或并行化版本。代码dtype = float64
import numba as nb import numpy as np import time @nb.njit(fastmath=True) def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 delta = u-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3); x= u/IceI; result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile #error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization @nb.njit(fastmath=True,error_model='numpy') def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 res=np.empty(u.shape[0],dtype=u.dtype) for i in range(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3); x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res #there is obviously a bug in Numba (declaring const values in the function) @nb.njit(fastmath=True,parallel=True,error_model='numpy') def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH): res=np.empty((u.shape[0]),dtype=u.dtype) for i in nb.prange(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3); x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res u=np.array(np.random.rand(1000000),dtype=np.float32) PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32) DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32) DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32) DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32) DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 #don't measure compilation overhead on first call res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) for i in range(1000): res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) print(time.time()-t1) print(time.time()-t1)
性能
Arraysize np.random.rand(100) Numpy 46.8µs naive Numba 3.1µs improved Numba: 1.62µs improved_Numba_p: 17.45µs #Arraysize np.random.rand(1000000) Numpy 255.8ms naive Numba 18.6ms improved Numba: 6.13ms improved_Numba_p: 3.54ms
代码dtype = np.float32
如果np.float32足够,则必须在函数中显式声明所有常量值给float32。否则,Numba将使用float64。
@nb.njit(fastmath=True,error_model='numpy') def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6) IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6) res=np.empty(u.shape[0],dtype=u.dtype) for i in range(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3) x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res @nb.njit(fastmath=True,parallel=True,error_model='numpy') def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): res=np.empty((u.shape[0]),dtype=u.dtype) DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6) IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6) for i in nb.prange(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3) x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res
性能
Arraysize np.random.rand(100).astype(np.float32) Numpy 29.3µs improved Numba: 1.33µs improved_Numba_p: 18µs Arraysize np.random.rand(1000000).astype(np.float32) Numpy 117ms improved Numba: 2.46ms improved_Numba_p: 1.56ms
与@CodeSurgeon提供的Cython版本的比较并不十分公平,因为他没有使用启用的AVX2和FMA3指令编译该功能。Numba默认使用-march =
native进行编译,这会在我的Core i7-4xxx上启用AVX2和FMA3指令。但是,如果您不希望分发已编译的Cython版本的代码,就会产生这种感觉,因为如果启用了该优化功能,默认情况下,它将不会在Haswell之前的处理器(或所有Pentium和Celerons)上运行。应该可以编译多个代码路径,但这是编译器的依赖和更多的工作。