Tests & examples with pyvkfft
[1]:
# Use the following to install pyvkfft on google colab
if False:
# Install pyvkfft & dependencies only once using pip
import os
if not os.path.exists('dev/pyvkfft'):
!mkdir dev
!cd dev && git clone --recursive https://github.com/vincefn/pyvkfft.git
!pip install pycuda
# !pip install cupy
!cd dev/pyvkfft && pip install .
# scipy, matplotlib not required for pyvkfft, but for tests
!pip install scipy matplotlib
Select PyCUDA or CuPy
[2]:
try:
# raise ImportError() # Uncomment to force using cupy if you have both
import pycuda.autoinit
import pycuda.driver as cu_drv
import pycuda.gpuarray as cua
synchronize = cu_drv.Context.synchronize
gpu_empty = cua.empty
gpu_empty_like = cua.empty_like
to_gpu = cua.to_gpu
stream = cu_drv.Stream()
print("Using PyCUDA")
except ImportError:
try:
import cupy
synchronize = cupy.cuda.stream.get_current_stream().synchronize
gpu_empty = cupy.empty
gpu_empty_like = cupy.empty_like
to_gpu = cupy.asarray
stream = cupy.cuda.Stream()
print("Using CuPy")
except ImportError:
raise ImportError("You need either PyCUDA or Cupy")
import pyvkfft.cuda
from pyvkfft.cuda import VkFFTApp
from pyvkfft.fft import fftn as vkfftn, ifftn as vkifftn, rfftn as vkrfftn, irfftn as vkirfftn
from pyvkfft.accuracy import l2, li
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
try:
from scipy.datasets import ascent
except:
from scipy.misc import ascent
import numpy as np
from scipy.fft import fftn, ifftn, fftshift, rfftn, irfftn, dctn, idctn
import timeit
Using PyCUDA
[3]:
def speed(shape, ndim, nb=10, stream=None, inplace=True, norm=0):
"""
Perform a speed test using VkFFT (
:param shape: array shape to use
:param ndim: number of dimensions for the FFT (e.g. can be 1, 2 or 3 for a 3D array, etc..)
:param nb: number of repeats for timing
:param stream: the pycuda.driver.Stream to be sued for calculations. If None,
the default stream for the active context will be used.
:param inplace: if True (default), do an in-place FFT
:param norm: norm=0 will have the L2 norm multiplied by the FT size for each transform,
whereas norm=1 will divide the L2 norm by the FT size for a backwards transform,
similarly to numpy's fft norm='backwards'.
:return: a tuple with the time per couple of FFT and iFFT, and the idealised memory throughput
assuming one read and one write of the array per transform axis, in Gbytes/s.
"""
d = to_gpu(np.random.uniform(0, 1, shape).astype(np.complex64))
if not inplace:
d1 = gpu_empty_like(d)
# print(d.shape)
app = VkFFTApp(d.shape, d.dtype, ndim=ndim, stream=stream, inplace=inplace, norm=norm)
synchronize()
t0 = timeit.default_timer()
for i in range(nb):
if inplace:
d = app.ifft(d)
d = app.fft(d)
else:
d1 = app.ifft(d, d1)
d = app.fft(d1, d)
synchronize()
dt = timeit.default_timer() - t0
shape = list(shape)
if len(shape) < 3:
shape += [1] * (3 - len(shape))
gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024 ** 3
s = ""
if not inplace:
s= "[out-of-place]"
print("(%4d %4d %4d)[%dD] dt=%6.2f ms %7.2f Gbytes/s %s [norm=%d]" %
(shape[2], shape[1], shape[0], ndim, dt / nb * 1000, gbps, s, norm))
return dt, gbps
Speed tests
[4]:
if False:
for inplace in [True, False]:
speed((256, 256, 256), 3, inplace=inplace)
speed((400, 400, 400), 3, inplace=inplace)
speed((512, 512, 512), 3, inplace=inplace)
speed((16, 504, 504), 2, inplace=inplace)
speed((16, 512, 512), 2, inplace=inplace)
speed((16, 1024, 1024), 2, inplace=inplace)
speed((16, 2048, 2048), 2, inplace=inplace)
speed((1, 512, 512), 2, inplace=inplace)
speed((1, 1024, 1024), 2, inplace=inplace)
speed((1, 2600, 2048), 2, inplace=inplace)
speed((1, 2048, 2600), 2, inplace=inplace)
speed((8, 2600, 2048), 2, inplace=inplace)
speed((8, 2048, 2600), 2, inplace=inplace)
speed((16, 2200, 2400), 2, inplace=inplace)
speed((16, 2310, 2730), 2, inplace=inplace)
speed((16, 2730, 2310), 2, inplace=inplace) # 2310=2*3*5*7*11, 2730=2*3*5*7*13
speed((16, 2400, 2200), 2, inplace=inplace)
speed((1, 3080, 3080), 2, inplace=inplace)
speed((8, 3072, 3072), 2, inplace=inplace)
# Also test with a supplied stream
speed((16, 1000, 1000), 2, stream=stream, inplace=inplace)
Complex<->Complex, inplace
[5]:
# single precision
d = to_gpu(ascent().astype(np.complex64))
app = VkFFTApp(d.shape, d.dtype, ndim=2)
print((abs(d.get()) ** 2).sum())
d = app.fft(d)
print((abs(d.get()) ** 2).sum())
print()
d = app.ifft(d)
print((abs(d.get()) ** 2).sum())
# Double precision
d = to_gpu(ascent().astype(np.complex128))
app = VkFFTApp(d.shape, d.dtype, ndim=2)
plt.figure(figsize=(15, 5))
plt.subplot(121)
plt.imshow(np.fft.fftshift(abs(np.fft.fftn(ascent().astype(np.complex128)))), norm=LogNorm())
plt.colorbar()
print((abs(d.get()) ** 2).sum())
d = app.fft(d)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())
plt.colorbar()
print((abs(d.get()) ** 2).sum())
d = app.ifft(d)
print((abs(d.get()) ** 2).sum())
2629743400.0
689371240000000.0
2629743000.0
2629743734.0
689371541405696.1
2629743734.0

Complex<->Complex, inplace, y-axis transform
Use axes=(-2,)
to perform a transform along specific axes, skipping the first dimension
[6]:
# single precision
d = to_gpu(ascent().astype(np.complex64))
axes = (-2,)
app = VkFFTApp(d.shape, d.dtype, axes=axes)
print((abs(d.get()) ** 2).sum())
d = app.fft(d)
print((abs(d.get()) ** 2).sum())
print()
d = app.ifft(d)
print((abs(d.get()) ** 2).sum())
# Double precision
d = to_gpu(ascent().astype(np.complex128))
app = VkFFTApp(d.shape, d.dtype, axes=axes)
plt.figure(figsize=(15, 5))
plt.subplot(121)
plt.imshow(np.fft.fftshift(abs(np.fft.fftn(ascent().astype(np.complex128), axes=axes))), norm=LogNorm())
plt.colorbar()
print((abs(d.get()) ** 2).sum())
d = app.fft(d)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())
plt.colorbar()
print((abs(d.get()) ** 2).sum())
d = app.ifft(d)
print((abs(d.get()) ** 2).sum())
2629743400.0
1346428200000.0
2629743400.0
2629743734.0
1346428791808.0
2629743734.0

complex<->complex, out-of-place
[7]:
# single precision, out-of-place
d = to_gpu(ascent().astype(np.complex64))
d1 = gpu_empty_like(d)
app = VkFFTApp(d.shape, d.dtype, inplace=False, ndim=2)
print((abs(d.get()) ** 2).sum())
plt.figure(figsize=(15, 4), dpi=100)
plt.subplot(141)
plt.imshow(ascent())
plt.title('Original array')
d1 = app.fft(d, d1)
print((abs(d1.get()) ** 2).sum())
dn = fftn(ascent().astype(np.complex64))
print(np.allclose(dn, d1.get(), rtol=1e-6,atol=abs(dn).max()*1e-6))
plt.subplot(142)
plt.imshow(abs(d.get()))
plt.title('Source array after FFT')
plt.subplot(143)
plt.imshow(fftshift(abs(d1.get())), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title('FFT')
plt.subplot(144)
plt.imshow(fftshift(abs(fftn(ascent().astype(np.complex64)))), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title('FFT (numpy)')
d = app.ifft(d1, d)
print((abs(d.get()) ** 2).sum())
print(np.allclose(ascent(), d.get(), rtol=1e-6,atol=ascent().max()*1e-6))
# double precision, out-of-place
d = to_gpu(ascent().astype(np.complex128))
d1 = gpu_empty_like(d)
app = VkFFTApp(d.shape, d.dtype, inplace=False, ndim=2)
print((abs(d.get()) ** 2).sum())
d1 = app.fft(d, d1)
print((abs(d1.get()) ** 2).sum())
dn = fftn(ascent().astype(np.complex128))
print(np.allclose(dn, d1.get(), rtol=1e-12,atol=abs(dn).max()*1e-12))
d = app.ifft(d1, d)
print((abs(d.get()) ** 2).sum())
2629743400.0
689371240000000.0
True
2629743000.0
True
2629743734.0
689371541405696.1
True
2629743734.0

Real<->Complex (half-Hermitian) inplace transform
[8]:
# single precision, R2C inplace
d0 = np.zeros((512, 514), dtype=np.float32)
d0[:, :512] = ascent()
d = to_gpu(d0)
app = VkFFTApp(d.shape, d.dtype, ndim=2, r2c=True)
print((abs(d.get()) ** 2).sum())
print(d.shape, d.dtype)
d = app.fft(d)
print(d.shape, d.dtype)
print((abs(d.get()) ** 2).sum())
dn = np.fft.rfftn(d0[:, :-2].astype(np.float32))
print(dn.shape)
print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))
plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C")
plt.subplot(122)
plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C (numpy)")
d = app.ifft(d)
print(d.shape, d.dtype)
print((abs(d.get()) ** 2).sum())
print(np.allclose(d.get()[:, :-2], d0[:, :-2], rtol=1e-6, atol = 255e-6))
2629743600.0
(512, 514) float32
(512, 257) complex64
610018400000000.0
(512, 257)
True
(512, 514) float32
2636300000.0
True

Real<->Complex (half-Hermitian) out-of-place transform
[9]:
# single precision, R2C inplace
d0 = to_gpu(ascent().astype(np.float32))
d1 = gpu_empty((512, 257), dtype=np.complex64)
app = VkFFTApp(d0.shape, d0.dtype, ndim=2, r2c=True, inplace=False)
print((abs(d0.get()) ** 2).sum())
print(d0.shape, d0.dtype)
d1 = app.fft(d0, d1)
print(d1.shape, d1.dtype)
print((abs(d1.get()) ** 2).sum())
dn = np.fft.rfftn(ascent().astype(np.float32))
print(dn.shape)
print(np.allclose(d1.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))
plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(fftshift(abs(d1.get()), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C out-of-place")
plt.subplot(122)
plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C (numpy)")
d0 = app.ifft(d1, d0)
print(d0.shape, d0.dtype)
print((abs(d0.get()) ** 2).sum())
print(np.allclose(d0.get(), ascent(), rtol=1e-6, atol = 255e-6))
plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(d0.get())
plt.colorbar()
plt.title("R->C & C->R out-of-place")
plt.subplot(122)
plt.imshow(ascent())
plt.colorbar()
plt.title("Original array")
2629743400.0
(512, 512) float32
(512, 257) complex64
610018400000000.0
(512, 257)
True
(512, 512) float32
2629743000.0
True
[9]:
Text(0.5, 1.0, 'Original array')


Real<->Real transform = Direct Cosine Transform (DCT)
[10]:
# single precision, DCT inplace
for dct_type in (1,2,3,4):
print("Testing DCT type %d" % dct_type)
d0 = to_gpu(ascent().astype(np.float32))
# dct type (default if dct=True)
app = VkFFTApp(d0.shape, d0.dtype, ndim=2, dct=dct_type, inplace=True)
print((abs(d0.get()) ** 2).sum())
d0 = app.fft(d0, d0)
print((abs(d0.get()) ** 2).sum())
dn = dctn(ascent().astype(np.float32), type=dct_type)
print(np.allclose(d0.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))
fig = plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(abs(d0.get()), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("DCT-%d in-place" % dct_type)
plt.subplot(122)
plt.imshow(abs(dn), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("DCT-%d (scipy)" % dct_type)
# inverse transform:
d0 = app.ifft(d0, d0)
idn = idctn(dn, type=dct_type)
print((abs(d0.get()) ** 2).sum(), (abs(idn) ** 2).sum())
print(np.allclose(d0.get(), idn, rtol=1e-4, atol = 1e-3),
np.allclose(d0.get(), ascent(), rtol=1e-4, atol = 1e-3))
fig = plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(d0.get())
plt.colorbar()
plt.title("DCT + iDCT type %d in-place" % dct_type)
plt.subplot(122)
plt.imshow(ascent())
plt.colorbar()
plt.title("Original array")
print()
Testing DCT type 1
2629743400.0
9056434000000000.0
True
2629742000.0 2629743000.0
True True
Testing DCT type 2
2629743400.0
9135847000000000.0
True
2629743000.0 2629743400.0
True True
Testing DCT type 3
2629743400.0
2751634700000000.0
True
2629741800.0 2629743400.0
True True
Testing DCT type 4
2629743400.0
2757484400000000.0
True
2629741600.0 2629743000.0
True True








Complex<->Complex, inplace, F-ordering
[11]:
d0 = np.asarray(ascent().astype(np.complex64), order='F')
for axes in [None, (-1,), (-2,)]:
d = to_gpu(d0)
app = VkFFTApp(d.shape, d.dtype, strides=d.strides, axes=axes)
plt.figure(figsize=(15, 5))
plt.subplot(121)
plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())
plt.colorbar()
d = app.fft(d)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())
plt.colorbar()
d = app.ifft(d)



Complex<->Complex, out-of-place, F-ordering
[12]:
d0 = np.asarray(ascent().astype(np.complex64), order='F')
axes = (-1,)
d1 = to_gpu(d0)
d2 = gpu_empty_like(d1)
app = VkFFTApp(d1.shape, d1.dtype, strides=d1.strides, axes=axes, inplace=False)
plt.figure(figsize=(15, 5))
plt.subplot(121)
plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())
plt.colorbar()
print((abs(d.get()) ** 2).sum())
d2 = app.fft(d1, d2)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(d2.get())), norm=LogNorm())
plt.colorbar()
d1 = app.ifft(d2, d1)
2629743600.0

Complex<->Complex, inplace, F-ordering, simple FFT interface
[13]:
order = 'F'
for axes in [None, (-1,), (-2,)]:
d0 = np.asarray(ascent().astype(np.complex64), order=order)
d = to_gpu(d0)
plt.figure(figsize=(15, 5))
plt.subplot(121)
dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))
plt.imshow(dn, norm=LogNorm())
plt.colorbar()
d = vkfftn(d, d, axes=axes)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())
plt.colorbar()
d = vkifftn(d, d, axes=axes)
# Test accuracy on random array
shape, dtype = d0.shape, d0.dtype
d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)
d0 = np.asarray(d0, order=order)
d = to_gpu(d0)
d = vkfftn(d, d, axes=axes)
dn = np.fft.fftn(d0, axes=axes)
print(l2(dn, d.get()), li(dn, d.get()))
d = to_gpu(d0)
d = vkifftn(d, d, axes=axes)
dn = np.fft.ifftn(d0, axes=axes)
print(l2(dn, d.get()), li(dn, d.get()))
print()
4.444979922273623e-07 5.908248388740862e-07
4.4354686916232555e-07 6.16391201187473e-07
2.6663396324512363e-07 3.872400644200587e-07
2.664766305948513e-07 4.079651360917785e-07
2.659319511021375e-07 3.5171515628801596e-07
2.6613409491012233e-07 3.6890486999237857e-07



Complex<->Complex, out-of-place, F-ordering, simple FFT interface
The destination array is automatically allocated
[ ]:
order = 'F'
for axes in [None, (-1,), (-2,)]:
d0 = np.asarray(ascent().astype(np.complex64), order=order)
d = to_gpu(d0)
plt.figure(figsize=(15, 5))
plt.subplot(121)
dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))
plt.imshow(dn, norm=LogNorm())
plt.colorbar()
d = vkfftn(d, axes=axes)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())
plt.colorbar()
d = vkifftn(d, axes=axes)
# Test accuracy on random array
shape, dtype = d0.shape, d0.dtype
d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)
d0 = np.asarray(d0, order=order)
d = to_gpu(d0)
d = vkfftn(d, axes=axes)
dn = np.fft.fftn(d0, axes=axes)
print(l2(dn, d.get()), li(dn, d.get()))
d = to_gpu(d0)
d = vkifftn(d, axes=axes)
dn = np.fft.ifftn(d0, axes=axes)
print(l2(dn, d.get()), li(dn, d.get()))
print()
Complex<->Complex, out-of-place pre-allocated, F-ordering, simple FFT interface
The destination array is pre-allocated here
[ ]:
order = 'F'
for axes in [None, (-1,), (-2,)]:
d0 = np.asarray(ascent().astype(np.complex64), order=order)
d = to_gpu(d0)
dest = gpu_empty_like(d)
plt.figure(figsize=(15, 5))
plt.subplot(121)
dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))
plt.imshow(dn, norm=LogNorm())
plt.colorbar()
dest = vkfftn(d, dest, axes=axes)
plt.subplot(122)
plt.imshow(np.fft.fftshift(abs(dest.get())), norm=LogNorm())
plt.colorbar()
dest = vkifftn(d, axes=axes)
# Test accuracy on random array
shape, dtype = d0.shape, d0.dtype
d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)
d0 = np.asarray(d0, order=order)
d = to_gpu(d0)
dest = vkfftn(d, dest, axes=axes)
dn = np.fft.fftn(d0, axes=axes)
print(l2(dn, dest.get()), li(dn, dest.get()))
d = to_gpu(d0)
dest = vkifftn(d, dest, axes=axes)
dn = np.fft.ifftn(d0, axes=axes)
print(l2(dn, dest.get()), li(dn, dest.get()))
print()
Real<->Complex (half-Hermitian) inplace transform, F-ordered, simple fft interface
[ ]:
# single precision, R2C inplace, # F order
order = 'F'
d0 = np.zeros((514, 512), dtype=np.float32)
d0[:512] = ascent()
d0 = np.asarray(d0, order=order)
d = to_gpu(d0)
d = vkrfftn(d, d)
dn = rfftn(d0[:-2], axes=(-1,-2)) # Need to force the Hermitian axis as the fast one
print(dn.shape)
print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))
plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C")
plt.subplot(122)
plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C (numpy)")
d = vkirfftn(d, d)
print(np.allclose(d.get()[:-2], d0[:-2], rtol=1e-6, atol = 255e-6))
[ ]:
d.strides , dn.strides
Real<->Complex (half-Hermitian) out-of-place, F-ordered, simple fft interface
[ ]:
# single precision, R2C inplace, # F order
order = 'F'
d0 = ascent().astype(np.float32)
d0 = np.asarray(d0, order=order)
d = to_gpu(d0)
d = vkrfftn(d)
dn = rfftn(d0, axes=(-1,-2)) # Need to force the Hermitian axis as the fast one
print(dn.shape, d.shape)
print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))
plt.figure(figsize=(15,5), dpi=100)
plt.subplot(121)
plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C")
plt.subplot(122)
plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))
plt.colorbar()
plt.title("R->C (numpy)")
d = vkirfftn(d)
print(np.allclose(d.get(), d0, rtol=1e-6, atol = 255e-6))
[ ]: