1D FFTs of columns and rows of a 3D matrix in CUDA

I'm trying to compute batch 1D FFTs using cufftPlanMany. The data set comes from a 3D field, stored in a 1D array, where I want to compute 1D FFTs in the x and y direction. The data is stored as shown in the figure below; continuous in x then y then z.

Doing batch FFTs in the x-direction is (I believe) straighforward; with input stride=1, distance=nx and batch=ny * nz, it computes the FFTs over elements {0,1,2,3}, {4,5,6,7}, ..., {28,29,30,31}. However, I can't think of a way to achieve the same for the FFTs in the y-direction. A batch for each xy plane is again straightforward (input stride=nx, dist=1, batch=nx results in FFTs over {0,4,8,12}, {1,5,9,13}, etc.). But with batch=nx * nz, going from {3,7,11,15} to {16,20,24,28}, the distance is larger than 1. Can this somehow be done with cufftPlanMany?

回答1:

I think that the short answer to your question (possibility of using a single cufftPlanMany to perform 1D FFTs of the columns of a 3D matrix) is NO.

Indeed, transformations performed according to cufftPlanMany, that you call like

cufftPlanMany(&handle, rank, n, 
              inembed, istride, idist,
              onembed, ostride, odist, CUFFT_C2C, batch);

must obey the Advanced Data Layout. In particular, 1D FFTs are worked out according to the following layout

input[b * idist + x * istride]

where b addresses the b-th signal and istride is the distance between two consecutive items in the same signal. If the 3D matrix has dimensions M * N * Q and if you want to perform 1D transforms along the columns, then the distance between two consecutive elements will be M, while the distance between two consecutive signals will be 1. Furthermore, the number of batched executions must be set equal to M. With those parameters, you are able to cover only one slice of the 3D matrix. Indeed, if you try increasing M, then the cuFFT will start trying to compute new column-wise FFTs starting from the second row. The only solution to this problem is an iterative call to cufftExecC2C to cover all the Q slices.

For the record, the following code provides a fully worked example on how performing 1D FFTs of the columns of a 3D matrix.

#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {

    const int M = 3;
    const int N = 4;
    const int Q = 2;

    thrust::host_vector<float2> h_matrix(M * N * Q);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) {
                float2 temp;
                temp.x = (float)(j + k * M); 
                //temp.x = 1.f; 
                temp.y = 0.f;
                h_matrix[k*M*N+j*M+i] = temp;
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }
    printf("\n");

    thrust::device_vector<float2> d_matrix(h_matrix);

    thrust::device_vector<float2> d_matrix_out(M * N * Q);

    // --- Advanced data layout
    //     input[b * idist + x * istride]
    //     output[b * odist + x * ostride]
    //     b = signal number
    //     x = element of the b-th signal

    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { N };                        // --- Size of the Fourier transform
    int istride = M, ostride = M;           // --- Distance between two successive input/output elements
    int idist = 1, odist = 1;               // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = M;                          // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_C2C, batch);

    for (int k=0; k<Q; k++)
        cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data()) + k * M * N), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data()) + k * M * N), CUFFT_FORWARD);
    cufftDestroy(handle);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) { 
                float2 temp = d_matrix_out[k*M*N+j*M+i];
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }

}

The situation is different for the case when you want to perform 1D transforms of the rows. In that case, the distance between two consecutive elements is 1, while the distance between two consecutive signals is M. This allows you to set a number of N * Q transformations and then invoking cufftExecC2C only one time. For the record, the code below provides a full example of 1D transformations of the rows of a 3D matrix.

#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {

    const int M = 3;
    const int N = 4;
    const int Q = 2;

    thrust::host_vector<float2> h_matrix(M * N * Q);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) {
                float2 temp;
                temp.x = (float)(j + k * M); 
                //temp.x = 1.f; 
                temp.y = 0.f;
                h_matrix[k*M*N+j*M+i] = temp;
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }
    printf("\n");

    thrust::device_vector<float2> d_matrix(h_matrix);

    thrust::device_vector<float2> d_matrix_out(M * N * Q);

    // --- Advanced data layout
    //     input[b * idist + x * istride]
    //     output[b * odist + x * ostride]
    //     b = signal number
    //     x = element of the b-th signal

    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { M };                        // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = M, odist = M;               // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = N * Q;                      // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_C2C, batch);

    cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data())), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data())), CUFFT_FORWARD);
    cufftDestroy(handle);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) { 
                float2 temp = d_matrix_out[k*M*N+j*M+i];
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }

}

回答2:

I guess, idist=nx*nz could also jump a whole plane and batch=nz would then cover one yx plane. The decision should be made according to whether nx or nz is larger.