CUDA中3D矩阵的列和行的1D FFT

Bar*_*art 5 cuda cufft

我正在尝试使用计算批量1D FFT cufftPlanMany.该数据集来自一个三维场中,存储在一维阵列,其中我想计算1维FFT在xy方向.数据存储如下图所示; 连续在x然后y然后z.

x-direction中进行批量FFT 是(我相信)直截了当; 具有输入stride=1,distance=nx并且batch=ny * nz,它计算在元件的FFT {0,1,2,3},{4,5,6,7},...,{28,29,30,31}.但是,我想不出一种方法可以在-direction中实现相同的FFT y.一种用于每批xy平面是再次简单(输入stride=nx,dist=1,batch=nx过度导致的FFT {0,4,8,12},{1,5,9,13}等).但是batch=nx * nz,从那里{3,7,11,15}开始{16,20,24,28},距离大于1.这可以用cufftPlanMany以某种方式完成吗?

在此输入图像描述

Jac*_*ern 4

我认为对你的问题的简短回答(使用单个单元cufftPlanMany对 3D 矩阵的列执行 1D FFT的可能性)是否定的。

事实上,根据 执行的转换cufftPlanMany,您称之为

cufftPlanMany(&handle, rank, n, 
              inembed, istride, idist,
              onembed, ostride, odist, CUFFT_C2C, batch);
Run Code Online (Sandbox Code Playgroud)

必须遵守高级数据布局。特别地,一维 FFT 是根据以下布局计算出来的

input[b * idist + x * istride]
Run Code Online (Sandbox Code Playgroud)

其中b寻址b第 -th 信号,并且istride是同一信号中两个连续项之间的距离。如果 3D 矩阵具有维度M * N * Q,并且要沿列执行 1D 变换,则两个连续元素之间的距离将为M,而两个连续信号之间的距离将为1。此外,批量执行的数量必须设置为等于M。使用这些参数,您只能覆盖 3D 矩阵的一个切片。事实上,如果您尝试增加M,那么 cuFFT 将开始尝试从第二行开始计算新的按列 FFT。此问题的唯一解决方案是迭代调用以cufftExecC2C覆盖所有Q切片。

作为记录,以下代码提供了有关如何对 3D 矩阵的列执行 1D FFT 的完整示例。

#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {

    const int M = 3;
    const int N = 4;
    const int Q = 2;

    thrust::host_vector<float2> h_matrix(M * N * Q);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) {
                float2 temp;
                temp.x = (float)(j + k * M); 
                //temp.x = 1.f; 
                temp.y = 0.f;
                h_matrix[k*M*N+j*M+i] = temp;
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }
    printf("\n");

    thrust::device_vector<float2> d_matrix(h_matrix);

    thrust::device_vector<float2> d_matrix_out(M * N * Q);

    // --- Advanced data layout
    //     input[b * idist + x * istride]
    //     output[b * odist + x * ostride]
    //     b = signal number
    //     x = element of the b-th signal

    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { N };                        // --- Size of the Fourier transform
    int istride = M, ostride = M;           // --- Distance between two successive input/output elements
    int idist = 1, odist = 1;               // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = M;                          // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_C2C, batch);

    for (int k=0; k<Q; k++)
        cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data()) + k * M * N), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data()) + k * M * N), CUFFT_FORWARD);
    cufftDestroy(handle);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) { 
                float2 temp = d_matrix_out[k*M*N+j*M+i];
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }

}
Run Code Online (Sandbox Code Playgroud)

当您想要执行行的一维转换时,情况会有所不同。在这种情况下,两个连续元素之间的距离是1,而两个连续信号之间的距离是M。这允许您设置多个N * Q转换,然后cufftExecC2C仅调用一次。作为记录,下面的代码提供了 3D 矩阵行的 1D 变换的完整示例。

#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {

    const int M = 3;
    const int N = 4;
    const int Q = 2;

    thrust::host_vector<float2> h_matrix(M * N * Q);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) {
                float2 temp;
                temp.x = (float)(j + k * M); 
                //temp.x = 1.f; 
                temp.y = 0.f;
                h_matrix[k*M*N+j*M+i] = temp;
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }
    printf("\n");

    thrust::device_vector<float2> d_matrix(h_matrix);

    thrust::device_vector<float2> d_matrix_out(M * N * Q);

    // --- Advanced data layout
    //     input[b * idist + x * istride]
    //     output[b * odist + x * ostride]
    //     b = signal number
    //     x = element of the b-th signal

    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { M };                        // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = M, odist = M;               // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = N * Q;                      // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_C2C, batch);

    cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data())), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data())), CUFFT_FORWARD);
    cufftDestroy(handle);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) { 
                float2 temp = d_matrix_out[k*M*N+j*M+i];
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }

}
Run Code Online (Sandbox Code Playgroud)