CUFFT:当输入是一个音调阵列时,如何计算fft

Opt*_*mus 2 c++ cuda fft dynamic-arrays cufft

我试图找到一个动态分配的数组的fft.使用输入数组从主机复制到设备cudaMemcpy2D.然后获取fft(cufftExecR2C)并将结果从设备复制回主机.

所以我最初的问题是如何在fft中使用音高信息.然后我在这里找到了答案 - CUFFT:如何计算投手指针的fft?

但不幸的是它不起作用.我得到的结果是垃圾值.以下是我的代码.

#define NRANK 2
#define BATCH 10

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h> 
#include <iomanip> 
#include <iostream>
#include <vector>

using namespace std;

const size_t NX = 4;
const size_t NY = 6;

int main()
    { 
    // Input array (static) - host side 
    float h_in_data_static[NX][NY] ={ 
        {0.7943 ,   0.6020 ,   0.7482  ,  0.9133  ,  0.9961 , 0.9261},
        {0.3112 ,   0.2630 ,   0.4505  ,  0.1524  ,  0.0782 ,  0.1782},
        {0.5285 ,   0.6541 ,   0.0838  ,  0.8258  ,  0.4427,  0.3842},
        {0.1656 ,   0.6892 ,   0.2290  ,  0.5383  ,  0.1067,  0.1712}
        };

    // --------------------------------
    // Input array (dynamic) - host side 
    float *h_in_data_dynamic = new float[NX*NY];  

    // Set the values
    size_t h_ipitch;
    for (int r = 0; r < NX; ++r)  // this can be also done on GPU
        {    
        for (int c = 0; c < NY; ++c)
            {   h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c];   }
        }
    // --------------------------------

    // Output array - host side
    float2 *h_out_data_temp = new float2[NX*(NY/2+1)] ; 


    // Input and Output array - device side 
    cufftHandle plan;
    cufftReal *d_in_data;      
    cufftComplex * d_out_data;
    int n[NRANK] = {NX, NY};

    //  Copy input array from Host to Device
    size_t ipitch;
    cudaError  cudaStat1 =  cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);    
    cout << cudaGetErrorString(cudaStat1) << endl;
    cudaError  cudaStat2 =  cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);   
    cout << cudaGetErrorString(cudaStat2) << endl;

    //  Allocate memory for output array - device side
    size_t opitch;
    cudaError  cudaStat3 =  cudaMallocPitch((void**)&d_out_data,&opitch,(NY/2+1)*sizeof(cufftComplex),NX);  
    cout << cudaGetErrorString(cudaStat3) << endl;

    //  Performe the fft
    int rank = 2; // 2D fft     
    int istride = 1, ostride = 1; // Stride lengths
    int idist = 1, odist = 1;     // Distance between batches
    int inembed[] = {ipitch, NX}; // Input size with pitch
    int onembed[] = {opitch, NX}; // Output size with pitch
    int batch = 1;
    cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch);
    //cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
    cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
    cufftExecR2C(plan, d_in_data, d_out_data);
    cudaThreadSynchronize();

    // Copy d_in_data back from device to host
    cudaError  cudaStat4 = cudaMemcpy2D(h_out_data_temp,(NY/2+1)*sizeof(float2), d_out_data, opitch, (NY/2+1)*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost); 
    cout << cudaGetErrorString(cudaStat4) << endl;

    // Print the results
    for (int i = 0; i < NX; i++)    
        {
        for (int j =0 ; j< NY/2 + 1; j++)       
            printf(" %f + %fi",h_out_data_temp[i*(NY/2+1) + j].x ,h_out_data_temp[i*(NY/2+1) + j].y);
        printf("\n");    
        }
    cudaFree(d_in_data);

    return 0;
    }
Run Code Online (Sandbox Code Playgroud)

我认为问题在于cufftPlanMany.我该如何解决这个问题?

Rob*_*lla 7

您可能需要仔细研究文档的高级数据布局部分.

我认为上一个被链接的问题有些令人困惑,因为这个问题是以相反的顺序传递widthheight参数,这是我对袖口2D计划所期望的.然而,答案然后模仿该顺序,因此它至少是一致的.

其次,你在前面的问题错过了那些在传递的"间距"参数inembedonembed不一样的,你会收到来自俯仰参数cudaMallocPitch操作.它们必须按输入和输出数据集中每个数据元素的字节数进行缩放.我实际上并不完全确定这是inembedonembed参数的预期用途,但似乎有效.

当我调整您的代码以解释上述两个更改时,我似乎得到了有效的结果,至少它们似乎在合理的范围内.您现在已经发布了几个关于2D FFT的问题,您已经说过结果不正确.我不能在脑子里做这些2D FFT,所以我建议将来你指出你期望的数据.

这有我做的改变:

#define NRANK 2
#define BATCH 10

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iomanip>
#include <iostream>
#include <vector>

using namespace std;

const size_t NX = 4;
const size_t NY = 6;

int main()
    {
    // Input array (static) - host side
    float h_in_data_static[NX][NY] ={
        {0.7943 ,   0.6020 ,   0.7482  ,  0.9133  ,  0.9961 , 0.9261},
        {0.3112 ,   0.2630 ,   0.4505  ,  0.1524  ,  0.0782 ,  0.1782},
        {0.5285 ,   0.6541 ,   0.0838  ,  0.8258  ,  0.4427,  0.3842},
        {0.1656 ,   0.6892 ,   0.2290  ,  0.5383  ,  0.1067,  0.1712}
        };

    // --------------------------------
    // Input array (dynamic) - host side
    float *h_in_data_dynamic = new float[NX*NY];

    // Set the values
    size_t h_ipitch;
    for (int r = 0; r < NX; ++r)  // this can be also done on GPU
        {
        for (int c = 0; c < NY; ++c)
            {   h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c];   }
        }
    // --------------------------------
    int owidth = (NY/2)+1;

    // Output array - host side
    float2 *h_out_data_temp = new float2[NX*owidth] ;


    // Input and Output array - device side
    cufftHandle plan;
    cufftReal *d_in_data;
    cufftComplex * d_out_data;
    int n[NRANK] = {NX, NY};

    //  Copy input array from Host to Device
    size_t ipitch;
    cudaError  cudaStat1 =  cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);
    cout << cudaGetErrorString(cudaStat1) << endl;
    cudaError  cudaStat2 =  cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);
    cout << cudaGetErrorString(cudaStat2) << endl;

    //  Allocate memory for output array - device side
    size_t opitch;
    cudaError  cudaStat3 =  cudaMallocPitch((void**)&d_out_data,&opitch,owidth*sizeof(cufftComplex),NX);
    cout << cudaGetErrorString(cudaStat3) << endl;

    //  Performe the fft
    int rank = 2; // 2D fft
    int istride = 1, ostride = 1; // Stride lengths
    int idist = 1, odist = 1;     // Distance between batches
    int inembed[] = {NX, ipitch/sizeof(cufftReal)}; // Input size with pitch
    int onembed[] = {NX, opitch/sizeof(cufftComplex)}; // Output size with pitch
    int batch = 1;
    if ((cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch)) != CUFFT_SUCCESS) cout<< "cufft error 1" << endl;
    //cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
    if ((cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)) != CUFFT_SUCCESS) cout << "cufft error 2" << endl;
    if ((cufftExecR2C(plan, d_in_data, d_out_data)) != CUFFT_SUCCESS) cout << "cufft error 3" << endl;
    cudaDeviceSynchronize();

    // Copy d_in_data back from device to host
    cudaError  cudaStat4 = cudaMemcpy2D(h_out_data_temp,owidth*sizeof(float2), d_out_data, opitch, owidth*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost);
    cout << cudaGetErrorString(cudaStat4) << endl;

    // Print the results
    for (int i = 0; i < NX; i++)
        {
        for (int j =0 ; j< owidth; j++)
            printf(" %f + %fi",h_out_data_temp[i*owidth + j].x ,h_out_data_temp[i*owidth + j].y);
        printf("\n");
        }
    cudaFree(d_in_data);

    return 0;
    }
Run Code Online (Sandbox Code Playgroud)

  • +1为你的答案.我用Matlab计算了上面代码的输出,似乎输出的非冗余部分与Matlab的`fft2`完全匹配. (4认同)