N-Body CUDA优化

Col*_*acX 2 simulation cuda

我正在开发一个CUDA中的N体算法,我想学习一些优化的技巧和窍门.

我已经设法让NVIDIA Geforce GTX 260 16384运行机身20Flops,它拥有27流媒体多处理器.

这个KernelcomputeForces函数是95%时间慢的捅,我想知道是否还有我可以做的优化我的代码.

据我所知,我已针对内存空间局部性和内存写入进行了优化.在CUDA文档的某个地方,它说共享内存更快但我不知道如何利用它.我已经将16块中的工作与512每个线程分开,但这对我来说有点模糊.

请帮助和感谢阅读本文.

n   is number of bodies

gm  is the gpu mass pointer

gpx is the gpu position x pointer

gpy is the gpu position y pointer

gpz is the gpu position z pointer

gfx is the gpu force x pointer

gfy is the gpu force y pointer

gfz is the gpu force z pointer
Run Code Online (Sandbox Code Playgroud)

相关的内核函数

__global__ void KernelcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    int numThreads = blockDim.x * gridDim.x;

    float GRAVITY = 0.00001f;

    //compare all with all
    for( unsigned int ia=tid; ia<n; ia+=numThreads ){
        float lfx = 0.0f;
        float lfy = 0.0f;
        float lfz = 0.0f;

        for( unsigned int ib=0; ib<n; ib++ ){
            //compute distance
            float dx = ( gpx[ib] - gpx[ia]);
            float dy = ( gpy[ib] - gpy[ia] );
            float dz = ( gpz[ib] - gpz[ia] );
            //float distance = sqrt( dx*dx + dy*dy + dz*dz );
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            //prevent slingshots and division by zero
            //distance += 0.1f;
            distanceSquared += 0.01f;

            //calculate gravitational magnitude between the bodies
            //float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distance * distance * distance * distance );
            float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distanceSquared );

            //calculate forces for the bodies
            //magnitude times direction
            lfx += magnitude * ( dx );
            lfy += magnitude * ( dy );
            lfz += magnitude * ( dz );
        }

        //stores local memory to global memory
        gfx[ia] = lfx;
        gfy[ia] = lfy;
        gfz[ia] = lfz;
    }
}

extern void GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){  
    dim3 gridDim( 16, 1, 1 ); //specifys how many blocks in three possible dimensions
    dim3 blockDim( 512, 1, 1 ); //threads per block
    KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
}
Run Code Online (Sandbox Code Playgroud)

tal*_*ies 5

在做任何其他事情之前,尝试运行更多块.给定的块只能在单个SM上运行 - 通过仅使用16个块,您可以保证大约40%的GPU容量将处于空闲状态.27的一些倍数应该是GTX260-216上的最佳块数.您可能还会发现减少每个块的线程数不会影响性能,因此您可以保持每个线程的工作量相同,但只需使用足够的块来覆盖GPU中的所有SM.

编辑:只是为了说明这一点,考虑一下内核的这个小测试工具:

template<int blocksize, int gridsize>
extern float GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){  
    float time;

    dim3 gridDim( gridsize, 1, 1 ); //specifys how many blocks in three possible dimensions
    dim3 blockDim( blocksize, 1, 1 ); //threads per block

    cudaEvent_t start, stop;
    errchk( cudaEventCreate(&start) );
    errchk( cudaEventCreate(&stop) );

    errchk( cudaEventRecord(start, 0) );
    KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
    rterrchk;

    errchk( cudaEventRecord(stop, 0) );
    errchk( cudaEventSynchronize(stop) );
    errchk( cudaEventElapsedTime(&time, start, stop) );

    return time;
}

int main(void)
{
    const int n = 16384;
    size_t gsize = sizeof(float) * size_t(n);

    float * g[4], * _g[7];

    errchk( cudaSetDevice(1) );  // GTX 275

    for(int i=0; i<7; i++) 
        errchk( cudaMalloc((void **)&_g[i], gsize) ); 

    for(int i=0; i<4; i++)
        g[i] = (float *)malloc(gsize);

    for(int i=0; i<n; i++)
    for(int j=0; j<4; j++)
    *(g[j]+i) = (float)drand48();

    for(int i=0; i<4; i++) 
        errchk( cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice) ); 

    // Warm up to take context establishment time out.
    GPUcomputeForces<16,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]);

    // Bench runs
    printf("(1,1)@(512,1,1): %f\n", GPUcomputeForces<1,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(8,1)@(512,1,1): %f\n", GPUcomputeForces<8,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(16,1)@(512,1,1): %f\n", GPUcomputeForces<16,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(30,1)@(256,1,1): %f\n", GPUcomputeForces<30,256>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(60,1)@(128,1,1): %f\n", GPUcomputeForces<60,128>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(120,1)@(64,1,1): %f\n", GPUcomputeForces<120,64>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(240,1)@(32,1,1): %f\n", GPUcomputeForces<240,32>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );

    cudaThreadExit();

    return 0;
}
Run Code Online (Sandbox Code Playgroud)

当我运行它时,我在库存GTX 275上获得以下执行时间(所有时间都以毫秒为单位):

(1,1)@(512,1,1): 1087.107910
(8,1)@(512,1,1): 135.582458
(16,1)@(512,1,1): 67.876671
(30,1)@(256,1,1): 54.881279
(60,1)@(128,1,1): 35.261280
(120,1)@(64,1,1): 36.316288
(240,1)@(32,1,1): 39.870495
Run Code Online (Sandbox Code Playgroud)

即:即使使用非常小的块大小,运行至少与卡上的MP一样多的块对于提高性能也是至关重要的.


tal*_*ies 5

共享内存将成为此类内核中的有用优化 - 它允许合并粒子位置和质量的读取,这在 GT200 上将非常重要。我发现这大约是你的版本的两倍(使用 128 个 128 个线程的 16384 个粒子启动):

template<int blocksize>
__global__
void KernelcomputeForces1( unsigned int n1, float* gm, float* gpx,
                float* gpy, float* gpz, float* gfx, float* gfy, float* gfz )
{
    __shared__ float lgpx[blocksize], lgpy[blocksize],
               lgpz[blocksize], lgm[blocksize];

    const float GRAVITY = 0.00001f;

    //compare all with all
    float lfx = 0.0f, lfy = 0.0f, lfz = 0.0f;
    int ia = blockDim.x * blockIdx.x + threadIdx.x;
    float lgpx0 = gpx[ia], lgpy0 = gpy[ia],
          lgpz0 = gpz[ia], lgm0 = gm[ia];

    for( unsigned int ib=0; ib<n1; ib+=blocksize ){

        lgpx[threadIdx.x] = gpx[ib + threadIdx.x];
        lgpy[threadIdx.x] = gpy[ib + threadIdx.x];
        lgpz[threadIdx.x] = gpz[ib + threadIdx.x];
        lgm[threadIdx.x] = gm[ib + threadIdx.x];
        __syncthreads();

#pragma unroll
        for(unsigned int ic=0; ic<blocksize; ic++) {

            //compute distance
            float dx = ( lgpx[ic] - lgpx0 );
            float dy = ( lgpy[ic] - lgpy0 );
            float dz = ( lgpz[ic] - lgpz0 );
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            //prevent slingshots and division by zero
            distanceSquared += 0.01f;

            //calculate gravitational magnitude between the bodies
            float magnitude = GRAVITY * ( lgm0 * lgm[ic] )
                    / ( distanceSquared );

            //calculate forces for the bodies
            //magnitude times direction
            lfx += magnitude * ( dx );
            lfy += magnitude * ( dy );
            lfz += magnitude * ( dz );
        }
        __syncthreads();
    }

    //stores local memory to global memory
    gfx[ia] = lfx;
    gfy[ia] = lfy;
    gfz[ia] = lfz;
}
Run Code Online (Sandbox Code Playgroud)

您需要对超出块大小倍数的粒子数量做一些不同的事情,可能是不会展开的第二节。观察 __syncthreads() 调用可能出现的扭曲发散,如果您不小心,这可能会使内核挂起。


Jac*_*ern 5

@talonmies已经回答了这个问题,展示了共享内存如何有助于提高性能.所以,没有更多的补充.我只想提供一个完整的代码,包括不同的优化步骤,包括平铺,共享内存随机操作,并显示在Kepler K20c上执行测试的一些结果.

下面的代码围绕@talonmies呈现的代码,具有5内核函数,即

KernelcomputeForces:没有优化

KernelcomputeForces_Shared:利用源块和共享内存中的平铺

KernelcomputeForces_Tiling:利用目标群众中的平铺

KernelcomputeForces_Tiling_Shared:利用源和目标质量中的平铺并利用共享内存

KernelcomputeForces_Tiling_Shuffle:与上面相同,但使用shuffle操作而不是共享内存.

也许,与已经在文献中发表的文章(使用CUDA的快速N-Body Simulation)和已经作为代码提供的内容相比(参见上面的答案和Mark Harris的GitHub N-body页面,最后一个内核是唯一的新内容.但是我对N-body起了一点作用,发现发布这个答案很有用,可能对下一个用户有用.

这是代码

#include <stdio.h>

#define GRAVITATIONAL_CONST 6.67*1e?11
#define SOFTENING 1e-9f

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < N) {

        float invDist, invDist3;

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/**********************************/
/* KERNEL FUNCTION: SHARED MEMORY */
/**********************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < N) {

        float invDist, invDist3;

        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/***************************/
/* KERNEL FUNCTION: TILING */
/***************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }
        __syncthreads();

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*******************************************/
/* KERNEL FUNCTION: TILING + SHARED MEMORY */
/*******************************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/************************************************/
/* KERNEL FUNCTION: TILING + SHUFFLE OPERATIONS */
/************************************************/
__global__
oid KernelcomputeForces_Tiling_Shuffle(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    const int laneid = threadIdx.x & 31;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=32) {

            // --- Loading data to shared memory
            float x_src = x_d[ib + laneid];
            float y_src = y_d[ib + laneid];
            float z_src = z_d[ib + laneid];
            float m_src = m_d[ib + laneid];

#pragma unroll 32
            for(unsigned int ic=0; ic<32; ic++) {

                // --- Compute relative distances
                float dx = (__shfl(x_src, ic) - x_reg);
                float dy = (__shfl(y_src, ic) - y_reg);
                float dz = (__shfl(z_src, ic) - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * __shfl(m_src, ic)) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int BLOCKSIZE>
float GPUcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;

    dim3 grid(iDivUp(N,BLOCKSIZE), 1, 1);       // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);                // --- Threads per block

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start, 0));
    KernelcomputeForces_Shared<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));

    return time;
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int GRIDSIZE, int BLOCKSIZE>
float GPUcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;

    dim3 grid(GRIDSIZE, 1, 1);      // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);    // --- Threads per block

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start, 0));
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    KernelcomputeForces_Tiling_Shuffle<<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));

    return time;
}

/********************/
/* CPU CALCULATIONS */
/********************/
void CPUcomputeForces(float* m_h, float* x_h, float* y_h, float* z_h, float* fx_h, float* fy_h, float* fz_h, unsigned int N) {  

    for (unsigned int i=0; i<N; i++) {

        float invDist, invDist3;

        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_h[ib] - x_h[i]);
            float dy = (y_h[ib] - y_h[i]);
            float dz = (z_h[ib] - z_h[i]);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = 1.f / sqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_h[i] * m_h[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;

        }

        // --- Stores local memory to global memory
        fx_h[i] = fx_temp;
        fy_h[i] = fy_temp;
        fz_h[i] = fz_temp;
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    const int N = 16384;

    size_t gsize = sizeof(float) * size_t(N);

    float * g[10], * _g[7];

    for(int i=0; i<7; i++) gpuErrchk( cudaMalloc((void **)&_g[i], gsize)); 

    for(int i=0; i<10; i++) g[i] = (float *)malloc(gsize);

    for(int i=0; i<N; i++)
        for(int j=0; j<4; j++)
            *(g[j]+i) = (float)rand();

    for(int i=0; i<4; i++) gpuErrchk(cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice)); 

    // --- Warm up to take context establishment time out.
    GPUcomputeForces<512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
    //GPUcomputeForces_Tiling<32,512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);

    // --- Bench runs
    printf("1024: %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512:  %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256:  %f\n",    GPUcomputeForces<256>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128:  %f\n",    GPUcomputeForces<128>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64:   %f\n",    GPUcomputeForces<64>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32:   %f\n",    GPUcomputeForces<32>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

    printf("16, 1024: %f\n",    GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  1024: %f\n",    GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("4,  1024: %f\n",    GPUcomputeForces_Tiling<4,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 512: %f\n",     GPUcomputeForces_Tiling<32,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 512: %f\n",     GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  512: %f\n",     GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 256:  %f\n",    GPUcomputeForces_Tiling<64,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 256:  %f\n",    GPUcomputeForces_Tiling<32,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 256:  %f\n",    GPUcomputeForces_Tiling<16,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,128:  %f\n",    GPUcomputeForces_Tiling<128,128>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 128:  %f\n",    GPUcomputeForces_Tiling<64,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 128:  %f\n",    GPUcomputeForces_Tiling<32,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,64:   %f\n",    GPUcomputeForces_Tiling<256,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,64:   %f\n",    GPUcomputeForces_Tiling<128,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 64:   %f\n",    GPUcomputeForces_Tiling<64,64>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

    for(int i=8; i<10; i++) gpuErrchk(cudaMemcpy(g[i], _g[i-3], gsize, cudaMemcpyDeviceToHost)); 

    CPUcomputeForces(g[0],g[1],g[2],g[3],g[4],g[5],g[6],N);

    for(int i=0; i<N; i++)
        for(int j=8; j<10; j++) {

            if (abs(*(g[j]+i) - *(g[j-3]+i)) > 0.001f) {
                printf("Error at %i, %i; GPU = %f; CPU = %f\n",i, (j-8), *(g[j-3]+i), *(g[j]+i));
                return;
            }
        }

    printf("Test passed!\n");

    cudaDeviceReset();

    return 0;
}
Run Code Online (Sandbox Code Playgroud)

以下是一些结果N = 16384.

KernelcomputeForces:最佳BLOCKSIZE = 128; t = 10.07ms.

KernelcomputeForces_Shared:最佳BLOCKSIZE = 128; t = 7.04ms.

KernelcomputeForces_Tiling:最佳BLOCKSIZE = 128; t = 11.14ms.

KernelcomputeForces_Tiling_Shared:最佳GRIDSIZE = 64; 最佳的BLOCKSIZE = 256; t = 7.20ms.

KernelcomputeForces_Tiling_Shuffle:最佳GRIDSIZE = 128; 最佳的BLOCKSIZE = 128; t = 6.84ms.

Warp shuffle操作似乎可以略微提高共享内存的性能.