数组结构与CUDA中的结构数组

Bug*_*tGG 41 c c++ arrays struct cuda

从我在这里读到的一些评论中,出于某种原因,对于像CUDA这样的并行实现,最好有Structure of Arrays(SoA)over Array of Structures(AoS)吗?如果这是真的,谁能解释为什么?提前致谢!

Pau*_*l R 49

选择AoS与SoA以获得最佳性能通常取决于访问模式.这不仅限于CUDA - 类似的考虑因素适用于任何性能受内存访问模式影响很大的体系结构,例如,您有缓存或者连续内存访问时性能更好(例如CUDA中的合并内存访问).

例如,对于RGB像素与单独的RGB平面:

struct {
    uint8_t r, g, b;
} AoS[N];

struct {
    uint8_t r[N];
    uint8_t g[N];
    uint8_t b[N];
} SoA;
Run Code Online (Sandbox Code Playgroud)

如果您要同时访问每个像素的R/G/B组件,那么AoS通常是有意义的,因为R,G,B组件的连续读取将是连续的并且通常包含在同一缓存行中.对于CUDA,这也意味着内存读/写合并.

但是,如果您要分别处理颜色平面,那么SoA可能是首选,例如,如果您想要按比例因子缩放所有R值,则SoA意味着所有R组件将是连续的.

另一个考虑因素是填充/对齐.对于上面的RGB示例,AoS布局中的每个元素都对齐到3个字节的倍数,这对于CUDA,SIMD等可能不方便 - 在某些情况下甚至可能需要在结构内填充以使对齐更方便(例如,添加一个虚拟uint8_t元素以确保4字节对齐).然而,在SoA情况下,平面是字节对齐的,这对于某些算法/架构可能更方便.

对于大多数图像处理类型的应用程序,AoS场景更为常见,但对于其他应用程序或特定图像处理任务,情况可能并非总是如此.如果没有明显的选择,我会推荐AoS作为默认选择.

有关AoS v SoA的更一般性讨论,另请参阅此答案.


Jac*_*ern 7

我只想提供一个简单的示例,说明数组结构 (SoA) 的性能如何优于结构数组 (AoS)。

在示例中,我正在考虑相同代码的三个不同版本:

  1. 系统架构 (v1)
  2. 直阵列 (v2)
  3. AoS (v3)

特别是,版本2考虑了使用直线阵列。版本2和的时间在3这个例子中是相同的,结果比 version 更好1。我怀疑,一般来说,直数组可能更可取,尽管以可读性为代价,因为例如,const __restrict__在这种情况下可以启用从统一缓存加载。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <thrust\device_vector.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCKSIZE   1024

/******************************************/
/* CELL STRUCT LEADING TO ARRAY OF STRUCT */
/******************************************/
struct cellAoS {

    unsigned int    x1;
    unsigned int    x2;
    unsigned int    code;
    bool            done;

};

/*******************************************/
/* CELL STRUCT LEADING TO STRUCT OF ARRAYS */
/*******************************************/
struct cellSoA {

    unsigned int    *x1;
    unsigned int    *x2;
    unsigned int    *code;
    bool            *done;

};


/*******************************************/
/* KERNEL MANIPULATING THE ARRAY OF STRUCT */
/*******************************************/
__global__ void AoSvsSoA_v1(cellAoS *d_cells, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        cellAoS tempCell = d_cells[tid];

        tempCell.x1 = tempCell.x1 + 10;
        tempCell.x2 = tempCell.x2 + 10;

        d_cells[tid] = tempCell;
    }

}

/******************************/
/* KERNEL MANIPULATING ARRAYS */
/******************************/
__global__ void AoSvsSoA_v2(unsigned int * __restrict__ d_x1, unsigned int * __restrict__ d_x2, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        d_x1[tid] = d_x1[tid] + 10;
        d_x2[tid] = d_x2[tid] + 10;

    }

}

/********************************************/
/* KERNEL MANIPULATING THE STRUCT OF ARRAYS */
/********************************************/
__global__ void AoSvsSoA_v3(cellSoA cell, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        cell.x1[tid] = cell.x1[tid] + 10;
        cell.x2[tid] = cell.x2[tid] + 10;

    }

}

/********/
/* MAIN */
/********/
int main() {

    const int N = 2048 * 2048 * 4;

    TimingGPU timerGPU;

    thrust::host_vector<cellAoS>    h_cells(N);
    thrust::device_vector<cellAoS>  d_cells(N);

    thrust::host_vector<unsigned int>   h_x1(N);
    thrust::host_vector<unsigned int>   h_x2(N);

    thrust::device_vector<unsigned int> d_x1(N);
    thrust::device_vector<unsigned int> d_x2(N);

    for (int k = 0; k < N; k++) {

        h_cells[k].x1 = k + 1;
        h_cells[k].x2 = k + 2;
        h_cells[k].code = k + 3;
        h_cells[k].done = true;

        h_x1[k] = k + 1;
        h_x2[k] = k + 2;

    }

    d_cells = h_cells;

    d_x1 = h_x1;
    d_x2 = h_x2;

    cellSoA cell;
    cell.x1 = thrust::raw_pointer_cast(d_x1.data());
    cell.x2 = thrust::raw_pointer_cast(d_x2.data());
    cell.code = NULL;
    cell.done = NULL;

    timerGPU.StartCounter();
    AoSvsSoA_v1 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(thrust::raw_pointer_cast(d_cells.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing AoSvsSoA_v1 = %f\n", timerGPU.GetCounter());

    //timerGPU.StartCounter();
    //AoSvsSoA_v2 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(thrust::raw_pointer_cast(d_x1.data()), thrust::raw_pointer_cast(d_x2.data()), N);
    //gpuErrchk(cudaPeekAtLastError());
    //gpuErrchk(cudaDeviceSynchronize());
    //printf("Timing AoSvsSoA_v2 = %f\n", timerGPU.GetCounter());

    timerGPU.StartCounter();
    AoSvsSoA_v3 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(cell, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing AoSvsSoA_v3 = %f\n", timerGPU.GetCounter());

    h_cells = d_cells;

    h_x1 = d_x1;
    h_x2 = d_x2;

    // --- Check results
    for (int k = 0; k < N; k++) {
        if (h_x1[k] != k + 11) {
            printf("h_x1[%i] not equal to %i\n", h_x1[k], k + 11);
            break;
        }
        if (h_x2[k] != k + 12) {
            printf("h_x2[%i] not equal to %i\n", h_x2[k], k + 12);
            break;
        }
        if (h_cells[k].x1 != k + 11) {
            printf("h_cells[%i].x1 not equal to %i\n", h_cells[k].x1, k + 11);
            break;
        }
        if (h_cells[k].x2 != k + 12) {
            printf("h_cells[%i].x2 not equal to %i\n", h_cells[k].x2, k + 12);
            break;
        }
    }

}
Run Code Online (Sandbox Code Playgroud)

以下是时序(在 GTX960 上执行的运行):

Array of struct        9.1ms (v1 kernel)
Struct of arrays       3.3ms (v3 kernel)
Straight arrays        3.2ms (v2 kernel)
Run Code Online (Sandbox Code Playgroud)