cudaMallocPitch和cudaMemcpy2D

Question

cudaMallocPitch和cudaMemcpy2D

将C++ 2D数组转换为CUDA 1D数组时出错.让我展示一下我的源代码.

int main(void)
{
      float h_arr[1024][256];
      float *d_arr;

      // --- Some codes to populate h_arr

      // --- cudaMallocPitch
      size_t pitch;
      cudaMallocPitch((void**)&d_arr, &pitch, 256, 1024);

      // --- Copy array to device
      cudaMemcpy2D(d_arr, pitch, h_arr, 256, 256, 1024, cudaMemcpyHostToDevice);
}

Run Code Online (Sandbox Code Playgroud)

我试图运行代码,但它弹出一个错误.

如何使用cudaMallocPitch()和cudaMemcpy2D()是否正确？

Answer 1

Jac*_*ern 6

Talonmies 已经满意地回答了这个问题。在这里，一些可能对社区有用的进一步解释。

在 CUDA 中访问二维数组时，如果每一行都正确对齐，内存事务会快得多。

CUDA 提供了cudaMallocPitch用额外字节“填充”2D 矩阵行的功能，以实现所需的对齐。请参阅“CUDA C 编程指南”，第 3.2.2 和 5.3.2 节，了解更多信息。

假设我们要分配一个二维填充的浮点（单精度）元素数组，语法cudaMallocPitch如下：

cudaMallocPitch(&devPtr, &devPitch, Ncols * sizeof(float), Nrows);

Run Code Online (Sandbox Code Playgroud)

在哪里

devPtr是指向 float ( float *devPtr)的输出指针。
devPitch是一个size_t输出变量，表示填充行的长度（以字节为单位）。
Nrows和Ncols是size_t表示矩阵大小的输入变量。

回想一下 C/C++ 和 CUDA 按行存储 2D 矩阵，cudaMallocPitch将分配一个大小（以字节为单位）的内存空间，等于Nrows * pitch。但是，只有Ncols * sizeof(float)每行的第一个字节将包含矩阵数据。因此，cudaMallocPitch消耗比 2D 矩阵存储严格必需的更多的内存，但这会在更高效的内存访问中返回。CUDA 还提供了cudaMemcpy2D将数据从/到主机内存空间复制到/从设备内存空间分配的功能cudaMallocPitch。在上述假设下（单精度二维矩阵），语法如下：

cudaMemcpy2D(devPtr, devPitch, hostPtr, hostPitch, Ncols * sizeof(float), Nrows, cudaMemcpyHostToDevice)

Run Code Online (Sandbox Code Playgroud)

在哪里

devPtr和hostPtr是浮点（float *devPtr和float *hostPtr）的输入指针，分别指向（源）设备和（目标）主机内存空间；
devPitch和hostPitch是size_t输入变量，分别表示设备和主机内存空间的填充行的长度（以字节为单位）；
Nrows和Ncols是size_t表示矩阵大小的输入变量。

请注意，这cudaMemcpy2D也允许在主机端进行倾斜内存分配。如果主机内存没有音调，则hostPtr = Ncols * sizeof(float). 此外，cudaMemcpy2D是双向的。对于上面的示例，我们将数据从主机复制到设备。如果我们想将数据从设备复制到主机，则上面的行更改为

cudaMemcpy2D(hostPtr, hostPitch, devPtr, devPitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost)

Run Code Online (Sandbox Code Playgroud)

cudaMallocPitch可以按以下示例执行对分配的二维矩阵元素的访问：

int    tidx = blockIdx.x*blockDim.x + threadIdx.x;
int    tidy = blockIdx.y*blockDim.y + threadIdx.y;

if ((tidx < Ncols) && (tidy < Nrows))
{
    float *row_a = (float *)((char*)devPtr + tidy * pitch);
    row_a[tidx] = row_a[tidx] * tidx * tidy;
}

Run Code Online (Sandbox Code Playgroud)

在这样的示例，tidx并且tidy被用作列和行索引，分别为（记住，在CUDA，x-threads跨越列和y-threads跨越行青睐聚结）。指向行首元素的指针是通过将初始指针偏移以字节为单位devPtr的行长度tidy * pitch（char *是指向字节的指针，sizeof(char)是1字节）来计算的，其中每行的长度是通过使用间距信息计算的。

下面，我将提供一个完整的示例来展示这些概念。

#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>

#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16

#define Nrows 3
#define Ncols 5

/*****************/
/* CUDA MEMCHECK */
/*****************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
        if (abort) { getch(); exit(code); }
    }
}

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int hostPtr, int b){ return ((hostPtr % b) != 0) ? (hostPtr / b + 1) : (hostPtr / b); }

/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float *devPtr, size_t pitch)
{
    int    tidx = blockIdx.x*blockDim.x + threadIdx.x;
    int    tidy = blockIdx.y*blockDim.y + threadIdx.y;

    if ((tidx < Ncols) && (tidy < Nrows))
    {
        float *row_a = (float *)((char*)devPtr + tidy * pitch);
        row_a[tidx] = row_a[tidx] * tidx * tidy;
    }
}

/********/
/* MAIN */
/********/
int main()
{
    float hostPtr[Nrows][Ncols];
    float *devPtr;
    size_t pitch;

    for (int i = 0; i < Nrows; i++)
        for (int j = 0; j < Ncols; j++) {
            hostPtr[i][j] = 1.f;
            //printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
        }

    // --- 2D pitched allocation and host->device memcopy
    gpuErrchk(cudaMallocPitch(&devPtr, &pitch, Ncols * sizeof(float), Nrows));
    gpuErrchk(cudaMemcpy2D(devPtr, pitch, hostPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));

    dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
    dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);

    test_kernel_2D << <gridSize, blockSize >> >(devPtr, pitch);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy2D(hostPtr, Ncols * sizeof(float), devPtr, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nrows; i++) 
        for (int j = 0; j < Ncols; j++) 
            printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);

    return 0;    
}

Run Code Online (Sandbox Code Playgroud)

Answer 2

tal*_*ies 5

cudaMallocPitch你写的电话看起来不错,但是这个:

  cudaMemcpy2D(d_arr, pitch, h_arr, 256, 256, 1024, cudaMemcpyHostToDevice);

Run Code Online (Sandbox Code Playgroud)

是不正确的.引用文档

将src指向的内存区域的矩阵(每行宽度字节的高度行)复制到dst指向的内存区域,其中kind是cudaMemcpyHostToHost,cudaMemcpyHostToDevice,cudaMemcpyDeviceToHost或cudaMemcpyDeviceToDevice之一,并指定副本的方向.dpitch和spitch是dst和src指向的2D数组的内存宽度,以字节为单位,包括添加到每行末尾的任何填充.存储区域可能不重叠.宽度不得超过dpitch或spitch.使用与副本方向不匹配的dst和src指针调用cudaMemcpy2D()会导致未定义的行为.如果dpitch或spitch超过允许的最大值,cudaMemcpy2D()将返回错误.

因此,要复制的源间距和宽度必须以字节为单位指定.你的主机矩阵有一个sizeof(float) * 256字节,因为你要复制的源间距和源宽度是相同的,这意味着你的cudaMemcpy2D调用应该是这样的:

 cudaMemcpy2D(d_arr, pitch, h_arr, 256*sizeof(float), 
                256*sizeof(float), 1024, cudaMemcpyHostToDevice);

Run Code Online (Sandbox Code Playgroud)

归档时间：	9 年，8 月前
查看次数：	3235 次
最近记录：	6 年，9 月前