我正在使用CUDA添加两个矩阵,并将结果提供给另一个矩阵.我希望利用共享内存功能,为此,我写了以下内容:
#include <stdio.h>
#include <cuda.h>
#define grid 1024
#define BSZ 16
__global__ void addition(int *dev_a, int *dev_b, int *dev_c)
{
__shared__ int as[BSZ][BSZ];
__shared__ int bs[BSZ][BSZ];
int by = blockIdx.y;
int bx = blockIdx.x;
int cvalue;
int ty = threadIdx.y;
int tx = threadIdx.x;
int row = by * BSZ + ty;
int col = bx * BSZ + tx;
as[ty][tx] = dev_a[row*grid + col];
bs[ty][tx] = dev_b[row*grid + col];
__syncthreads();
cvalue = as[ty][tx] + bs[ty][tx];
__syncthreads();
dev_c[row*grid + col] …Run Code Online (Sandbox Code Playgroud)