我想在CUDA上实现Inter-block障碍,但遇到了严重的问题.
我无法弄清楚为什么它不起作用.
#include <iostream>
#include <cstdlib>
#include <ctime>
#define SIZE 10000000
#define BLOCKS 100
using namespace std;
struct Barrier {
int *count;
__device__ void wait() {
atomicSub(count, 1);
while(*count)
;
}
Barrier() {
int blocks = BLOCKS;
cudaMalloc((void**) &count, sizeof(int));
cudaMemcpy(count, &blocks, sizeof(int), cudaMemcpyHostToDevice);
}
~Barrier() {
cudaFree(count);
}
};
__global__ void sum(int* vec, int* cache, int *sum, Barrier barrier)
{
int tid = blockIdx.x;
int temp = 0;
while(tid < SIZE) {
temp += vec[tid];
tid += gridDim.x;
} …Run Code Online (Sandbox Code Playgroud)