OpenCL:一个内核可以调用另一个内核

Ash*_*win 4 opencl

嗨,
我正在尝试在OpenCL中运行可用的卷积代码.
我有异构系统 -
1)CPU
2)GPU
PFB我的代码库在我的系统中运行:

convolution.cl

// TODO: Add OpenCL kernel code here.
__kernel 
void convolve(
    const __global uint * const input,
    __constant uint     * const mask,
    __global uint       * const output,
    const int                   inputWidth,
    const int                   maskWidth){

        const int x = get_global_id(0);
        const int y = get_global_id(1);

        uint sum = 0;

        for (int r = 0; r < maskWidth; r++)
        {
            const int idxIntmp = (y + r) * inputWidth + x;
            for (int c = 0; c < maskWidth; c++)
            {
                sum += mask[(r * maskWidth) + c] * input[idxIntmp + c];
            }
        }

        output[y * get_global_size(0) + x] = sum;
}
Run Code Online (Sandbox Code Playgroud)

和convolution.cpp -

//卷积​​ - 将3×3掩模应用于8×8输入信号的过程,产生6×6输出信号

    #include "CL/cl.h"
    #include "vector"
    #include "iostream"
    #include "time.h"

    #include <fstream>
    #include <sstream>
    #include <string>

using namespace std;

// Constants
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;

cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
    {3, 1, 1, 4, 8, 2, 1, 3},
    {4, 2, 1, 1, 2, 1, 2, 3},
    {4, 4, 4, 4, 3, 2, 2, 2},
    {9, 8, 3, 8, 9, 0, 0, 0},
    {9, 3, 3, 9, 0, 0, 0, 0},
    {0, 9, 0, 8, 0, 0, 0, 0},
    {3, 0, 8, 8, 9, 4, 4, 4},
    {5, 9, 8, 1, 8, 1, 1, 1}
};

const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;

cl_uint outputSignal[outputSignalWidth][outputSignalHeight];

const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;

cl_uint mask[maskWidth][maskHeight] =
{
    {1, 1, 1}, 
    {1, 0, 1}, 
    {1, 1, 1},
};

inline void checkErr(cl_int err, const char * name)
{
    if (err != CL_SUCCESS)
    {
        std::cerr << "ERROR: " << name
            << " (" << err << ")" << std::endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(
    const char * errInfo,
    const void * private_info,
    size_t cb,
    void * user_data)
{
    std::cout << "Error occurred during context use: "<< errInfo << std::endl;
    exit(EXIT_FAILURE);
}

int main(int argc,char argv[]){
    cl_int errNum;

    cl_uint numPlatforms;
    cl_uint numDevices;

    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;

    cl_context context = NULL;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    cl_mem inputSignalBuffer;
    cl_mem outputSignalBuffer;
    cl_mem maskBuffer;

    double start,end,Totaltime;//Timer variables

    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);

    checkErr(
        (errNum != CL_SUCCESS) ? errNum :
        (numPlatforms <= 0 ? -1 : CL_SUCCESS),
        "clGetPlatformIDs");

    platformIDs = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);

    checkErr(
        (errNum != CL_SUCCESS) ? errNum :
        (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

    deviceIDs = NULL;

    cl_uint i;

    for (i = 0; i < numPlatforms; i++)
    {
        errNum = clGetDeviceIDs(
            platformIDs[i],
            CL_DEVICE_TYPE_GPU,
            0,
            NULL,
            &numDevices);
        if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
        {
            checkErr(errNum, "clGetDeviceIDs");
        }
        else if (numDevices > 0)
        {
            deviceIDs = (cl_device_id *)malloc(
                sizeof(cl_device_id) * numDevices);

            errNum = clGetDeviceIDs(
                platformIDs[i], 
                CL_DEVICE_TYPE_GPU, 
                numDevices,
                &deviceIDs[0], 
                NULL);

            checkErr(errNum, "clGetDeviceIDs");

            break;
        }
    }
    if (deviceIDs == NULL) {
        std::cout << "No CPU device found" << std::endl;
        exit(-1);
    }
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,(cl_context_properties)platformIDs[i], 0
    };

    context = clCreateContext(
        contextProperties, numDevices, deviceIDs,
        &contextCallback, NULL, &errNum);

    checkErr(errNum, "clCreateContext");

    std::ifstream srcFile("convolution.cl");

    checkErr(srcFile.is_open() ? CL_SUCCESS : -1,
        "reading convolution.cl");

    std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));

    const char * src = srcProg.c_str();
    size_t length = srcProg.length();

    program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);

    checkErr(errNum, "clCreateProgramWithSource");

    errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);

    checkErr(errNum, "clBuildProgram");

    kernel = clCreateKernel(program, "convolve", &errNum);

    checkErr(errNum, "clCreateKernel");

    inputSignalBuffer = clCreateBuffer(
        context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
        static_cast<void *>(inputSignal), &errNum);

    checkErr(errNum, "clCreateBuffer(inputSignal)");    

    maskBuffer = clCreateBuffer(
        context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_uint) * maskHeight * maskWidth,
        static_cast<void *>(mask), &errNum);

    checkErr(errNum, "clCreateBuffer(mask)");

    outputSignalBuffer = clCreateBuffer(
        context, CL_MEM_WRITE_ONLY,
        sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
        NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(outputSignal)");

    queue = clCreateCommandQueue(
        context, deviceIDs[0], 0, &errNum);
    checkErr(errNum, "clCreateCommandQueue");

    errNum = clSetKernelArg(
        kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
    errNum |= clSetKernelArg(
        kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(
        kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
    errNum |= clSetKernelArg(
        kernel, 3, sizeof(cl_uint), &inputSignalWidth);
    errNum |= clSetKernelArg(
        kernel, 4, sizeof(cl_uint), &maskWidth);

    checkErr(errNum, "clSetKernelArg");

    const size_t globalWorkSize[1] ={ outputSignalWidth * outputSignalHeight };
    const size_t localWorkSize[1] = { 1 };

    start = clock();

    errNum = clEnqueueNDRangeKernel(
                                    queue,
                                    kernel,
                                    1,
                                    NULL,
                                    globalWorkSize,
                                    localWorkSize,
                                    0,
                                    NULL,
                                    NULL
                                    );

    checkErr(errNum, "clEnqueueNDRangeKernel");

    errNum = clEnqueueReadBuffer(
        queue, outputSignalBuffer, CL_TRUE, 0,
        sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
        outputSignal, 0, NULL, NULL);

    checkErr(errNum, "clEnqueueReadBuffer");

    end= clock(); - start;
    cout<<"Time in ms = "<<((end/CLOCKS_PER_SEC) * 1000) << endl;

    for (int y = 0; y < outputSignalHeight; y++)
    {
        for (int x = 0; x < outputSignalWidth; x++)
        {
            std::cout << outputSignal[x][y] << " ";
        }
        std::cout << std::endl;
    }

    return 0;
}
Run Code Online (Sandbox Code Playgroud)

问题:我有疑问 -
1)当我使用设备类型为CL_DEVICE_TYPE_GPU时,
性能提高267 ms.当我使用CL_DEVICE_TYPE_CPU时,执行时间更改为467 ms.我想知道在没有GPU的CPU和CPU与GPU上运行卷积代码之间的区别是什么(通过选择设备类型为CL_DEVICE_TYPE_CPU).
2)正如我可以看到convolution.cl文件,其中有一个执行3次的for循环.我可以调用其他内核从可用的内核文件中执行此操作吗?

我问这个问题,因为我是OpenCL编码的新手,想知道那件事.

Rom*_*yan 5

  1. CPU和GPU都是OpenCL设备.因此,通过选择CL_DEVICE_TYPE_CPU,您告诉OpenCL运行时将内核代码编译到CPU汇编程序并在CPU上运行它.当您选择CL_DEVICE_TYPE_GPU时,内核代码将编译为GPU汇编程序并在您的视频卡上执行.能够在不重写源代码的情况下更改设备类型是OpenCL的主要功能.没关系,你的CPU是否集成了GPU,和/或安装了独立的GPU,你只需选择可用的设备并运行内核即可.

  2. 对于OpenCL 1.2及更早版本,您无法从内核调用内核.动态并行性在OpenCL 2.0中实现.