GPGPU OpenCL编程步调与简朴实例

副标题#e#

1.OpenCL观念

OpenCL是一个为异构平台编写措施的框架，此异构平台可由CPUI、GPU或其他范例的处理惩罚器构成。 OpenCL由一门用于编写kernels （在OpenCL设备上运行的函数）的语言（基于C99）和一组用于界说并节制平台的API构成。

OpenCL提供了两种层面的并行机制：任务并行与数据并行。

2.OpenCL与CUDA的区别

差异点：OpenCL是通用的异构平台编程语言，为了分身差异设备，利用繁琐。

CUDA是nvidia公司发现的专门在其GPGPU上的编程的框架，利用简朴，好入门。

沟通点：都是基于任务并行与数据并行。

3.OpenCL的编程步调

（1）Discover and initialize the platforms

挪用两次clGetPlatformIDs函数，第一次获取可用的平台数量，第二次获取一个可用的平台。

（2）Discover and initialize the devices

挪用两次clGetDeviceIDs函数，第一次获取可用的设备数量，第二次获取一个可用的设备。

（3）Create a context(挪用clCreateContext函数）

上下文context大概会打点多个设备device。

（4）Create a command queue(挪用clCreateCommandQueue函数）

一个设备device对应一个command queue。

上下文conetxt将呼吁发送到设备对应的command queue，设备就可以执行呼吁行列里的呼吁。

（5）Create device buffers(挪用clCreateBuffer函数）

Buffer中生存的是数据工具，就是设备执行措施需要的数据生存在个中。

Buffer由上下文conetxt建设，这样上下文打点的多个设备就会共享Buffer中的数据。

（6）Write host data to device buffers(挪用clEnqueueWriteBuffer函数）

（7）Create and compile the program

建设措施工具，措施工具就代表你的措施源文件可能二进制代码数据。

（8）Create the kernel(挪用clCreateKernel函数）

按照你的措施工具，生成kernel工具，暗示设备措施的进口。

（9）Set the kernel arguments(挪用clSetKernelArg函数）

（10）Configure the work-item structure(配置worksize）

设置work-item的组织形式（维数，group构成等）

（11）Enqueue the kernel for execution(挪用clEnqueueNDRangeKernel函数）

将kernel工具，以及 work-item参数放入呼吁行列中举办执行。

（12）Read the output buffer back to the host(挪用clEnqueueReadBuffer函数）

（13）Release OpenCL resources（至此竣事整个运行进程）

#p#副标题#e#

4.说明

OpenCL中的核函数必需单列一个文件。

OpenCL的编程一般步调就是上面的13步，太长了，以至于要想做个向量加法都是那么坚苦。

不外上面的步调前3步一般是牢靠的，可以单独写在一个.h/.cpp文件中，其他的一般也不会有什么大的变革。

5.措施实例，向量运算

5.1通用前3个步调，生成一个文件

tool.h

#ifndef TOOLH
#define TOOLH
    
#include <CL/cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
    
/** convert the kernel file into a string */
int convertToString(const char *filename, std::string& s);
    
/**Getting platforms and choose an available one.*/
int getPlatform(cl_platform_id &platform);
    
/**Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *getCl_device_id(cl_platform_id &platform);
    
#endif

tool.cpp

#include <CL/cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
#include "tool.h"
using namespace std;
    
/** convert the kernel file into a string */
int convertToString(const char *filename, std::string& s)
{
    size_t size;
    char*  str;
    std::fstream f(filename, (std::fstream::in | std::fstream::binary));
    
    if(f.is_open())
    {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t)f.tellg();
        f.seekg(0, std::fstream::beg);
        str = new char[size+1];
        if(!str)
        {
            f.close();
            return 0;
        }
    
        f.read(str, fileSize);
        f.close();
        str[size] = '\0';
        s = str;
        delete[] str;
        return 0;
    }
    cout<<"Error: failed to open file\n:"<<filename<<endl;
    return -1;
}
    
/**Getting platforms and choose an available one.*/
int getPlatform(cl_platform_id &platform)
{
    platform = NULL;//the chosen platform
    
    cl_uint numPlatforms;//the NO. of platforms
    cl_int    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (status != CL_SUCCESS)
    {
        cout<<"Error: Getting platforms!"<<endl;
        return -1;
    }
    
    /**For clarity, choose the first available platform. */
    if(numPlatforms > 0)
    {
        cl_platform_id* platforms =
            (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        platform = platforms[0];
        free(platforms);
    }
    else
        return -1;
}
    
/**Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *getCl_device_id(cl_platform_id &platform)
{
    cl_uint numDevices = 0;
    cl_device_id *devices=NULL;
    cl_int    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, numDevices);
    if (numDevices > 0) //GPU available.
    {
        devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
    }
    return devices;
}

查察本栏目

#p#副标题#e#

5.2核函数文件

HelloWorld_Kernel.cl

__kernel void helloworld(__global double* in, __global double* out)
{
    int num = get_global_id(0);
    out[num] = in[num] / 2.4 *(in[num]/6) ;
}

5.3主函数文件

#p#分页标题#e#

HelloWorld.cpp

//For clarity,error checking has been omitted.
#include <CL/cl.h>
#include "tool.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
    
int main(int argc, char* argv[])
{
    cl_int    status;
    /**Step 1: Getting platforms and choose an available one(first).*/
    cl_platform_id platform;
    getPlatform(platform);
    
    /**Step 2:Query the platform and choose the first GPU device if has one.*/
    cl_device_id *devices=getCl_device_id(platform);
    
    /**Step 3: Create context.*/
    cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
    
    /**Step 4: Creating command queue associate with the context.*/
    cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    
    /**Step 5: Create program object */
    const char *filename = "HelloWorld_Kernel.cl";
    string sourceStr;
    status = convertToString(filename, sourceStr);
    const char *source = sourceStr.c_str();
    size_t sourceSize[] = {strlen(source)};
    cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
    
    /**Step 6: Build program. */
    status=clBuildProgram(program, 1,devices,NULL,NULL,NULL);
    
    /**Step 7: Initial input,output for the host and create memory objects for the kernel*/
    const int NUM=512000;
    double* input = new double[NUM];
    for(int i=0;i<NUM;i++)
        input[i]=i;
    double* output = new double[NUM];
    
    cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(double),(void *) input, NULL);
    cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , NUM * sizeof(double), NULL, NULL);
    
    /**Step 8: Create kernel object */
    cl_kernel kernel = clCreateKernel(program,"helloworld", NULL);
    
    /**Step 9: Sets Kernel arguments.*/
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
    
    /**Step 10: Running the kernel.*/
    size_t global_work_size[1] = {NUM};
    cl_event enentPoint;
    status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &enentPoint);
    clWaitForEvents(1,&enentPoint); ///wait
    clReleaseEvent(enentPoint);
    
    /**Step 11: Read the cout put back to host memory.*/
    status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, NUM * sizeof(double), output, 0, NULL, NULL);
    cout<<output[NUM-1]<<endl;
    
    /**Step 12: Clean the resources.*/
    status = clReleaseKernel(kernel);//*Release kernel.
    status = clReleaseProgram(program);    //Release the program object.
    status = clReleaseMemObject(inputBuffer);//Release mem object.
    status = clReleaseMemObject(outputBuffer);
    status = clReleaseCommandQueue(commandQueue);//Release  Command queue.
    status = clReleaseContext(context);//Release context.
    
    if (output != NULL)
    {
        free(output);
        output = NULL;
    }
    
    if (devices != NULL)
    {
        free(devices);
        devices = NULL;
    }
    return 0;
}

编译、链接、执行：

g++ -I /opt/AMDAPP/include/ -o A *.cpp -lOpenCL ; ./A

关键字：