副标题#e#
Reduction操纵:规约操纵就是由多个数生成一个数,如求最大值、最小值、向量点积、求和等操纵,都属于这一类操纵。
有大量数据的环境下,利用GPU举办任务并行与数据并行,可以收到可好的结果。
group同步:OpenCL只提供了事情组内的各线程之间的同步机制,并没有提供所有线程的同步。提供组内item-work同步的要领:
void barrier (cl_mem_fence_flags flags)
参数说明:cl_mem_fence_flags 可以取CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE
函数说明:(1)一个work-group中所有work-item碰着barrier要领,都要期待其他work-item也达到该语句,才气执行后头的措施;
(2)还可以组内的work-item对local or global memory的顺序读写操纵。
如下图中每个大框暗示任务并行、每个group线程;框中的计较是数据并行、每个item-work线程:
作为操练,给出个完整的利用OpenCL计较整数序列求和,在数据并行中利用Local Memory 加快,group组内并行同步利用CLK_LOCAL_MEM_FENCE。
措施实例(整数序列求和):
1.核函数(Own_Reduction_Kernels.cl):
__kernel void reduce(__global uint4* input, __global uint4* output, int NUM) { NUM = NUM / 4; //每四个数为一个整体uint4。 unsigned int tid = get_local_id(0); unsigned int localSize = get_local_size(0); unsigned int globalSize = get_global_size(0); uint4 res=(uint4){0,0,0,0}; __local uint4 resArray[64]; unsigned int i = get_global_id(0); while(i < NUM) { res+=input[i]; i+=globalSize; } resArray[tid]=res; //将每个work-item计较功效生存到对应__local memory中 barrier(CLK_LOCAL_MEM_FENCE); // do reduction in shared mem for(unsigned int s = localSize >> 1; s > 0; s >>= 1) { if(tid < s) { resArray[tid] += resArray[tid + s]; } barrier(CLK_LOCAL_MEM_FENCE); } // write result for this block to global mem if(tid == 0) output[get_group_id(0)] = resArray[0]; }
#p#副标题#e#
2.tool.h 、tool.cpp
见:http://www.cnblogs.com/xudong-bupt/p/3582780.html
3.Reduction.cpp
#include <CL/cl.h> #include "tool.h" #include <string.h> #include <stdio.h> #include <stdlib.h> #include <iostream> #include <string> #include <fstream> using namespace std; int isVerify(int NUM,int groupNUM,int *res) //校验功效 { int sum1 = (NUM+1)*NUM/2; int sum2 = 0; for(int i = 0;i < groupNUM*4; i++) sum2 += res[i]; if(sum1 == sum2) return 0; return -1; } void isStatusOK(cl_int status) //判定状态码 { if(status == CL_SUCCESS) cout<<"RIGHT"<<endl; else cout<<"ERROR"<<endl; } int main(int argc, char* argv[]) { cl_int status; /**Step 1: Getting platforms and choose an available one(first).*/ cl_platform_id platform; getPlatform(platform); /**Step 2:Query the platform and choose the first GPU device if has one.*/ cl_device_id *devices=getCl_device_id(platform); /**Step 3: Create context.*/ cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); /**Step 4: Creating command queue associate with the context.*/ cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); /**Step 5: Create program object */ const char *filename = "Own_Reduction_Kernels.cl"; string sourceStr; status = convertToString(filename, sourceStr); const char *source = sourceStr.c_str(); size_t sourceSize[] = {strlen(source)}; cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL); /**Step 6: Build program. */ status=clBuildProgram(program, 1,devices,NULL,NULL,NULL); /**Step 7: Initial input,output for the host and create memory objects for the kernel*/ int NUM=25600; //6400*4 size_t global_work_size[1] = {640}; /// size_t local_work_size[1]={64}; ///256 PE size_t groupNUM=global_work_size[0]/local_work_size[0]; int* input = new int[NUM]; for(int i=0;i<NUM;i++) input[i]=i+1; int* output = new int[(global_work_size[0]/local_work_size[0])*4]; cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(int),(void *) input, NULL); cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , groupNUM*4* sizeof(int), NULL, NULL); /**Step 8: Create kernel object */ cl_kernel kernel = clCreateKernel(program,"reduce", NULL); /**Step 9: Sets Kernel arguments.*/ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer); status = clSetKernelArg(kernel, 2, sizeof(int), &NUM); /**Step 10: Running the kernel.*/ cl_event enentPoint; status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &enentPoint); clWaitForEvents(1,&enentPoint); ///wait clReleaseEvent(enentPoint); isStatusOK(status); /**Step 11: Read the cout put back to host memory.*/ status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0,groupNUM*4 * sizeof(int), output, 0, NULL, NULL); isStatusOK(status); if(isVerify(NUM, groupNUM ,output) == 0) cout<<"The result is right!!!"<<endl; else cout<<"The result is wrong!!!"<<endl; /**Step 12: Clean the resources.*/ status = clReleaseKernel(kernel);//*Release kernel. status = clReleaseProgram(program); //Release the program object. status = clReleaseMemObject(inputBuffer);//Release mem object. status = clReleaseMemObject(outputBuffer); status = clReleaseCommandQueue(commandQueue);//Release Command queue. status = clReleaseContext(context);//Release context. free(input); free(output); free(devices); return 0; }
作者:cnblogs 旭东的博客