下面通过一个向量相加的程序来了解OpenCL . 有A,B两个四维向量,相加后值存在C向量里,openCL根据用户提供的维数,将向量分解成多个任务分发给DSP进行并行计算
openCL程序分为两个部份,一部份是内核代码,负责具体算法。另一部份是主程序负责初始化OpenCL和准备数据。主程序加载内核代码,并按照既定方法进行运算
kernel代码如下
__kernel voidvectoradd(__global int *a, __global int *b, __global int *c)
{
  int id = get_global_id(0);
  c[id] = a[id] + b[id];
}
__kernel 指明这是一个openCL内核,__global 说明指针指向的是全局的设备内存空间
HOST端代码如下
int get_ocl_string(const char *file_name, char *ocl_string)
{
  FILE *fp;
  int file_length;
  int status = 0;
 
  fp = fopen(file_name, "r");
  if (fp == NULL)
  return -1;
 
  fseek(fp, 0, SEEK_END);
  file_length = ftell(fp);
  fseek(fp, 0, SEEK_SET);
  status = fread(ocl_string, 1, file_length,fp);
  if (status == -1)
  return -1;
  return file_length;
}
int main(void)
{
  int array_a[10] = {0, 1, 2, 3, 4, 5, 6, 7,8, 9};
  int array_b[10] = {9, 8, 7, 6, 5, 4, 3, 2,1, 0};
  int array_c[10] = {0, 0, 0, 0, 0, 0, 0, 0,0, 0};
  size_t datasize = 10 * sizeof(int);
  size_t ocl_string_size;
  char *ocl_string;
  ocl_string = (char *)malloc(1024*1024);
  cl_platform_id platform_id;
  cl_device_id device_id;
  cl_context context;
  cl_command_queue command_queue;
  cl_mem buffer_a, buffer_b, buffer_c;
  cl_program program;
  cl_kernel kernel;
 
  clGetPlatformIDs(1, &platform_id,NULL);
  clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_ACCELERATOR, 1, &device_id, NULL);
 
  //创建上下文
  context = clCreateContext(NULL, 1,&device_id, NULL, NULL, NULL);
  command_queue = clCreateCommandQueue(context,device_id, 0, NULL);
 
 //分配内存
  buffer_a = clCreateBuffer(context,CL_MEM_READ_ONLY, datasize, NULL, NULL);
  buffer_b = clCreateBuffer(context,CL_MEM_READ_ONLY, datasize, NULL, NULL);
  buffer_c = clCreateBuffer(context,CL_MEM_READ_ONLY, datasize, NULL, NULL);
 
 //读取核函数,并且上传到DSP端
  ocl_string_size =get_ocl_string("vectoradd.cl", ocl_string);
  clEnqueueWriteBuffer(command_queue,buffer_a, CL_FALSE, 0, datasize, array_a, 0, NULL, NULL);
  clEnqueueWriteBuffer(command_queue,buffer_b, CL_FALSE, 0, datasize, array_b, 0, NULL, NULL);
  program =clCreateProgramWithSource(context, 1, (const char **)&ocl_string,&ocl_string_size, NULL);
  clBuildProgram(program, 1, &device_id,NULL, NULL, NULL);
  kernel = clCreateKernel(program,"vectoradd", NULL);
 //传递参数
  clSetKernelArg(kernel, 0,sizeof(cl_mem),&buffer_a);
  clSetKernelArg(kernel, 1,sizeof(cl_mem),&buffer_b);
  clSetKernelArg(kernel, 2,sizeof(cl_mem),&buffer_c);
  size_t global_work_size[1] = {10};
  //执行核函数
  clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
  //从核函数取回计算结果
  clEnqueueReadBuffer(command_queue,buffer_c, CL_TRUE, 0, datasize, array_c, 0, NULL, NULL);
  for (int i = 0 ; i < 10; i ++) {
  printf("%d ", array_a);
  }
  printf("\n");
  for (int i = 0 ; i < 10; i ++) {
  printf("%d ", array_b);
  }
  printf("\n");
  for (int i = 0 ; i < 10; i ++) {
  printf("%d ", array_c);
  }
  printf("\n");
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(command_queue);
  clReleaseMemObject(buffer_a);
  clReleaseMemObject(buffer_b);
  clReleaseMemObject(buffer_c);
  clReleaseContext(context);
  return 0;
}
将上述程序编译,结果如下
0 1 2 3 4 5 6 7 89
9 8 7 6 5 4 3 2 10
9 9 9 9 9 9 9 9 99