The following sample use OpenCL in order to calculate the sum of two distinct vectors.
I use Nvidia video card so I installed the Cuda SDK in order to get the OpenCL headers and libs.
The C++ Calling program:
#include "stdafx.h"
#define PROGRAM_FILE "vecSuming.cl"
#define KERNEL_FUNC "vec_sum"
#include <stdio.h>#include <stdlib.h>#include <sys/types.h>#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\CL\opencl.h"
int _tmain(int argc, _TCHAR* argv[]) {cl_platform_id platform;cl_device_id device;cl_context context;cl_command_queue queue;cl_int i, err;cl_program program;FILE *program_handle;char *program_buffer, *program_log;
size_t program_size, log_size;cl_kernel kernel;size_t work_units_per_kernel;float vec1[4], vec2[4], result[4];
cl_mem vec1_buff, vec2_buff, res_buff;for(i=0; i<4; i++)
{vec1[i] = i ;vec2[i] = i * 2 ;}clGetPlatformIDs(1, &platform, NULL);clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1,&device, NULL);context = clCreateContext(NULL, 1, &device, NULL,NULL, &err);program_handle = fopen("C:\\Learn\\OpenCL\\MeshamemLiMavetOpenCL\\matvec.cl", "r");fseek(program_handle, 0, SEEK_END);program_size = ftell(program_handle);rewind(program_handle);program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';fread(program_buffer, sizeof(char), program_size,program_handle);fclose(program_handle);program_buffer[program_size - 6 ] = '\0';program = clCreateProgramWithSource(context, 1,(const char**)&program_buffer, &program_size, &err);free(program_buffer);cl_int theBuildResult = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);if (theBuildResult != 0) {
// Determine the size of the log
size_t log_size;clGetProgramBuildInfo(program, device , CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);// Allocate memory for the log
char *log = (char *) malloc(log_size);// Get the log
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);// Print the log
printf("%s\n", log);
return -1;
}kernel = clCreateKernel(program, KERNEL_FUNC, &err);queue = clCreateCommandQueue(context, device, 0, &err);vec1_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |CL_MEM_COPY_HOST_PTR, sizeof(float)*4, vec1, &err);vec2_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |CL_MEM_COPY_HOST_PTR, sizeof(float)*4, vec2 , &err);res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY,sizeof(float)*4, NULL, &err);clSetKernelArg(kernel, 0, sizeof(cl_mem), &vec1_buff);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &vec2_buff);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &res_buff);
work_units_per_kernel = 4;clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);clEnqueueReadBuffer(queue, res_buff, CL_TRUE, 0,sizeof(float)*4, result, 0, NULL, NULL);for ( i = 0 ; i < 4 ; i ++ )
{printf ( "result of Index %d = %f " ,i, result[i]);
}clReleaseMemObject(vec1_buff);clReleaseMemObject(vec2_buff);clReleaseMemObject(res_buff);clReleaseKernel(kernel);clReleaseCommandQueue(queue);clReleaseProgram(program);clReleaseContext(context);return 0;
}
The kernel function program :
אין תגובות:
הוסף רשומת תגובה