יום ראשון, 4 במאי 2014

OpenCL Simple Sample

The following sample use OpenCL in order to calculate the sum of two distinct vectors.
I use Nvidia video card so I installed the Cuda SDK in order to get the OpenCL headers and libs.
The C++ Calling program:

#include "stdafx.h"
#define PROGRAM_FILE "vecSuming.cl"
#define KERNEL_FUNC "vec_sum"
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\CL\opencl.h"
int _tmain(int argc, _TCHAR* argv[]) {
	cl_platform_id platform;
	cl_device_id device;
	cl_context context;
	cl_command_queue queue;
	cl_int i, err;
	cl_program program;
	FILE *program_handle;
	char *program_buffer, *program_log;
	size_t program_size, log_size;
	cl_kernel kernel;
	size_t work_units_per_kernel;
	float vec1[4], vec2[4], result[4];
	cl_mem vec1_buff, vec2_buff, res_buff;
	for(i=0; i<4; i++)
	{
		   vec1[i] = i ;
		   vec2[i] = i * 2 ;
	}
	 
	clGetPlatformIDs(1, &platform, NULL);
	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1,
	&device, NULL);
	context = clCreateContext(NULL, 1, &device, NULL,
	NULL, &err);
	program_handle = fopen("C:\\Learn\\OpenCL\\MeshamemLiMavetOpenCL\\matvec.cl", "r");
	fseek(program_handle, 0, SEEK_END);
	program_size = ftell(program_handle);
	rewind(program_handle);
	program_buffer = (char*)malloc(program_size + 1);
	program_buffer[program_size] = '\0';
	fread(program_buffer, sizeof(char), program_size,
	program_handle);
	fclose(program_handle);
	program_buffer[program_size - 6 ] = '\0';
	program = clCreateProgramWithSource(context, 1,
	(const char**)&program_buffer, &program_size, &err);
	free(program_buffer);
	cl_int theBuildResult =  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (theBuildResult != 0) {
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, device , CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		// Allocate memory for the log
		char *log = (char *) malloc(log_size);
		// Get the log
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
		// Print the log
		printf("%s\n", log);
		
		return -1;
	}
	kernel = clCreateKernel(program, KERNEL_FUNC, &err);
	queue = clCreateCommandQueue(context, device, 0, &err);
	vec1_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |
		   CL_MEM_COPY_HOST_PTR, sizeof(float)*4, vec1, &err);
	vec2_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |
		   CL_MEM_COPY_HOST_PTR, sizeof(float)*4, vec2 , &err);
	res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
		sizeof(float)*4, NULL, &err);
	clSetKernelArg(kernel, 0, sizeof(cl_mem), &vec1_buff);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &vec2_buff);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &res_buff);
	work_units_per_kernel = 4;
	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);
	clEnqueueReadBuffer(queue, res_buff, CL_TRUE, 0,sizeof(float)*4, result, 0, NULL, NULL);
	for ( i = 0 ; i < 4 ; i ++ )
	{
		printf ( "result of Index %d = %f " ,i,  result[i]);
	}
	clReleaseMemObject(vec1_buff);
	clReleaseMemObject(vec2_buff);
	clReleaseMemObject(res_buff);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(queue);
	clReleaseProgram(program);
	clReleaseContext(context);
	return 0;
}

  The kernel function program :


__kernel void vec_sum(__global float* vector1,__global float* vector2,
	__global float* result)
{
	int i = get_global_id(0);
	result[i] = vector1[i] + vector2[i];
}
The result :
Capture45

אין תגובות:

הוסף רשומת תגובה