Hi all. I have been trying to code reductions for CPU and GPU. The kernels attached below work really
well for Intel GPU's and Nvidia GPU. But, when I compile for CPU (Intel). The results are not consistent.
Sometimes, the result is right sometimes the result is wrong. There are two kernels: reduction_vector
is called many times by the host. When, the global_size is reduced to local_size. I issue complete_vector to finalize
the reduction.
__kernel void reduction_vector(__global int* data, __local int* partial_sums)
{
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 1; i < group_size; i <<= 1) {
int mask = (i << 1) - 1;
if ((lid & mask) == 0) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
data[get_group_id(0)] = partial_sums[0];
}
}
__kernel void reduction_complete(__global int* data,
__local int* partial_sums, __global int *sum) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_local_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 1; i < group_size; i <<= 1) {
int mask = (i << 1) - 1;
if ((lid & mask) == 0) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
*sum = partial_sums[0];
}
}
This is the host code
local_size = 128;
/* Create data buffer */
data_buffer = clCreateBuffer(oclobjects.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_int)* ARRAY_SIZE, data, &err);
sum_buffer = clCreateBuffer(oclobjects.context, CL_MEM_WRITE_ONLY,sizeof(cl_int), NULL, &err);
if(err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
clEnqueueWriteBuffer(oclobjects.queue, data_buffer, CL_TRUE, 0, sizeof(cl_int) * ARRAY_SIZE, data, 0, NULL, NULL);
clFinish(oclobjects.queue);
/* Set arguments for vector kernel */
err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);
err |= clSetKernelArg(vector_kernel, 1, local_size * sizeof(cl_int), NULL);
/* Set arguments for complete kernel */
err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);
err |= clSetKernelArg(complete_kernel, 1, local_size * sizeof(cl_int), NULL);
err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);
if(err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernels */
global_size = ARRAY_SIZE;
err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
printf("Global size = %lu\n", global_size);
/* Perform successive stages of the reduction */
while(global_size/local_size > local_size) {
global_size = global_size/local_size;
err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
printf("Global size = %lu\n", global_size);
if(err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
}
global_size = global_size/(local_size);
local_size = global_size;
err = clEnqueueNDRangeKernel(oclobjects.queue, complete_kernel, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
printf("Global size = %lu\n", global_size);
/* Read the result */
err = clEnqueueReadBuffer(oclobjects.queue, sum_buffer, CL_TRUE, 0, sizeof(cl_int), &sum, 0, NULL, NULL);
clFinish(oclobjects.queue);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
/* Finish processing the queue and get profiling information */
clFinish(oclobjects.queue);
It does look to me that this Intel's bug in the CPU runtime. Notice, I tried two runtimes:
1. Runtime 14.2 x64
2. Runtime 15.1.x64
Thanks, for your help....
Diego