Quantcast
Channel: Intel® Software - OpenCL*
Viewing all articles
Browse latest Browse all 1182

Problems with reduction done in CPU

$
0
0

Hi all. I have been trying to code reductions for CPU and GPU.  The kernels attached below work really

well for Intel GPU's and Nvidia GPU. But, when I compile for CPU (Intel). The results are not consistent.

Sometimes, the result is right sometimes the result is wrong. There are two kernels: reduction_vector

is called many times by the host. When, the global_size is reduced to local_size. I issue complete_vector to finalize

the reduction.

 

__kernel void reduction_vector(__global int* data, __local int* partial_sums)

{

int lid = get_local_id(0);

int group_size = get_local_size(0);

partial_sums[lid] = data[get_global_id(0)];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 1; i < group_size; i <<= 1) {

 

int mask = (i << 1) - 1;

if ((lid & mask) == 0) {

partial_sums[lid] += partial_sums[lid + i];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

 

if(lid == 0) {

data[get_group_id(0)] = partial_sums[0];

}

}

__kernel void reduction_complete(__global int* data,

__local int* partial_sums, __global int *sum) {

int lid = get_local_id(0);

int group_size = get_local_size(0);

partial_sums[lid] = data[get_local_id(0)];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 1; i < group_size; i <<= 1) {

int mask = (i << 1) - 1;

if ((lid & mask) == 0) {

partial_sums[lid] += partial_sums[lid + i];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

 

if(lid == 0) {

*sum = partial_sums[0];

}

}

This is the host code

local_size = 128;

/* Create data buffer */

data_buffer = clCreateBuffer(oclobjects.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_int)* ARRAY_SIZE, data, &err);

sum_buffer = clCreateBuffer(oclobjects.context, CL_MEM_WRITE_ONLY,sizeof(cl_int), NULL, &err);

if(err < 0) {

perror("Couldn't create a buffer");

exit(1);

};

 

clEnqueueWriteBuffer(oclobjects.queue, data_buffer, CL_TRUE, 0, sizeof(cl_int) * ARRAY_SIZE, data, 0, NULL, NULL);

clFinish(oclobjects.queue);

/* Set arguments for vector kernel */

err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);

err |= clSetKernelArg(vector_kernel, 1, local_size * sizeof(cl_int), NULL);

/* Set arguments for complete kernel */

err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);

err |= clSetKernelArg(complete_kernel, 1, local_size * sizeof(cl_int), NULL);

err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);

if(err < 0) {

perror("Couldn't create a kernel argument");

exit(1);

}

/* Enqueue kernels */

global_size = ARRAY_SIZE;

err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

if(err < 0) {

perror("Couldn't enqueue the kernel");

exit(1);

}

printf("Global size = %lu\n", global_size);

/* Perform successive stages of the reduction */

while(global_size/local_size > local_size) {

global_size = global_size/local_size;

err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

printf("Global size = %lu\n", global_size);

if(err < 0) {

perror("Couldn't enqueue the kernel");

exit(1);

}

}

global_size = global_size/(local_size);

local_size = global_size;

 

err = clEnqueueNDRangeKernel(oclobjects.queue, complete_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

printf("Global size = %lu\n", global_size);

/* Read the result */

err = clEnqueueReadBuffer(oclobjects.queue, sum_buffer, CL_TRUE, 0, sizeof(cl_int), &sum, 0, NULL, NULL);

clFinish(oclobjects.queue);

if (err < 0) {

perror("Couldn't read the buffer");

exit(1);

}

/* Finish processing the queue and get profiling information */

clFinish(oclobjects.queue);

It does look to me that this Intel's bug in the CPU runtime. Notice, I tried two runtimes:

1.  Runtime 14.2 x64

2. Runtime 15.1.x64

Thanks, for your help....

 

Diego


Viewing all articles
Browse latest Browse all 1182

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>