Problems with reduction done in CPU

Hi all. I have been trying to code reductions for CPU and GPU. The kernels attached below work really

well for Intel GPU's and Nvidia GPU. But, when I compile for CPU (Intel). The results are not consistent.

Sometimes, the result is right sometimes the result is wrong. There are two kernels: reduction_vector

is called many times by the host. When, the global_size is reduced to local_size. I issue complete_vector to finalize

the reduction.

__kernel void reduction_vector(__global int* data, __local int* partial_sums)

{

int lid = get_local_id(0);

int group_size = get_local_size(0);

partial_sums[lid] = data[get_global_id(0)];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 1; i < group_size; i <<= 1) {

int mask = (i << 1) - 1;

if ((lid & mask) == 0) {

partial_sums[lid] += partial_sums[lid + i];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0) {

data[get_group_id(0)] = partial_sums[0];

}

__kernel void reduction_complete(__global int* data,

__local int* partial_sums, __global int *sum) {

int lid = get_local_id(0);

int group_size = get_local_size(0);

partial_sums[lid] = data[get_local_id(0)];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 1; i < group_size; i <<= 1) {

int mask = (i << 1) - 1;

if ((lid & mask) == 0) {

partial_sums[lid] += partial_sums[lid + i];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0) {

*sum = partial_sums[0];

}

This is the host code

local_size = 128;

/* Create data buffer */

data_buffer = clCreateBuffer(oclobjects.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_int)* ARRAY_SIZE, data, &err);

sum_buffer = clCreateBuffer(oclobjects.context, CL_MEM_WRITE_ONLY,sizeof(cl_int), NULL, &err);

if(err < 0) {

perror("Couldn't create a buffer");

exit(1);

};

clEnqueueWriteBuffer(oclobjects.queue, data_buffer, CL_TRUE, 0, sizeof(cl_int) * ARRAY_SIZE, data, 0, NULL, NULL);

clFinish(oclobjects.queue);

/* Set arguments for vector kernel */

err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);

err |= clSetKernelArg(vector_kernel, 1, local_size * sizeof(cl_int), NULL);

/* Set arguments for complete kernel */

err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);

err |= clSetKernelArg(complete_kernel, 1, local_size * sizeof(cl_int), NULL);

err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);

if(err < 0) {

perror("Couldn't create a kernel argument");

exit(1);

}

/* Enqueue kernels */

global_size = ARRAY_SIZE;

err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

if(err < 0) {

perror("Couldn't enqueue the kernel");

exit(1);

}

printf("Global size = %lu\n", global_size);

/* Perform successive stages of the reduction */

while(global_size/local_size > local_size) {

global_size = global_size/local_size;

err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

printf("Global size = %lu\n", global_size);

if(err < 0) {

perror("Couldn't enqueue the kernel");

exit(1);

}

global_size = global_size/(local_size);

local_size = global_size;

err = clEnqueueNDRangeKernel(oclobjects.queue, complete_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

printf("Global size = %lu\n", global_size);

/* Read the result */

err = clEnqueueReadBuffer(oclobjects.queue, sum_buffer, CL_TRUE, 0, sizeof(cl_int), &sum, 0, NULL, NULL);

clFinish(oclobjects.queue);

if (err < 0) {

perror("Couldn't read the buffer");

exit(1);

}

/* Finish processing the queue and get profiling information */

clFinish(oclobjects.queue);

It does look to me that this Intel's bug in the CPU runtime. Notice, I tried two runtimes:

1. Runtime 14.2 x64

2. Runtime 15.1.x64

Thanks, for your help....

Diego

Problems with reduction done in CPU

Trending Articles

VMOU RSCIT Result 2017, RSCIT Result VMOU rkcl.vmou.ac.in Name Wise

[同期の失敗] について

Practice Sheet of Right form of verbs for HSC Students

Download: Bicko Bicko ft Rich Bizzy & Crew G- Wanfulanganya (Prod by: Bicko...

O'CONNELL MICHAEL F. 11/29/197...

GTA 5 PPSSPP Zip File Download For Android Mediafire 382 MB

Not right!

Best Suvichar in Hindi |बेस्ट सुविचार |शुभ विचार हिंदी में

EXERCISE

[GET] Jack Griffin-Parry – The Clothing Brand Blueprint ($150.00)

Bureau of Internal Revenue: Regional Offices (Directory)

Nalgonda District Police Office Mobile Numbers List in Telangana State

Arrest logs for Wednesday, March 20, 2019

Mp3 Download: Mandoza - Godoba

Moondru Mudichu 02-03-2017 – Polimer tv Serial

Uline Warehouse Associate Interview

Could Not Find the Application that Created this file

Rajasthan Board 10th Result 2016 Roll No wise & Name Wise

ZARIA CUMMINGS

Edna Murto, 90, longtime resident of Ely, dies