I was able to simplify the kernel to a very small state in order to make it easier to track down the bug. Logically, this kernel may not be correct however syntax seems OK to me and it compiles with on the CPU, as well as other platforms (AMD CPU and GPU).
This is on an Intel Core i3 32xx CPU, the IGP is an Intel HD 2500. I'm using the latest driver build (3345), windows 7 x64 and the Kernel Builder x64 application.
/* Please Write the OpenCL Kernel(s) code here*/
void ntn_snh(__global float16 const *restrict src, __global float16 *restrict dst)
{
for (__private int j = 0; j < 16; j+=4, src+=4, dst+=4)
{
dst[0] = sinh(src[0]);
dst[1] = sinh(src[1]);
dst[2] = sinh(src[2]);
dst[3] = sinh(src[3]);
}
}
void ntn_csh(__global float16 const *restrict src, __global float16 *restrict dst)
{
for (__private int j = 0; j < 16; j+=4, src+=4, dst+=4)
{
dst[0] = cosh(src[0]);
dst[1] = cosh(src[1]);
dst[2] = cosh(src[2]);
dst[3] = cosh(src[3]);
}
}
void ntn_tnh(__global float16 const *restrict src, __global float16 *restrict dst)
{
for (__private int j = 0; j < 16; j+=4, src+=4, dst+=4)
{
dst[0] = tanh(src[0]);
dst[1] = tanh(src[1]);
dst[2] = tanh(src[2]);
dst[3] = tanh(src[3]);
}
}
__kernel void eval(__global uchar const *restrict const gcom,
__global float16 const *restrict const gdata_in,
__global float16 *restrict const gdata_out)
{
__private int j = 6, k = 9;
for (; k >= 7; --j, --k)
{
switch(gcom[j])
{
case 0: break;
case 1: ntn_snh(gdata_in + gcom[k]*64, gdata_out + j*64); break;
case 2: ntn_csh(gdata_in + gcom[k]*64, gdata_out + j*64); break;
case 3: ntn_tnh(gdata_in + gcom[k]*64, gdata_out + j*64); break;
case 4: break;
default:{}
}
}
for (; j >= 0; --j, --k)
{
switch(gcom[j])
{
case 0: break;
case 1: ntn_snh(gdata_out + k*64, gdata_out + j*64); break;
case 2: ntn_csh(gdata_out + k*64, gdata_out + j*64); break;
case 3: ntn_tnh(gdata_out + k*64, gdata_out + j*64); break;
case 4: break;
default:{}
}
}
}