Quantcast
Channel: Intel® Software - OpenCL*
Viewing all 1182 articles
Browse latest View live

Segfault on clBuildProgram for simple CPU Kernel

$
0
0

Hi all, I have run into a simple, reproducible segfault that occurs during compilation of a kernel, i.e. in clBuildProgram().  I am using the intel opencl-1.2-6.4.0.24 runtime on an Intel(R) Xeon(R) CPU X5650, on Red Hat Enterprise Linux Server release 7.3 (Maipo).

In main.c:

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <CL/cl.h>

#define MEM_SIZE (81)
#define MAX_SOURCE_SIZE (0x100000)

//simple error checking, not strictly necessary:
#define err(ans) { cpu_assert((ans), __FILE__, __LINE__); }
const char *getErrorString(cl_int error)
{
switch(error){
    // run-time and JIT compiler errors
    case 0: return "CL_SUCCESS";
    case -1: return "CL_DEVICE_NOT_FOUND";
    case -2: return "CL_DEVICE_NOT_AVAILABLE";
    case -3: return "CL_COMPILER_NOT_AVAILABLE";
    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
    case -5: return "CL_OUT_OF_RESOURCES";
    case -6: return "CL_OUT_OF_HOST_MEMORY";
    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
    case -8: return "CL_MEM_COPY_OVERLAP";
    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
    case -11: return "CL_BUILD_PROGRAM_FAILURE";
    case -12: return "CL_MAP_FAILURE";
    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
    case -16: return "CL_LINKER_NOT_AVAILABLE";
    case -17: return "CL_LINK_PROGRAM_FAILURE";
    case -18: return "CL_DEVICE_PARTITION_FAILED";
    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";

    // compile-time errors
    case -30: return "CL_INVALID_VALUE";
    case -31: return "CL_INVALID_DEVICE_TYPE";
    case -32: return "CL_INVALID_PLATFORM";
    case -33: return "CL_INVALID_DEVICE";
    case -34: return "CL_INVALID_CONTEXT";
    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
    case -36: return "CL_INVALID_COMMAND_QUEUE";
    case -37: return "CL_INVALID_HOST_PTR";
    case -38: return "CL_INVALID_MEM_OBJECT";
    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
    case -40: return "CL_INVALID_IMAGE_SIZE";
    case -41: return "CL_INVALID_SAMPLER";
    case -42: return "CL_INVALID_BINARY";
    case -43: return "CL_INVALID_BUILD_OPTIONS";
    case -44: return "CL_INVALID_PROGRAM";
    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
    case -46: return "CL_INVALID_KERNEL_NAME";
    case -47: return "CL_INVALID_KERNEL_DEFINITION";
    case -48: return "CL_INVALID_KERNEL";
    case -49: return "CL_INVALID_ARG_INDEX";
    case -50: return "CL_INVALID_ARG_VALUE";
    case -51: return "CL_INVALID_ARG_SIZE";
    case -52: return "CL_INVALID_KERNEL_ARGS";
    case -53: return "CL_INVALID_WORK_DIMENSION";
    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
    case -56: return "CL_INVALID_GLOBAL_OFFSET";
    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
    case -58: return "CL_INVALID_EVENT";
    case -59: return "CL_INVALID_OPERATION";
    case -60: return "CL_INVALID_GL_OBJECT";
    case -61: return "CL_INVALID_BUFFER_SIZE";
    case -62: return "CL_INVALID_MIP_LEVEL";
    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
    case -64: return "CL_INVALID_PROPERTY";
    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
    case -66: return "CL_INVALID_COMPILER_OPTIONS";
    case -67: return "CL_INVALID_LINKER_OPTIONS";
    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";

    // extension errors
    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
    default: return "Unknown OpenCL error";
    }
}
void cpu_assert(cl_int x, const char *file, int line) {
	if (x != CL_SUCCESS)
	{
		fprintf(stderr,"cpu_assert: %s %s %d\n", getErrorString(x), file, line);
		exit(x);
	}
}

//main program
int main()
{
	double mem[MEM_SIZE] = {0};

	cl_platform_id platform_id[10];
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem memobj = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;

	FILE *fp;
	const char fileName[] = "./kernel.cl";
	size_t source_size;
	char *source_str;
	cl_int i;

	/* Load kernel source code */
	fp = fopen(fileName, "r");
	if (!fp) {
		exit(-1);
	}
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	/* Get platform/device information */
	err(clGetPlatformIDs(10, platform_id, &ret_num_platforms));
	cl_platform_id pid = NULL;
	for (int i = 0; i < ret_num_platforms; ++i)
	{
		//check if intel
		char pvendor[500];
		size_t psize = 500 * sizeof(char);
                //choose the first intel platofrm
		char intel_check[10] = "Intel";
		err(clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, psize, pvendor, NULL));
		if(strstr(pvendor, intel_check) != NULL)
		{
			pid = platform_id[i];
		}
	}
        //get the Intel CPU
	err(clGetDeviceIDs(pid, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices));

	/* Create OpenCL Context */
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	err(ret);

	/* Create Command Queue */
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	err(ret);

	/* Create memory buffer*/
	memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(double), NULL, &ret);
	err(ret);

	/* Create Kernel program from the read in source */
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	err(ret);

	/* Build Kernel Program */
	err(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

	/* Create OpenCL Kernel */
	kernel = clCreateKernel(program, "test_kernel", &ret);
	err(ret);

	/* Set OpenCL kernel argument */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
	err(ret);

	size_t global_work_size[3] = {1, 0, 0};
	size_t local_work_size[3] = {4, 0, 0};

	/* Execute OpenCL kernel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
	err(ret);

	/* Transfer result from the memory buffer */
	ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(double), mem, 0, NULL, NULL);
	err(ret);

	/* Display result */
	for (i=0; i < MEM_SIZE; i++) {
		printf("%e\t", mem[i]);
	}

	/* Finalization */
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(memobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);

	free(source_str);

	return 0;
}

 

And in kernel.cl:

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif

__constant double const params[81] = { 0.826029585777618, 0.09277771524641742, 0.17882754585611516, 0.5063378202753985, 0.6152053087181504, 0.2880270862072688, 0.3129924612654047, 0.1914031592227472, 0.7102266835529006, 0.48932865515007107, 0.6061282135426413, 0.7596607644431532, 0.14443802597375788, 0.8116209377685191, 0.9594085162032434, 0.13075211361103556, 0.9224608320274585, 0.14604765433036915, 0.2596221225532682, 0.49918545558827154, 0.7450662624171099, 0.2667298203995915, 0.25658809473522426, 0.8326419218342502, 0.4342552237224352, 0.17536887526039147, 0.7307554279935198, 0.16662216310809286, 0.5729980215962235, 0.960525881776112, 0.6405413316234755, 0.3470773744166106, 0.8743972242812091, 0.30552499783741516, 0.3146807714222978, 0.7641117037190533, 0.4956119008256711, 0.9564385601232531, 0.0817308089707498, 0.5851026578901762, 0.09572537604291531, 0.7595279218060109, 0.3370657201439913, 0.09352025664655894, 0.352966288119304, 0.5307300151282943, 0.06732539048031061, 0.11708139095968984, 0.7255317496613602, 0.9816608694307325, 0.8171862183434712, 0.42590052091582375, 0.7227051679396143, 0.8383945203018864, 0.5021108846782305, 0.8536292405267636, 0.863285283964059, 0.18335701117563308, 0.4563413539390173, 0.7652079478016128, 0.431958947047663, 0.49298992135423214, 0.6306613411814528, 0.7182527828252896, 0.2918913305544274, 0.1922131983748544, 0.1473770002195013, 0.05404427061478689, 0.24071986186320615, 0.6771845487513621, 0.05844761644341512, 0.879924425441519, 0.17381661089494238, 0.475292639000336, 0.9467343353557718, 0.8799321075729781, 0.14852416386935496, 0.8957251952598398, 0.8342246883437114, 0.3828325906418706, 0.20051275899280996 };

__kernel void __attribute__ ((reqd_work_group_size(4, 1, 1))) loopy_kernel(__global double *restrict out)
{
  for (int i_outer = 0; i_outer <= 20 + -1 * lid(0) + (3 * lid(0) / 4); ++i_outer)
    out[4 * i_outer + lid(0)] = params[4 * i_outer + lid(0)];
}

 

The program was compiled using:

gcc -std=c99 -c main.c -I/opt/opencl-headers/ -o main.o -O0 -g && gcc main.o -Wl,-rpath,/opt/intel/opencl/lib64/ -lOpenCL -o a.out

(note that turning off debug / and using O3 has the same result)

Program output:

Stack dump:
0.      Running pass 'PrepareKernelArgs' on module 'main'.

gdb output:

Program received signal SIGSEGV, Segmentation fault.
0x00007ffff315cdec in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7.x86_64 libgcc-4.8.5-11.el7.x86_64 libstdc++-4.8.5-11.el7.x86_64 ncurses-libs-5.9-13.20130511.el7.x86_64 numactl-libs-2.0.9-6.el7_2.x86_64 opencl-1.2-base-6.4.0.25-1.x86_64 opencl-1.2-intel-cpu-6.4.0.25-1.x86_64 zlib-1.2.7-17.el7.x86_64
(gdb) bt
#0  0x00007ffff315cdec in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#1  0x00007ffff311d724 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#2  0x00007ffff311f610 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#3  0x00007ffff311f886 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#4  0x00007ffff311fcdb in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#5  0x00007ffff32b4557 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#6  0x00007ffff30404b8 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#7  0x00007ffff30214e7 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#8  0x00007ffff3039909 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#9  0x00007ffff301ead9 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so
#10 0x00007ffff44755b7 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so
#11 0x00007ffff55ba1d0 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#12 0x00007ffff52c9b59 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libtask_executor.so
#13 0x00007ffff4d57691 in tbb::interface7::internal::task_arena_base::internal_execute (this=0x0, d=...) at ../../src/tbb/arena.cpp:673
#14 0x00007ffff52c0e97 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libtask_executor.so
#15 0x00007ffff5539c48 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#16 0x00007ffff55bc550 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#17 0x00007ffff55bc629 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#18 0x00007ffff55dbdfc in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#19 0x00007ffff55dccbd in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#20 0x00007ffff55dcd5a in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#21 0x00007ffff55dbd48 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#22 0x00007ffff55bb888 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#23 0x00007ffff55ba7e1 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#24 0x00007ffff52c9b59 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libtask_executor.so
#25 0x00007ffff4d57691 in tbb::interface7::internal::task_arena_base::internal_execute (this=0x0, d=...) at ../../src/tbb/arena.cpp:673
#26 0x00007ffff52c0e97 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libtask_executor.so
#27 0x00007ffff5539c48 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#28 0x00007ffff55bc550 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#29 0x00007ffff55bc629 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#30 0x00007ffff55dbdfc in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#31 0x00007ffff55dccbd in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#32 0x00007ffff55dcd5a in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#33 0x00007ffff55dbd48 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#34 0x00007ffff55bb888 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#35 0x00007ffff55c096c in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#36 0x00007ffff52c9b59 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libtask_executor.so
#37 0x00007ffff4d575d5 in tbb::interface7::internal::task_arena_base::internal_execute (this=0x0, d=...) at ../../src/tbb/arena.cpp:676
#38 0x00007ffff52c0e97 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libtask_executor.so
#39 0x00007ffff5539c48 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#40 0x00007ffff55bc550 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#41 0x00007ffff55bfd95 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#42 0x00007ffff55ab890 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#43 0x00007ffff5587722 in ?? () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#44 0x00007ffff554b055 in clBuildProgram () from /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so
#45 0x0000000000400ffe in main ()

 

valgrind output:

==10701== Memcheck, a memory error detector
==10701== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
==10701== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==10701== Command: ./a.out
==10701== Parent PID: 5874
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x5D3559C: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5F086DE: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5DFFD65: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5F0AAAC: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5F0AE9B: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5E01320: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5DD45D1: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5CD34AC: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5CD3447: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x4E35811: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOpenCL.so.2.0)
==10701==    by 0x4E39CB1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOpenCL.so.2.0)
==10701==    by 0x560EBAF: pthread_once (in /usr/lib64/libpthread-2.17.so)
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x5D352B2: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5F086EF: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5DFFD65: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5F0AAAC: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5F0AE9B: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5E01320: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5DD45D1: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5CD34AC: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x5CD3447: ??? (in /usr/lib64/nvidia/libnvidia-opencl.so.367.48)
==10701==    by 0x4E35811: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOpenCL.so.2.0)
==10701==    by 0x4E39CB1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOpenCL.so.2.0)
==10701==    by 0x560EBAF: pthread_once (in /usr/lib64/libpthread-2.17.so)
==10701== 
==10701== Warning: set address range perms: large range [0x1000000000, 0x2900000000) (noaccess)
==10701== Warning: set address range perms: large range [0x2900000000, 0x3000000000) (noaccess)
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x4084A8F: __intel_sse2_strrchr (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libtbb.so.2)
==10701==    by 0x406D541: tbb::internal::init_dl_data() (dynamic_link.cpp:332)
==10701==    by 0x406D476: __sti__$E (dynamic_link.cpp:495)
==10701==    by 0x408F041: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libtbb.so.2)
==10701==    by 0x4068732: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libtbb.so.2)
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x8F0C876: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC198C: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2496: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2858: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC6D82: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC1462: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE8D01: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F2DA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x88213E1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x8821809: clDevCreateDeviceInstance (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x7884FA4: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==    by 0x7846B68: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x8F0C888: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC198C: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2496: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2858: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC6D82: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC1462: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE8D01: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F2DA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x88213E1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x8821809: clDevCreateDeviceInstance (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x7884FA4: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==    by 0x7846B68: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x8F0C89A: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC198C: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2496: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2858: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC6D82: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC1462: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE8D01: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F2DA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x88213E1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x8821809: clDevCreateDeviceInstance (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x7884FA4: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==    by 0x7846B68: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x8F0C8AC: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC198C: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2496: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2858: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC6D82: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC1462: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE8D01: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F2DA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x88213E1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x8821809: clDevCreateDeviceInstance (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x7884FA4: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==    by 0x7846B68: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701== 
==10701== Conditional jump or move depends on uninitialised value(s)
==10701==    at 0x8F0C8BE: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC198C: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2496: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC2858: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC6D82: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DC1462: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE8D01: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F2DA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x88213E1: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x8821809: clDevCreateDeviceInstance (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x7884FA4: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==    by 0x7846B68: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701== 
==10701== Invalid read of size 1
==10701==    at 0x8F0ADEC: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECB723: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECD60F: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECD885: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECDCDA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x9062556: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DEE4B7: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DCF4E6: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE7908: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DCCAD8: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F5B6: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x78521CF: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==  Address 0x8 is not stack'd, malloc'd or (recently) free'd
==10701== 
==10701== 
==10701== Process terminating with default action of signal 11 (SIGSEGV)
==10701==  Access not within mapped region at address 0x8
==10701==    at 0x8F0ADEC: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECB723: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECD60F: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECD885: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8ECDCDA: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x9062556: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DEE4B7: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DCF4E6: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DE7908: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x8DCCAD8: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libOclCpuBackEnd.so)
==10701==    by 0x880F5B6: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libcpu_device.so)
==10701==    by 0x78521CF: ??? (in /opt/intel/opencl-1.2-6.4.0.25/lib64/libintelocl.so)
==10701==  If you believe this happened as a result of a stack
==10701==  overflow in your program's main thread (unlikely but
==10701==  possible), you can try to increase the size of the
==10701==  main thread stack using the --main-stacksize= flag.
==10701==  The main thread stack size used in this run was 8388608.
==10701== 
==10701== HEAP SUMMARY:
==10701==     in use at exit: 20,677,964 bytes in 133,880 blocks
==10701==   total heap usage: 162,203 allocs, 28,323 frees, 33,802,790 bytes allocated
==10701== 
==10701== LEAK SUMMARY:
==10701==    definitely lost: 312 bytes in 5 blocks
==10701==    indirectly lost: 0 bytes in 0 blocks
==10701==      possibly lost: 1,844,672 bytes in 13,733 blocks
==10701==    still reachable: 18,832,980 bytes in 120,142 blocks
==10701==                       of which reachable via heuristic:
==10701==                         stdstring          : 220,711 bytes in 4,022 blocks
==10701==                         newarray           : 37,008 bytes in 12 blocks
==10701==                         multipleinheritance: 928 bytes in 2 blocks
==10701==         suppressed: 0 bytes in 0 blocks
==10701== Rerun with --leak-check=full to see details of leaked memory
==10701== 
==10701== For counts of detected and suppressed errors, rerun with: -v
==10701== Use --track-origins=yes to see where uninitialised values come from
==10701== ERROR SUMMARY: 20 errors from 9 contexts (suppressed: 0

 

 

 

Thread Topic: 

Bug Report

device fission on gen9 GPU

$
0
0

Is device fission is supported on Gen 9 GPU?

I want to run multiple OpenCL kernel concurrently on GPU. Is there any way i can run?

Thanks,

Biren Doshi

 

 

Advanced Motion Estimation Extension and GPU

$
0
0

Hi

 

Can we use Intel advanced motion estimation accelerator and GPU computing power concurrently?

 

Thanks,

Bieren Doshi

 

ioc64 installed on Fedora 25 throwing instance of 'std::string', aborting

$
0
0

I am working on getting the Intel OpenCL SDK up and running on a Fedora 25 Skylake machine (Core i7-6700).  So far I have:

  1. Installed the xorg-x11-drv-intel package from the Fedora (or maybe RPMFusion) repo
  2. Installed the Intel SDK for OpenCL Applications 2016 R3 for Linux with sudo permissions

Step 2 warned about Unsupported OS and Intel Debugger for Heterogeneous Compute Unsupported OS, and I selected to ignore those prereqs.

When I run 

ioc64 -version

I get the following output:

terminate called after throwing an instance of 'std::string'
Aborted (core dumped)

Same thing with 

ioc64 -help

Have I installed the correct components?  Could this error be related to using an unsupported OS?  Any suggestions on how to proceed?

EDIT: More info:

$ sudo lshw -c video
  *-display
       description: VGA compatible controller
       product: HD Graphics 530
       vendor: Intel Corporation
       physical id: 2
       bus info: pci@0000:00:02.0
       version: 06
       width: 64 bits
       clock: 33MHz
       capabilities: pciexpress msi pm vga_controller bus_master cap_list rom
       configuration: driver=i915 latency=0
       resources: irq:132 memory:f6000000-f6ffffff memory:e0000000-efffffff ioport:f000(size=64) memory:c0000-dffff
$ modinfo i915
filename:       /lib/modules/4.8.8-200.fc24.x86_64/kernel/drivers/gpu/drm/i915/i915.ko.xz
license:        GPL and additional rights
description:    Intel Graphics
author:         Intel Corporation
author:         Tungsten Graphics, Inc.
firmware:       i915/bxt_dmc_ver1_07.bin
firmware:       i915/skl_dmc_ver1_26.bin
firmware:       i915/kbl_dmc_ver1_01.bin
firmware:       i915/kbl_guc_ver9_14.bin
firmware:       i915/bxt_guc_ver8_7.bin
firmware:       i915/skl_guc_ver6_1.bin<snip>

 

Thread Topic: 

Question

HD Graphics 5500 is unavailable with CentOS7.2 + intel-opencl-r3.1 driver

$
0
0

Hi,

I am trying to enable the HD Graphics 5500 of i7-5600U on CentOS7.2. I posted a similar inquiry to the following Intel's community and had a suggestion to ask at this forum:

No "CL_DEVICE_TYPE_GPU" with Intel HD5600 on CentOS 7

Let me explain my issue again. First, I installed "Driver and library(runtime) packages" into my CentOS 7.2, i.e., CentOS Linux (3.10.0-327.36.3.el7.x86_64) 7 (Core), which is obtained by upgrading CentOS7 (1511). I followed the Installation instructions in the following page:

OpenCL™ Drivers and Runtimes for Intel® Architecture | Intel® Software

The installation has done without any problem. Then, using the new kernel "4.7.0.intel.r3.1", I tested the sample code "CapsBasic" obtained from

OpenCL™ Platform/Device Capabilities Viewer Sample | Intel® Software.

There was no problem in its compilation & linking to OpenCL.  However, the result is not what I was expecting. The output looks like

[ttaka@localhost CapsBasic]$ ./CapsBasic
Number of available platforms: 1
Platform names:
    [0] Intel(R) OpenCL [Selected]
Number of devices available for each type:
    CL_DEVICE_TYPE_CPU: 1
    CL_DEVICE_TYPE_GPU: 0

    CL_DEVICE_TYPE_ACCELERATOR: 0

*** Detailed information for each device ***

CL_DEVICE_TYPE_CPU[0]
    CL_DEVICE_NAME: Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
    CL_DEVICE_AVAILABLE: 1
    CL_DEVICE_VENDOR: Intel(R) Corporation
    CL_DEVICE_PROFILE: FULL_PROFILE
    CL_DEVICE_VERSION: OpenCL 2.0 (Build 330)
    CL_DRIVER_VERSION: 1.2.0.330
    CL_DEVICE_OPENCL_C_VERSION: OpenCL C 2.0
    CL_DEVICE_MAX_COMPUTE_UNITS: 4
    CL_DEVICE_MAX_CLOCK_FREQUENCY: 2600
    CL_DEVICE_MAX_WORK_GROUP_SIZE: 8192
    CL_DEVICE_ADDRESS_BITS: 64
    CL_DEVICE_MEM_BASE_ADDR_ALIGN: 1024
    CL_DEVICE_MAX_MEM_ALLOC_SIZE: 4174946304
    CL_DEVICE_GLOBAL_MEM_SIZE: 16699785216
    CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: 131072
    CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: 262144
    CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: 64
    CL_DEVICE_LOCAL_MEM_SIZE: 32768
    CL_DEVICE_PROFILING_TIMER_RESOLUTION: 1
    CL_DEVICE_IMAGE_SUPPORT: 1
    CL_DEVICE_ERROR_CORRECTION_SUPPORT: 0
    CL_DEVICE_HOST_UNIFIED_MEMORY: 1
    CL_DEVICE_EXTENSIONS: cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64 cl_khr_image2d_from_buffer
    CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: 1
    CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: 1
    CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: 1
    CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: 1
    CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: 8
    CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: 4
    CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: 8
    CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: 4

Clearly, there is no available GPU as an OpenCL device. Is this possible? As far as I know, Intel HD 5500 is included in the 5th generation HD. The PC being used here is Lenovo Thinkpad x250. With windows8, the PC could program the HD 5500 via OpenCL 1.2.

Also, I installed Media Server 2017 to CentOS7.2, which was reinstalled from scratch. However, at the verification step, I cound not find /dev/dri and go forward any more.

I really appreciate any suggestions or information.

 

Best regards,

Toru Takahashi

Thread Topic: 

Question

Same instruction on all 8 EU?

$
0
0

To get peak performance, all EU in single sub-slice should issue same instruction or in single EU only we need same instruction? At what granularity i should avoid branching ?

 

Thanks and regards,

Biren Doshi

 

Uninstall Intel OpenCL Platform?

$
0
0

Machine Details:

Windows 10

CPU: Intel Core i5 4670K Quad Core

Integrated Graphics: HD 4600

I've been developing an AMD OpenCL application using their SDK.  I use VS2015 to compile, then I test on a different machine with a discrete AMD GPU.

Recently I installed a AMD RX 470 on my dev machine, which means I now have two OpenCL platforms: Intel and AMD.  Now, my application crashes when run from the IDE :

Unhandled exception at 0x00007FFB3F76A74A (igdrcl64.dll) in ethminer.exe: 0xC0000005: Access violation reading location 0x0000000000000994.

The error appears to be occurring in igdrcl64.dll when my application attempts to enumerate installed OpenCL platforms by calling clGetPlatformIDs.  Interestingly, my application seems to run fine when run from the command line, outside the IDE.

I don't really know the best way to solve this, but I was thinking it might help to remove the Intel OpenCL platform. How can I do that?  Before installing the new GPU, I uninstalled the Intel Graphics Driver from the control panel, but I don't  think it got everything.

Thread Topic: 

Question

Ubuntu: application detects 1 device while clinfo detects 2

$
0
0

Hi I've installed both CPU and GPU Intel OpenCL drivers on a x86_64 machine running Ubuntu 16.10 and I've encountered an unexplainable (yet) behaviour on my OpenCL project. My application only detects the Intel HD Graphics device but not the Core i7 device. The strangest thing is when I run the clinfo program, it lists both devices. Even stranger, I've downloaded the original source code of the clinfo program to check any difference in code and after building it with the same compile options (-g -O0 -DDEBUG=1) and debugging it under gdb, it does detect two devices with code that seems identical to my program. The only difference I see is that the clinfo program is written in C and compiles with gcc while my program is in C++, uses the C++ wrapper and uses g++.

Here is how clinfo detects the number of devices:

	error = clGetPlatformIDs(0, NULL, &num_platforms);
	if (error != CL_PLATFORM_NOT_FOUND_KHR)
		CHECK_ERROR("number of platforms");

	ALLOC(platform, num_platforms, "platform IDs", cl_platform_id);
	error = clGetPlatformIDs(num_platforms, platform, NULL);
	CHECK_ERROR("platform IDs");

	for (p = 0; p < num_platforms; ++p) {
		error = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &(pdata[p].ndevs));
	        if (error == CL_DEVICE_NOT_FOUND)
		      pdata[p].ndevs = 0;
	       else
		      CHECK_ERROR("number of devices");
        }

And here's how my program works:

Error CLRuntime::createPlatform(const char* pPlatformRequest, cl::Platform& pPlatform)
{
    std::vector<cl_platform_id> lPlatforms;
    std::string                 lInfo;
    cl_uint                     lCount;

    clGetPlatformIDs(0, NULL, &lCount);
    if (lCount == 0)
    {
        HS_LOG(error) << "OpenCL has zero platforms";
        return ERR_FATAL;
    }

    lPlatforms.resize(lCount);

    clGetPlatformIDs(lCount, &lPlatforms.front(), NULL);

	// Default is to use first platform found
	pPlatform = lPlatforms[0];

	for (auto i = 0; i < lPlatforms.size(); i++)
	{
                cl::Platform    lPlatform(lPlatforms[i]);

		HS_LOG(debug) << "Platform #"<< i << " :";
		lInfo = lPlatform.getInfo<CL_PLATFORM_VERSION>();
		HS_LOG(debug) << '\t'<< lInfo;
		lInfo = lPlatform.getInfo<CL_PLATFORM_NAME>();
		HS_LOG(debug) << "\tname: "<< lInfo;

		if (pPlatformRequest!=nullptr && *pPlatformRequest!='\0'&& lInfo.find(pPlatformRequest) != std::string::npos)
		{
			pPlatform = lPlatforms[i];
		}

		lInfo = lPlatform.getInfo<CL_PLATFORM_VENDOR>();
		HS_LOG(debug) << "\tvendor: "<< lInfo;

                clGetDeviceIDs(lPlatforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &lCount);
		HS_LOG(debug) << "\tDevice count: "<< lCount;
	}

	HS_LOG(info) << ">> Using platform "<< pPlatform.getInfo<CL_PLATFORM_NAME>();

    return ERR_NONE;
}

What other factor could possibly influence this difference in behaviour??

Thread Topic: 

Help Me

OpenCL lists two Intel (R) OpenCL platforms

$
0
0

On a laptop running Windows 10 64 bits with an Intel HD GPU and NVIDIA GPU, my OpenCL program and one of the Intel SDK OpenCL sample program both lists two "Intel (R) OpenCL" platform, with same profile and version alongside another vendor platform and an extra "Experimental OpenCL 2.1 CPU Only Platform". Why the double entry for the Intel (R) OpenCL platform?
 

Thread Topic: 

Question

clCreateContext fails on CPU but not on GPU

$
0
0

On a laptop running under Windows 10 64 bits and dual Intel GPU / NVIDIA GPU configuration, my OpenCL program returns a CL_OUT_OF_HOST_MEMORY when creating an OpenCL context with the "Intel (R) OpenCL" platform and a "Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz - OpenCL 2.0 (Build 359)" device while the create context succeeds when I choose the "Intel(R) HD Graphics 530" device on the same platform and same code. Now the strange thing is that with one of the Intel OpenCL sample code, on the same machine and system, the context is created normally in both cases. Under VS 2015, at the time of context creation, heap allocation is only around 3 Mb so it feels weird to read that the host is out of memory!

Does anybody know what could be causing this error when choosing the CPU device under certain conditions?

Thread Topic: 

Help Me

sub_group_broadcast() broken on GEN9 (21.20.16.4552)

$
0
0

I have a kernel with a "required subgroup size" of 8.

My test is launching a grid of 24 global work items and 8 local work items (only for testing purposes).

After much debugging, the sub_group_broadcast() function was determined to be the culprit.

Replacing it with work_group_broadcast() resulted in a working kernel.

Is this a known bug?  

All of the other sub_group_XXX() functions appear to be working.

-Allan

Platform: Win10 x64, HD 530, 21.20.16.4552.

 

 

gdbserver-igfx "failed to initialize"

$
0
0

Hello,

I'm trying to use OpenCL™ Kernel Debugger as explained here: https://software.intel.com/en-us/node/671873
I have a target as follow:

  • Ubuntu 16.04LTS
  • Intel Celeron J3160
  • Intel® SDK for OpenCL™ Applications 2016 R3
  • Intel driver for OpenCL - intel-opencl-xxx-r3.0-57406.x86_64
  • intel graphic driver i915 v 4.7.0.intel.r3.0

My problem:
When I use following command:
/usr/bin/gdbserver-igfx :1234 --attach 123

I get error "Failed to initialize"

I tried several things like adding the HOST IP address, starting the program first to put a real PID, ... But nothing worked.

Please Help !

 

Thread Topic: 

Help Me

IGIL vs GEN

$
0
0

Hi,

I see that on OS X, Apple CL compiler has an IGIL target. Is IGIL code the same as GEN code?

Is there a way to inspect IGIL/GEN programs resulting from building a CL kernel on OS X? This has been possible possible on Windows and Linux for a while now.

Thank you in advance.

 

ioc32 build failure

$
0
0

ioc32.exe (6.3.0.1904) exited with code -1073741819 (0xc0000005) and a stack dump when compiling surf.cl from OpenCV 2.4.x.
See the attached log file.
Platform is VS 2013 on Windows 10.

I narrowed the failure to the following section of code in icvCalcOrientation.

if (tid < ORI_SAMPLES)
{
        const float margin = (float)(grad_wav_size - 1) / 2.0f;
        const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
        const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);

        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
                x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
        {
                X = icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
                X = c_aptW[tid] * X;
                Y = icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
                Y = c_aptW[tid] * Y;
                angle = atan2(Y, X);

If you swtich lines 11 and 12 in this snippet, the code will compile.

AttachmentSize
Downloadapplication/octet-streambuild.log5.8 KB
Downloadapplication/zipsurf.zip10.86 KB

__private memory, spills and loop unrolling on HD Graphics

$
0
0

I have a few questions:

  • If IOC reports that private memory is being used then does that always imply that it's being spilled or can it reside in the remaining registers?
  • How do I detect or analyze my kernel to see if spilling is occurring?
  • Should an auto struct multidimensional array of registers (some dimensions are 1) that is always indexed with constants be automatically fully unrolled (and not appear as private memory)?

I'm seeing a lot of mov and send operations in the .asm dump -- more than I would expect -- and would like to understand what's happening in the kernel and how to get the auto variable struct to be "stationary" in the register file since all the accesses are constant.

This is on GEN9 / Win10/x64 and the latest driver.

One high point:  half floats seem to work OK!

Thanks,

Allan


performance of half2 vector vs. half scalars per SIMD8 lane?

$
0
0

Basic question for GEN8+ experts:

In a SIMD8 kernel, does the GEN8+ EU achieve maximum fp16 throughput with half2 vectors per SIMD lane or are independent half scalars going to be better/worse/same? 

I am also wondering why assigning 4 half2 vectors with constants results in 8 scalar half MOVs?

Given a struct made up of 4 half2 vectors:

       a.x = 0;
       a.y = 0;
       a.z = 0;
       a.w = 1;

this is what gets generated:

         mov      (8|M0)         r79.0<1>:hf   0x3C00:hf
         mov      (8|M0)         r79.8<1>:hf   0x3C00:hf
         mov      (8|M0)         r78.0<1>:hf   0x0:hf
         mov      (8|M0)         r78.8<1>:hf   0x0:hf
         mov      (8|M0)         r77.0<1>:hf   0x0:hf
         mov      (8|M0)         r77.8<1>:hf   0x0:hf
         mov      (8|M0)         r76.0<1>:hf   0x0:hf
         mov      (8|M0)         r76.8<1>:hf   0x0:hf

I was expecting to see a 32-bit MOV initializing each half2 member.

 

Eclipse Code Builder - kernel arguments and configuration locked

$
0
0

Hello,

I 'm currently trying to deploy Intel OpenCL on the GPU of my Intel(R) Xeon(R) CPU E3-1505M v5 @ 2.80GHz on Centos 7.1. My configuration is :

- intel_sdk_for_opencl_2016_6.3.0.1904_x64.tgz

- driver r3.1.58620

- eclipse Oxygen, jre-8u111-linux-x64, mono-4.0.1-4.x86_64

I'm following the Code Builder tutorial. Every thing works fine until this step : https://software.intel.com/en-us/node/671857

When I try to launch the kernel, I have an error message telling me that the buffers are not assigned:

execution analysis error: No variable assigned to kernel argument #0 (ptrInput)

So I go to the OpenCL Kernel Analysis Input view and the configuration as well as the assigned variables are all in grey. I cannot click on the configuration and the kernel variables fields are empty.

Can you please help me ?

Regards,

Mathieu

Here are the buffers I have declared :

<!--  Kernel Builder Variables Database  --><KernelBuilderBuffers><BufferDetails><Name>ptrInput</Name><DataType>uchar</DataType><MemoryFlags>CL_MEM_READ_WRITE</MemoryFlags><AccessQualifier>READ_ONLY</AccessQualifier><I_O_Mode>0</I_O_Mode><Source>/home/user/STI/THOREL/workspace_eclipse/Code Builder Sessions/session_0/bin_ptrInput.bin</Source><ReadOutput>true</ReadOutput><BufferSize>262144</BufferSize><InitByRandom>false</InitByRandom><InitByZero>true</InitByZero><UseAsSVM>false</UseAsSVM></BufferDetails><BufferDetails><Name>ptrOutput</Name><DataType>uchar</DataType><MemoryFlags>CL_MEM_READ_WRITE</MemoryFlags><AccessQualifier>READ_ONLY</AccessQualifier><I_O_Mode>1</I_O_Mode><Source></Source><ReadOutput>true</ReadOutput><BufferSize>262144</BufferSize><InitByRandom>false</InitByRandom><InitByZero>false</InitByZero><UseAsSVM>false</UseAsSVM></BufferDetails></KernelBuilderBuffers><KernelBuilderImages/><KernelBuilderSamplers/>

 

The kernel is :

 

#define FILTER_WIDTH ( 11 )
#define FILTER_SIZE  ( FILTER_WIDTH * FILTER_WIDTH )
#define FILTER_INF   ( 5 ) // Need to be calculated -> floor( FILTER_WIDTH / 2 )

__kernel void convolve_no_local( const __global unsigned char * restrict ptrInput,
                                       __global unsigned char * restrict ptrOutput,
                                 const          int                      iWidth,
                                 const          int                      iHeight )
{
    // Must be normalized
    const int arrFilter[ FILTER_SIZE ] = { -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,  0,
                                           -2, -2, -2, -2, -2, -2, -2, -2, -2,  0,  2,
                                           -2, -2, -2, -2, -2, -2, -2, -2,  0,  2,  2,
                                           -2, -2, -2, -2, -2, -2, -2,  0,  2,  2,  2,
                                           -2, -2, -2, -2, -2, -2,  0,  2,  2,  2,  2,
                                           -2, -2, -2, -2, -2,  0,  2,  2,  2,  2,  2,
                                           -2, -2, -2, -2,  0,  2,  2,  2,  2,  2,  2,
                                           -2, -2, -2,  0,  2,  2,  2,  2,  2,  2,  2,
                                           -2, -2,  0,  2,  2,  2,  2,  2,  2,  2,  2,
                                           -2,  0,  2,  2,  2,  2,  2,  2,  2,  2,  2,
                                            0,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2 };

    const int iWorkGlobalX = get_global_id( 0 );
    const int iWorkGlobalY = get_global_id( 1 );

    if( iWorkGlobalX >= FILTER_INF && iWorkGlobalX < iWidth  - FILTER_INF &&
        iWorkGlobalY >= FILTER_INF && iWorkGlobalY < iHeight - FILTER_INF )
    {
        const int iInputX = ( iWorkGlobalX - FILTER_INF );
        const int iInputY = ( iWorkGlobalY - FILTER_INF );

        int iInput;
        int iValue = 0;

#ifdef __UNROLL_3x3__
        iInput = iInputX + iInputY * iWidth;
        iValue = iValue + ( arrFilter[ 0 ] * convert_int( ptrInput[ iInput     ] ));
        iValue = iValue + ( arrFilter[ 1 ] * convert_int( ptrInput[ iInput + 1 ] ));
        iValue = iValue + ( arrFilter[ 2 ] * convert_int( ptrInput[ iInput + 2 ] ));
        iInput = iInput + iWidth;
        iValue = iValue + ( arrFilter[ 3 ] * convert_int( ptrInput[ iInput     ] ));
        iValue = iValue + ( arrFilter[ 4 ] * convert_int( ptrInput[ iInput + 1 ] ));
        iValue = iValue + ( arrFilter[ 5 ] * convert_int( ptrInput[ iInput + 2 ] ));
        iInput = iInput + iWidth;
        iValue = iValue + ( arrFilter[ 6 ] * convert_int( ptrInput[ iInput     ] ));
        iValue = iValue + ( arrFilter[ 7 ] * convert_int( ptrInput[ iInput + 1 ] ));
        iValue = iValue + ( arrFilter[ 8 ] * convert_int( ptrInput[ iInput + 2 ] ));
#else
        int iRow;
        int iCol;
        int iFilter = 0;

        for( iRow = 0 ; iRow < FILTER_WIDTH ; iRow++ )
        {
            iInput = iInputX + ( iInputY + iRow ) * iWidth;

            for( iCol = 0 ; iCol < FILTER_WIDTH ; iCol++ )
            {
                iValue = iValue + ( arrFilter[ iFilter ] * convert_int( ptrInput[ iInput ] ));

                iInput++;
                iFilter++;
            }
        }
#endif
        ptrOutput[ iWorkGlobalX + iWorkGlobalY * iWidth ] = convert_uchar_sat( iValue );
    }
}

 

Here are the view screeshots :

     

         

Thread Topic: 

Help Me

CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT on Intel HD Graphics 400

$
0
0

Hi,

I have a target as follow:

  • Ubuntu 16.04LTS
  • Intel Celeron J3160 (GPU Intel HD Graphics 400)
  • Intel® SDK for OpenCL™ Applications 2016 R3
  • Intel driver for OpenCL - intel-opencl-xxx-r3.0-57406.x86_64
  • intel graphic driver i915 v 4.7.0.intel.r3.0

I have tried to query device info preferred witdh and got following result:

    Device Name = Intel(R) HD Graphics
    Device Vendor = Intel(R) Corporation
    Preferred vector width in chars: 16
    Preferred vector width in shorts: 8
    Preferred vector width in ints: 4
    Preferred vector width in longs: 1
    Preferred vector width in floats: 1
    Preferred vector width in doubles: 0
    Preferred vector width in halfs: 8

It clearly appears that there is 128bits vectorization, at least for chars,shorts,ints and halfs.
But strangely not for floats and longs?

1) Am I right ?
2) How to explain that ?
 

Also we plan to go to next generation with an Intel N3350 (HD Graphics 500).
3) So would it be the same ?

4) Where can I find documentation about that, on HD Graphics 400 and HD Graphics 500 ?

Thank you
Regards

Thread Topic: 

Help Me

CL_KERNEL_SPILL_MEM_SIZE_INTEL interpretation?

$
0
0

I'm trying to hunt down spills in my kernel.

Is there a more detailed description of the CL_KERNEL_SPILL_MEM_SIZE_INTEL query (link)?

Is this byte value per work item, sub_group or work_group?

I ask because I'm seeing a consistent "2048" returned by this query.

Also, what GEN instruction(s) indicate spillage? Is it SENDS?

 

Intel GEN SLM allocation granularity still 4KB per workgroup?

$
0
0

Given a kernel that uses no barriers, does this recommendation still hold for GEN8 and beyond?

https://software.intel.com/en-us/node/540442

NOTE

A bare minimum SLM allocation size is 4k per workgroup, so even if your kernel requires less bytes per work-group, the actual allocation still will be 4k. To accommodate many potential execution scenarios try to minimize local memory usage to fit the optimal value of 4K per workgroup. Also notice that the granularity of SLM allocation is 1K.

 

Viewing all 1182 articles
Browse latest View live


<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>