Hello,
Expected behavior (only occurs in the CPU, not in the IGPU):
1. The host creates an user event. Then, the host calls a EnqueueKernelNDRange (vector addition) and waits for the user event (WaitForEvents). When the kernel finishes it triggers the callback "callback_kernel".
2. This "callback_kernel" calls a EnqueueReadBuffer non-blocking, and when it finishes triggers the callback "callback_read".
3. The "callback_read" sets CL_COMPLETE the user event.
4. The host continues after the WaitForEvents with the content filled (buffer read).
If I select the Intel CPU as device:
./callback 0 1 0 1 1 0
platforms: 1 (0) platform name: Intel(R) OpenCL num devices: 2 (0) devices: 2 (0) context (0) device name: Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz device type: CL_DEVICE_TYPE_CPU command queue (0) mode CPU create buffer a (0) create buffer b (0) create buffer c (0) create program (0) build program (0) create kernel1 (0) gws {1024, 1, 1} lws {128, 1, 1} set arg 0 (0) set arg 1 (0) set arg 2 (0) set arg 3 (0) nd range kernel1 (0) create user event (0) c_v 0x1420030 set event callback (0) waiting for events -- BEGIN callback kernel executed (0) c_v 0x1420030 c_v[0] = 0 about to read the c buffer blocking 0 enqueue read buffer (0) set event callback (0) -- END -- BEGIN callback read executed (0) c_v 0x1420030 c_v[0] = 1 set user event status (0) -- END waited for events About to wait events: 1 3 = CL_QUEUED, 0 = CL_COMPLETE, 2 = CL_SUBMITTED, 1 = CL_RUNNING cl_event CPU init 0 [0] = status 0 (ref 0x7f7568000a90) CPU [0] = 1, [1] = 3, [2] = 5 about to check (first: 1)
It works.
Now, if I select the Intel IGPU (Intel Integrated GPU):
./callback 0 0 2 1 1 0
(gdb) r 0 0 2 1 1 0 Starting program: /callbacks/build/callback 0 0 2 1 1 0 [Thread debugging using libthread_db enabled] Using host libthread_db library "/usr/lib/libthread_db.so.1". [New Thread 0x7ffff4cd9700 (LWP 21291)] platforms: 1 (0) platform name: Intel(R) OpenCL num devices: 2 (0) devices: 2 (0) [New Thread 0x7fffeede2700 (LWP 21292)] [New Thread 0x7fffee5e0700 (LWP 21293)] [New Thread 0x7fffee9e1700 (LWP 21294)] context (0) device name: Intel(R) HD Graphics device type: CL_DEVICE_TYPE_GPU command queue (0) mode IGPU create buffer a (0) create buffer b (0) create buffer c (0) create program (0) build program (0) create kernel1 (0) gws {1024, 1, 1} lws {128, 1, 1} set arg 0 (0) set arg 1 (0) set arg 2 (0) set arg 3 (0) nd range kernel1 (0) create user event (0) c_v 0x607030 [New Thread 0x7fffec827700 (LWP 21295)] set event callback (0) waiting for events -- BEGIN callback kernel executed (0) c_v 0x607030 c_v[0] = 0 about to read the c buffer blocking 0 enqueue read buffer (0) set event callback (0) -- END ^C Thread 1 "callback" received signal SIGINT, Interrupt. 0x00007ffff730a756 in pthread_cond_wait@@GLIBC_2.3.2 () from /usr/lib/libpthread.so.0 (gdb) bt #0 0x00007ffff730a756 in pthread_cond_wait@@GLIBC_2.3.2 () from /usr/lib/libpthread.so.0 #1 0x00007ffff64c635b in ?? () from /opt/intel/opencl/libintelocl.so #2 0x00007ffff648c63a in ?? () from /opt/intel/opencl/libintelocl.so #3 0x00007ffff647b5d1 in ?? () from /opt/intel/opencl/libintelocl.so #4 0x00007ffff63f3e75 in clWaitForEvents () from /opt/intel/opencl/libintelocl.so #5 0x00007ffff6edca43 in ?? () from /opt/intel/opencl/libIntelOpenCL.so #6 0x000000000040237e in main (argc=7, argv=0x7fffffffdc58) at ./src/callback.c:532
It is freezes / hangs.
It works in the Intel CPU but not in the Intel Integrated GPU (Graphics).
(gdb shows that is freezed in the `pthread_cond_wait` of the intel opencl driver).
Can anyone explain really what is the behavior with the callbacks/events and the host thread? (best practices, how to avoid deadlocks)
I need fine grained control and the fastest performance, and it looks like is callbacks, but they have weird behaviors...
#include <CL/cl.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #define MAX_FILE_SIZE 1024000 #include <sys/stat.h> #include <sys/types.h> typedef enum ocl_type_e_t { OCL_TYPE_NULL = 0, OCL_TYPE_CPU = 1, OCL_TYPE_GPU = 2, OCL_TYPE_IGPU = 3, OCL_TYPE_ACC = 4 } ocl_type_e_t; const char* cl_device_type_to_str(cl_device_type type) { static char* strings[] = { "(invalid)", // invalid"CL_DEVICE_TYPE_CPU","CL_DEVICE_TYPE_GPU","CL_DEVICE_TYPE_ACCELERATOR","CL_DEVICE_TYPE_CUSTOM","CL_DEVICE_TYPE_DEFAULT","CL_DEVICE_TYPE_ALL", }; char* ret; switch (type) { case CL_DEVICE_TYPE_CPU: ret = strings[1]; break; case CL_DEVICE_TYPE_GPU: ret = strings[2]; break; case CL_DEVICE_TYPE_ACCELERATOR: ret = strings[3]; break; case CL_DEVICE_TYPE_CUSTOM: ret = strings[4]; break; case CL_DEVICE_TYPE_DEFAULT: ret = strings[5]; break; case CL_DEVICE_TYPE_ALL: ret = strings[6]; break; default: ret = strings[0]; break; } return ret; } const char* file_read(char* const path) { struct stat st; /* st = (struct stat*)malloc(sizeof(stat)); */ int error = stat(path, &st); if (error != 0) { printf("Invalid file %s\n", path); exit(EXIT_FAILURE); } int size_file = st.st_size; if (size_file > MAX_FILE_SIZE) { printf("File %s is bigger than the max allowed size (%d > %d bytes)\n", path, size_file, MAX_FILE_SIZE); exit(EXIT_FAILURE); } FILE* fp = fopen(path, "r"); if (fp == NULL) { printf("Error opening the file %s\n", path); exit(EXIT_FAILURE); } char* const buf = (char* const)malloc(size_file); if (buf == NULL) { printf("Error allocating %d bytes for the contents of the file %s\n", size_file, path); exit(EXIT_FAILURE); } int size_read; while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) { ; } fclose(fp); return buf; } cl_event clb_events_waiting[100]; int clb_events_waiting_device[100]; int clb_events_init_read[100]; int clb_num_events_waiting = 0; void clbWaitEvents(int * c) { if (clb_num_events_waiting > 0){ printf("About to wait events: %d\n", clb_num_events_waiting); int i; int waiting = 0; cl_event ev_waiting[100]; printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING\n", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING); for (i=0; i<clb_num_events_waiting; i++){ cl_int ret; clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL); int dev = clb_events_waiting_device[i]; int init = clb_events_init_read[i] / sizeof(int); printf("cl_event %s init %6d [%d] = status %d (ref %p)\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]); if (ret != CL_COMPLETE){ ev_waiting[waiting] = clb_events_waiting[i]; waiting++; } } for (i=0; i<clb_num_events_waiting; i++){ int dev = clb_events_waiting_device[i]; int init = clb_events_init_read[i] / sizeof(int); printf("%s [%d] = %d, [%d] = %d, [%d] = %d\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]); } if (waiting > 0){ printf("about to wait %d events\n", waiting); clWaitForEvents(waiting, ev_waiting); printf("wait events finished\n"); } /* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */ } } typedef struct callback_data { cl_command_queue* queue; cl_mem* buf_c; int* c_v; uint size; cl_event* end; bool nested_callbacks; bool blocking; } callback_data; void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status, void* user_data); void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data); int main(int argc, char* argv[]) { bool use_callbacks = true; bool use_nested_callbacks = true; bool use_blocking = false; int numSelPlatform = 0; int numSelDevice = 0; int doUseCallbacks = 0; int doUseNestedCallbacks = 0; int doUseBlocking = 0; int use_type = 0; if (argc != 7) { printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use ""callbacks) (use nested callbacks) (use blocking)\n", argv[0]); exit(EXIT_FAILURE); } else { numSelPlatform = atoi(argv[1]); numSelDevice = atoi(argv[2]); use_type = atoi(argv[3]); doUseCallbacks = atoi(argv[4]); doUseNestedCallbacks = atoi(argv[5]); doUseBlocking = atoi(argv[6]); } cl_event end; uint size = 1024; int* a_v = (int*)malloc(size * sizeof(int)); int* b_v = (int*)malloc(size * sizeof(int)); int* c_v = (int*)malloc(size * sizeof(int)); for (size_t i = 0; i < size; i++) { a_v[i] = i; b_v[i] = i + 1; c_v[i] = 0; } const char* kernel_str = file_read("src/kernel.cl"); use_callbacks = doUseCallbacks; use_nested_callbacks = doUseNestedCallbacks; use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE; cl_int st; cl_int err; int len = 256; char buflog[len]; cl_uint numPlatforms = 0; st = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id* platforms = NULL; platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); st = clGetPlatformIDs(numPlatforms, platforms, NULL); printf("platforms: %d (%d)\n", numPlatforms, st); cl_uint selPlatform = numSelPlatform; // 1; numPlatforms = 1; cl_platform_id platform = platforms[selPlatform]; clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL); if (buflog != NULL) { printf("platform name: %s\n", buflog); } cl_uint numDevices = 0; st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); printf("num devices: %d (%d)\n", numDevices, st); if (st != CL_SUCCESS) { /* printf("explain error: %s\n", clErrorString(st)); */ printf("error: %d\n", st); } cl_device_id* devices = NULL; devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); printf("devices: %d (%d)\n", numDevices, st); // Context cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err); printf("context (%d)\n", err); // Select device cl_uint selDevice = numSelDevice; // 0; numDevices = 1; // clBuildProgram cl_device_id device = devices[selDevice]; // Device Info clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL); if (buflog != NULL) { printf("device name: %s\n", buflog); } cl_device_type type; clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); printf("device type: %s\n", cl_device_type_to_str(type)); // events cl_event ev_kernel; // CommandQueue /* cl_command_queue_properties props; */ cl_command_queue queue; queue = clCreateCommandQueue(context, device, 0, &err); printf("command queue (%d)\n", err); // CreateBuffer cl_mem buf_a; cl_mem buf_b; cl_mem buf_c; ocl_type_e_t ocl_type; if (use_type == 0) { ocl_type = OCL_TYPE_CPU; printf("mode CPU\n"); } else if (use_type == 1) { ocl_type = OCL_TYPE_GPU; printf("mode GPU\n"); } else if (use_type == 2) { ocl_type = OCL_TYPE_IGPU; printf("mode IGPU\n"); } else if (use_type == 3) { ocl_type = OCL_TYPE_ACC; printf("mode ACC\n"); } /* cl_mem buf_x; */ switch (ocl_type) { case OCL_TYPE_IGPU: buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int), a_v, &err); /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */ /* Acpy, &err); */ break; case OCL_TYPE_GPU: buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int), a_v, &err); break; case OCL_TYPE_ACC: buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size * sizeof(int), a_v, &err); break; case OCL_TYPE_CPU: buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size * sizeof(int), a_v, &err); break; default: printf("no ocl_type defined\n"); exit(EXIT_FAILURE); break; } printf("create buffer a (%d)\n", err); if (err != CL_SUCCESS) { /* printf("create buffer error: %s\n", clErrorString(err)); */ printf("create buffer error: %d\n", err); } switch (ocl_type) { case OCL_TYPE_IGPU: buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int), b_v, &err); break; case OCL_TYPE_GPU: buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int), b_v, &err); break; case OCL_TYPE_ACC: buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size * sizeof(int), b_v, &err); break; case OCL_TYPE_CPU: buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size * sizeof(int), b_v, &err); break; default: printf("no ocl_type defined\n"); exit(EXIT_FAILURE); break; } printf("create buffer b (%d)\n", err); if (err != CL_SUCCESS) { printf("create buffer error: %d\n", err); /* printf("create buffer error: %s\n", clErrorString(err)); */ } switch (ocl_type) { case OCL_TYPE_IGPU: buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int), c_v, &err); /* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols * * sizeof(int), */ /* c_v, &err); */ /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */ /* Acpy, &err); */ break; case OCL_TYPE_GPU: buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int), c_v, &err); break; case OCL_TYPE_ACC: buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size * sizeof(int), c_v, &err); break; case OCL_TYPE_CPU: buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, /* buf_c = */ /* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */ /* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */ size * sizeof(int), c_v, &err); break; default: printf("no ocl_type defined\n"); exit(EXIT_FAILURE); break; } printf("create buffer c (%d)\n", err); if (err != CL_SUCCESS) { /* printf("create buffer error: %s\n", clErrorString(err)); */ printf("create buffer error: %d\n", err); } /* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x, * &err); */ /* printf("create buffer x (%d)\n", err); */ // WriteBuffer /* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float), */ /* Acpy, 0, NULL, NULL); */ /* printf("write buffer Acpy - b_a (%d)\n", st); */ /* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy, * 0, */ /* NULL, NULL); */ /* printf("write buffer bcpy - b_b (%d)\n", st); */ // Create Program cl_program program; program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str, NULL, &err); printf("create program (%d)\n", err); // Build Program /* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL, * NULL, */ /* NULL); */ char* opts = "-Werror"; st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL, NULL); printf("build program (%d)\n", st); if (st != CL_SUCCESS) { /* printf("build status: %s\n", clErrorString(st)); */ printf("build status: %d\n", st); char log[512]; st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log, NULL); printf("build info (%d)\n", st); if (st == CL_SUCCESS) { printf("%s\n", log); } } // Create Kernel cl_kernel kernel1; kernel1 = clCreateKernel(program, "kernel1", &st); printf("create kernel1 (%d)\n", st); /* cl_kernel kernel2; */ /* kernel2 = clCreateKernel(program, "ocl1_2", &st); */ /* printf("create kernel2 (%d)\n", st); */ // workgroup size size_t dims = 1; size_t gws[] = { 1, 1, 1 }; /* size_t gws[dims]; */ gws[0] = size; // a_rows; /* gws[0] = 32; */ /* size_t* lws = NULL; */ /* size_t lws[dims]; */ /* size_t lws[dims]; */ /* size_t lws[dims] = NULL; */ /* size_t lws[] = {0, 0, 0}; */ size_t lws[] = { 128, 1, 1 }; printf("gws {%lu, %lu, %lu}\n", gws[0], gws[1], gws[2]); if (lws != NULL) { printf("lws {%lu, %lu, %lu}\n", lws[0], lws[1], lws[2]); } else { printf("lws unspecified\n"); } // Set Kernel Args st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a); printf("set arg %d (%d)\n", 0, st); st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b); printf("set arg %d (%d)\n", 1, st); /* printf("set kernel1 arg: %d (%d)\n", 0, st); */ st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c); printf("set arg %d (%d)\n", 2, st); st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size); printf("set arg %d (%d)\n", 3, st); // Execute kernel st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws, (const size_t*)lws, 0, NULL, &ev_kernel); /* (const size_t*)lws, 0, NULL, NULL); */ /* printf("nd range kernel1 (%d %s)\n", st, clErrorString(st)); */ printf("nd range kernel1 (%d)\n", st); end = clCreateUserEvent(context, &st); printf("create user event (%d)\n", st); callback_data* user_data = (callback_data*)malloc(sizeof(callback_data)); printf("c_v %p\n", (void*)c_v); user_data->queue = &queue; user_data->buf_c = &buf_c; user_data->c_v = c_v; user_data->size = size; user_data->end = &end; user_data->nested_callbacks = use_nested_callbacks; user_data->blocking = use_blocking; if (use_callbacks) { st = clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data); printf("set event callback (%d)\n", st); } /* printf("first: %2.5f\n", c_v[0]); */ /* print_matrix_float_s_t("c", c); */ // ReadBuffer /* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ, * 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */ /* printf("read buffer c_v - buf_c (%d)\n", st); */ /* printf("finish queue\n"); */ /* clFinish(queue); */ /* printf("finished queue\n"); */ if (use_callbacks) { /* clWaitForCompletion(context); */ printf("waiting for events\n"); /* /\* cl_event events[] = {ev_kernel}; *\/ */ cl_event events[] = { end }; clWaitForEvents(1, events); // ev_kernel); printf("waited for events\n"); clbWaitEvents(c_v); } else { printf("about to read the c buffer\n"); st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int), c_v, 0, NULL, NULL); printf("read buffer c_v - buf_c (%d)\n", st); } /* print_matrix("c_v", c_v, c_rows, c_cols); */ /* printf("first: %2.5f\n", c_v[0]); */ /* print_matrix_float_s_t("c", c); */ free(user_data); clReleaseKernel(kernel1); /* clReleaseKernel(kernel2); */ clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseMemObject(buf_a); clReleaseMemObject(buf_b); clReleaseMemObject(buf_c); /* clReleaseMemObject(b_x); */ clReleaseContext(context); free(devices); free(platforms); #define THRESHOLD 0 // check printf("about to check (first: %d)\n", c_v[0]); for (size_t i = 0; i < size; i++) { if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) { printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i, a_v[i], i, b_v[i], i, c_v[i]); exit(EXIT_FAILURE); } } return EXIT_SUCCESS; } void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status, void* user_data) { printf("-- BEGIN callback read executed (%d)\n", ev_status); callback_data* cb_data = (callback_data*)user_data; /* cl_command_queue queue = *(cb_data->queue); */ /* cl_mem buf_c = *(cb_data->buf_c); */ int* c_v = cb_data->c_v; cl_event end = *(cb_data->end); /* int size = cb_data->size; */ cl_int st; printf("c_v %p\n", (void*)c_v); printf("c_v[0] = %d\n", c_v[0]); /* c_v[1] = 1; */ st = clSetUserEventStatus(end, CL_COMPLETE); printf("set user event status (%d)\n", st); // haz que salga el finish printf("-- END\n"); } cl_event ev_read; void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data) { printf("-- BEGIN callback kernel executed (%d)\n", ev_status); callback_data* cb_data = (callback_data*)user_data; cl_command_queue queue = *(cb_data->queue); cl_mem buf_c = *(cb_data->buf_c); int* c_v = cb_data->c_v; int size = cb_data->size; bool nested_callbacks = cb_data->nested_callbacks; bool blocking = cb_data->blocking; cl_event end = *(cb_data->end); printf("c_v %p\n", (void*)c_v); printf("c_v[0] = %d\n", c_v[0]); cl_int st; /* printf("about to flush\n"); */ /* clFlush(queue); */ /* printf("flushed\n"); */ size_t offset = 0; /* size = size + 4; */ printf("about to read the c buffer\n"); printf("blocking %d\n", blocking); clb_events_waiting_device[clb_num_events_waiting] = 0; clb_events_init_read[clb_num_events_waiting] = 0; /* why it does not work? (blocking CL_TRUE) */ st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int), c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]); ev_read = clb_events_waiting[clb_num_events_waiting - 1]; printf("enqueue read buffer (%d)\n", st); /* size * sizeof(int), c_v, 0, NULL, NULL); */ if (nested_callbacks) { st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data); printf("set event callback (%d)\n", st); /* st = clSetUserEventStatus(end, CL_COMPLETE); */ /* printf("set user event status (%d)\n", st); */ } /* c_v[1] = 1; */ /* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */ /* printf("event info (%d)\n", st); */ /* int len = 512; */ /* char buflog[len]; */ /* cl_command_type; */ /* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */ /* if (buflog != NULL) { */ /* printf("- event: %s\n", buflog); */ /* } */ if (!nested_callbacks) { st = clSetUserEventStatus(end, CL_COMPLETE); printf("set user event status (%d)\n", st); /* printf("read buffer c_v - buf_c (%d)\n", st); */ } printf("-- END\n"); }
Kernel:
__kernel void kernel1(__global int* a, __global int* b, __global int* c, int size) { int idx = get_global_id(0); if (idx >= 0 && idx < size){ c[idx] = a[idx] + b[idx]; } }