Quantcast
Channel: Intel® Software - OpenCL*
Viewing all articles
Browse latest Browse all 1182

OpenCL Callback freezed in Intel GPU (hangs) - deadlock? pthread_cond_wait

$
0
0

Hello,

Expected behavior (only occurs in the CPU, not in the IGPU):
1. The host creates an user event. Then, the host calls a EnqueueKernelNDRange (vector addition) and waits for the user event (WaitForEvents). When the kernel finishes it triggers the callback "callback_kernel".
2. This "callback_kernel" calls a EnqueueReadBuffer non-blocking, and when it finishes triggers the callback "callback_read".
3. The "callback_read" sets CL_COMPLETE the user event.
4. The host continues after the WaitForEvents with the content filled (buffer read).

If I select the Intel CPU as device:

./callback 0 1 0 1 1 0

 

platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
device type: CL_DEVICE_TYPE_CPU
command queue (0)
mode CPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x1420030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x1420030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
-- BEGIN callback read executed (0)
c_v 0x1420030
c_v[0] = 1
set user event status (0)
-- END
waited for events
About to wait events: 1
3 = CL_QUEUED, 0 = CL_COMPLETE, 2 = CL_SUBMITTED, 1 = CL_RUNNING
cl_event CPU init      0  [0] = status 0 (ref 0x7f7568000a90)
CPU [0] = 1, [1] = 3, [2] = 5
about to check (first: 1)

 

It works.

Now, if I select the Intel IGPU (Intel Integrated GPU):

 

./callback 0 0 2 1 1 0

 

(gdb) r 0 0 2 1 1 0
Starting program: /callbacks/build/callback 0 0 2 1 1 0
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
[New Thread 0x7ffff4cd9700 (LWP 21291)]
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
[New Thread 0x7fffeede2700 (LWP 21292)]
[New Thread 0x7fffee5e0700 (LWP 21293)]
[New Thread 0x7fffee9e1700 (LWP 21294)]
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x607030
[New Thread 0x7fffec827700 (LWP 21295)]
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x607030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
^C
Thread 1 "callback" received signal SIGINT, Interrupt.
0x00007ffff730a756 in pthread_cond_wait@@GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
(gdb) bt
#0  0x00007ffff730a756 in pthread_cond_wait@@GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
#1  0x00007ffff64c635b in ?? () from /opt/intel/opencl/libintelocl.so
#2  0x00007ffff648c63a in ?? () from /opt/intel/opencl/libintelocl.so
#3  0x00007ffff647b5d1 in ?? () from /opt/intel/opencl/libintelocl.so
#4  0x00007ffff63f3e75 in clWaitForEvents () from /opt/intel/opencl/libintelocl.so
#5  0x00007ffff6edca43 in ?? () from /opt/intel/opencl/libIntelOpenCL.so
#6  0x000000000040237e in main (argc=7, argv=0x7fffffffdc58) at ./src/callback.c:532

It is freezes / hangs.

It works in the Intel CPU but not in the Intel Integrated GPU (Graphics).

(gdb shows that is freezed in the `pthread_cond_wait` of the intel opencl driver).

Can anyone explain really what is the behavior with the callbacks/events and the host thread? (best practices, how to avoid deadlocks)

I need fine grained control and the fastest performance, and it looks like is callbacks, but they have weird behaviors...

    #include <CL/cl.h>

    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>

    #define MAX_FILE_SIZE 1024000

    #include <sys/stat.h>
    #include <sys/types.h>

    typedef enum ocl_type_e_t {
      OCL_TYPE_NULL = 0,
      OCL_TYPE_CPU = 1,
      OCL_TYPE_GPU = 2,
      OCL_TYPE_IGPU = 3,
      OCL_TYPE_ACC = 4
    } ocl_type_e_t;


    const char*
    cl_device_type_to_str(cl_device_type type)
    {
      static char* strings[] = {
        "(invalid)", // invalid"CL_DEVICE_TYPE_CPU","CL_DEVICE_TYPE_GPU","CL_DEVICE_TYPE_ACCELERATOR","CL_DEVICE_TYPE_CUSTOM","CL_DEVICE_TYPE_DEFAULT","CL_DEVICE_TYPE_ALL",
      };

      char* ret;

      switch (type) {
        case CL_DEVICE_TYPE_CPU:
          ret = strings[1];
          break;
        case CL_DEVICE_TYPE_GPU:
          ret = strings[2];
          break;
        case CL_DEVICE_TYPE_ACCELERATOR:
          ret = strings[3];
          break;
        case CL_DEVICE_TYPE_CUSTOM:
          ret = strings[4];
          break;
        case CL_DEVICE_TYPE_DEFAULT:
          ret = strings[5];
          break;
        case CL_DEVICE_TYPE_ALL:
          ret = strings[6];
          break;
        default:
          ret = strings[0];
          break;
      }
      return ret;
    }

    const char*
    file_read(char* const path)
    {
      struct stat st;
      /* st = (struct stat*)malloc(sizeof(stat)); */
      int error = stat(path, &st);
      if (error != 0) {
        printf("Invalid file %s\n", path);
        exit(EXIT_FAILURE);
      }

      int size_file = st.st_size;

      if (size_file > MAX_FILE_SIZE) {
        printf("File %s is bigger than the max allowed size (%d > %d bytes)\n",
               path, size_file, MAX_FILE_SIZE);
        exit(EXIT_FAILURE);
      }

      FILE* fp = fopen(path, "r");
      if (fp == NULL) {
        printf("Error opening the file %s\n", path);
        exit(EXIT_FAILURE);
      }

      char* const buf = (char* const)malloc(size_file);
      if (buf == NULL) {
        printf("Error allocating %d bytes for the contents of the file %s\n",
               size_file, path);
        exit(EXIT_FAILURE);
      }

      int size_read;
      while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) {
        ;
      }

      fclose(fp);

      return buf;
    }



    cl_event clb_events_waiting[100];
    int clb_events_waiting_device[100];
    int clb_events_init_read[100];
    int clb_num_events_waiting = 0;

    void
    clbWaitEvents(int * c)
    {
      if (clb_num_events_waiting > 0){
        printf("About to wait events: %d\n", clb_num_events_waiting);
        int i;
        int waiting = 0;
        cl_event ev_waiting[100];
        printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING\n", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING);
        for (i=0; i<clb_num_events_waiting; i++){
          cl_int ret;
          clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
          int dev = clb_events_waiting_device[i];
          int init = clb_events_init_read[i] / sizeof(int);
          printf("cl_event %s init %6d  [%d] = status %d (ref %p)\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]);

          if (ret != CL_COMPLETE){
            ev_waiting[waiting] = clb_events_waiting[i];
            waiting++;
          }
        }

        for (i=0; i<clb_num_events_waiting; i++){
          int dev = clb_events_waiting_device[i];
          int init = clb_events_init_read[i] / sizeof(int);
          printf("%s [%d] = %d, [%d] = %d, [%d] = %d\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]);
        }

        if (waiting > 0){
          printf("about to wait %d events\n", waiting);
          clWaitForEvents(waiting, ev_waiting);
          printf("wait events finished\n");
        }
        /* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */
      }
    }














    typedef struct callback_data
    {
      cl_command_queue* queue;
      cl_mem* buf_c;
      int* c_v;
      uint size;
      cl_event* end;
      bool nested_callbacks;
      bool blocking;
    } callback_data;

    void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status,
                                      void* user_data);

    void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status,
                                        void* user_data);

    int
    main(int argc, char* argv[])
    {

      bool use_callbacks = true;
      bool use_nested_callbacks = true;
      bool use_blocking = false;

      int numSelPlatform = 0;
      int numSelDevice = 0;
      int doUseCallbacks = 0;
      int doUseNestedCallbacks = 0;
      int doUseBlocking = 0;
      int use_type = 0;
      if (argc != 7) {
        printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use ""callbacks) (use nested callbacks) (use blocking)\n",
               argv[0]);
        exit(EXIT_FAILURE);
      } else {
        numSelPlatform = atoi(argv[1]);
        numSelDevice = atoi(argv[2]);
        use_type = atoi(argv[3]);
        doUseCallbacks = atoi(argv[4]);
        doUseNestedCallbacks = atoi(argv[5]);
        doUseBlocking = atoi(argv[6]);
      }

      cl_event end;

      uint size = 1024;
      int* a_v = (int*)malloc(size * sizeof(int));
      int* b_v = (int*)malloc(size * sizeof(int));
      int* c_v = (int*)malloc(size * sizeof(int));
      for (size_t i = 0; i < size; i++) {
        a_v[i] = i;
        b_v[i] = i + 1;
        c_v[i] = 0;
      }

      const char* kernel_str = file_read("src/kernel.cl");

      use_callbacks = doUseCallbacks;
      use_nested_callbacks = doUseNestedCallbacks;
      use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE;

      cl_int st;
      cl_int err;

      int len = 256;
      char buflog[len];

      cl_uint numPlatforms = 0;
      st = clGetPlatformIDs(0, NULL, &numPlatforms);
      cl_platform_id* platforms = NULL;
      platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));

      st = clGetPlatformIDs(numPlatforms, platforms, NULL);
      printf("platforms: %d (%d)\n", numPlatforms, st);

      cl_uint selPlatform = numSelPlatform; // 1;

      numPlatforms = 1;
      cl_platform_id platform = platforms[selPlatform];

      clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL);
      if (buflog != NULL) {
        printf("platform name: %s\n", buflog);
      }

      cl_uint numDevices = 0;
      st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
      printf("num devices: %d (%d)\n", numDevices, st);
      if (st != CL_SUCCESS) {
        /* printf("explain error: %s\n", clErrorString(st)); */
        printf("error: %d\n", st);
      }
      cl_device_id* devices = NULL;
      devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

      st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
      printf("devices: %d (%d)\n", numDevices, st);

      // Context
      cl_context context;
      context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err);
      printf("context (%d)\n", err);

      // Select device
      cl_uint selDevice = numSelDevice; // 0;
      numDevices = 1;                   // clBuildProgram
      cl_device_id device = devices[selDevice];

      // Device Info
      clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL);
      if (buflog != NULL) {
        printf("device name: %s\n", buflog);
      }

      cl_device_type type;
      clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
      printf("device type: %s\n", cl_device_type_to_str(type));

      // events
      cl_event ev_kernel;

      // CommandQueue
      /* cl_command_queue_properties props; */
      cl_command_queue queue;
      queue = clCreateCommandQueue(context, device, 0, &err);
      printf("command queue (%d)\n", err);

      // CreateBuffer
      cl_mem buf_a;
      cl_mem buf_b;
      cl_mem buf_c;

      ocl_type_e_t ocl_type;
      if (use_type == 0) {
        ocl_type = OCL_TYPE_CPU;
        printf("mode CPU\n");
      } else if (use_type == 1) {
        ocl_type = OCL_TYPE_GPU;
        printf("mode GPU\n");
      } else if (use_type == 2) {
        ocl_type = OCL_TYPE_IGPU;
        printf("mode IGPU\n");
      } else if (use_type == 3) {
        ocl_type = OCL_TYPE_ACC;
        printf("mode ACC\n");
      }

      /* cl_mem buf_x; */
      switch (ocl_type) {
        case OCL_TYPE_IGPU:
          buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                                 a_v, &err);
          /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
           * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
          /*                      Acpy, &err); */
          break;
        case OCL_TYPE_GPU:
          buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                                 a_v, &err);
          break;
        case OCL_TYPE_ACC:
          buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 size * sizeof(int), a_v, &err);
          break;
        case OCL_TYPE_CPU:
          buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 size * sizeof(int), a_v, &err);
          break;
        default:
          printf("no ocl_type defined\n");
          exit(EXIT_FAILURE);
          break;
      }

      printf("create buffer a (%d)\n", err);
      if (err != CL_SUCCESS) {
        /* printf("create buffer error: %s\n", clErrorString(err)); */
        printf("create buffer error: %d\n", err);
      }

      switch (ocl_type) {
        case OCL_TYPE_IGPU:
          buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                                 b_v, &err);
          break;
        case OCL_TYPE_GPU:
          buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                                 b_v, &err);
          break;
        case OCL_TYPE_ACC:
          buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 size * sizeof(int), b_v, &err);
          break;
        case OCL_TYPE_CPU:
          buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 size * sizeof(int), b_v, &err);
          break;
        default:
          printf("no ocl_type defined\n");
          exit(EXIT_FAILURE);
          break;
      }

      printf("create buffer b (%d)\n", err);
      if (err != CL_SUCCESS) {
        printf("create buffer error: %d\n", err);
        /* printf("create buffer error: %s\n", clErrorString(err)); */
      }

      switch (ocl_type) {
        case OCL_TYPE_IGPU:
          buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                                 c_v, &err);
          /* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols *
           * sizeof(int), */
          /*                        c_v, &err); */
          /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
           * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
          /*                      Acpy, &err); */
          break;
        case OCL_TYPE_GPU:
          buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                                 c_v, &err);
          break;
        case OCL_TYPE_ACC:
          buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 size * sizeof(int), c_v, &err);
          break;
        case OCL_TYPE_CPU:
          buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE |
                                 CL_MEM_USE_HOST_PTR,
          /* buf_c = */
            /* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */
                           /* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */
                           size * sizeof(int), c_v, &err);
          break;
        default:
          printf("no ocl_type defined\n");
          exit(EXIT_FAILURE);
          break;
      }

      printf("create buffer c (%d)\n", err);
      if (err != CL_SUCCESS) {
        /* printf("create buffer error: %s\n", clErrorString(err)); */
        printf("create buffer error: %d\n", err);
      }
      /* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x,
       * &err); */
      /* printf("create buffer x (%d)\n", err); */

      // WriteBuffer
      /* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float),
       */
      /*                           Acpy, 0, NULL, NULL); */
      /* printf("write buffer Acpy - b_a (%d)\n", st); */
      /* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy,
       * 0, */
      /*                           NULL, NULL); */
      /* printf("write buffer bcpy - b_b (%d)\n", st); */

      // Create Program
      cl_program program;
      program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str,
                                          NULL, &err);
      printf("create program (%d)\n", err);

      // Build Program
      /* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL,
       * NULL, */
      /*                     NULL); */
      char* opts = "-Werror";
      st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL,
                          NULL);
      printf("build program (%d)\n", st);
      if (st != CL_SUCCESS) {
        /* printf("build status: %s\n", clErrorString(st)); */
        printf("build status: %d\n", st);
        char log[512];
        st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log,
                                   NULL);
        printf("build info (%d)\n", st);
        if (st == CL_SUCCESS) {
          printf("%s\n", log);
        }
      }

      // Create Kernel
      cl_kernel kernel1;
      kernel1 = clCreateKernel(program, "kernel1", &st);
      printf("create kernel1 (%d)\n", st);
      /* cl_kernel kernel2; */
      /* kernel2 = clCreateKernel(program, "ocl1_2", &st); */
      /* printf("create kernel2 (%d)\n", st); */

      // workgroup size
      size_t dims = 1;
      size_t gws[] = { 1, 1, 1 };
      /* size_t gws[dims]; */
      gws[0] = size; // a_rows;
      /* gws[0] = 32; */
      /* size_t* lws = NULL; */
      /* size_t lws[dims]; */
      /* size_t lws[dims]; */
      /* size_t lws[dims] = NULL; */
      /* size_t lws[] = {0, 0, 0}; */
      size_t lws[] = { 128, 1, 1 };
      printf("gws {%lu, %lu, %lu}\n", gws[0], gws[1], gws[2]);
      if (lws != NULL) {
        printf("lws {%lu, %lu, %lu}\n", lws[0], lws[1], lws[2]);
      } else {
        printf("lws unspecified\n");
      }

      // Set Kernel Args
      st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a);
      printf("set arg %d (%d)\n", 0, st);
      st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b);
      printf("set arg %d (%d)\n", 1, st);
      /* printf("set kernel1 arg: %d (%d)\n", 0, st); */
      st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c);
      printf("set arg %d (%d)\n", 2, st);
      st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size);
      printf("set arg %d (%d)\n", 3, st);

      // Execute kernel
      st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
                                  (const size_t*)lws, 0, NULL, &ev_kernel);
      /* (const size_t*)lws, 0, NULL, NULL); */
      /* printf("nd range kernel1 (%d %s)\n", st, clErrorString(st)); */
      printf("nd range kernel1 (%d)\n", st);

      end = clCreateUserEvent(context, &st);
      printf("create user event (%d)\n", st);

      callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));

      printf("c_v %p\n", (void*)c_v);

      user_data->queue = &queue;
      user_data->buf_c = &buf_c;
      user_data->c_v = c_v;
      user_data->size = size;
      user_data->end = &end;
      user_data->nested_callbacks = use_nested_callbacks;
      user_data->blocking = use_blocking;

      if (use_callbacks) {
        st =
          clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
        printf("set event callback (%d)\n", st);
      }
      /* printf("first: %2.5f\n", c_v[0]); */
      /* print_matrix_float_s_t("c", c); */
      // ReadBuffer
      /* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ,
       * 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */
      /* printf("read buffer c_v - buf_c (%d)\n", st); */
      /* printf("finish queue\n"); */
      /* clFinish(queue); */
      /* printf("finished queue\n"); */

      if (use_callbacks) {
        /* clWaitForCompletion(context); */

        printf("waiting for events\n");
        /* /\* cl_event events[] = {ev_kernel}; *\/ */
        cl_event events[] = { end };
        clWaitForEvents(1, events); // ev_kernel);
        printf("waited for events\n");

        clbWaitEvents(c_v);

      } else {
        printf("about to read the c buffer\n");
        st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int),
                                 c_v, 0, NULL, NULL);
        printf("read buffer c_v - buf_c (%d)\n", st);
      }

      /* print_matrix("c_v", c_v, c_rows, c_cols); */

      /* printf("first: %2.5f\n", c_v[0]); */
      /* print_matrix_float_s_t("c", c); */
      free(user_data);

      clReleaseKernel(kernel1);
      /* clReleaseKernel(kernel2); */
      clReleaseProgram(program);
      clReleaseCommandQueue(queue);
      clReleaseMemObject(buf_a);
      clReleaseMemObject(buf_b);
      clReleaseMemObject(buf_c);
      /* clReleaseMemObject(b_x); */
      clReleaseContext(context);
      free(devices);
      free(platforms);

    #define THRESHOLD 0
      // check
      printf("about to check (first: %d)\n", c_v[0]);
      for (size_t i = 0; i < size; i++) {
        if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) {
          printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i,
                 a_v[i], i, b_v[i], i, c_v[i]);
          exit(EXIT_FAILURE);
        }
      }

      return EXIT_SUCCESS;
    }

    void CL_CALLBACK
    callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
    {
      printf("-- BEGIN callback read executed (%d)\n", ev_status);
      callback_data* cb_data = (callback_data*)user_data;
      /* cl_command_queue queue = *(cb_data->queue); */
      /* cl_mem buf_c = *(cb_data->buf_c); */
      int* c_v = cb_data->c_v;
      cl_event end = *(cb_data->end);
      /* int size = cb_data->size; */

      cl_int st;

      printf("c_v %p\n", (void*)c_v);
      printf("c_v[0] = %d\n", c_v[0]);

      /* c_v[1] = 1; */

      st = clSetUserEventStatus(end, CL_COMPLETE);
      printf("set user event status (%d)\n", st);
      // haz que salga el finish
      printf("-- END\n");
    }

    cl_event ev_read;

    void CL_CALLBACK
    callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
    {
      printf("-- BEGIN callback kernel executed (%d)\n", ev_status);
      callback_data* cb_data = (callback_data*)user_data;
      cl_command_queue queue = *(cb_data->queue);
      cl_mem buf_c = *(cb_data->buf_c);
      int* c_v = cb_data->c_v;
      int size = cb_data->size;
      bool nested_callbacks = cb_data->nested_callbacks;
      bool blocking = cb_data->blocking;
      cl_event end = *(cb_data->end);


      printf("c_v %p\n", (void*)c_v);
      printf("c_v[0] = %d\n", c_v[0]);

      cl_int st;

      /* printf("about to flush\n"); */
      /* clFlush(queue); */
      /* printf("flushed\n"); */

      size_t offset = 0;
      /* size = size + 4; */
      printf("about to read the c buffer\n");
      printf("blocking %d\n", blocking);

      clb_events_waiting_device[clb_num_events_waiting] = 0;
      clb_events_init_read[clb_num_events_waiting] = 0;


      /* why it does not work? (blocking CL_TRUE) */
      st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),
                               c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
      ev_read = clb_events_waiting[clb_num_events_waiting - 1];
      printf("enqueue read buffer (%d)\n", st);
      /* size * sizeof(int), c_v, 0, NULL, NULL); */

      if (nested_callbacks) {
        st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
        printf("set event callback (%d)\n", st);
        /* st = clSetUserEventStatus(end, CL_COMPLETE); */
        /* printf("set user event status (%d)\n", st); */
      }
      /* c_v[1] = 1; */

      /* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */
      /* printf("event info (%d)\n", st); */

      /* int len = 512; */
      /* char buflog[len]; */
      /* cl_command_type; */
      /* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */
      /* if (buflog != NULL) { */
      /*   printf("- event: %s\n", buflog); */
      /* } */

      if (!nested_callbacks) {
        st = clSetUserEventStatus(end, CL_COMPLETE);
        printf("set user event status (%d)\n", st);

        /* printf("read buffer c_v - buf_c (%d)\n", st); */
      }
      printf("-- END\n");
    }

Kernel:

 

__kernel void
kernel1(__global int* a, __global int* b, __global int* c, int size)
{

  int idx = get_global_id(0);

  if (idx >= 0 && idx < size){
    c[idx] = a[idx] + b[idx];
  }
}

 

Thread Topic: 

Question

Viewing all articles
Browse latest Browse all 1182

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>