clEnqueueNDRangeKernel may fail on kernel with 2D local arrays but succeed with 1D local arrays and manual index computing.
For example, the following matrix multiplication kernel fails with CL_OUT_OF_RESOURCES if USE_2D is defined and succeedes otherwise.
Matricies are [24, 72] * [24, 72]T = [24, 24] and BLOCK_SIZE = 24.
#define BLOCK_SIZE 24 #define C_WIDTH 24 #define AB_COMMON 72 __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) void mx_mul(__global const float *A, __global const float *B, __global float *C) { #ifdef USE_2D __local float AS[BLOCK_SIZE][BLOCK_SIZE]; __local float BS[BLOCK_SIZE][BLOCK_SIZE]; #else __local float AS[BLOCK_SIZE * BLOCK_SIZE]; __local float BS[BLOCK_SIZE * BLOCK_SIZE]; #endif int bx = get_group_id(0); int by = get_group_id(1); int tx = get_local_id(0); int ty = get_local_id(1); int a_offs = (by * BLOCK_SIZE + ty) * AB_COMMON + tx; int b_offs = (bx * BLOCK_SIZE + ty) * AB_COMMON + tx; float sum = 0; for (int i = 0; i < AB_COMMON / BLOCK_SIZE; i++, a_offs += BLOCK_SIZE, b_offs += BLOCK_SIZE) { #ifdef USE_2D AS[ty][tx] = A[a_offs]; BS[ty][tx] = B[b_offs]; #else AS[ty * BLOCK_SIZE + tx] = A[a_offs]; BS[ty * BLOCK_SIZE + tx] = B[b_offs]; #endif barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll for (int k = 0; k < BLOCK_SIZE; k++) { #ifdef USE_2D sum += AS[ty][k] * BS[tx][k]; #else sum += AS[ty * BLOCK_SIZE + k] * BS[tx * BLOCK_SIZE + k]; #endif } barrier(CLK_LOCAL_MEM_FENCE); } C[get_global_id(1) * C_WIDTH + get_global_id(0)] = sum; }
Tested on Ubuntu 14.10 and Core i7-3770 with intel_sdk_for_ocl_applications_xe_2013_r3_sdk_3.2.1.16712_x64.