Hi,
We found the same kernel performance varies dramatically if the input image is created from different ways. With the attached test tool:
- if the input image is created from a host ptr directly, the performance is good, e.g. for 8K x 8K input image:
- ./blockread
- Average kernel 2.033509 ms
- if the input image is created from a buffer object (which is created from the same host ptr), the performance drops much: for the same 8K x 8K process:
- ./blockread -b
- Average kernel 3.763424 ms
The buffer pitch/base address are aligned at 4K, not sure why the performance difference is so big...
The code snippet for image creation is listed bellow
if (create_image_from_buf) {
buf_from_hostptr = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR, src_size, src_ptr, &errNum);if (buf_from_hostptr == 0) {
printf("clCreateBuffer failed \n");
exit(1);
}
desc.buffer = buf_from_hostptr;// flags inherited from buffer
img_from_buf = clCreateImage(context,0, &format, &desc,NULL,&errNum);if (img_from_buf == 0) {
printf("clCreateImage failed \n");
exit(1);
}
} else {
img_from_hostptr = clCreateImage(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, &format, &desc, src_ptr, &errNum);
if (img_from_hostptr == NULL)
{
std::cerr << "Error creating memory objects."<< std::endl;
return false;
}
}
Thanks
-Austin