Commit 33b82420 authored by Ulrich Pegelow's avatar Ulrich Pegelow

opencl: use dt_opencl_local_buffer_opt() in globaltonemap/highlights/nlmeans

parent b382a773
......@@ -307,6 +307,7 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
cl_int err = -999;
cl_mem dev_m = NULL;
cl_mem dev_r = NULL;
float *maximum = NULL;
const int devid = piece->pipe->devid;
int gtkernel = -1;
......@@ -314,38 +315,6 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
const int height = roi_out->height;
float parameters[4] = { 0.0f };
// prepare local work group
size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group
size_t workgroupsize = 0; // the maximum number of items in a work group
unsigned long localmemsize = 0; // the maximum amount of local memory we can use
size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel
// make sure blocksize is not too large
int blocksize = BLOCKSIZE;
if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
&& dt_opencl_get_kernel_work_group_size(devid, gd->kernel_pixelmax_first, &kernelworkgroupsize)
== CL_SUCCESS)
{
// reduce blocksize step by step until it fits to limits
while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize * blocksize > kernelworkgroupsize
|| blocksize * blocksize > workgroupsize || blocksize * blocksize * sizeof(float) > localmemsize)
{
if(blocksize == 1) break;
blocksize >>= 1;
}
}
else
{
blocksize = 1; // slow but safe
}
if(blocksize < 2)
{
// very small blocksize. this is really unlikely to happen, but let's be prepared: give up on opencl in
// that case
return FALSE;
}
switch(d->operator)
{
case OPERATOR_REINHARD:
......@@ -381,15 +350,31 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
if(isnan(tmp_lwmax))
{
size_t sizes[3];
size_t local[3];
dt_opencl_local_buffer_t flocopt
= (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
.cellsize = sizeof(float), .overhead = 0,
.sizex = 1 << 4, .sizey = 1 << 4 };
if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt))
goto error;
const size_t bwidth = ROUNDUP(width, flocopt.sizex);
const size_t bheight = ROUNDUP(height, flocopt.sizey);
const int bwidth = ROUNDUP(width, blocksize);
const int bheight = ROUNDUP(height, blocksize);
const int bufsize = (bwidth / flocopt.sizex) * (bheight / flocopt.sizey);
const int bufsize = (bwidth / blocksize) * (bheight / blocksize);
const int groupsize = maxsizes[0];
const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, groupsize) / groupsize);
dt_opencl_local_buffer_t slocopt
= (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
.cellsize = sizeof(float), .overhead = 0,
.sizex = 1 << 16, .sizey = 1 };
if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt))
goto error;
const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex);
size_t sizes[3];
size_t local[3];
dev_m = dt_opencl_alloc_device_buffer(devid, (size_t)bufsize * sizeof(float));
if(dev_m == NULL) goto error;
......@@ -400,31 +385,31 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
sizes[0] = bwidth;
sizes[1] = bheight;
sizes[2] = 1;
local[0] = blocksize;
local[1] = blocksize;
local[0] = flocopt.sizex;
local[1] = flocopt.sizey;
local[2] = 1;
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 0, sizeof(cl_mem), &dev_in);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 1, sizeof(int), &width);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 2, sizeof(int), &height);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 3, sizeof(cl_mem), &dev_m);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 4, blocksize * blocksize * sizeof(float), NULL);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_first, 4, flocopt.sizex * flocopt.sizey * sizeof(float), NULL);
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_first, sizes, local);
if(err != CL_SUCCESS) goto error;
sizes[0] = reducesize * groupsize;
sizes[0] = reducesize * slocopt.sizex;
sizes[1] = 1;
sizes[2] = 1;
local[0] = groupsize;
local[0] = slocopt.sizex;
local[1] = 1;
local[2] = 1;
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 0, sizeof(cl_mem), &dev_m);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 1, sizeof(cl_mem), &dev_r);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 2, sizeof(int), &bufsize);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 3, groupsize * sizeof(float), NULL);
dt_opencl_set_kernel_arg(devid, gd->kernel_pixelmax_second, 3, slocopt.sizex * sizeof(float), NULL);
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_pixelmax_second, sizes, local);
if(err != CL_SUCCESS) goto error;
float *maximum = malloc(reducesize * sizeof(float));
maximum = dt_alloc_align(16, reducesize * sizeof(float));
err = dt_opencl_read_buffer_from_device(devid, (void *)maximum, dev_r, 0,
(size_t)reducesize * sizeof(float), CL_TRUE);
if(err != CL_SUCCESS) goto error;
......@@ -442,7 +427,8 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
tmp_lwmax = MAX(eps, (maximum[0] * 0.01f));
free(maximum);
dt_free_align(maximum);
maximum = NULL;
}
const float lwmax = tmp_lwmax;
......@@ -504,6 +490,7 @@ error:
if(b) dt_bilateral_free_cl(b);
dt_opencl_release_mem_object(dev_m);
dt_opencl_release_mem_object(dev_r);
dt_free_align(maximum);
dt_print(DT_DEBUG_OPENCL, "[opencl_global_tonemap] couldn't enqueue kernel! %d\n", err);
return FALSE;
}
......
......@@ -166,37 +166,20 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
else if(d->mode == DT_IOP_HIGHLIGHTS_LCH && filters == 9u)
{
// xtrans sensor raws with LCH mode
int blocksizex, blocksizey;
// we use local buffering for speed reasons; determine suited work group size
size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group
size_t workgroupsize = 0; // the maximum number of items in a work group
unsigned long localmemsize = 0; // the maximum amount of local memory we can use
size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel
dt_opencl_local_buffer_t locopt
= (dt_opencl_local_buffer_t){ .xoffset = 2 * 2, .xfactor = 1, .yoffset = 2 * 2, .yfactor = 1,
.cellsize = sizeof(float), .overhead = 0,
.sizex = 1 << 8, .sizey = 1 << 8 };
int blocksizex = 1 << 8;
int blocksizey = 1 << 8;
if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
&& dt_opencl_get_kernel_work_group_size(devid, gd->kernel_highlights_1f_lch_xtrans, &kernelworkgroupsize) == CL_SUCCESS)
if(dt_opencl_local_buffer_opt(devid, gd->kernel_highlights_1f_lch_xtrans, &locopt))
{
while(maxsizes[0] < blocksizex || maxsizes[1] < blocksizey
|| localmemsize < (blocksizex + 4) * (blocksizey + 4) * sizeof(float)
|| workgroupsize < blocksizex * blocksizey || kernelworkgroupsize < blocksizex * blocksizey)
{
if(blocksizex == 1 && blocksizey == 1) break;
if(blocksizex > blocksizey)
blocksizex >>= 1;
else
blocksizey >>= 1;
}
blocksizex = locopt.sizex;
blocksizey = locopt.sizey;
}
else
{
dt_print(DT_DEBUG_OPENCL,
"[opencl_highlights] can not identify resource limits for device %d\n", devid);
goto error;
}
blocksizex = blocksizey = 1;
dev_xtrans
= dt_opencl_copy_host_to_device_constant(devid, sizeof(piece->pipe->dsc.xtrans), piece->pipe->dsc.xtrans);
......
......@@ -34,9 +34,6 @@
#include <xmmintrin.h>
#endif
#define BLOCKSIZE \
2048 /* maximum blocksize. must be a power of 2 and will be automatically reduced if needed */
#define NUM_BUCKETS 4
// this is the version of the modules parameters,
......@@ -216,34 +213,31 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
if(buckets[k] == NULL) goto error;
}
// prepare local work group
size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group
size_t workgroupsize = 0; // the maximum number of items in a work group
unsigned long localmemsize = 0; // the maximum amount of local memory we can use
size_t kernelworkgroupsize = 0; // the maximum amount of items in work group of the kernel
// assuming this is the same for nlmeans_horiz and nlmeans_vert
// make sure blocksize is not too large
int blocksize = BLOCKSIZE;
if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
&& dt_opencl_get_kernel_work_group_size(devid, gd->kernel_nlmeans_horiz, &kernelworkgroupsize)
== CL_SUCCESS)
{
// reduce blocksize step by step until it fits to limits
while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize
|| blocksize > workgroupsize || (blocksize + 2 * P) * sizeof(float) > localmemsize)
{
if(blocksize == 1) break;
blocksize >>= 1;
}
}
int hblocksize;
dt_opencl_local_buffer_t hlocopt
= (dt_opencl_local_buffer_t){ .xoffset = 2 * P, .xfactor = 1, .yoffset = 0, .yfactor = 1,
.cellsize = sizeof(float), .overhead = 0,
.sizex = 1 << 16, .sizey = 1 };
if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_horiz, &hlocopt))
hblocksize = hlocopt.sizex;
else
{
blocksize = 1; // slow but safe
}
hblocksize = 1;
int vblocksize;
dt_opencl_local_buffer_t vlocopt
= (dt_opencl_local_buffer_t){ .xoffset = 1, .xfactor = 1, .yoffset = 2 * P, .yfactor = 1,
.cellsize = sizeof(float), .overhead = 0,
.sizex = 1, .sizey = 1 << 16 };
if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_vert, &vlocopt))
vblocksize = vlocopt.sizey;
else
vblocksize = 1;
const size_t bwidth = width % blocksize == 0 ? width : (width / blocksize + 1) * blocksize;
const size_t bheight = height % blocksize == 0 ? height : (height / blocksize + 1) * blocksize;
const size_t bwidth = ROUNDUP(width, hblocksize);
const size_t bheight = ROUNDUP(height, vblocksize);
size_t sizesl[3];
size_t local[3];
......@@ -276,7 +270,7 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
sizesl[0] = bwidth;
sizesl[1] = ROUNDUPHT(height);
sizesl[2] = 1;
local[0] = blocksize;
local[0] = hblocksize;
local[1] = 1;
local[2] = 1;
dev_U4_t = buckets[bucket_next(&state, NUM_BUCKETS)];
......@@ -286,7 +280,7 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 3, sizeof(int), (void *)&height);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 4, 2 * sizeof(int), (void *)&q);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 5, sizeof(int), (void *)&P);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 6, (blocksize + 2 * P) * sizeof(float), NULL);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 6, (hblocksize + 2 * P) * sizeof(float), NULL);
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_nlmeans_horiz, sizesl, local);
if(err != CL_SUCCESS) goto error;
......@@ -295,7 +289,7 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
sizesl[1] = bheight;
sizesl[2] = 1;
local[0] = 1;
local[1] = blocksize;
local[1] = vblocksize;
local[2] = 1;
dev_U4_tt = buckets[bucket_next(&state, NUM_BUCKETS)];
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 0, sizeof(cl_mem), (void *)&dev_U4_t);
......@@ -305,7 +299,7 @@ int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_m
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 4, 2 * sizeof(int), (void *)&q);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 5, sizeof(int), (void *)&P);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 6, sizeof(float), (void *)&sharpness);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 7, (blocksize + 2 * P) * sizeof(float), NULL);
dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 7, (vblocksize + 2 * P) * sizeof(float), NULL);
err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_nlmeans_vert, sizesl, local);
if(err != CL_SUCCESS) goto error;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment