nlmeans.c 34.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
    This file is part of darktable,
    copyright (c) 2011 johannes hanika.

    darktable is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    darktable is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
21
#include "bauhaus/bauhaus.h"
22
#include "common/opencl.h"
23
#include "control/control.h"
24 25 26
#include "develop/imageop.h"
#include "develop/imageop_math.h"
#include "develop/tiling.h"
27
#include "gui/accelerators.h"
28
#include "gui/gtk.h"
29
#include "iop/iop_api.h"
30 31
#include <gtk/gtk.h>
#include <stdlib.h>
32

33
#if defined(__SSE__)
34
#include <xmmintrin.h>
35
#endif
36

37
#define NUM_BUCKETS 4
38

39 40
// this is the version of the modules parameters,
// and includes version information about compile-time dt
41
DT_MODULE_INTROSPECTION(2, dt_iop_nlmeans_params_t)
42 43 44 45 46

typedef struct dt_iop_nlmeans_params_v1_t
{
  float luma;
  float chroma;
47
} dt_iop_nlmeans_params_v1_t;
48 49 50 51

typedef struct dt_iop_nlmeans_params_t
{
  // these are stored in db.
52 53
  float radius;
  float strength;
54 55
  float luma;
  float chroma;
56
} dt_iop_nlmeans_params_t;
57 58 59

typedef struct dt_iop_nlmeans_gui_data_t
{
60 61
  GtkWidget *radius;
  GtkWidget *strength;
62 63
  GtkWidget *luma;
  GtkWidget *chroma;
64
} dt_iop_nlmeans_gui_data_t;
65

66
typedef dt_iop_nlmeans_params_t dt_iop_nlmeans_data_t;
67 68 69

typedef struct dt_iop_nlmeans_global_data_t
{
70 71 72 73
  int kernel_nlmeans_init;
  int kernel_nlmeans_dist;
  int kernel_nlmeans_horiz;
  int kernel_nlmeans_vert;
74
  int kernel_nlmeans_accu;
75
  int kernel_nlmeans_finish;
76
} dt_iop_nlmeans_global_data_t;
77 78 79

const char *name()
{
80
  return _("denoise (non-local means)");
81 82
}

83 84
int legacy_params(dt_iop_module_t *self, const void *const old_params, const int old_version,
                  void *new_params, const int new_version)
85 86 87 88 89
{
  if(old_version == 1 && new_version == 2)
  {
    dt_iop_nlmeans_params_v1_t *o = (dt_iop_nlmeans_params_v1_t *)old_params;
    dt_iop_nlmeans_params_t *n = (dt_iop_nlmeans_params_t *)new_params;
90
    n->luma = o->luma;
91
    n->chroma = o->chroma;
92 93
    n->strength = 100.0f;
    n->radius = 3;
94 95 96 97 98
    return 0;
  }
  return 1;
}

99
int groups()
100 101 102 103
{
  return IOP_GROUP_CORRECT;
}

104
int flags()
105
{
106
  return IOP_FLAGS_SUPPORTS_BLENDING | IOP_FLAGS_ALLOW_TILING;
107 108
}

109 110 111 112 113 114 115 116
void init_key_accels(dt_iop_module_so_t *self)
{
  dt_accel_register_slider_iop(self, FALSE, NC_("accel", "luma"));
  dt_accel_register_slider_iop(self, FALSE, NC_("accel", "chroma"));
}

void connect_key_accels(dt_iop_module_t *self)
{
117
  dt_iop_nlmeans_gui_data_t *g = (dt_iop_nlmeans_gui_data_t *)self->gui_data;
118 119 120 121 122

  dt_accel_connect_slider_iop(self, "luma", GTK_WIDGET(g->luma));
  dt_accel_connect_slider_iop(self, "chroma", GTK_WIDGET(g->chroma));
}

123
/** modify regions of interest (optional, per pixel ops don't need this) */
124 125 126 127
// void modify_roi_out(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, dt_iop_roi_t
// *roi_out, const dt_iop_roi_t *roi_in);
// void modify_roi_in(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t
// *roi_out, dt_iop_roi_t *roi_in);
128

129 130 131 132
typedef union floatint_t
{
  float f;
  uint32_t i;
133
} floatint_t;
134

135
static inline float fast_mexp2f(const float x)
136
{
137 138
  const float i1 = (float)0x3f800000u; // 2^0
  const float i2 = (float)0x3f000000u; // 2^-1
139 140
  const float k0 = i1 + x * (i2 - i1);
  floatint_t k;
141
  k.i = k0 >= (float)0x800000u ? k0 : 0;
142 143 144 145
  return k.f;
}

static float gh(const float f, const float sharpness)
146
{
147
  const float f2 = f * sharpness;
148
  return fast_mexp2f(f2);
149 150 151
  // return 0.0001f + dt_fast_expf(-fabsf(f)*800.0f);
  // return 1.0f/(1.0f + f*f);
  // make spread bigger: less smoothing
152 153
  // const float spread = 100.f;
  // return 1.0f/(1.0f + fabsf(f)*spread);
154 155
}

156
#ifdef HAVE_OPENCL
157 158 159 160 161 162 163 164 165 166
static int bucket_next(unsigned int *state, unsigned int max)
{
  unsigned int current = *state;
  unsigned int next = (current >= max - 1 ? 0 : current + 1);

  *state = next;

  return next;
}

167
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out,
168
               const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
169 170 171
{
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;
  dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)self->data;
172

173

174
  const int devid = piece->pipe->devid;
175 176 177
  const int width = roi_in->width;
  const int height = roi_in->height;

178
  cl_mem dev_U2 = NULL;
179
  cl_mem dev_U4 = NULL;
180
  cl_mem dev_U4_t = NULL;
181 182 183 184
  cl_mem dev_U4_tt = NULL;

  unsigned int state = 0;
  cl_mem buckets[NUM_BUCKETS] = { NULL };
185

186
  cl_int err = -999;
187

188
  const int P = ceilf(d->radius * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f)); // pixel filter size
189 190
  const int K = ceilf(7 * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f));         // nbhood
  const float sharpness = 3000.0f / (1.0f + d->strength);
191

192
  if(P < 1)
193
  {
194 195
    size_t origin[] = { 0, 0, 0 };
    size_t region[] = { width, height, 1 };
196
    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
197
    if(err != CL_SUCCESS) goto error;
198 199
    return TRUE;
  }
200

201
  float max_L = 120.0f, max_C = 512.0f;
202 203 204
  float nL = 1.0f / max_L, nC = 1.0f / max_C;
  float nL2 = nL * nL, nC2 = nC * nC;
  // float weight[4] = { powf(d->luma, 0.6), powf(d->chroma, 0.6), powf(d->chroma, 0.6), 1.0f };
205
  float weight[4] = { d->luma, d->chroma, d->chroma, 1.0f };
206

207
  dev_U2 = dt_opencl_alloc_device_buffer(devid, (size_t)width * height * 4 * sizeof(float));
208 209
  if(dev_U2 == NULL) goto error;

210
  for(int k = 0; k < NUM_BUCKETS; k++)
211
  {
212
    buckets[k] = dt_opencl_alloc_device_buffer(devid, (size_t)width * height * sizeof(float));
213 214
    if(buckets[k] == NULL) goto error;
  }
Tobias Ellinghaus's avatar
Tobias Ellinghaus committed
215

216 217 218 219 220 221 222 223
  int hblocksize;
  dt_opencl_local_buffer_t hlocopt
    = (dt_opencl_local_buffer_t){ .xoffset = 2 * P, .xfactor = 1, .yoffset = 0, .yfactor = 1,
                                  .cellsize = sizeof(float), .overhead = 0,
                                  .sizex = 1 << 16, .sizey = 1 };

  if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_horiz, &hlocopt))
    hblocksize = hlocopt.sizex;
224
  else
225 226 227 228 229 230 231 232 233 234 235 236 237
    hblocksize = 1;

  int vblocksize;
  dt_opencl_local_buffer_t vlocopt
    = (dt_opencl_local_buffer_t){ .xoffset = 1, .xfactor = 1, .yoffset = 2 * P, .yfactor = 1,
                                  .cellsize = sizeof(float), .overhead = 0,
                                  .sizex = 1, .sizey = 1 << 16 };

  if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_vert, &vlocopt))
    vblocksize = vlocopt.sizey;
  else
    vblocksize = 1;

238

239 240
  const size_t bwidth = ROUNDUP(width, hblocksize);
  const size_t bheight = ROUNDUP(height, vblocksize);
241 242 243

  size_t sizesl[3];
  size_t local[3];
244
  size_t sizes[] = { ROUNDUPWD(width), ROUNDUPHT(height), 1 };
245

246
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_init, 0, sizeof(cl_mem), (void *)&dev_U2);
247 248 249 250 251 252 253 254 255
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_init, 1, sizeof(int), (void *)&width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_init, 2, sizeof(int), (void *)&height);
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_init, sizes);
  if(err != CL_SUCCESS) goto error;



  for(int j = -K; j <= 0; j++)
    for(int i = -K; i <= K; i++)
256
    {
257
      int q[2] = { i, j };
258

259
      dev_U4 = buckets[bucket_next(&state, NUM_BUCKETS)];
260 261 262 263
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 0, sizeof(cl_mem), (void *)&dev_in);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 1, sizeof(cl_mem), (void *)&dev_U4);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 2, sizeof(int), (void *)&width);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 3, sizeof(int), (void *)&height);
264
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 4, 2 * sizeof(int), (void *)&q);
265 266 267 268 269 270 271 272
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 5, sizeof(float), (void *)&nL2);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 6, sizeof(float), (void *)&nC2);
      err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_dist, sizes);
      if(err != CL_SUCCESS) goto error;

      sizesl[0] = bwidth;
      sizesl[1] = ROUNDUPHT(height);
      sizesl[2] = 1;
273
      local[0] = hblocksize;
274 275
      local[1] = 1;
      local[2] = 1;
276
      dev_U4_t = buckets[bucket_next(&state, NUM_BUCKETS)];
277 278 279 280
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 0, sizeof(cl_mem), (void *)&dev_U4);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 1, sizeof(cl_mem), (void *)&dev_U4_t);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 2, sizeof(int), (void *)&width);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 3, sizeof(int), (void *)&height);
281
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 4, 2 * sizeof(int), (void *)&q);
282
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 5, sizeof(int), (void *)&P);
283
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 6, (hblocksize + 2 * P) * sizeof(float), NULL);
284 285 286 287 288 289 290 291
      err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_nlmeans_horiz, sizesl, local);
      if(err != CL_SUCCESS) goto error;


      sizesl[0] = ROUNDUPWD(width);
      sizesl[1] = bheight;
      sizesl[2] = 1;
      local[0] = 1;
292
      local[1] = vblocksize;
293
      local[2] = 1;
294
      dev_U4_tt = buckets[bucket_next(&state, NUM_BUCKETS)];
295
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 0, sizeof(cl_mem), (void *)&dev_U4_t);
296
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 1, sizeof(cl_mem), (void *)&dev_U4_tt);
297 298
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 2, sizeof(int), (void *)&width);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 3, sizeof(int), (void *)&height);
299
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 4, 2 * sizeof(int), (void *)&q);
300 301
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 5, sizeof(int), (void *)&P);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 6, sizeof(float), (void *)&sharpness);
302
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 7, (vblocksize + 2 * P) * sizeof(float), NULL);
303 304 305 306
      err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_nlmeans_vert, sizesl, local);
      if(err != CL_SUCCESS) goto error;


307
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 0, sizeof(cl_mem), (void *)&dev_in);
308
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 1, sizeof(cl_mem), (void *)&dev_U2);
309
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 2, sizeof(cl_mem), (void *)&dev_U4_tt);
310 311
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 3, sizeof(int), (void *)&width);
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 4, sizeof(int), (void *)&height);
312
      dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 5, 2 * sizeof(int), (void *)&q);
313
      err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_accu, sizes);
314 315
      if(err != CL_SUCCESS) goto error;

316
      if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
317
        dt_opencl_finish(devid);
318 319

      // indirectly give gpu some air to breathe (and to do display related stuff)
320
      dt_iop_nap(darktable.opencl->micro_nap);
321
    }
322

323
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 0, sizeof(cl_mem), (void *)&dev_in);
324
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 1, sizeof(cl_mem), (void *)&dev_U2);
325 326 327
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 2, sizeof(cl_mem), (void *)&dev_out);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 3, sizeof(int), (void *)&width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 4, sizeof(int), (void *)&height);
328
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 5, 4 * sizeof(float), (void *)&weight);
329 330 331
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_finish, sizes);
  if(err != CL_SUCCESS) goto error;

332
  dt_opencl_release_mem_object(dev_U2);
333
  for(int k = 0; k < NUM_BUCKETS; k++)
334
  {
335
    dt_opencl_release_mem_object(buckets[k]);
336
  }
337 338 339
  return TRUE;

error:
340
  dt_opencl_release_mem_object(dev_U2);
341
  for(int k = 0; k < NUM_BUCKETS; k++)
342
  {
343
    dt_opencl_release_mem_object(buckets[k]);
344 345
  }

346 347 348 349
  dt_print(DT_DEBUG_OPENCL, "[opencl_nlmeans] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
#endif
350

351

352 353 354
void tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
                     const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
                     struct dt_develop_tiling_t *tiling)
355
{
356
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;
357
  const int P = ceilf(d->radius * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f)); // pixel filter size
358
  const int K = ceilf(7 * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f));         // nbhood
359

360
  tiling->factor = 2.0f + 1.0f + 0.25 * NUM_BUCKETS; // in + out + tmp
361 362
  tiling->maxbuf = 1.0f;
  tiling->overhead = 0;
363
  tiling->overlap = P + K;
364 365 366 367 368
  tiling->xalign = 1;
  tiling->yalign = 1;
  return;
}

369 370 371 372 373
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
             void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  // get our data struct:
374 375 376
  const dt_iop_nlmeans_params_t *const d = (dt_iop_nlmeans_params_t *)piece->data;

  const int ch = piece->colors;
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448

  // adjust to zoom size:
  const int P = ceilf(d->radius * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f)); // pixel filter size
  const int K = ceilf(7 * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f));         // nbhood
  const float sharpness = 3000.0f / (1.0f + d->strength);
  if(P < 1)
  {
    // nothing to do from this distance:
    memcpy(ovoid, ivoid, (size_t)sizeof(float) * 4 * roi_out->width * roi_out->height);
    return;
  }

  // adjust to Lab, make L more important
  // float max_L = 100.0f, max_C = 256.0f;
  // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
  float max_L = 120.0f, max_C = 512.0f;
  float nL = 1.0f / max_L, nC = 1.0f / max_C;
  const float norm2[4] = { nL * nL, nC * nC, nC * nC, 1.0f };

  float *Sa = dt_alloc_align(64, (size_t)sizeof(float) * roi_out->width * dt_get_num_threads());
  // we want to sum up weights in col[3], so need to init to 0:
  memset(ovoid, 0x0, (size_t)sizeof(float) * roi_out->width * roi_out->height * 4);

  // for each shift vector
  for(int kj = -K; kj <= K; kj++)
  {
    for(int ki = -K; ki <= K; ki++)
    {
      int inited_slide = 0;
// don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else
// we will add up errors)
// do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more
// memory
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, Sa)
#endif
      for(int j = 0; j < roi_out->height; j++)
      {
        if(j + kj < 0 || j + kj >= roi_out->height) continue;
        float *S = Sa + (size_t)dt_get_thread_num() * roi_out->width;
        const float *ins = ((float *)ivoid) + 4 * ((size_t)roi_in->width * (j + kj) + ki);
        float *out = ((float *)ovoid) + 4 * (size_t)roi_out->width * j;

        const int Pm = MIN(MIN(P, j + kj), j);
        const int PM = MIN(MIN(P, roi_out->height - 1 - j - kj), roi_out->height - 1 - j);
        // first line of every thread
        // TODO: also every once in a while to assert numerical precision!
        if(!inited_slide)
        {
          // sum up a line
          memset(S, 0x0, sizeof(float) * roi_out->width);
          for(int jj = -Pm; jj <= PM; jj++)
          {
            int i = MAX(0, -ki);
            float *s = S + i;
            const float *inp = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j + jj);
            const float *inps = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j + jj + kj) + ki);
            const int last = roi_out->width + MIN(0, -ki);
            for(; i < last; i++, inp += 4, inps += 4, s++)
            {
              for(int k = 0; k < 3; k++) s[0] += (inp[k] - inps[k]) * (inp[k] - inps[k]) * norm2[k];
            }
          }
          // only reuse this if we had a full stripe
          if(Pm == P && PM == P) inited_slide = 1;
        }

        // sliding window for this line:
        float *s = S;
        float slide = 0.0f;
        // sum up the first -P..P
        for(int i = 0; i < 2 * P + 1; i++) slide += s[i];
449
        for(int i = 0; i < roi_out->width; i++, s++, ins += 4, out += 4)
450 451 452 453
        {
          if(i - P > 0 && i + P < roi_out->width) slide += s[P] - s[-P - 1];
          if(i + ki >= 0 && i + ki < roi_out->width)
          {
454 455 456 457 458 459 460 461
            const float iv[4] = { ins[0], ins[1], ins[2], 1.0f };
#if defined(_OPENMP) && defined(OPENMP_SIMD_)
#pragma omp SIMD()
#endif
            for(size_t c = 0; c < 4; c++)
            {
              out[c] += iv[c] * gh(slide, sharpness);
            }
462 463 464 465 466 467
          }
        }
        if(inited_slide && j + P + 1 + MAX(0, kj) < roi_out->height)
        {
          // sliding window in j direction:
          int i = MAX(0, -ki);
468
          s = S + i;
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
          const float *inp = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j + P + 1);
          const float *inps = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j + P + 1 + kj) + ki);
          const float *inm = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j - P);
          const float *inms = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j - P + kj) + ki);
          const int last = roi_out->width + MIN(0, -ki);
          for(; i < last; i++, inp += 4, inps += 4, inm += 4, inms += 4, s++)
          {
            float stmp = s[0];
            for(int k = 0; k < 3; k++)
              stmp += ((inp[k] - inps[k]) * (inp[k] - inps[k]) - (inm[k] - inms[k]) * (inm[k] - inms[k]))
                      * norm2[k];
            s[0] = stmp;
          }
        }
        else
          inited_slide = 0;
      }
    }
  }
488

489
  // normalize and apply chroma/luma blending
490 491 492 493 494 495
  const float weight[4] = { d->luma, d->chroma, d->chroma, 1.0f };
  const float invert[4] = { 1.0f - d->luma, 1.0f - d->chroma, 1.0f - d->chroma, 0.0f };

  const float *const in = ((const float *const)ivoid);
  float *const out = ((float *const)ovoid);

496
#ifdef _OPENMP
497
#pragma omp parallel for SIMD() default(none) schedule(static) collapse(2)
498
#endif
499
  for(size_t k = 0; k < (size_t)ch * roi_out->width * roi_out->height; k += ch)
500
  {
501
    for(size_t c = 0; c < 4; c++)
502
    {
503
      out[k + c] = (in[k + c] * invert[c]) + (out[k + c] * (weight[c] / out[k + 3]));
504 505
    }
  }
506

507 508 509
  // free shared tmp memory:
  dt_free_align(Sa);

510
  if(piece->pipe->mask_display & DT_DEV_PIXELPIPE_DISPLAY_MASK) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
511 512
}

513
#if defined(__SSE__)
514
/** process, all real work is done here. */
515 516
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
517 518 519
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  // get our data struct:
520
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;
521

522
  // adjust to zoom size:
523
  const int P = ceilf(d->radius * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f)); // pixel filter size
524 525
  const int K = ceilf(7 * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f));         // nbhood
  const float sharpness = 3000.0f / (1.0f + d->strength);
526
  if(P < 1)
527 528
  {
    // nothing to do from this distance:
529
    memcpy(ovoid, ivoid, (size_t)sizeof(float) * 4 * roi_out->width * roi_out->height);
530 531
    return;
  }
532

533
  // adjust to Lab, make L more important
534 535 536
  // float max_L = 100.0f, max_C = 256.0f;
  // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
  float max_L = 120.0f, max_C = 512.0f;
537 538
  float nL = 1.0f / max_L, nC = 1.0f / max_C;
  const float norm2[4] = { nL * nL, nC * nC, nC * nC, 1.0f };
539

540
  float *Sa = dt_alloc_align(64, (size_t)sizeof(float) * roi_out->width * dt_get_num_threads());
541
  // we want to sum up weights in col[3], so need to init to 0:
542
  memset(ovoid, 0x0, (size_t)sizeof(float) * roi_out->width * roi_out->height * 4);
543 544

  // for each shift vector
545
  for(int kj = -K; kj <= K; kj++)
546
  {
547
    for(int ki = -K; ki <= K; ki++)
548
    {
549
      int inited_slide = 0;
550 551 552 553
// don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else
// we will add up errors)
// do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more
// memory
554
#ifdef _OPENMP
555
#pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, Sa)
556
#endif
557
      for(int j = 0; j < roi_out->height; j++)
558
      {
559
        if(j + kj < 0 || j + kj >= roi_out->height) continue;
560
        float *S = Sa + (size_t)dt_get_thread_num() * roi_out->width;
561 562
        const float *ins = ((float *)ivoid) + 4 * ((size_t)roi_in->width * (j + kj) + ki);
        float *out = ((float *)ovoid) + 4 * (size_t)roi_out->width * j;
563

564 565
        const int Pm = MIN(MIN(P, j + kj), j);
        const int PM = MIN(MIN(P, roi_out->height - 1 - j - kj), roi_out->height - 1 - j);
566 567 568 569
        // first line of every thread
        // TODO: also every once in a while to assert numerical precision!
        if(!inited_slide)
        {
570
          // sum up a line
571 572
          memset(S, 0x0, sizeof(float) * roi_out->width);
          for(int jj = -Pm; jj <= PM; jj++)
573
          {
574 575
            int i = MAX(0, -ki);
            float *s = S + i;
576 577
            const float *inp = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j + jj);
            const float *inps = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j + jj + kj) + ki);
578
            const int last = roi_out->width + MIN(0, -ki);
579
            for(; i < last; i++, inp += 4, inps += 4, s++)
580
            {
581
              for(int k = 0; k < 3; k++) s[0] += (inp[k] - inps[k]) * (inp[k] - inps[k]) * norm2[k];
582 583 584
            }
          }
          // only reuse this if we had a full stripe
585
          if(Pm == P && PM == P) inited_slide = 1;
586 587 588 589 590 591
        }

        // sliding window for this line:
        float *s = S;
        float slide = 0.0f;
        // sum up the first -P..P
592 593
        for(int i = 0; i < 2 * P + 1; i++) slide += s[i];
        for(int i = 0; i < roi_out->width; i++)
594
        {
595 596
          if(i - P > 0 && i + P < roi_out->width) slide += s[P] - s[-P - 1];
          if(i + ki >= 0 && i + ki < roi_out->width)
597
          {
598
            const __m128 iv = { ins[0], ins[1], ins[2], 1.0f };
599
            _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide, sharpness)));
600
          }
601
          s++;
602 603 604
          ins += 4;
          out += 4;
        }
605
        if(inited_slide && j + P + 1 + MAX(0, kj) < roi_out->height)
606 607
        {
          // sliding window in j direction:
608
          int i = MAX(0, -ki);
609
          s = S + i;
610 611 612 613
          const float *inp = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j + P + 1);
          const float *inps = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j + P + 1 + kj) + ki);
          const float *inm = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j - P);
          const float *inms = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j - P + kj) + ki);
614
          const int last = roi_out->width + MIN(0, -ki);
615
          for(; ((intptr_t)s & 0xf) != 0 && i < last; i++, inp += 4, inps += 4, inm += 4, inms += 4, s++)
616
          {
617
            float stmp = s[0];
618 619 620
            for(int k = 0; k < 3; k++)
              stmp += ((inp[k] - inps[k]) * (inp[k] - inps[k]) - (inm[k] - inms[k]) * (inm[k] - inms[k]))
                      * norm2[k];
621
            s[0] = stmp;
622
          }
623
          /* Process most of the line 4 pixels at a time */
624
          for(; i < last - 4; i += 4, inp += 16, inps += 16, inm += 16, inms += 16, s += 4)
625
          {
626
            __m128 sv = _mm_load_ps(s);
627 628 629 630
            const __m128 inp1 = _mm_load_ps(inp) - _mm_load_ps(inps);
            const __m128 inp2 = _mm_load_ps(inp + 4) - _mm_load_ps(inps + 4);
            const __m128 inp3 = _mm_load_ps(inp + 8) - _mm_load_ps(inps + 8);
            const __m128 inp4 = _mm_load_ps(inp + 12) - _mm_load_ps(inps + 12);
631

632 633 634 635
            const __m128 inp12lo = _mm_unpacklo_ps(inp1, inp2);
            const __m128 inp34lo = _mm_unpacklo_ps(inp3, inp4);
            const __m128 inp12hi = _mm_unpackhi_ps(inp1, inp2);
            const __m128 inp34hi = _mm_unpackhi_ps(inp3, inp4);
636

637 638
            const __m128 inpv0 = _mm_movelh_ps(inp12lo, inp34lo);
            sv += inpv0 * inpv0 * _mm_set1_ps(norm2[0]);
639

640 641
            const __m128 inpv1 = _mm_movehl_ps(inp34lo, inp12lo);
            sv += inpv1 * inpv1 * _mm_set1_ps(norm2[1]);
642

643 644
            const __m128 inpv2 = _mm_movelh_ps(inp12hi, inp34hi);
            sv += inpv2 * inpv2 * _mm_set1_ps(norm2[2]);
645

646 647 648 649
            const __m128 inm1 = _mm_load_ps(inm) - _mm_load_ps(inms);
            const __m128 inm2 = _mm_load_ps(inm + 4) - _mm_load_ps(inms + 4);
            const __m128 inm3 = _mm_load_ps(inm + 8) - _mm_load_ps(inms + 8);
            const __m128 inm4 = _mm_load_ps(inm + 12) - _mm_load_ps(inms + 12);
650

651 652 653 654
            const __m128 inm12lo = _mm_unpacklo_ps(inm1, inm2);
            const __m128 inm34lo = _mm_unpacklo_ps(inm3, inm4);
            const __m128 inm12hi = _mm_unpackhi_ps(inm1, inm2);
            const __m128 inm34hi = _mm_unpackhi_ps(inm3, inm4);
655

656 657
            const __m128 inmv0 = _mm_movelh_ps(inm12lo, inm34lo);
            sv -= inmv0 * inmv0 * _mm_set1_ps(norm2[0]);
658

659 660
            const __m128 inmv1 = _mm_movehl_ps(inm34lo, inm12lo);
            sv -= inmv1 * inmv1 * _mm_set1_ps(norm2[1]);
661

662 663
            const __m128 inmv2 = _mm_movelh_ps(inm12hi, inm34hi);
            sv -= inmv2 * inmv2 * _mm_set1_ps(norm2[2]);
664 665

            _mm_store_ps(s, sv);
666
          }
667
          for(; i < last; i++, inp += 4, inps += 4, inm += 4, inms += 4, s++)
668
          {
669
            float stmp = s[0];
670 671 672
            for(int k = 0; k < 3; k++)
              stmp += ((inp[k] - inps[k]) * (inp[k] - inps[k]) - (inm[k] - inms[k]) * (inm[k] - inms[k]))
                      * norm2[k];
673
            s[0] = stmp;
674 675
          }
        }
676 677
        else
          inited_slide = 0;
678
      }
679
    }
680
  }
681 682
  // normalize and apply chroma/luma blending
  // bias a bit towards higher values for low input values:
683 684
  // const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6));
  const __m128 weight = _mm_set_ps(1.0f, d->chroma, d->chroma, d->luma);
685
  const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight);
686
#ifdef _OPENMP
687
#pragma omp parallel for default(none) schedule(static) shared(d)
688
#endif
689
  for(int j = 0; j < roi_out->height; j++)
690
  {
691 692 693
    float *out = ((float *)ovoid) + 4 * (size_t)roi_out->width * j;
    float *in = ((float *)ivoid) + 4 * (size_t)roi_out->width * j;
    for(int i = 0; i < roi_out->width; i++)
694
    {
695 696
      _mm_store_ps(out, _mm_add_ps(_mm_mul_ps(_mm_load_ps(in), invert),
                                   _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3])))));
697
      out += 4;
698
      in += 4;
699 700
    }
  }
701
  // free shared tmp memory:
702
  dt_free_align(Sa);
703

704
  if(piece->pipe->mask_display & DT_DEV_PIXELPIPE_DISPLAY_MASK) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
705
}
706
#endif
707

708
/** this will be called to init new defaults if a new image is loaded from film strip mode. */
709 710
void reload_defaults(dt_iop_module_t *module)
{
711 712 713
  // our module is disabled by default
  module->default_enabled = 0;
  // init defaults:
714
  dt_iop_nlmeans_params_t tmp = (dt_iop_nlmeans_params_t){ 2.0f, 50.0f, 0.5f, 1.0f };
715 716
  memcpy(module->params, &tmp, sizeof(dt_iop_nlmeans_params_t));
  memcpy(module->default_params, &tmp, sizeof(dt_iop_nlmeans_params_t));
717 718 719 720 721
}

/** init, cleanup, commit to pipeline */
void init(dt_iop_module_t *module)
{
722 723
  module->params = calloc(1, sizeof(dt_iop_nlmeans_params_t));
  module->default_params = calloc(1, sizeof(dt_iop_nlmeans_params_t));
724
  // about the first thing to do in Lab space:
Heiko Bauke's avatar
Heiko Bauke committed
725
  module->priority = 529; // module order created by iop_dependencies.py, do not edit!
726 727
  module->params_size = sizeof(dt_iop_nlmeans_params_t);
  module->gui_data = NULL;
728
  module->data = NULL;
729 730 731 732 733 734 735 736
}

void cleanup(dt_iop_module_t *module)
{
  free(module->params);
  module->params = NULL;
}

737 738 739
void init_global(dt_iop_module_so_t *module)
{
  const int program = 5; // nlmeans.cl, from programs.conf
740 741
  dt_iop_nlmeans_global_data_t *gd
      = (dt_iop_nlmeans_global_data_t *)malloc(sizeof(dt_iop_nlmeans_global_data_t));
742
  module->data = gd;
743 744 745 746 747
  gd->kernel_nlmeans_init = dt_opencl_create_kernel(program, "nlmeans_init");
  gd->kernel_nlmeans_dist = dt_opencl_create_kernel(program, "nlmeans_dist");
  gd->kernel_nlmeans_horiz = dt_opencl_create_kernel(program, "nlmeans_horiz");
  gd->kernel_nlmeans_vert = dt_opencl_create_kernel(program, "nlmeans_vert");
  gd->kernel_nlmeans_accu = dt_opencl_create_kernel(program, "nlmeans_accu");
748
  gd->kernel_nlmeans_finish = dt_opencl_create_kernel(program, "nlmeans_finish");
749 750 751 752 753
}

void cleanup_global(dt_iop_module_so_t *module)
{
  dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)module->data;
754 755 756 757
  dt_opencl_free_kernel(gd->kernel_nlmeans_init);
  dt_opencl_free_kernel(gd->kernel_nlmeans_dist);
  dt_opencl_free_kernel(gd->kernel_nlmeans_horiz);
  dt_opencl_free_kernel(gd->kernel_nlmeans_vert);
758
  dt_opencl_free_kernel(gd->kernel_nlmeans_accu);
759
  dt_opencl_free_kernel(gd->kernel_nlmeans_finish);
760 761 762 763
  free(module->data);
  module->data = NULL;
}

764
/** commit is the synch point between core and gui, so it copies params to pipe data. */
765 766
void commit_params(struct dt_iop_module_t *self, dt_iop_params_t *params, dt_dev_pixelpipe_t *pipe,
                   dt_dev_pixelpipe_iop_t *piece)
767 768 769
{
  dt_iop_nlmeans_params_t *p = (dt_iop_nlmeans_params_t *)params;
  dt_iop_nlmeans_data_t *d = (dt_iop_nlmeans_data_t *)piece->data;
770
  memcpy(d, p, sizeof(*d));
771
  d->luma = MAX(0.0001f, p->luma);
772
  d->chroma = MAX(0.0001f, p->chroma);
773 774
}

775
void init_pipe(struct dt_iop_module_t *self, dt_dev_pixelpipe_t *pipe, dt_dev_pixelpipe_iop_t *piece)
776 777 778 779 780
{
  piece->data = malloc(sizeof(dt_iop_nlmeans_data_t));
  self->commit_params(self, self->default_params, pipe, piece);
}

781
void cleanup_pipe(struct dt_iop_module_t *self, dt_dev_pixelpipe_t *pipe, dt_dev_pixelpipe_iop_t *piece)
782 783
{
  free(piece->data);
784
  piece->data = NULL;
785 786
}

787
static void radius_callback(GtkWidget *w, dt_iop_module_t *self)
788 789 790 791 792 793 794
{
  // this is important to avoid cycles!
  if(darktable.gui->reset) return;
  dt_iop_nlmeans_params_t *p = (dt_iop_nlmeans_params_t *)self->params;
  p->radius = (int)dt_bauhaus_slider_get(w);
  dt_dev_add_history_item(darktable.develop, self, TRUE);
}
795
static void strength_callback(GtkWidget *w, dt_iop_module_t *self)
796 797 798 799
{
  // this is important to avoid cycles!
  if(darktable.gui->reset) return;
  dt_iop_nlmeans_params_t *p = (dt_iop_nlmeans_params_t *)self->params;
800 801 802
  p->strength = dt_bauhaus_slider_get(w);
  dt_dev_add_history_item(darktable.develop, self, TRUE);
}
803
static void luma_callback(GtkWidget *w, dt_iop_module_t *self)
804 805 806 807
{
  // this is important to avoid cycles!
  if(darktable.gui->reset) return;
  dt_iop_nlmeans_params_t *p = (dt_iop_nlmeans_params_t *)self->params;
808
  p->luma = dt_bauhaus_slider_get(w) * (1.0f / 100.0f);
809 810 811
  dt_dev_add_history_item(darktable.develop, self, TRUE);
}

812
static void chroma_callback(GtkWidget *w, dt_iop_module_t *self)
813 814 815 816
{
  // this is important to avoid cycles!
  if(darktable.gui->reset) return;
  dt_iop_nlmeans_params_t *p = (dt_iop_nlmeans_params_t *)self->params;
817
  p->chroma = dt_bauhaus_slider_get(w) * (1.0f / 100.0f);
818 819 820 821
  dt_dev_add_history_item(darktable.develop, self, TRUE);
}

/** gui callbacks, these are needed. */
822
void gui_update(dt_iop_module_t *self)
823 824 825 826
{
  // let gui slider match current parameters:
  dt_iop_nlmeans_gui_data_t *g = (dt_iop_nlmeans_gui_data_t *)self->gui_data;
  dt_iop_nlmeans_params_t *p = (dt_iop_nlmeans_params_t *)self->params;
827
  dt_bauhaus_slider_set(g->radius, p->radius);
828
  dt_bauhaus_slider_set(g->strength, p->strength);
829 830
  dt_bauhaus_slider_set(g->luma, p->luma * 100.f);
  dt_bauhaus_slider_set(g->chroma, p->chroma * 100.f);
831 832
}

833
void gui_init(dt_iop_module_t *self)
834 835 836 837
{
  // init the slider (more sophisticated layouts are possible with gtk tables and boxes):
  self->gui_data = malloc(sizeof(dt_iop_nlmeans_gui_data_t));
  dt_iop_nlmeans_gui_data_t *g = (dt_iop_nlmeans_gui_data_t *)self->gui_data;
838
  self->widget = gtk_box_new(GTK_ORIENTATION_VERTICAL, DT_BAUHAUS_SPACE);
839
  g->radius = dt_bauhaus_slider_new_with_range(self, 1.0f, 4.0f, 1., 2.f, 0);
840
  g->strength = dt_bauhaus_slider_new_with_range(self, 0.0f, 100.0f, 1., 50.f, 0);
841 842
  g->luma = dt_bauhaus_slider_new_with_range(self, 0.0f, 100.0f, 1., 50.f, 0);
  g->chroma = dt_bauhaus_slider_new_with_range(self, 0.0f, 100.0f, 1., 100.f, 0);
843 844
  gtk_box_pack_start(GTK_BOX(self->widget), g->radius, TRUE, TRUE, 0);
  gtk_box_pack_start(GTK_BOX(self->widget), g->strength, TRUE, TRUE, 0);
845 846
  gtk_box_pack_start(GTK_BOX(self->widget), g->luma, TRUE, TRUE, 0);
  gtk_box_pack_start(GTK_BOX(self->widget), g->chroma, TRUE, TRUE, 0);
847
  dt_bauhaus_widget_set_label(g->radius, NULL, _("patch size"));
848
  dt_bauhaus_slider_set_format(g->radius, "%.0f");
849
  dt_bauhaus_widget_set_label(g->strength, NULL, _("strength"));
850
  dt_bauhaus_slider_set_format(g->strength, "%.0f%%");
851
  dt_bauhaus_widget_set_label(g->luma, NULL, _("luma"));
852
  dt_bauhaus_slider_set_format(g->luma, "%.0f%%");
853
  dt_bauhaus_widget_set_label(g->chroma, NULL, _("chroma"));
854
  dt_bauhaus_slider_set_format(g->chroma, "%.0f%%");
855 856 857 858
  gtk_widget_set_tooltip_text(g->radius, _("radius of the patches to match"));
  gtk_widget_set_tooltip_text(g->strength, _("strength of the effect"));
  gtk_widget_set_tooltip_text(g->luma, _("how much to smooth brightness"));
  gtk_widget_set_tooltip_text(g->chroma, _("how much to smooth colors"));
859 860 861 862
  g_signal_connect(G_OBJECT(g->radius), "value-changed", G_CALLBACK(radius_callback), self);
  g_signal_connect(G_OBJECT(g->strength), "value-changed", G_CALLBACK(strength_callback), self);
  g_signal_connect(G_OBJECT(g->luma), "value-changed", G_CALLBACK(luma_callback), self);
  g_signal_connect(G_OBJECT(g->chroma), "value-changed", G_CALLBACK(chroma_callback), self);
863 864
}

865
void gui_cleanup(dt_iop_module_t *self)
866 867 868 869 870 871
{
  // nothing else necessary, gtk will clean up the slider.
  free(self->gui_data);
  self->gui_data = NULL;
}

Richard Wonka's avatar
Richard Wonka committed
872
// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
873
// vim: shiftwidth=2 expandtab tabstop=2 cindent
Tobias Ellinghaus's avatar
Tobias Ellinghaus committed
874
// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;