interpolation.c 61.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/* --------------------------------------------------------------------------
    This file is part of darktable,
    copyright (c) 2012 Edouard Gomez <ed.gomez@free.fr>

    darktable is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    darktable is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
* ------------------------------------------------------------------------*/

#include "common/interpolation.h"
20
#include "common/darktable.h"
21 22
#include "control/conf.h"

23 24 25
#include <assert.h>
#include <glib.h>
#include <inttypes.h>
26 27 28 29
#include <math.h>
#include <stddef.h>
#include <stdint.h>

30 31 32 33 34 35 36 37 38 39
/** Border extrapolation modes */
enum border_mode
{
  BORDER_REPLICATE, // aaaa|abcdefg|gggg
  BORDER_WRAP,      // defg|abcdefg|abcd
  BORDER_MIRROR,    // edcb|abcdefg|fedc
  BORDER_CLAMP      // ....|abcdefg|....
};

/* Supporting them all might be overkill, let the compiler trim all
40
 * unnecessary modes in clip for resampling codepath*/
41 42
#define RESAMPLING_BORDER_MODE BORDER_REPLICATE

43
/* Supporting them all might be overkill, let the compiler trim all
44
 * unnecessary modes in interpolation codepath */
45 46
#define INTERPOLATION_BORDER_MODE BORDER_MIRROR

47
// Defines minimum alignment requirement for critical SIMD code
48
#define SSE_ALIGNMENT 16
49 50 51 52 53 54

// Defines the maximum kernel half length
// !! Make sure to sync this with the filter array !!
#define MAX_HALF_FILTER_WIDTH 3

// Add code for timing resampling function
55
#define DEBUG_RESAMPLING_TIMING 0
56

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
// Add debug info messages to stderr
#define DEBUG_PRINT_INFO 0

// Add *verbose* (like one msg per pixel out) debug message to stderr
#define DEBUG_PRINT_VERBOSE 0

/* --------------------------------------------------------------------------
 * Debug helpers
 * ------------------------------------------------------------------------*/

#if DEBUG_RESAMPLING_TIMING
#include <sys/time.h>
#endif

#if DEBUG_PRINT_INFO
72 73 74 75 76
#define debug_info(...)                                                                                      \
  do                                                                                                         \
  {                                                                                                          \
    fprintf(stderr, __VA_ARGS__);                                                                            \
  } while(0)
77 78 79 80 81
#else
#define debug_info(...)
#endif

#if DEBUG_PRINT_VERBOSE
82 83 84 85 86
#define debug_extra(...)                                                                                     \
  do                                                                                                         \
  {                                                                                                          \
    fprintf(stderr, __VA_ARGS__);                                                                            \
  } while(0)
87 88 89 90 91
#else
#define debug_extra(...)
#endif

#if DEBUG_RESAMPLING_TIMING
92
static inline int64_t getts()
93 94 95
{
  struct timeval t;
  gettimeofday(&t, NULL);
96
  return t.tv_sec * INT64_C(1000000) + t.tv_usec;
97 98 99 100 101 102 103 104 105 106 107 108
}
#endif

/* --------------------------------------------------------------------------
 * Generic helpers
 * ------------------------------------------------------------------------*/

/** Compute ceil value of a float
 * @remark Avoid libc ceil for now. Maybe we'll revert to libc later.
 * @param x Value to ceil
 * @return ceil value
 */
109
static inline float ceil_fast(float x)
110
{
111
  if(x <= 0.f)
112
  {
113
    return (float)(int)x;
114 115 116
  }
  else
  {
117 118 119 120
    return -((float)(int)-x) + 1.f;
  }
}

121
#if defined(__SSE2__)
122 123 124
/** Compute absolute value
 * @param t Vector of 4 floats
 * @return Vector of their absolute values
125
 */ static inline __m128 _mm_abs_ps(__m128 t)
126
{
127 128 129
  static const uint32_t signmask[4] __attribute__((aligned(SSE_ALIGNMENT)))
  = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
  return _mm_and_ps(*(__m128 *)signmask, t);
130
}
131
#endif
132 133 134 135 136

/** Clip into specified range
 * @param idx index to filter
 * @param length length of line
 */
137
static inline int clip(int i, int min, int max, enum border_mode mode)
138
{
139
  switch(mode)
140 141
  {
    case BORDER_REPLICATE:
142
      if(i < min)
143 144 145
      {
        i = min;
      }
146
      else if(i > max)
147 148 149 150 151
      {
        i = max;
      }
      break;
    case BORDER_MIRROR:
152
      if(i < min)
153 154 155
      {
        i = min - i;
      }
156
      else if(i > max)
157
      {
158
        i = 2 * max - i;
159 160 161
      }
      break;
    case BORDER_WRAP:
162
      if(i < min)
163 164 165
      {
        i = max - (min - i);
      }
166
      else if(i > max)
167 168 169 170 171
      {
        i = min + (i - max);
      }
      break;
    case BORDER_CLAMP:
172
      if(i < min || i > max)
173 174 175 176 177 178 179
      {
        /* Should not be used as is, we prevent -1 usage, filtering the taps
         * we clip the sample indexes for. So understand this function is
         * specific to its caller. */
        i = -1;
      }
      break;
180
  }
181

182 183 184
  return i;
}

185 186
static inline void prepare_tap_boundaries(int *tap_first, int *tap_last, const enum border_mode mode,
                                          const int filterwidth, const int t, const int max)
187 188 189 190
{
  /* Check lower bound pixel index and skip as many pixels as necessary to
   * fall into range */
  *tap_first = 0;
191
  if(mode == BORDER_CLAMP && t < 0)
192
  {
193 194 195 196 197
    *tap_first = -t;
  }

  // Same for upper bound pixel
  *tap_last = filterwidth;
198
  if(mode == BORDER_CLAMP && t + filterwidth >= max)
199
  {
200 201 202 203
    *tap_last = max - t;
  }
}

204 205 206 207 208 209 210 211
/** Make sure an aligned chunk will not misalign its following chunk
 * proposing an adapted length
 *
 * @param l Length required for current chunk
 * @param align Required alignment for next chunk
 *
 * @return Required length for keeping alignment ok if chaining data chunks
 */
212
static inline size_t increase_for_alignment(size_t l, size_t align)
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
{
  align -= 1;
  return (l + align) & (~align);
}

/** Compute an approximate sine.
 * This function behaves correctly for the range [-pi pi] only.
 * It has the following properties:
 * <ul>
 *   <li>It has exact values for 0, pi/2, pi, -pi/2, -pi</li>
 *   <li>It has matching derivatives to sine for these same points</li>
 *   <li>Its relative error margin is <= 1% iirc</li>
 *   <li>It computational cost is 5 mults + 3 adds + 2 abs</li>
 * </ul>
 * @param t Radian parameter
 * @return guess what
 */
230
static inline float sinf_fast(float t)
231
{
232
  static const float a = 4 / (M_PI * M_PI);
233
  static const float p = 0.225f;
234

235
  t = a * t * (M_PI - fabsf(t));
236

237
  return t * (p * (fabsf(t) - 1) + 1);
238 239
}

240
#if defined(__SSE2__)
241 242 243 244 245 246 247 248 249 250 251 252
/** Compute an approximate sine (SSE version, four sines a call).
 * This function behaves correctly for the range [-pi pi] only.
 * It has the following properties:
 * <ul>
 *   <li>It has exact values for 0, pi/2, pi, -pi/2, -pi</li>
 *   <li>It has matching derivatives to sine for these same points</li>
 *   <li>Its relative error margin is <= 1% iirc</li>
 *   <li>It computational cost is 5 mults + 3 adds + 2 abs</li>
 * </ul>
 * @param t Radian parameter
 * @return guess what
 */
253
static inline __m128 sinf_fast_sse(__m128 t)
254
{
255 256 257 258
  static const __m128 a
      = { 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI) };
  static const __m128 p = { 0.225f, 0.225f, 0.225f, 0.225f };
  static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
259 260 261 262 263 264 265 266 267 268 269 270 271 272

  // m4 = a*t*(M_PI - fabsf(t));
  __m128 m1 = _mm_abs_ps(t);
  __m128 m2 = _mm_sub_ps(pi, m1);
  __m128 m3 = _mm_mul_ps(t, m2);
  __m128 m4 = _mm_mul_ps(a, m3);

  // p*(m4*fabsf(m4) - m4) + m4;
  __m128 n1 = _mm_abs_ps(m4);
  __m128 n2 = _mm_mul_ps(m4, n1);
  __m128 n3 = _mm_sub_ps(n2, m4);
  __m128 n4 = _mm_mul_ps(p, n3);

  return _mm_add_ps(n4, m4);
273
}
274 275
#endif

276
/* --------------------------------------------------------------------------
277 278 279 280 281
 * Interpolation kernels
 * ------------------------------------------------------------------------*/

/* --------------------------------------------------------------------------
 * Bilinear interpolation
282 283
 * ------------------------------------------------------------------------*/

284
static inline float bilinear(float width, float t)
285 286 287
{
  float r;
  t = fabsf(t);
288
  if(t > 1.f)
289
  {
290
    r = 0.f;
291 292 293
  }
  else
  {
294 295 296 297 298
    r = 1.f - t;
  }
  return r;
}

299
#if defined(__SSE2__)
300
static inline __m128 bilinear_sse(__m128 width, __m128 t)
301
{
302
  static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
303
  return _mm_sub_ps(one, _mm_abs_ps(t));
304
}
305
#endif
306 307 308 309 310

/* --------------------------------------------------------------------------
 * Bicubic interpolation
 * ------------------------------------------------------------------------*/

311
static inline float bicubic(float width, float t)
312 313 314
{
  float r;
  t = fabsf(t);
315
  if(t >= 2.f)
316
  {
317
    r = 0.f;
318
  }
319
  else if(t > 1.f && t < 2.f)
320
  {
321 322
    float t2 = t * t;
    r = 0.5f * (t * (-t2 + 5.f * t - 8.f) + 4.f);
323 324 325
  }
  else
  {
326 327
    float t2 = t * t;
    r = 0.5f * (t * (3.f * t2 - 5.f * t) + 2.f);
328 329 330 331
  }
  return r;
}

332
#if defined(__SSE2__)
333
static inline __m128 bicubic_sse(__m128 width, __m128 t)
334
{
335 336 337 338 339 340 341
  static const __m128 half = { .5f, .5f, .5f, .5f };
  static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
  static const __m128 two = { 2.f, 2.f, 2.f, 2.f };
  static const __m128 three = { 3.f, 3.f, 3.f, 3.f };
  static const __m128 four = { 4.f, 4.f, 4.f, 4.f };
  static const __m128 five = { 5.f, 5.f, 5.f, 5.f };
  static const __m128 eight = { 8.f, 8.f, 8.f, 8.f };
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375

  t = _mm_abs_ps(t);
  __m128 t2 = _mm_mul_ps(t, t);

  /* Compute 1 < t < 2 case:
   * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
   * half*(t*(mt2 + t5 - eight) + four)
   * half*(t*(mt2 + t5_sub_8) + four)
   * half*(t*(mt2_add_t5_sub_8) + four) */
  __m128 t5 = _mm_mul_ps(five, t);
  __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
  __m128 zero = _mm_setzero_ps();
  __m128 mt2 = _mm_sub_ps(zero, t2);
  __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
  __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
  __m128 b = _mm_add_ps(a, four);
  __m128 r12 = _mm_mul_ps(b, half);

  /* Compute case < 1
   * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
  __m128 t23 = _mm_mul_ps(three, t2);
  __m128 c = _mm_sub_ps(t23, t5);
  __m128 d = _mm_mul_ps(t, c);
  __m128 e = _mm_add_ps(d, two);
  __m128 r01 = _mm_mul_ps(half, e);

  // Compute masks fr keeping correct components
  __m128 mask01 = _mm_cmple_ps(t, one);
  __m128 mask12 = _mm_cmpgt_ps(t, one);
  r01 = _mm_and_ps(mask01, r01);
  r12 = _mm_and_ps(mask12, r12);


  return _mm_or_ps(r01, r12);
376
}
377
#endif
378 379 380 381 382

/* --------------------------------------------------------------------------
 * Lanczos interpolation
 * ------------------------------------------------------------------------*/

383 384 385
#define DT_LANCZOS_EPSILON (1e-9f)

#if 0
386
// Reference version left here for ... documentation
387
static inline float
388
lanczos(float width, float t)
389 390 391
{
  float r;

392 393
  if (t<-width || t>width)
  {
394
    r = 0.f;
395 396 397
  }
  else if (t>-DT_LANCZOS_EPSILON && t<DT_LANCZOS_EPSILON)
  {
398
    r = 1.f;
399 400 401
  }
  else
  {
402 403 404 405
    r = width*sinf(M_PI*t)*sinf(M_PI*t/width)/(M_PI*M_PI*t*t);
  }
  return r;
}
406 407
#endif

408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
/* Fast lanczos version, no calls to math.h functions, too accurate, too slow
 *
 * Based on a forum entry at
 * http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/
 *
 * Apart the fast sine function approximation, the only trick is to compute:
 * sin(pi.t) = sin(a.pi + r.pi) where t = a + r = trunc(t) + r
 *           = sin(a.pi).cos(r.pi) + sin(r.pi).cos(a.pi)
 *           =         0*cos(r.pi) + sin(r.pi).cos(a.pi)
 *           = sign.sin(r.pi) where sign =  1 if the a is even
 *                                       = -1 if the a is odd
 *
 * Of course we know that lanczos func will only be called for
 * the range -width < t < width so we can additionally avoid the
 * range check.  */

424
static inline float lanczos(float width, float t)
425 426 427 428 429 430 431
{
  /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
   * correct */
  int a = (int)t;
  float r = t - (float)a;

  // Compute the correct sign for sinf(pi.r)
432 433 434 435 436
  union
  {
    float f;
    uint32_t i;
  } sign;
437
  sign.i = ((a & 1) << 31) | 0x3f800000;
438

439 440
  return (DT_LANCZOS_EPSILON + width * sign.f * sinf_fast(M_PI * r) * sinf_fast(M_PI * t / width))
         / (DT_LANCZOS_EPSILON + M_PI * M_PI * t * t);
441 442
}

443 444
#if defined(__SSE2__)
static inline __m128 lanczos_sse2(__m128 width, __m128 t)
445
{
446 447 448 449 450 451
  /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
   * correct */
  __m128i a = _mm_cvtps_epi32(t);
  __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));

  // Compute the correct sign for sinf(pi.r)
452 453 454 455 456 457 458 459 460
  static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT)))
  = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
  static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1 };
  static const __m128 eps
      = { DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON };
  static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
  static const __m128 pi2 = { M_PI * M_PI, M_PI * M_PI, M_PI * M_PI, M_PI * M_PI };

  __m128i isign = _mm_and_si128(*(__m128i *)ione, a);
461
  isign = _mm_slli_epi64(isign, 31);
462
  isign = _mm_or_si128(*(__m128i *)fone, isign);
463 464 465 466 467 468 469 470 471 472 473
  __m128 fsign = _mm_castsi128_ps(isign);

  __m128 num = _mm_mul_ps(width, fsign);
  num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
  num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
  num = _mm_add_ps(eps, num);

  __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
  den = _mm_add_ps(eps, den);

  return _mm_div_ps(num, den);
474
}
475
#endif
476 477 478 479

#undef DT_LANCZOS_EPSILON

/* --------------------------------------------------------------------------
480
 * All our known interpolators
481 482
 * ------------------------------------------------------------------------*/

483 484 485 486 487
/* !!! !!! !!!
 * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width
 * of this filter list. Otherwise bad things will happen
 * !!! !!! !!!
 */
488
static const struct dt_interpolation dt_interpolator[] = {
489 490 491 492
  {.id = DT_INTERPOLATION_BILINEAR,
   .name = "bilinear",
   .width = 1,
   .func = &bilinear,
493
#if defined(__SSE2__)
494 495 496 497 498 499 500
   .funcsse = &bilinear_sse
#endif
  },
  {.id = DT_INTERPOLATION_BICUBIC,
   .name = "bicubic",
   .width = 2,
   .func = &bicubic,
501
#if defined(__SSE2__)
502 503 504 505 506 507 508
   .funcsse = &bicubic_sse
#endif
  },
  {.id = DT_INTERPOLATION_LANCZOS2,
   .name = "lanczos2",
   .width = 2,
   .func = &lanczos,
509 510
#if defined(__SSE2__)
   .funcsse = &lanczos_sse2
511 512 513 514 515 516
#endif
  },
  {.id = DT_INTERPOLATION_LANCZOS3,
   .name = "lanczos3",
   .width = 3,
   .func = &lanczos,
517 518
#if defined(__SSE2__)
   .funcsse = &lanczos_sse2
519 520
#endif
  },
521 522 523
};

/* --------------------------------------------------------------------------
524
 * Kernel utility methods
525 526
 * ------------------------------------------------------------------------*/

527 528 529 530
/** Computes an upsampling filtering kernel
 *
 * @param itor [in] Interpolator used
 * @param kernel [out] resulting itor->width*2 filter taps
531
 * @param norm [out] Kernel norm
532
 * @param first [out] first input sample index used
533
 * @param t [in] Interpolated coordinate */
534 535
static inline void compute_upsampling_kernel_plain(const struct dt_interpolation *itor, float *kernel,
                                                   float *norm, int *first, float t)
536
{
537
  int f = (int)t - itor->width + 1;
538
  if(first)
539
  {
540 541 542
    *first = f;
  }

543 544
  /* Find closest integer position and then offset that to match first
   * filtered sample position */
545
  t = t - (float)f;
546 547

  // Will hold kernel norm
548
  float n = 0.f;
549 550

  // Compute the raw kernel
551
  for(int i = 0; i < 2 * itor->width; i++)
552
  {
553
    float tap = itor->func((float)itor->width, t);
554
    n += tap;
555 556 557
    kernel[i] = tap;
    t -= 1.f;
  }
558
  if(norm)
559
  {
560 561
    *norm = n;
  }
562 563
}

564
#if defined(__SSE2__)
565 566 567
/** Computes an upsampling filtering kernel (SSE version, four taps per inner loop)
 *
 * @param itor [in] Interpolator used
568 569
 * @param kernel [out] resulting itor->width*2 filter taps (array must be at least (itor->width*2+3)/4*4
 *floats long)
570
 * @param norm [out] Kernel norm
571
 * @param first [out] first input sample index used
572 573 574 575
 * @param t [in] Interpolated coordinate
 *
 * @return kernel norm
 */
576 577
static inline void compute_upsampling_kernel_sse(const struct dt_interpolation *itor, float *kernel,
                                                 float *norm, int *first, float t)
578
{
579
  int f = (int)t - itor->width + 1;
580
  if(first)
581
  {
582 583 584
    *first = f;
  }

585 586
  /* Find closest integer position and then offset that to match first
   * filtered sample position */
587
  t = t - (float)f;
588 589

  // Prepare t vector to compute four values a loop
590 591
  static const __m128 bootstrap = { 0.f, -1.f, -2.f, -3.f };
  static const __m128 iter = { -4.f, -4.f, -4.f, -4.f };
592
  __m128 vt = _mm_add_ps(_mm_set_ps1(t), bootstrap);
593 594 595 596
  __m128 vw = _mm_set_ps1((float)itor->width);

  // Prepare counters (math kept stupid for understanding)
  int i = 0;
597
  int runs = (2 * itor->width + 3) / 4;
598

599
  while(i < runs)
600
  {
601 602 603
    // Compute the values
    __m128 vr = itor->funcsse(vw, vt);

604
    // Save result
605
    *(__m128 *)kernel = vr;
606

607
    // Prepare next iteration
608 609 610 611 612 613
    vt = _mm_add_ps(vt, iter);
    kernel += 4;
    i++;
  }

  // compute norm now
614
  if(norm)
615
  {
616 617
    float n = 0.f;
    i = 0;
618 619
    kernel -= 4 * runs;
    while(i < 2 * itor->width)
620
    {
621 622 623 624 625
      n += *kernel;
      kernel++;
      i++;
    }
    *norm = n;
626 627
  }
}
628 629 630 631 632 633
#endif

static inline void compute_upsampling_kernel(const struct dt_interpolation *itor, float *kernel, float *norm,
                                             int *first, float t)
{
  if(darktable.codepath.OPENMP_SIMD) return compute_upsampling_kernel_plain(itor, kernel, norm, first, t);
634
#if defined(__SSE2__)
635 636 637 638 639 640
  else if(darktable.codepath.SSE2)
    return compute_upsampling_kernel_sse(itor, kernel, norm, first, t);
#endif
  else
    dt_unreachable_codepath();
}
641

642 643 644 645 646
/** Computes a downsampling filtering kernel
 *
 * @param itor [in] Interpolator used
 * @param kernelsize [out] Number of taps
 * @param kernel [out] resulting taps (at least itor->width/inoout elements for no overflow)
647
 * @param norm [out] Kernel norm
648
 * @param first [out] index of the first sample for which the kernel is to be applied
649
 * @param outoinratio [in] "out samples" over "in samples" ratio
650
 * @param xout [in] Output coordinate */
651 652 653
static inline void compute_downsampling_kernel_plain(const struct dt_interpolation *itor, int *taps,
                                                     int *first, float *kernel, float *norm,
                                                     float outoinratio, int xout)
654 655 656 657 658 659
{
  // Keep this at hand
  float w = (float)itor->width;

  /* Compute the phase difference between output pixel and its
   * input corresponding input pixel */
660 661
  float xin = ceil_fast(((float)xout - w) / outoinratio);
  if(first)
662
  {
663 664
    *first = (int)xin;
  }
665 666

  // Compute first interpolator parameter
667
  float t = xin * outoinratio - (float)xout;
668 669

  // Will hold kernel norm
670
  float n = 0.f;
671 672

  // Compute all filter taps
673 674
  *taps = (int)((w - t) / outoinratio);
  for(int i = 0; i < *taps; i++)
675
  {
676
    *kernel = itor->func(w, t);
677
    n += *kernel;
678
    t += outoinratio;
679
    kernel++;
680 681
  }

682
  if(norm)
683
  {
684 685
    *norm = n;
  }
686 687
}

688

689
#if defined(__SSE2__)
690
/** Computes a downsampling filtering kernel (SSE version, four taps per inner loop iteration)
691 692 693
 *
 * @param itor [in] Interpolator used
 * @param kernelsize [out] Number of taps
694
 * @param kernel [out] resulting taps (at least itor->width/inoout + 4 elements for no overflow)
695
 * @param norm [out] Kernel norm
696 697
 * @param first [out] index of the first sample for which the kernel is to be applied
 * @param outoinratio [in] "out samples" over "in samples" ratio
698
 * @param xout [in] Output coordinate */
699 700
static inline void compute_downsampling_kernel_sse(const struct dt_interpolation *itor, int *taps, int *first,
                                                   float *kernel, float *norm, float outoinratio, int xout)
701 702 703 704 705 706
{
  // Keep this at hand
  float w = (float)itor->width;

  /* Compute the phase difference between output pixel and its
   * input corresponding input pixel */
707 708
  float xin = ceil_fast(((float)xout - w) / outoinratio);
  if(first)
709
  {
710 711 712 713
    *first = (int)xin;
  }

  // Compute first interpolator parameter
714
  float t = xin * outoinratio - (float)xout;
715 716

  // Compute all filter taps
717
  *taps = (int)((w - t) / outoinratio);
718 719

  // Bootstrap vector t
720 721
  static const __m128 bootstrap = { 0.f, 1.f, 2.f, 3.f };
  const __m128 iter = _mm_set_ps1(4.f * outoinratio);
722 723 724 725 726
  const __m128 vw = _mm_set_ps1(w);
  __m128 vt = _mm_add_ps(_mm_set_ps1(t), _mm_mul_ps(_mm_set_ps1(outoinratio), bootstrap));

  // Prepare counters (math kept stupid for understanding)
  int i = 0;
727
  int runs = (*taps + 3) / 4;
728

729
  while(i < runs)
730
  {
731 732 733 734
    // Compute the values
    __m128 vr = itor->funcsse(vw, vt);

    // Save result
735
    *(__m128 *)kernel = vr;
736 737 738 739 740 741 742 743

    // Prepare next iteration
    vt = _mm_add_ps(vt, iter);
    kernel += 4;
    i++;
  }

  // compute norm now
744
  if(norm)
745
  {
746 747
    float n = 0.f;
    i = 0;
748 749
    kernel -= 4 * runs;
    while(i < *taps)
750
    {
751 752 753 754 755
      n += *kernel;
      kernel++;
      i++;
    }
    *norm = n;
756 757
  }
}
758 759 760 761 762 763 764
#endif

static inline void compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first,
                                               float *kernel, float *norm, float outoinratio, int xout)
{
  if(darktable.codepath.OPENMP_SIMD)
    return compute_downsampling_kernel_plain(itor, taps, first, kernel, norm, outoinratio, xout);
765
#if defined(__SSE2__)
766 767 768 769 770 771
  else if(darktable.codepath.SSE2)
    return compute_downsampling_kernel_sse(itor, taps, first, kernel, norm, outoinratio, xout);
#endif
  else
    dt_unreachable_codepath();
}
772

773
/* --------------------------------------------------------------------------
774
 * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
775 776
 * ------------------------------------------------------------------------*/

777 778
#define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))

779 780 781
float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x,
                                      const float y, const int width, const int height,
                                      const int samplestride, const int linestride)
782
{
783
  assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
784

785 786
  float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
  float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
787 788

  // Compute both horizontal and vertical kernels
789 790
  float normh;
  float normv;
791 792
  compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
  compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
793

794 795
  int ix = (int)x;
  int iy = (int)y;
796

797 798 799 800 801
  /* Now 2 cases, the pixel + filter width goes outside the image
   * in that case we have to use index clipping to keep all reads
   * in the input image (slow path) or we are sure it won't fall
   * outside and can do more simple code */
  float r;
802 803
  if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
     && iy < (height - itor->width))
804
  {
805 806 807
    // Inside image boundary case

    // Go to top left pixel
808 809
    in = (float *)in + linestride * iy + ix * samplestride;
    in = in - (itor->width - 1) * (samplestride + linestride);
810 811 812

    // Apply the kernel
    float s = 0.f;
813
    for(int i = 0; i < 2 * itor->width; i++)
814
    {
815
      float h = 0.0f;
816
      for(int j = 0; j < 2 * itor->width; j++)
817
      {
818
        h += kernelh[j] * in[j * samplestride];
819
      }
820
      s += kernelv[i] * h;
821
      in += linestride;
822
    }
823
    r = s / (normh * normv);
824
  }
825
  else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
826
  {
827 828 829
    // At least a valid coordinate


830
    // Point to the upper left pixel index wise
831 832
    iy -= itor->width - 1;
    ix -= itor->width - 1;
833 834 835 836 837 838

    static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
    assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong

    int xtap_first;
    int xtap_last;
839
    prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
840 841 842

    int ytap_first;
    int ytap_last;
843
    prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
844 845 846

    // Apply the kernel
    float s = 0.f;
847
    for(int i = ytap_first; i < ytap_last; i++)
848
    {
849
      int clip_y = clip(iy + i, 0, height - 1, bordermode);
850
      float h = 0.0f;
851
      for(int j = xtap_first; j < xtap_last; j++)
852
      {
853 854 855
        int clip_x = clip(ix + j, 0, width - 1, bordermode);
        const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
        h += kernelh[j] * ipixel[0];
856
      }
857
      s += kernelv[i] * h;
858 859
    }

860
    r = s / (normh * normv);
861 862 863
  }
  else
  {
864 865
    // invalide coordinate
    r = 0.0f;
866
  }
867
  return r;
868 869
}

870 871 872 873
/* --------------------------------------------------------------------------
 * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
 * ------------------------------------------------------------------------*/

874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965
static void dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation *itor, const float *in,
                                                   float *out, const float x, const float y, const int width,
                                                   const int height, const int linestride)
{
  assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));

  // Quite a bit of space for kernels
  float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
  float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));

  // Compute both horizontal and vertical kernels
  float normh;
  float normv;
  compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
  compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);

  // Precompute the inverse of the filter norm for later use
  const float oonorm = (1.f / (normh * normv));

  /* Now 2 cases, the pixel + filter width goes outside the image
   * in that case we have to use index clipping to keep all reads
   * in the input image (slow path) or we are sure it won't fall
   * outside and can do more simple code */
  int ix = (int)x;
  int iy = (int)y;

  if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
     && iy < (height - itor->width))
  {
    // Inside image boundary case

    // Go to top left pixel
    in = (float *)in + linestride * iy + ix * 4;
    in = in - (itor->width - 1) * (4 + linestride);

    // Apply the kernel
    float pixel[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
    for(int i = 0; i < 2 * itor->width; i++)
    {
      float h[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
      for(int j = 0; j < 2 * itor->width; j++)
      {
        for(int c = 0; c < 3; c++) h[c] += kernelh[j] * in[j * 4 + c];
      }
      for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
      in += linestride;
    }

    for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
  }
  else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
  {
    // At least a valid coordinate

    // Point to the upper left pixel index wise
    iy -= itor->width - 1;
    ix -= itor->width - 1;

    static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
    assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong

    int xtap_first;
    int xtap_last;
    prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);

    int ytap_first;
    int ytap_last;
    prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);

    // Apply the kernel
    float pixel[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
    for(int i = ytap_first; i < ytap_last; i++)
    {
      int clip_y = clip(iy + i, 0, height - 1, bordermode);
      float h[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
      for(int j = xtap_first; j < xtap_last; j++)
      {
        int clip_x = clip(ix + j, 0, width - 1, bordermode);
        const float *ipixel = in + clip_y * linestride + clip_x * 4;
        for(int c = 0; c < 3; c++) h[c] += kernelh[j] * ipixel[c];
      }
      for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
    }

    for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
  }
  else
  {
    for(int c = 0; c < 3; c++) out[c] = 0.0f;
  }
}

966
#if defined(__SSE2__)
967 968 969
static void dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation *itor, const float *in,
                                                 float *out, const float x, const float y, const int width,
                                                 const int height, const int linestride)
970
{
971
  assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
972 973

  // Quite a bit of space for kernels
974 975
  float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
  float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
976 977
  __m128 vkernelh[2 * MAX_HALF_FILTER_WIDTH];
  __m128 vkernelv[2 * MAX_HALF_FILTER_WIDTH];
978 979

  // Compute both horizontal and vertical kernels
980 981
  float normh;
  float normv;
982 983
  compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
  compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
984 985

  // We will process four components a time, duplicate the information
986
  for(int i = 0; i < 2 * itor->width; i++)
987
  {
988 989 990 991 992
    vkernelh[i] = _mm_set_ps1(kernelh[i]);
    vkernelv[i] = _mm_set_ps1(kernelv[i]);
  }

  // Precompute the inverse of the filter norm for later use
993
  __m128 oonorm = _mm_set_ps1(1.f / (normh * normv));
994

995 996 997 998 999 1000 1001
  /* Now 2 cases, the pixel + filter width goes outside the image
   * in that case we have to use index clipping to keep all reads
   * in the input image (slow path) or we are sure it won't fall
   * outside and can do more simple code */
  int ix = (int)x;
  int iy = (int)y;

1002 1003
  if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
     && iy < (height - itor->width))
1004
  {
1005 1006 1007
    // Inside image boundary case

    // Go to top left pixel
1008 1009
    in = (float *)in + linestride * iy + ix * 4;
    in = in - (itor->width - 1) * (4 + linestride);
1010 1011 1012

    // Apply the kernel
    __m128 pixel = _mm_setzero_ps();
1013
    for(int i = 0; i < 2 * itor->width; i++)
1014
    {
1015
      __m128 h = _mm_setzero_ps();
1016
      for(int j = 0; j < 2 * itor->width; j++)
1017
      {
1018
        h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)&in[j * 4]));
1019
      }
1020
      pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
1021
      in += linestride;
1022 1023
    }

1024
    *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
1025
  }
1026
  else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
1027
  {
1028 1029
    // At least a valid coordinate

1030
    // Point to the upper left pixel index wise
1031 1032
    iy -= itor->width - 1;
    ix -= itor->width - 1;
1033 1034 1035 1036 1037 1038

    static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
    assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong

    int xtap_first;
    int xtap_last;
1039
    prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
1040 1041 1042

    int ytap_first;
    int ytap_last;
1043
    prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
1044 1045 1046

    // Apply the kernel
    __m128 pixel = _mm_setzero_ps();
1047
    for(int i = ytap_first; i < ytap_last; i++)
1048
    {
1049
      int clip_y = clip(iy + i, 0, height - 1, bordermode);
1050
      __m128 h = _mm_setzero_ps();
1051
      for(int j = xtap_first; j < xtap_last; j++)
1052
      {
1053 1054 1055
        int clip_x = clip(ix + j, 0, width - 1, bordermode);
        const float *ipixel = in + clip_y * linestride + clip_x * 4;
        h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)ipixel));
1056
      }
1057
      pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
1058 1059
    }

1060
    *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
1061 1062 1063
  }
  else
  {
1064
    *(__m128 *)out = _mm_set_ps1(0.0f);
1065
  }
1066
}
1067 1068 1069 1070 1071 1072 1073 1074
#endif

void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out,
                                      const float x, const float y, const int width, const int height,
                                      const int linestride)
{
  if(darktable.codepath.OPENMP_SIMD)
    return dt_interpolation_compute_pixel4c_plain(itor, in, out, x, y, width, height, linestride);
1075
#if defined(__SSE2__)
1076 1077 1078 1079 1080 1081
  else if(darktable.codepath.SSE2)
    return dt_interpolation_compute_pixel4c_sse(itor, in, out, x, y, width, height, linestride);
#endif
  else
    dt_unreachable_codepath();
}
1082

1083 1084 1085 1086
/* --------------------------------------------------------------------------
 * Interpolation factory
 * ------------------------------------------------------------------------*/

1087
const struct dt_interpolation *dt_interpolation_new(enum dt_interpolation_type type)
1088
{
1089
  const struct dt_interpolation *itor = NULL;
1090

1091
  if(type == DT_INTERPOLATION_USERPREF)
1092
  {
1093
    // Find user preferred interpolation method
1094 1095
    gchar *uipref = dt_conf_get_string("plugins/lighttable/export/pixel_interpolator");
    for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1096
    {
1097
      if(!strcmp(uipref, dt_interpolator[i].name))
1098
      {
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
        // Found the one
        itor = &dt_interpolator[i];
        break;
      }
    }
    g_free(uipref);

    /* In the case the search failed (!uipref or name not found),
     * prepare later search pass with default fallback */
    type = DT_INTERPOLATION_DEFAULT;
  }
1110
  if(!itor)
1111
  {
1112
    // Did not find the userpref one or we've been asked for a specific one
1113
    for(int i = DT_INTERPOLATION_FIRST; i < DT_INTERPOLATION_LAST; i++)
1114
    {
1115
      if(dt_interpolator[i].id == type)
1116
      {
1117 1118 1119
        itor = &dt_interpolator[i];
        break;
      }
1120
      if(dt_interpolator[i].id == DT_INTERPOLATION_DEFAULT)
1121
      {
1122 1123 1124 1125 1126 1127 1128
        itor = &dt_interpolator[i];
      }
    }
  }

  return itor;
}
1129 1130 1131 1132 1133

/* --------------------------------------------------------------------------
 * Image resampling
 * ------------------------------------------------------------------------*/

1134
/** Prepares a 1D resampling plan
1135
 *
Unknown's avatar
Unknown committed
1136
 * This consists of the following information
1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
 * <ul>
 * <li>A list of lengths that tell how many pixels are relevant for the
 *    next output</li>
 * <li>A list of required filter kernels</li>
 * <li>A list of sample indexes</li>
 * </ul>
 *
 * How to apply the resampling plan:
 * <ol>
 * <li>Pick a length from the length array</li>
 * <li>until length is reached
 *     <ol>
 *     <li>pick a kernel tap></li>
 *     <li>pick the relevant sample according to the picked index</li>
 *     <li>multiply them and accumulate</li>
 *     </ol>
 * </li>
 * <li>here goes a single output sample</li>
 * </ol>
 *
 * This until you reach the number of output pixels
 *
 * @param itor interpolator used to resample
 * @param in [in] Number of input samples
 * @param out [in] Number of output samples
 * @param plength [out] Array of lengths for each pixel filtering (number
 * of taps/indexes to use). This array mus be freed with fre() when you're
 * done with the plan.
 * @param pkernel [out] Array of filter kernel taps
 * @param pindex [out] Array of sample indexes to be used for applying each kernel tap
Unknown's avatar
Unknown committed
1167
 * arrays of information
1168 1169
 * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary
 *out position meta[3*out]
1170 1171
 * @return 0 for success, !0 for failure
 */
1172 1173 1174
static int prepare_resampling_plan(const struct dt_interpolation *itor, int in, const int in_x0, int out,
                                   const int out_x0, float scale, int **plength, float **pkernel,
                                   int **pindex, int **pmeta)
1175 1176 1177 1178 1179
{
  // Safe return values
  *plength = NULL;
  *pkernel = NULL;
  *pindex = NULL;
1180
  if(pmeta)
1181 1182
  {
    *pmeta = NULL;
1183
  }
1184

1185
  if(scale == 1.f)
1186
  {
1187 1188 1189 1190 1191
    // No resampling required
    return 0;
  }

  // Compute common upsampling/downsampling memory requirements
1192
  int maxtapsapixel;
1193
  if(scale > 1.f)
1194
  {
1195
    // Upscale... the easy one. The values are exact
1196
    maxtapsapixel = 2 * itor->width;
1197 1198 1199
  }
  else
  {
1200
    // Downscale... going for worst case values memory wise
1201
    maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);
1202 1203
  }

1204
  int nlengths = out;
1205 1206 1207 1208 1209 1210
  int nindex = maxtapsapixel * out;
  int nkernel = maxtapsapixel * out;
  size_t lengthreq = increase_for_alignment(nlengths * sizeof(int), SSE_ALIGNMENT);
  size_t indexreq = increase_for_alignment(nindex * sizeof(int), SSE_ALIGNMENT);
  size_t kernelreq = increase_for_alignment(nkernel * sizeof(float), SSE_ALIGNMENT);
  size_t scratchreq = maxtapsapixel * sizeof(float) + 4 * sizeof(float);
1211
  // NB: because sse versions compute four taps a time
1212
  size_t metareq = pmeta ? 3 * sizeof(int) * out : 0;
1213

1214
  void *blob = NULL;
1215
  size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
1216
  blob = dt_alloc_align(SSE_ALIGNMENT, totalreq);
1217
  if(!blob)
1218
  {
1219 1220 1221
    return 1;
  }

1222 1223 1224 1225 1226 1227 1228 1229 1230
  int *lengths = (int *)blob;
  blob = (char *)blob + lengthreq;
  int *index = (int *)blob;
  blob = (char *)blob + indexreq;
  float *kernel = (float *)blob;
  blob = (char *)blob + kernelreq;
  float *scratchpad = scratchreq ? (float *)blob : NULL;
  blob = (char *)blob + scratchreq;
  int *meta = metareq ? (int *)blob : NULL;
1231
//   blob = (char *)blob + metareq;
1232

1233
  /* setting this as a const should help the compilers trim all unnecessary
1234 1235 1236 1237 1238 1239
   * codepaths */
  const enum border_mode bordermode = RESAMPLING_BORDER_MODE;

  /* Upscale and downscale differ in subtle points, getting rid of code
   * duplication might have been tricky and i prefer keeping the code
   * as straight as possible */
1240
  if(scale > 1.f)
1241
  {
1242 1243 1244
    int kidx = 0;
    int iidx = 0;
    int lidx = 0;
1245
    int midx = 0;
1246
    for(int x = 0; x < out; x++)
1247
    {
1248
      if(meta)
1249
      {
1250 1251 1252 1253
        meta[midx++] = lidx;
        meta[midx++] = kidx;
        meta[midx++] = iidx;
      }
1254 1255

      // Projected position in input samples
1256
      float fx = (float)(out_x0 + x) / scale;
1257 1258 1259

      // Compute the filter kernel at that position
      int first;
1260
      compute_upsampling_kernel(itor, scratchpad, NULL, &first, fx);
1261

1262