Commit d99df319 authored by Tobias Ellinghaus's avatar Tobias Ellinghaus

Use SSE in colorbalance + clean up some color code

This manually merges the work done in PR #948.
parent 31b9b401
......@@ -196,7 +196,7 @@ cairo_surface_t *cairo_surface_create_from_xyz_data(const float *const image, co
{
float sRGB[3];
int32_t pixel = 0;
dt_XYZ_to_sRGB(iter, sRGB);
dt_XYZ_to_sRGB_clipped(iter, sRGB);
for(int c = 0; c < 3; c++) pixel |= ((int)(sRGB[c] * 255) & 0xff) << (16 - c * 8);
*((int *)(&rgbbuf[(x + (size_t)y * width) * 4])) = pixel;
}
......
......@@ -118,7 +118,7 @@ void set_color(box_t *box, dt_colorspaces_color_profile_type_t color_space, floa
case DT_COLORSPACE_LAB:
dt_Lab_to_XYZ(Lab, XYZ);
case DT_COLORSPACE_XYZ:
dt_XYZ_to_sRGB(XYZ, box->rgb);
dt_XYZ_to_sRGB_clipped(XYZ, box->rgb);
break;
}
}
......
......@@ -19,6 +19,7 @@
#pragma once
#include "common/colorspaces.h"
#include "common/colorspaces_inline_conversions.h"
#include <glib.h>
......
/*
This file is part of darktable,
copyright (c) 2009--2010 johannes hanika.
copyright (c) 2011--2017 tobias ellinghaus.
darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -348,120 +349,6 @@ static cmsToneCurve *build_linear_gamma(void)
return cmsBuildParametricToneCurve(0, 1, Parameters);
}
static float cbrt_5f(float f)
{
uint32_t *p = (uint32_t *)&f;
*p = *p / 3 + 709921077;
return f;
}
static float cbrta_halleyf(const float a, const float R)
{
const float a3 = a * a * a;
const float b = a * (a3 + R + R) / (a3 + a3 + R);
return b;
}
static float lab_f(const float x)
{
const float epsilon = 216.0f / 24389.0f;
const float kappa = 24389.0f / 27.0f;
if(x > epsilon)
{
// approximate cbrtf(x):
const float a = cbrt_5f(x);
return cbrta_halleyf(a, x);
}
else
return (kappa * x + 16.0f) / 116.0f;
}
void dt_XYZ_to_Lab(const float *XYZ, float *Lab)
{
const float d50[3] = { 0.9642, 1.0, 0.8249 };
const float f[3] = { lab_f(XYZ[0] / d50[0]), lab_f(XYZ[1] / d50[1]), lab_f(XYZ[2] / d50[2]) };
Lab[0] = 116.0f * f[1] - 16.0f;
Lab[1] = 500.0f * (f[0] - f[1]);
Lab[2] = 200.0f * (f[1] - f[2]);
}
static float lab_f_inv(const float x)
{
const float epsilon = 0.20689655172413796; // cbrtf(216.0f/24389.0f);
const float kappa = 24389.0f / 27.0f;
if(x > epsilon)
return x * x * x;
else
return (116.0f * x - 16.0f) / kappa;
}
void dt_Lab_to_XYZ(const float *Lab, float *XYZ)
{
const float d50[3] = { 0.9642, 1.0, 0.8249 };
const float fy = (Lab[0] + 16.0f) / 116.0f;
const float fx = Lab[1] / 500.0f + fy;
const float fz = fy - Lab[2] / 200.0f;
XYZ[0] = d50[0] * lab_f_inv(fx);
XYZ[1] = d50[1] * lab_f_inv(fy);
XYZ[2] = d50[2] * lab_f_inv(fz);
}
void dt_XYZ_to_sRGB(const float * const XYZ, float *sRGB)
{
const float xyz_to_srgb_matrix[3][3] =
{
{3.1338561, -1.6168667, -0.4906146},
{-0.9787684, 1.9161415, 0.0334540},
{0.0719453, -0.2289914, 1.4052427}
};
// XYZ -> sRGB
float rgb[3] = {0, 0, 0};
for(int r = 0; r < 3; r++)
for(int c = 0; c < 3; c++)
rgb[r] += xyz_to_srgb_matrix[r][c] * XYZ[c];
// linear sRGB -> gamma corrected sRGB
for(int c = 0; c < 3; c++)
rgb[c] = rgb[c] <= 0.0031308 ? 12.92 * rgb[c] : (1.0 + 0.055) * powf(rgb[c], 1.0 / 2.4) - 0.055;
#define CLIP(a) ((a) < 0 ? 0 : (a) > 1 ? 1 : (a))
for(int i = 0; i < 3; i++)
sRGB[i] = CLIP(rgb[i]);
#undef CLIP
}
void dt_Lab_to_prophotorgb(const float *const Lab, float *rgb)
{
const float xyz_to_rgb[3][3] = { // prophoto rgb d50
{ 1.3459433, -0.2556075, -0.0511118},
{-0.5445989, 1.5081673, 0.0205351},
{ 0.0000000, 0.0000000, 1.2118128},
};
float XYZ[3];
dt_Lab_to_XYZ(Lab, XYZ);
rgb[0] = rgb[1] = rgb[2] = 0.0f;
for(int r = 0; r < 3; r++)
for(int c = 0; c < 3; c++)
rgb[r] += xyz_to_rgb[r][c] * XYZ[c];
}
void dt_prophotorgb_to_Lab(const float *const rgb, float *Lab)
{
const float rgb_to_xyz[3][3] = { // prophoto rgb
{0.7976749, 0.1351917, 0.0313534},
{0.2880402, 0.7118741, 0.0000857},
{0.0000000, 0.0000000, 0.8252100},
};
float XYZ[3] = {0.0f};
for(int r = 0; r < 3; r++)
for(int c = 0; c < 3; c++)
XYZ[r] += rgb_to_xyz[r][c] * rgb[c];
dt_XYZ_to_Lab(XYZ, Lab);
}
int dt_colorspaces_get_darktable_matrix(const char *makermodel, float *matrix)
{
dt_profiled_colormatrix_t *preset = NULL;
......
/*
This file is part of darktable,
copyright (c) 2009--2010 johannes hanika.
copyright (c) 2011--2017 tobias ellinghaus.
darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -19,6 +20,7 @@
#pragma once
#include "common/darktable.h"
#include <lcms2.h>
// this was removed from lcms2 in 2.4
......@@ -142,18 +144,6 @@ cmsHPROFILE dt_colorspaces_get_rgb_profile_from_mem(uint8_t *data, uint32_t size
/** free the resources of a profile created with the functions above. */
void dt_colorspaces_cleanup_profile(cmsHPROFILE p);
/** uses D50 white point. */
void dt_XYZ_to_Lab(const float *XYZ, float *Lab);
/** uses D50 white point. */
void dt_Lab_to_XYZ(const float *Lab, float *XYZ);
/** uses D50 white point and clips the output to [0..1]. */
void dt_XYZ_to_sRGB(const float * const XYZ, float *sRGB);
void dt_Lab_to_prophotorgb(const float * const Lab, float *rgb);
void dt_prophotorgb_to_Lab(const float * const rgb, float *Lab);
/** extracts tonecurves and color matrix prof to XYZ from a given input profile, returns 0 on success (curves
* and matrix are inverted for input) */
int dt_colorspaces_get_matrix_from_input_profile(cmsHPROFILE prof, float *matrix, float *lutr, float *lutg,
......
This diff is collapsed.
/*
This file is part of darktable,
copyright (c) 2015 Bruce Guenter
darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
darktable is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with darktable. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <xmmintrin.h>
/*
* Fast SSE2 implementation of special math functions.
*/
#define POLY0(x, c0) _mm_set1_ps(c0)
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
#define EXP_POLY_DEGREE 4
#define LOG_POLY_DEGREE 5
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
*/
static inline __m128 _mm_exp2_ps(__m128 x)
{
__m128i ipart;
__m128 fpart, expipart, expfpart;
x = _mm_min_ps(x, _mm_set1_ps(129.00000f));
x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
/* ipart = int(x - 0.5) */
ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
/* fpart = x - ipart */
fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
/* expipart = (float) (1 << ipart) */
expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
#if EXP_POLY_DEGREE == 5
expfpart
= POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
#elif EXP_POLY_DEGREE == 4
expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
#elif EXP_POLY_DEGREE == 3
expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
#elif EXP_POLY_DEGREE == 2
expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
#else
#error
#endif
return _mm_mul_ps(expipart, expfpart);
}
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
*/
static inline __m128 _mm_log2_ps(__m128 x)
{
__m128i expmask = _mm_set1_epi32(0x7f800000);
__m128i mantmask = _mm_set1_epi32(0x007fffff);
__m128 one = _mm_set1_ps(1.0f);
__m128i i = _mm_castps_si128(x);
/* exp = (float) exponent(x) */
__m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
/* mant = (float) mantissa(x) */
__m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
__m128 logmant;
/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
* These coefficients can be generate with
* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
*/
#if LOG_POLY_DEGREE == 6
logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f,
-1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
#elif LOG_POLY_DEGREE == 5
logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f,
-0.465725644288844778798f, 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f,
-0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
#else
#error
#endif
/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
return _mm_add_ps(logmant, exp);
}
static inline __m128 _mm_pow_ps(__m128 x, __m128 y)
{
return _mm_exp2_ps(_mm_log2_ps(x) * y);
}
static inline __m128 _mm_pow_ps1(__m128 x, float y)
{
return _mm_exp2_ps(_mm_log2_ps(x) * y);
}
// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
// vim: shiftwidth=2 expandtab tabstop=2 cindent
// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-space on;
......@@ -21,7 +21,7 @@
#endif
#include "bauhaus/bauhaus.h"
#include "common/bilateral.h"
#include "common/colorspaces.h"
#include "common/colorspaces_inline_conversions.h"
#include "common/debug.h"
#include "common/interpolation.h"
#include "common/opencl.h"
......
......@@ -24,9 +24,10 @@ http://www.youtube.com/watch?v=JVoUgR6bhBc
#endif
// our includes go first:
#include "bauhaus/bauhaus.h"
#include "common/colorspaces.h"
#include "common/colorspaces_inline_conversions.h"
#include "common/opencl.h"
#include "develop/imageop.h"
#include "develop/imageop_math.h"
#include "dtgtk/drawingarea.h"
#include "gui/gtk.h"
#include "iop/iop_api.h"
......@@ -121,25 +122,6 @@ void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const
d->gain[CHANNEL_GREEN] * d->gain[CHANNEL_FACTOR],
d->gain[CHANNEL_BLUE] * d->gain[CHANNEL_FACTOR] };
// sRGB -> XYZ matrix, D65
const float srgb_to_xyz[3][3] = {
{ 0.4360747, 0.3850649, 0.1430804 },
{ 0.2225045, 0.7168786, 0.0606169 },
{ 0.0139322, 0.0971045, 0.7141733 }
// {0.4124564, 0.3575761, 0.1804375},
// {0.2126729, 0.7151522, 0.0721750},
// {0.0193339, 0.1191920, 0.9503041}
};
// XYZ -> sRGB matrix, D65
const float xyz_to_srgb[3][3] = {
{ 3.1338561, -1.6168667, -0.4906146 },
{ -0.9787684, 1.9161415, 0.0334540 },
{ 0.0719453, -0.2289914, 1.4052427 }
// {3.2404542, -1.5371385, -0.4985314},
// {-0.9692660, 1.8760108, 0.0415560},
// {0.0556434, -0.2040259, 1.0572252}
};
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
......@@ -155,11 +137,7 @@ void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const
dt_Lab_to_XYZ(in, XYZ);
// XYZ -> sRGB
float rgb[3] = { 0, 0, 0 };
for(int r = 0; r < 3; r++)
for(int c = 0; c < 3; c++) rgb[r] += xyz_to_srgb[r][c] * XYZ[c];
// linear sRGB -> gamma corrected sRGB
for(int c = 0; c < 3; c++)
rgb[c] = rgb[c] <= 0.0031308 ? 12.92 * rgb[c] : (1.0 + 0.055) * powf(rgb[c], 1.0 / 2.4) - 0.055;
dt_XYZ_to_sRGB(XYZ, rgb);
// do the calculation in RGB space
for(int c = 0; c < 3; c++)
......@@ -171,12 +149,8 @@ void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const
// transform the result back to Lab
// sRGB -> XYZ
XYZ[0] = XYZ[1] = XYZ[2] = 0.0;
// gamma corrected sRGB -> linear sRGB
for(int c = 0; c < 3; c++)
rgb[c] = rgb[c] <= 0.04045 ? rgb[c] / 12.92 : powf((rgb[c] + 0.055) / (1 + 0.055), 2.4);
for(int r = 0; r < 3; r++)
for(int c = 0; c < 3; c++) XYZ[r] += srgb_to_xyz[r][c] * rgb[c];
dt_sRGB_to_XYZ(rgb, XYZ);
// XYZ -> Lab
dt_XYZ_to_Lab(XYZ, out);
out[3] = in[3];
......@@ -187,6 +161,62 @@ void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const
}
}
#if defined(__SSE__)
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
dt_iop_colorbalance_data_t *d = (dt_iop_colorbalance_data_t *)piece->data;
const int ch = piece->colors;
// these are RGB values!
const __m128 lift = { 2.0 - (d->lift[CHANNEL_RED] * d->lift[CHANNEL_FACTOR]),
2.0 - (d->lift[CHANNEL_GREEN] * d->lift[CHANNEL_FACTOR]),
2.0 - (d->lift[CHANNEL_BLUE] * d->lift[CHANNEL_FACTOR]) },
gamma = { d->gamma[CHANNEL_RED] * d->gamma[CHANNEL_FACTOR],
d->gamma[CHANNEL_GREEN] * d->gamma[CHANNEL_FACTOR],
d->gamma[CHANNEL_BLUE] * d->gamma[CHANNEL_FACTOR] },
gamma_inv = { (gamma[0] != 0.0) ? 1.0 / gamma[0] : 1000000.0,
(gamma[1] != 0.0) ? 1.0 / gamma[1] : 1000000.0,
(gamma[2] != 0.0) ? 1.0 / gamma[2] : 1000000.0 },
gain = { d->gain[CHANNEL_RED] * d->gain[CHANNEL_FACTOR],
d->gain[CHANNEL_GREEN] * d->gain[CHANNEL_FACTOR],
d->gain[CHANNEL_BLUE] * d->gain[CHANNEL_FACTOR] };
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
for(int j = 0; j < roi_out->height; j++)
{
float *in = ((float *)ivoid) + (size_t)ch * roi_in->width * j;
float *out = ((float *)ovoid) + (size_t)ch * roi_out->width * j;
for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
{
// transform the pixel to sRGB:
// Lab -> XYZ
__m128 Lab = _mm_load_ps(in);
__m128 XYZ = dt_Lab_to_XYZ_sse2(Lab);
// XYZ -> sRGB
__m128 rgb = dt_XYZ_to_sRGB_sse2(XYZ);
// do the calculation in RGB space
__m128 one = _mm_set1_ps(1.0);
__m128 tmp = (((rgb - one) * lift) + one) * gain;
tmp = _mm_max_ps(tmp, _mm_setzero_ps());
rgb = _mm_pow_ps(tmp, gamma_inv);
// transform the result back to Lab
// sRGB -> XYZ
XYZ = dt_sRGB_to_XYZ_sse2(rgb);
// XYZ -> Lab
__m128 outv = dt_XYZ_to_Lab_sse2(XYZ);
_mm_store_ps(out, outv);
}
}
if(piece->pipe->mask_display & DT_DEV_PIXELPIPE_DISPLAY_MASK) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
#endif
#ifdef HAVE_OPENCL
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out,
const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
......
......@@ -22,6 +22,7 @@
#include "bauhaus/bauhaus.h"
#include "common/colormatrices.c"
#include "common/colorspaces.h"
#include "common/colorspaces_inline_conversions.h"
#include "common/image_cache.h"
#include "common/opencl.h"
#include "control/control.h"
......@@ -448,114 +449,6 @@ error:
}
#endif
static inline float _cbrtf(const float x)
{
union convert {
float f;
int i;
} data, dataout;
data.f = x;
// approximate cbrtf(x):
dataout.i = (((int)(((float)(data.i)) / 3.0f)) + 709921077);
return dataout.f;
}
#if defined(__SSE2__)
static inline __m128 _cbrtf_sse2(const __m128 x)
{
return (_mm_castsi128_ps(
_mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
_mm_set1_epi32(709921077))));
}
#endif
static inline float lab_f_m(const float x)
{
const float epsilon = (216.0f / 24389.0f);
const float kappa = (24389.0f / 27.0f);
const float a = _cbrtf(x);
const float a3 = a * a * a;
// x > epsilon
const float res_big = (((a) * ((x + x) + a3)) / ((a3 + a3) + x));
// x <= epsilon
const float res_small = (((kappa * x) + (16.0f)) / (116.0f));
// blend results according to whether each component is > epsilon or not
return ((x > epsilon) ? res_big : res_small);
}
#if defined(__SSE2__)
static inline __m128 lab_f_m_sse2(const __m128 x)
{
const __m128 epsilon = _mm_set1_ps(216.0f / 24389.0f);
const __m128 kappa = _mm_set1_ps(24389.0f / 27.0f);
// calculate as if x > epsilon : result = cbrtf(x)
// approximate cbrtf(x):
const __m128 a = _cbrtf_sse2(x);
const __m128 a3 = _mm_mul_ps(_mm_mul_ps(a, a), a);
const __m128 res_big
= _mm_div_ps(_mm_mul_ps(a, _mm_add_ps(a3, _mm_add_ps(x, x))), _mm_add_ps(_mm_add_ps(a3, a3), x));
// calculate as if x <= epsilon : result = (kappa*x+16)/116
const __m128 res_small
= _mm_div_ps(_mm_add_ps(_mm_mul_ps(kappa, x), _mm_set1_ps(16.0f)), _mm_set1_ps(116.0f));
// blend results according to whether each component is > epsilon or not
const __m128 mask = _mm_cmpgt_ps(x, epsilon);
return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}
#endif
static inline void _dt_XYZ_to_Lab(const float *const XYZ, float *const Lab)
{
const float d50_inv[4] = { 1.0f / 0.9642f, 1.0f, 1.0f / 0.8249f, 0.0f };
const float coef[4] = { 116.0f, 500.0f, 200.0f, 0.0f };
float _xyz[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
for(int c = 0; c < 4; c++)
{
_xyz[c] = lab_f_m(d50_inv[c] * XYZ[c]);
}
// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3])
float sf1[4];
sf1[0] = _xyz[1];
sf1[1] = _xyz[0];
sf1[2] = _xyz[1];
sf1[3] = _xyz[3];
float sf2[4];
sf2[0] = _xyz[3];
sf2[1] = _xyz[1];
sf2[2] = _xyz[2];
sf2[3] = _xyz[3];
for(int c = 0; c < 4; c++)
{
Lab[c] = (sf1[c] - sf2[c]) * coef[c];
}
}
#if defined(__SSE2__)
static inline __m128 dt_XYZ_to_Lab_sse2(const __m128 XYZ)
{
const __m128 d50_inv = _mm_set_ps(0.0f, 1.0f / 0.8249f, 1.0f, 1.0f / 0.9642f);
const __m128 coef = _mm_set_ps(0.0f, 200.0f, 500.0f, 116.0f);
const __m128 f = lab_f_m_sse2(_mm_mul_ps(XYZ, d50_inv));
// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3])
return _mm_mul_ps(coef, _mm_sub_ps(_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 1, 0, 1)),
_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 2, 1, 3))));
}
#endif
static inline void apply_blue_mapping(const float *const in, float *const out)
{
out[0] = in[0];
......@@ -621,7 +514,7 @@ static void process_cmatrix_bm(struct dt_iop_module_t *self, dt_dev_pixelpipe_io
}
}
_dt_XYZ_to_Lab(_xyz, out);
dt_XYZ_to_Lab(_xyz, out);
}
else
{
......@@ -651,7 +544,7 @@ static void process_cmatrix_bm(struct dt_iop_module_t *self, dt_dev_pixelpipe_io
}
}
_dt_XYZ_to_Lab(XYZ, out);
dt_XYZ_to_Lab(XYZ, out);
}
}
}
......@@ -685,7 +578,7 @@ static void process_cmatrix_fastpath_simple(struct dt_iop_module_t *self, dt_dev
}
}
_dt_XYZ_to_Lab(_xyz, out);
dt_XYZ_to_Lab(_xyz, out);
}
}
......@@ -732,7 +625,7 @@ static void process_cmatrix_fastpath_clipping(struct dt_iop_module_t *self, dt_d
}
}
_dt_XYZ_to_Lab(XYZ, out);
dt_XYZ_to_Lab(XYZ, out);
}
}
......@@ -795,7 +688,7 @@ static void process_cmatrix_proper(struct dt_iop_module_t *self, dt_dev_pixelpip
}
}
_dt_XYZ_to_Lab(_xyz, out);
dt_XYZ_to_Lab(_xyz, out);
}
else
{
......@@ -825,7 +718,7 @@ static void process_cmatrix_proper(struct dt_iop_module_t *self, dt_dev_pixelpip
}
}
_dt_XYZ_to_Lab(XYZ, out);
dt_XYZ_to_Lab(XYZ, out);
}
}
}
......
......@@ -19,7 +19,7 @@
#include "config.h"
#endif
#include "bauhaus/bauhaus.h"
#include "common/colorspaces.h"
#include "common/colorspaces_inline_conversions.h"
#include "common/opencl.h"
#include "control/control.h"
#include "develop/develop.h"
......
......@@ -21,6 +21,7 @@
#endif
#include "bauhaus/bauhaus.h"
#include "common/colorspaces.h"
#include "common/colorspaces_inline_conversions.h"
#include "common/opencl.h"
#include "control/conf.h"
#include "control/control.h"
......@@ -322,87 +323,6 @@ static void process_fastpath_apply_tonecurves(struct dt_iop_module_t *self, dt_d
}
}
#if defined(_OPENMP) && defined(OPENMP_SIMD_)
#pragma omp declare SIMD()
#endif
static inline float lab_f_inv_m(const float x)
{
const float epsilon = (0.20689655172413796f); // cbrtf(216.0f/24389.0f);
const float kappa_rcp_x16 = (16.0f * 27.0f / 24389.0f);
const float kappa_rcp_x116 = (116.0f * 27.0f / 24389.0f);
// x > epsilon
float res_big = x * x * x;
// x <= epsilon
float res_small = ((kappa_rcp_x116 * x) - kappa_rcp_x16);
// blend results according to whether each component is > epsilon or not
return ((x > epsilon) ? res_big : res_small);
}
#if defined(__SSE__)
static inline __m128 lab_f_inv_m_SSE(const __m128 x)
{
const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f);
const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f);
const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f);
// x > epsilon
const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x);
// x <= epsilon
const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16);
// blend results according to whether each component is > epsilon or not
const __m128 mask = _mm_cmpgt_ps(x, epsilon);
return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}
#endif
static inline void _dt_Lab_to_XYZ(const float *const Lab, float *const xyz)
{
const float d50[] = { 0.9642f, 1.0f, 0.8249f };
const float coef[] = { 1.0f / 500.0f, 1.0f / 116.0f, -1.0f / 200.0f };
const float offset = (0.137931034f);
float _F[3];
_F[0] = Lab[1];
_F[1] = Lab[0];
_F[2] = Lab[2];
for(int c = 0; c < 3; c++)
{
_F[c] *= coef[c];
}
float _F1[3];
_F1[0] = _F[1];
_F1[1] = 0.0f;
_F1[2] = _F[1];
for(int c = 0; c < 3; c++)
{
const float f = _F[c] + _F1[c] + offset;
xyz[c] = d50[c] * lab_f_inv_m(f);
}
}