Commit 9bb5bc5b authored by Edouard Gomez's avatar Edouard Gomez

interpolation: add a SSE specialized 4 component pixel interpolator

This assumes pixels to be interpolated the same for all components.
So we can happily save 2/3 of current kernel computation. Plus of
course we compute the 4 components in single pass.

Ported clipping.c to this new API. lens.c cannot benefit from this
as it theoritically interpolates each color plane independently

For my test image:
lanczos3: 0.164s -> 0.059s
parent e58d4367
......@@ -361,7 +361,7 @@ compute_kernel_sse(
}
/* --------------------------------------------------------------------------
* Interpolation function (see usage in iop/lens.c and iop/clipping.c)
* Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
* ------------------------------------------------------------------------*/
float
......@@ -396,6 +396,57 @@ dt_interpolation_compute_sample(
return s/(normh*normv);
}
/* --------------------------------------------------------------------------
* Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
* ------------------------------------------------------------------------*/
void
dt_interpolation_compute_pixel4c(
const struct dt_interpolation* itor,
const float* in,
const float* out,
const float x, const float y,
const int linestride)
{
assert(itor->width < 4);
assert(samplestride == 4);
// Quite a bit of space for kernels
float kernelh[8] __attribute__((aligned(16)));
float kernelv[8] __attribute__((aligned(16)));
__m128 vkernelh[8];
__m128 vkernelv[8];
// Compute both horizontal and vertical kernels
float normh = compute_kernel_sse(itor, kernelh, x);
float normv = compute_kernel_sse(itor, kernelv, y);
// We will process four components a time, duplicate the information
for (int i=0; i<2*itor->width; i++) {
vkernelh[i] = _mm_set_ps1(kernelh[i]);
vkernelv[i] = _mm_set_ps1(kernelv[i]);
}
// Precompute the inverse of the filter norm for later use
__m128 oonorm = _mm_set_ps1(1.f/(normh*normv));
// Go to top left pixel
in = in - (itor->width-1)*(4 + linestride);
// Apply the kernel
__m128 pixel = _mm_setzero_ps();
for (int i=0; i<2*itor->width; i++) {
__m128 h = _mm_setzero_ps();
for (int j=0; j<2*itor->width; j++) {
h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128*)&in[j*4]));
}
pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i],h));
in += linestride;
}
*(__m128*)out = _mm_mul_ps(pixel, oonorm);
}
/* --------------------------------------------------------------------------
* Interpolation factory
* ------------------------------------------------------------------------*/
......
......@@ -77,6 +77,31 @@ dt_interpolation_compute_sample(
const int samplestride,
const int linestride);
/** Compute an interpolated 4 component pixel.
*
* This function computes a full 4 component pixel. This helps a bit speedwise
* as interpolation coordinates are supposed to be the same for all components.
* Thus we can share horizontal and vertical interpolation kernels accross all
* components
*
* NB: a pixel is to be four floats big in stride
*
* @param in Input samples
* @param out Output sample
* @param itor interpolator to be used
* @param x X-Coordinate of the requested sample
* @param y Y-Coordinate of the requested sample
* @param linestride Stride in bytes for complete line
*
*/
void
dt_interpolation_compute_pixel4c(
const struct dt_interpolation* itor,
const float* in,
const float* out,
const float x, const float y,
const int linestride);
/** Get an interpolator from type
* @param type Interpolator to search for
* @return requested interpolator or default if not found (this function can't fail)
......
......@@ -388,6 +388,8 @@ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void
const int ch = piece->colors;
const int ch_width = ch*roi_in->width;
assert(ch == 4);
// only crop, no rot fast and sharp path:
if(!d->flags && d->angle == 0.0 && d->all_off && roi_in->width == roi_out->width && roi_in->height == roi_out->height)
{
......@@ -450,8 +452,7 @@ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void
if(ii >= (interpolation->width-1) && jj >= (interpolation->width-1) && ii < roi_in->width-interpolation->width && jj < roi_in->height-interpolation->width)
{
const float *in = ((float *)ivoid) + ch*(roi_in->width*jj+ii);
for(int c=0; c<3; c++,in++)
out[c] = dt_interpolation_compute_sample(interpolation, in, po[0], po[1], ch, ch_width);
dt_interpolation_compute_pixel4c(interpolation, in, out, po[0], po[1], ch_width);
}
else for(int c=0; c<3; c++) out[c] = 0.0f;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment