8 #ifndef META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
9 #define META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
13 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
40 template <
unsigned int tChannels>
55 template <
unsigned int tPatchSize>
56 static inline void interpolateSquarePatch8BitPerChannel(
const uint8_t* imageTopLeft,
const unsigned int imageStrideElements, uint8_t* buffer,
const unsigned int factorRight,
const unsigned int factorBottom);
74 template <
unsigned int tChannels,
unsigned int tPatchSize, PixelCenter tPixelCenter = PC_TOP_LEFT,
typename TScalar = Scalar>
75 static inline void interpolateSquarePatch8BitPerChannel(
const uint8_t*
const image,
const unsigned int width,
const unsigned int imagePaddingElements, uint8_t* buffer,
const VectorT2<TScalar>& position);
79 template <
unsigned int tPatchSize>
82 ocean_assert(tPatchSize >= 5u);
84 ocean_assert(imageTopLeft !=
nullptr && buffer !=
nullptr);
85 ocean_assert(imageStrideElements >= 1u * tPatchSize);
87 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
89 const unsigned int factorLeft = 128u - factorRight;
90 const unsigned int factorTop = 128u - factorBottom;
92 constexpr
unsigned int blocks15 = tPatchSize / 15u;
93 constexpr
unsigned int remainingAfterBlocks15 = tPatchSize % 15u;
95 constexpr
bool partialBlock15 = remainingAfterBlocks15 > 10u;
96 constexpr
unsigned int remainingAfterPartialBlock15 = partialBlock15 ? 0u : remainingAfterBlocks15;
98 constexpr
bool block7 = remainingAfterPartialBlock15 >= 7u;
99 constexpr
unsigned int remainingAfterBlock7 = remainingAfterPartialBlock15 % 7u;
101 constexpr
bool partialBlock7 = remainingAfterBlock7 >= 3u;
102 constexpr
unsigned int remainingAfterPartialBlock7 = partialBlock7 ? 0u : remainingAfterBlock7;
104 constexpr
unsigned int blocks1 = remainingAfterPartialBlock7;
106 const unsigned int factorTopLeft = factorTop * factorLeft;
107 const unsigned int factorTopRight = factorTop * factorRight;
110 const __m128i factorsTop_u_16x8 = _mm_set1_epi32(
int(factorTopLeft) |
int(factorTopRight) << 16);
112 const unsigned int factorBottomLeft = factorBottom * factorLeft;
113 const unsigned int factorBottomRight = factorBottom * factorRight;
116 const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(
int(factorBottomLeft) |
int(factorBottomRight) << 16);
118 for (
unsigned int y = 0u; y < tPatchSize; ++y)
123 for (
unsigned int x = 0u; x < blocks15; ++x)
125 const __m128i top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
126 const __m128i bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
130 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
131 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
134 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
135 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
139 const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
140 const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
143 const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
144 const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
148 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
149 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
151 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
152 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
155 const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
156 const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
158 const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
159 const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
162 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
163 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
165 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
168 const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
169 const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
171 const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
176 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
179 const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks15) && (!block7 && !partialBlock7 && blocks1 == 0u);
183 memcpy(buffer, &result_u_8x16, 15);
187 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
194 if constexpr (partialBlock15)
196 ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
199 __m128i bottom_u_8x16;
201 if (y < tPatchSize - 1u)
203 top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
204 bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
208 memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlocks15 + 1u);
209 memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlocks15 + 1u);
214 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
215 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
218 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
219 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
223 const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
224 const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
227 const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
228 const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
232 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
233 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
235 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
236 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
239 const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
240 const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
242 const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
243 const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
246 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
247 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
249 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
252 const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
253 const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
255 const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
260 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
262 ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
263 const bool isLastBlock = y + 1u == tPatchSize;
267 memcpy(buffer, &result_u_8x16, remainingAfterBlocks15);
271 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
274 imageTopLeft += remainingAfterBlocks15;
275 buffer += remainingAfterBlocks15;
278 if constexpr (block7)
280 const __m128i top_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft));
281 const __m128i bottom_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft + imageStrideElements));
285 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
286 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
289 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
290 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
294 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
295 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
297 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
298 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
301 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
302 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
304 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
307 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
309 const bool isLastBlock = (y + 1u == tPatchSize) && (!partialBlock7 && blocks1 == 0u);
313 memcpy(buffer, &result_u_8x16, 7);
317 _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
324 if constexpr (partialBlock7)
326 ocean_assert(blocks1 == 0u);
329 __m128i bottom_u_8x16;
331 if (y < tPatchSize - 1u)
333 top_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft));
334 bottom_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft + imageStrideElements));
338 memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlock7 + 1u);
339 memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlock7 + 1u);
344 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
345 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
348 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
349 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
353 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
354 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
356 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
357 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
360 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
361 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
363 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
366 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
368 ocean_assert(blocks1 == 0u);
369 const bool isLastBlock = y + 1u == tPatchSize;
373 memcpy(buffer, &result_u_8x16, remainingAfterBlock7);
377 _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
380 imageTopLeft += remainingAfterBlock7;
381 buffer += remainingAfterBlock7;
384 if constexpr (blocks1 != 0u)
386 const uint8_t*
const imageBottomLeft = imageTopLeft + imageStrideElements;
388 for (
unsigned int n = 0u; n < blocks1; ++n)
390 buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[1u + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[1u + n] * factorBottomRight + 8192u) / 16384u);
393 imageTopLeft += blocks1;
397 imageTopLeft += imageStrideElements - tPatchSize;
402 template <
unsigned int tPatchSize>
405 ocean_assert(imageTopLeft !=
nullptr && buffer !=
nullptr);
406 ocean_assert(imageStrideElements >= 1u * tPatchSize);
408 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
410 const unsigned int factorLeft = 128u - factorRight;
411 const unsigned int factorTop = 128u - factorBottom;
413 constexpr
unsigned int blocks4 = tPatchSize / 4u;
414 constexpr
unsigned int remainingAfterBlocks4 = tPatchSize % 4u;
416 constexpr
bool partialBlock4 = remainingAfterBlocks4 >= 2u;
417 constexpr
unsigned int remainingAfterPartialBlock4 = partialBlock4 ? 0u : remainingAfterBlocks4;
419 constexpr
unsigned int blocks1 = remainingAfterPartialBlock4;
421 const unsigned int factorTopLeft = factorTop * factorLeft;
422 const unsigned int factorTopRight = factorTop * factorRight;
425 const __m128i factorsTop_u_16x8 = _mm_set1_epi32(
int(factorTopLeft) |
int(factorTopRight) << 16);
427 const unsigned int factorBottomLeft = factorBottom * factorLeft;
428 const unsigned int factorBottomRight = factorBottom * factorRight;
431 const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(
int(factorBottomLeft) |
int(factorBottomRight) << 16);
433 for (
unsigned int y = 0u; y < tPatchSize; ++y)
438 for (
unsigned int x = 0u; x < blocks4; ++x)
440 const bool canReadLastElements = y < tPatchSize - 1u || x < blocks4 - 1u;
443 __m128i bottom_u_8x16;
445 if (canReadLastElements)
447 top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
448 bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
452 top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft - 1)), 1);
453 bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements - 1u)), 1);
461 const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16,
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
462 const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16,
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
465 const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16,
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
466 const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16,
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
469 const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16,
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
470 const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16,
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
474 const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
475 const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
476 const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
478 const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
479 const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
480 const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
483 const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
484 const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
485 const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
490 const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4,
SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
491 const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4,
SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
492 const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4,
SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
495 const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
497 const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks4) && (!partialBlock4 && blocks1 <= 1u);
501 uint8_t tempBuffer[16];
502 _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
504 memcpy(buffer, tempBuffer, 12);
508 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
517 const bool canReadLastElements = y < tPatchSize - 1u;
520 __m128i bottom_u_8x16;
522 if (canReadLastElements)
524 top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
525 bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
529 constexpr
unsigned int overlappingElements = 16u - (remainingAfterBlocks4 * 3u + 3u);
530 ocean_assert(overlappingElements < 16u);
532 top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft - overlappingElements)), overlappingElements);
533 bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements - overlappingElements)), overlappingElements);
541 const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16,
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
542 const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16,
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
545 const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16,
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
546 const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16,
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
549 const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16,
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
550 const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16,
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
554 const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
555 const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
556 const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
558 const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
559 const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
560 const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
563 const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
564 const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
565 const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
570 const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4,
SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
571 const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4,
SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
572 const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4,
SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
575 const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
577 ocean_assert(blocks1 == 0u);
579 const bool isLastBlock = y + 1u == tPatchSize;
583 uint8_t tempBuffer[16];
584 _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
586 memcpy(buffer, tempBuffer, remainingAfterBlocks4 * 3u);
590 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
593 imageTopLeft += remainingAfterBlocks4 * 3u;
594 buffer += remainingAfterBlocks4 * 3u;
597 if constexpr (blocks1 != 0u)
599 const uint8_t*
const imageBottomLeft = imageTopLeft + imageStrideElements;
601 for (
unsigned int n = 0u; n < blocks1; ++n)
603 for (
unsigned int c = 0u; c < 3u; ++c)
605 buffer[n * 3u + c] = uint8_t((imageTopLeft[n * 3u + c] * factorTopLeft + imageTopLeft[n * 3u + 3u + c] * factorTopRight + imageBottomLeft[n * 3u + c] * factorBottomLeft + imageBottomLeft[n * 3u + 3u + c] * factorBottomRight + 8192u) / 16384u);
609 imageTopLeft += blocks1 * 3u;
610 buffer += blocks1 * 3u;
613 imageTopLeft += imageStrideElements - tPatchSize * 3u;
617 template <
unsigned int tChannels>
618 template <
unsigned int tPatchSize>
621 ocean_assert(imageTopLeft !=
nullptr && buffer !=
nullptr);
622 ocean_assert(imageStrideElements >= 1u * tPatchSize);
624 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
626 const unsigned int factorLeft = 128u - factorRight;
627 const unsigned int factorTop = 128u - factorBottom;
629 const unsigned int factorTopLeft = factorTop * factorLeft;
630 const unsigned int factorTopRight = factorTop * factorRight;
632 const unsigned int factorBottomLeft = factorBottom * factorLeft;
633 const unsigned int factorBottomRight = factorBottom * factorRight;
635 const uint8_t* imageBottomLeft = imageTopLeft + imageStrideElements;
637 for (
unsigned int y = 0u; y < tPatchSize; ++y)
639 for (
unsigned int x = 0u; x < tPatchSize; ++x)
641 for (
unsigned int n = 0u; n < tChannels; ++n)
643 buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[tChannels + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[tChannels + n] * factorBottomRight + 8192u) / 16384u);
646 imageTopLeft += tChannels;
647 imageBottomLeft += tChannels;
652 imageTopLeft += imageStrideElements - tChannels * tPatchSize;
653 imageBottomLeft += imageStrideElements - tChannels * tPatchSize;
657 template <
unsigned int tChannels,
unsigned int tPatchSize, PixelCenter tPixelCenter,
typename TScalar>
660 static_assert(tChannels >= 1u,
"Invalid channel number!");
661 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
663 ocean_assert(image !=
nullptr && buffer !=
nullptr);
664 ocean_assert(tPatchSize + 1u <= width);
666 constexpr
unsigned int tPatchSize_2 = tPatchSize / 2u;
668 const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
672 ocean_assert(shiftedPosition.
x() >= TScalar(tPatchSize_2) && shiftedPosition.
y() >= TScalar(tPatchSize_2));
673 ocean_assert(shiftedPosition.
x() < TScalar(width - tPatchSize_2 - 1u));
675 const unsigned int left = (
unsigned int)(shiftedPosition.
x()) - tPatchSize_2;
676 const unsigned int top = (
unsigned int)(shiftedPosition.
y()) - tPatchSize_2;
678 ocean_assert(left + tPatchSize < width);
680 const TScalar tx = shiftedPosition.
x() - TScalar(
int(shiftedPosition.
x()));
681 ocean_assert(tx >= TScalar(0) && tx <= TScalar(1));
682 const unsigned int factorRight = (
unsigned int)(tx * TScalar(128) + TScalar(0.5));
684 const TScalar ty = shiftedPosition.
y() - TScalar(
int(shiftedPosition.
y()));
685 ocean_assert(ty >= 0 && ty <= 1);
686 const unsigned int factorBottom = (
unsigned int)(ty * TScalar(128) + TScalar(0.5));
688 const uint8_t*
const imageTopLeft = image + top * imageStrideElements + left * tChannels;
This class allows to specialize functions for individual channels.
Definition: AdvancedFrameInterpolatorBilinearSSE.h:42
static void interpolateSquarePatch8BitPerChannel(const uint8_t *imageTopLeft, const unsigned int imageStrideElements, uint8_t *buffer, const unsigned int factorRight, const unsigned int factorBottom)
Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and sto...
Definition: AdvancedFrameInterpolatorBilinearSSE.h:619
This class implements advanced bilinear frame interpolation functions using SSE extensions.
Definition: AdvancedFrameInterpolatorBilinearSSE.h:33
static void interpolateSquarePatch8BitPerChannel(const uint8_t *const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t *buffer, const VectorT2< TScalar > &position)
Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and sto...
Definition: AdvancedFrameInterpolatorBilinearSSE.h:658
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition: SSE.h:3770
This class implements a vector with two elements.
Definition: Vector2.h:96
const T & x() const noexcept
Returns the x value.
Definition: Vector2.h:698
const T & y() const noexcept
Returns the y value.
Definition: Vector2.h:710
@ PC_TOP_LEFT
The center of a pixel is in the upper-left corner of each pixel's square.
Definition: CV.h:133
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15