8 #ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
9 #define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
15 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
37 template <
unsigned int tChannels>
49 template <
unsigned int tPixels>
50 static inline void mean8BitPerChannel(
const uint8_t*
const buffer, uint8_t*
const meanValues);
59 template <
unsigned int tPatchSize>
60 static inline void mean8BitPerChannel(
const uint8_t* patch,
const unsigned int patchStrideElements, uint8_t*
const meanValues);
71 template <
unsigned int tPixels>
72 static inline uint32_t
buffer8BitPerChannel(
const uint8_t*
const buffer0,
const uint8_t*
const buffer1,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1);
85 template <
unsigned int tPatchSize>
86 static inline uint32_t
patch8BitPerChannel(
const uint8_t* patch0,
const uint8_t* patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1);
99 template <
unsigned int tChannels,
unsigned int tPixels>
100 static inline uint32_t
buffer8BitPerChannel(
const uint8_t*
const buffer0,
const uint8_t*
const buffer1);
112 template <
unsigned int tChannels,
unsigned int tPatchSize>
113 static inline uint32_t
patch8BitPerChannel(
const uint8_t*
const patch0,
const uint8_t*
const patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements);
124 template <
unsigned int tChannels,
unsigned int tPatchSize>
125 static inline uint32_t
patchBuffer8BitPerChannel(
const uint8_t*
const patch0,
const uint8_t*
const buffer1,
const unsigned int patch0StrideElements);
134 template <
unsigned int tChannels,
unsigned int tPixels>
135 static OCEAN_FORCE_INLINE
void mean8BitPerChannel(
const uint8_t*
const buffer, uint8_t*
const meanValues);
145 template <
unsigned int tChannels,
unsigned int tPatchSize>
146 static OCEAN_FORCE_INLINE
void mean8BitPerChannel(
const uint8_t*
const patch,
const unsigned int patchStrideElements, uint8_t*
const meanValues);
150 template <
unsigned int tPixels>
153 static_assert(tPixels >= 8u,
"Invalid buffer size!");
155 constexpr
unsigned int tChannels = 1u;
157 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
159 constexpr
unsigned int bufferElements = tChannels * tPixels;
161 constexpr
unsigned int blocks16 = bufferElements / 16u;
162 constexpr
unsigned int remainingAfterBlocks16 = bufferElements % 16u;
164 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
166 constexpr
bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
168 constexpr
bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
170 constexpr
unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
172 static_assert(blocks1 <= 2u,
"Invalid block size!");
174 __m128i sum_128i = _mm_setzero_si128();
176 uint32_t sumIndividual = 0u;
178 for (
unsigned int n = 0u; n < blocks16; ++n)
180 const __m128i buffer_128i = _mm_lddqu_si128((
const __m128i*)buffer);
182 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
187 if constexpr (partialBlock16)
189 constexpr
unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
191 static_assert(overlapElements < 8u,
"Invalid value!");
193 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer - overlapElements)), overlapElements);
195 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
197 buffer += remainingAfterBlocks16;
200 if constexpr (fullBlock8)
202 const __m128i buffer_128i = _mm_loadl_epi64((
const __m128i*)buffer);
204 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
209 if constexpr (partialBlock8)
211 constexpr
unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
213 static_assert(overlapElements < 8u,
"Invalid value!");
215 const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer - overlapElements)), overlapElements);
217 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
219 buffer += remainingAfterBlocks16;
222 if constexpr (blocks1 != 0u)
224 for (
unsigned int n = 0u; n < blocks1; ++n)
226 sumIndividual += buffer[n];
232 meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
236 template <
unsigned int tPixels>
239 static_assert(tPixels >= 8u,
"Invalid buffer size!");
241 constexpr
unsigned int tChannels = 3u;
243 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
245 constexpr
unsigned int bufferElements = tChannels * tPixels;
247 constexpr
unsigned int blocks48 = bufferElements / 48u;
248 constexpr
unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
250 constexpr
bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
252 constexpr
unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
254 constexpr
unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
256 constexpr
unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
258 constexpr
unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
260 constexpr
unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
262 constexpr
unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
264 constexpr
unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
266 constexpr
unsigned int blocks1 = remainingAfterPartialBlock15;
268 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
270 __m128i sumChannel0_128i = _mm_setzero_si128();
271 __m128i sumChannel1_128i = _mm_setzero_si128();
272 __m128i sumChannel2_128i = _mm_setzero_si128();
274 uint32_t sumIndividual[3] = {0u};
276 for (
unsigned int n = 0u; n < blocks48; ++n)
278 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 0));
279 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 16));
280 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 32));
287 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
288 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
289 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
294 if constexpr (partialBlock48)
296 constexpr
int overlappingElements = int(48u - remainingAfterFullBlocks48);
298 const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(buffer)), overlappingElements);
299 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(buffer - overlappingElements + 16));
300 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(buffer - overlappingElements + 32));
307 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
308 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
309 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
311 buffer += remainingAfterFullBlocks48;
314 for (
unsigned int n = 0u; n < blocks24; ++n)
316 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 0));
317 const __m128i bufferB_128i = _mm_loadl_epi64((
const __m128i*)(buffer + 16));
319 __m128i channel01_128i;
320 __m128i channel2_128i;
323 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
325 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
326 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
327 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
332 for (
unsigned int n = 0u; n < blocks21; ++n)
334 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 0));
335 const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer + 16 - 3)), 3);
337 __m128i channel01_128i;
338 __m128i channel2_128i;
341 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
343 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
344 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
345 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
350 for (
unsigned int n = 0u; n < blocks15; ++n)
352 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer - 1)), 1);
354 __m128i channel01_128i;
355 __m128i channel2_128i;
358 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
360 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
361 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
362 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
367 if constexpr (blocks1 != 0u)
369 constexpr
unsigned int pixels = blocks1 / 3u;
371 for (
unsigned int x = 0u; x < pixels; ++x)
373 for (
unsigned int n = 0u; n < 3u; ++n)
375 sumIndividual[n] += buffer[x * 3u + n];
387 template <
unsigned int tChannels>
388 template <
unsigned int tPixels>
391 static_assert(tChannels >= 1u,
"Invalid channel number!");
392 static_assert(tPixels >= 1u,
"Invalid buffer size!");
394 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
396 uint32_t sum[tChannels] = {0u};
398 for (
unsigned int n = 0u; n < tPixels; ++n)
400 for (
unsigned int c = 0u; c < tChannels; ++c)
402 sum[c] += buffer[n * tChannels + c];
406 for (
unsigned int c = 0u; c < tChannels; ++c)
408 meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
413 template <
unsigned int tPatchSize>
416 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
418 constexpr
unsigned int tChannels = 1u;
420 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
422 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
424 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
426 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
427 constexpr
unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
429 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
431 constexpr
bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
433 constexpr
bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
435 constexpr
unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
437 static_assert(blocks1 <= 2u,
"Invalid block size!");
439 __m128i sum_128i = _mm_setzero_si128();
441 uint32_t sumIndividual = 0u;
443 for (
unsigned int y = 0u; y < tPatchSize; ++y)
447 for (
unsigned int n = 0u; n < blocks16; ++n)
449 const __m128i buffer_128i = _mm_lddqu_si128((
const __m128i*)patch);
451 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
456 if constexpr (fullBlock8)
458 const __m128i buffer_128i = _mm_loadl_epi64((
const __m128i*)patch);
460 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
465 if constexpr (partialBlock16)
467 constexpr
unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
469 static_assert(overlapElements < 8u,
"Invalid value!");
471 if (y < tPatchSize - 1u)
473 const __m128i buffer_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch), overlapElements);
475 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
479 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch - overlapElements)), overlapElements);
481 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
484 patch += remainingAfterBlocks16;
487 if constexpr (partialBlock8)
489 constexpr
unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
491 static_assert(overlapElements < 8u,
"Invalid value!");
493 if (y < tPatchSize - 1u)
495 const __m128i buffer_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch), overlapElements + 8);
497 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
501 const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch - overlapElements)), overlapElements);
503 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
506 patch += remainingAfterBlocks16;
509 if constexpr (blocks1 != 0u)
511 for (
unsigned int n = 0u; n < blocks1; ++n)
513 sumIndividual += patch[n];
519 patch += patchStrideElements - patchWidthElements;
524 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
528 template <
unsigned int tPatchSize>
531 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
533 constexpr
unsigned int tChannels = 3u;
535 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
537 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
539 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
541 constexpr
unsigned int blocks48 = patchWidthElements / 48u;
542 constexpr
unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
544 constexpr
bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
546 constexpr
unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
548 constexpr
unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
550 constexpr
unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
552 constexpr
unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
554 constexpr
unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
556 constexpr
unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
558 constexpr
unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
560 constexpr
unsigned int blocks1 = remainingAfterPartialBlock15;
562 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
564 __m128i sumChannel0_128i = _mm_setzero_si128();
565 __m128i sumChannel1_128i = _mm_setzero_si128();
566 __m128i sumChannel2_128i = _mm_setzero_si128();
568 uint32_t sumIndividual[3] = {0u};
570 for (
unsigned int y = 0u; y < tPatchSize; ++y)
574 for (
unsigned int n = 0u; n < blocks48; ++n)
576 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(patch + 0));
577 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(patch + 16));
578 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(patch + 32));
585 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
586 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
587 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
592 if constexpr (partialBlock48)
594 constexpr
int overlappingElements = int(48u - remainingAfterFullBlocks48);
596 const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(patch)), overlappingElements);
597 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(patch - overlappingElements + 16));
598 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(patch - overlappingElements + 32));
605 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
606 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
607 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
609 patch += remainingAfterFullBlocks48;
612 for (
unsigned int n = 0u; n < blocks24; ++n)
614 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(patch + 0));
615 const __m128i bufferB_128i = _mm_loadl_epi64((
const __m128i*)(patch + 16));
617 __m128i channel01_128i;
618 __m128i channel2_128i;
621 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
623 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
624 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
625 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
630 for (
unsigned int n = 0u; n < blocks21; ++n)
632 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(patch + 0));
633 const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch + 16 - 3)), 3);
635 __m128i channel01_128i;
636 __m128i channel2_128i;
639 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
641 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
642 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
643 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
648 for (
unsigned int n = 0u; n < blocks15; ++n)
650 const __m128i buffer_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((
const __m128i*)(patch)) : _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch - 1)), 1);
652 __m128i channel01_128i;
653 __m128i channel2_128i;
656 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
658 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
659 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
660 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
665 if constexpr (blocks1 != 0u)
667 constexpr
unsigned int pixels = blocks1 / 3u;
669 for (
unsigned int x = 0u; x < pixels; ++x)
671 for (
unsigned int n = 0u; n < 3u; ++n)
673 sumIndividual[n] += patch[x * 3u + n];
680 patch += patchStrideElements - patchWidthElements;
683 meanValues[0] = uint8_t((
SSE::sum_u32_first_third(sumChannel0_128i) + sumIndividual[0] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
684 meanValues[1] = uint8_t((
SSE::sum_u32_first_third(sumChannel1_128i) + sumIndividual[1] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
685 meanValues[2] = uint8_t((
SSE::sum_u32_first_third(sumChannel2_128i) + sumIndividual[2] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
688 template <
unsigned int tChannels>
689 template <
unsigned int tPatchSize>
692 static_assert(tChannels >= 1u,
"Invalid channel number!");
693 static_assert(tPatchSize >= 1u,
"Invalid patch size!");
695 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
697 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
699 uint32_t sum[tChannels] = {0u};
701 for (
unsigned int y = 0u; y < tPatchSize; ++y)
703 for (
unsigned int x = 0u; x < tPatchSize; ++x)
705 for (
unsigned int n = 0u; n < tChannels; ++n)
707 sum[n] += patch[x * tChannels + n];
711 patch += patchStrideElements;
714 for (
unsigned int n = 0u; n < tChannels; ++n)
716 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
721 template <
unsigned int tPixels>
724 static_assert(tPixels >= 8u,
"Invalid pixel number!");
726 constexpr
unsigned int tChannels = 1u;
728 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
729 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
731 constexpr
unsigned int bufferElements = tChannels * tPixels;
733 constexpr
unsigned int blocks16 = bufferElements / 16u;
734 constexpr
unsigned int remainingAfterBlocks16 = bufferElements % 16u;
736 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
738 constexpr
bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
740 constexpr
bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
742 constexpr
unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
744 static_assert(blocks1 <= 2u,
"Invalid block size!");
746 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
748 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
750 const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
751 const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
753 __m128i sum0_128i = _mm_setzero_si128();
754 __m128i sum1_128i = _mm_setzero_si128();
756 uint32_t sumIndividual = 0u;
758 for (
unsigned int n = 0u; n < blocks16; ++n)
760 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)buffer0);
761 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)buffer1);
763 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
764 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
766 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
767 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
773 if constexpr (partialBlock16)
775 constexpr
unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
777 static_assert(overlapElements < 8u,
"Invalid value!");
779 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - overlapElements)), overlapElements);
780 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - overlapElements)), overlapElements);
782 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
783 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
785 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
786 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
788 buffer0 += remainingAfterBlocks16;
789 buffer1 += remainingAfterBlocks16;
792 if constexpr (fullBlock8)
794 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)buffer0);
795 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)buffer1);
797 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
798 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
800 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
801 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
807 if constexpr (partialBlock8)
809 constexpr
unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
811 static_assert(overlapElements < 8u,
"Invalid value!");
813 const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer0 - overlapElements)), overlapElements);
814 const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer1 - overlapElements)), overlapElements);
816 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
817 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
819 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
820 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
822 buffer0 += remainingAfterBlocks16;
823 buffer1 += remainingAfterBlocks16;
826 if constexpr (blocks1 != 0u)
828 for (
unsigned int n = 0u; n < blocks1; ++n)
830 sumIndividual +=
sqrDistance(buffer0[n] - meanValues0[0], buffer1[n] - meanValues1[0]);
841 template <
unsigned int tPixels>
844 static_assert(tPixels >= 5u,
"Invalid pixel number!");
846 constexpr
unsigned int tChannels = 3u;
848 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
849 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
851 constexpr
unsigned int bufferElements = tChannels * tPixels;
853 constexpr
unsigned int blocks48 = bufferElements / 48u;
854 constexpr
unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
856 constexpr
bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
858 constexpr
unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
860 constexpr
unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
862 constexpr
unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
864 constexpr
unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
866 constexpr
unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
868 constexpr
unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
870 constexpr
unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
872 constexpr
unsigned int blocks1 = remainingAfterPartialBlock15;
874 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
876 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
878 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
880 const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
881 const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
882 const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
884 const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
885 const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
886 const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
888 __m128i sum0_128i = _mm_setzero_si128();
889 __m128i sum1_128i = _mm_setzero_si128();
891 uint32_t sumIndividual = 0u;
893 for (
unsigned int n = 0u; n < blocks48; ++n)
895 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 0));
896 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 16));
897 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 32));
899 __m128i channel0_0_128i;
900 __m128i channel0_1_128i;
901 __m128i channel0_2_128i;
904 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 0));
905 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 16));
906 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 32));
908 __m128i channel1_0_128i;
909 __m128i channel1_1_128i;
910 __m128i channel1_2_128i;
913 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
914 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
916 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
917 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
919 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
920 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
922 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
923 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
925 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
926 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
928 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
929 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
935 if constexpr (partialBlock48)
937 constexpr
int overlappingElements = int(48u - remainingAfterFullBlocks48);
938 constexpr
int overlappingPixels = overlappingElements / int(tChannels);
940 const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0)), overlappingElements);
941 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements + 16));
942 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements + 32));
944 __m128i channel0_0_128i;
945 __m128i channel0_1_128i;
946 __m128i channel0_2_128i;
949 const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1)), overlappingElements);
950 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements + 16));
951 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements + 32));
953 __m128i channel1_0_128i;
954 __m128i channel1_1_128i;
955 __m128i channel1_2_128i;
958 __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2);
959 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
961 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
962 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
964 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
965 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
967 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
968 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
970 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
971 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
973 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
974 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
976 buffer0 += remainingAfterFullBlocks48;
977 buffer1 += remainingAfterFullBlocks48;
980 for (
unsigned int n = 0u; n < blocks24; ++n)
982 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 0));
983 const __m128i buffer0B_128i = _mm_loadl_epi64((
const __m128i*)(buffer0 + 16));
985 __m128i channel0_01_128i;
986 __m128i channel0_2_128i;
989 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 0));
990 const __m128i buffer1B_128i = _mm_loadl_epi64((
const __m128i*)(buffer1 + 16));
992 __m128i channel1_01_128i;
993 __m128i channel1_2_128i;
996 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i));
997 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
999 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1000 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1002 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1004 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1010 for (
unsigned int n = 0u; n < blocks21; ++n)
1012 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 0));
1013 const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer0 + 16 - 3)), 3);
1015 __m128i channel0_01_128i;
1016 __m128i channel0_2_128i;
1019 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 0));
1020 const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer1 + 16 - 3)), 3);
1022 __m128i channel1_01_128i;
1023 __m128i channel1_2_128i;
1026 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2);
1027 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1029 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1030 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1032 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1034 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1040 for (
unsigned int n = 0u; n < blocks15; ++n)
1042 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - 1)), 1);
1044 __m128i channel0_01_128i;
1045 __m128i channel0_2_128i;
1048 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - 1)), 1);
1050 __m128i channel1_01_128i;
1051 __m128i channel1_2_128i;
1054 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6);
1055 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1057 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1058 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1060 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1062 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1068 if constexpr (blocks1 != 0u)
1070 constexpr
unsigned int pixels = blocks1 / 3u;
1072 for (
unsigned int x = 0u; x < pixels; ++x)
1074 for (
unsigned int n = 0u; n < 3u; ++n)
1076 sumIndividual +=
sqrDistance(buffer0[x * 3u + n] - meanValues0[n], buffer1[x * 3u + n] - meanValues1[n]);
1087 template <
unsigned int tChannels>
1088 template <
unsigned int tPixels>
1091 static_assert(tChannels >= 1u,
"Invalid channel number!");
1092 static_assert(tPixels >= 1u,
"Invalid patch size!");
1094 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
1095 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1099 for (
unsigned int n = 0u; n < tPixels; ++n)
1101 for (
unsigned int c = 0u; c < tChannels; ++c)
1103 ssd +=
sqrDistance(buffer0[n * tChannels + c] - meanValues0[c], buffer1[n * tChannels + c] - meanValues1[c]);
1111 template <
unsigned int tPatchSize>
1114 static_assert(tPatchSize >= 1u,
"Invalid patch size!");
1116 constexpr
unsigned int tChannels = 1u;
1118 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1119 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1121 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1122 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1124 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
1126 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
1127 constexpr
unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
1129 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
1131 constexpr
bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
1133 constexpr
bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
1135 constexpr
unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
1137 static_assert(blocks1 <= 2u,
"Invalid block size!");
1139 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
1141 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
1143 const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
1144 const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
1146 __m128i sum0_128i = _mm_setzero_si128();
1147 __m128i sum1_128i = _mm_setzero_si128();
1149 uint32_t sumIndividual = 0u;
1151 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1156 for (
unsigned int n = 0u; n < blocks16; ++n)
1158 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
1159 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
1161 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1162 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1164 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1165 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1171 if constexpr (fullBlock8)
1173 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
1174 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
1176 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1177 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1179 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1180 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1186 if constexpr (partialBlock16)
1188 constexpr
unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
1190 static_assert(overlapElements < 8u,
"Invalid value!");
1192 if (y < tPatchSize - 1u)
1194 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
1195 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
1197 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1198 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1200 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1201 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1205 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)(patch0 - overlapElements));
1206 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)(patch1 - overlapElements));
1208 const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1209 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1211 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1212 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1215 patch0 += remainingAfterBlocks16;
1216 patch1 += remainingAfterBlocks16;
1219 if constexpr (partialBlock8)
1221 constexpr
unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
1223 static_assert(overlapElements < 8u,
"Invalid value!");
1225 if (y < tPatchSize - 1u)
1227 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
1228 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
1230 const __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1232 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1236 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)(patch0 - overlapElements));
1237 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)(patch1 - overlapElements));
1239 const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1241 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1244 patch0 += remainingAfterBlocks16;
1245 patch1 += remainingAfterBlocks16;
1248 if constexpr (blocks1 != 0u)
1250 for (
unsigned int n = 0u; n < blocks1; ++n)
1252 sumIndividual +=
sqrDistance(patch0[n] - meanValues0[0], patch1[n] - meanValues1[0]);
1259 patch0 += patch0StrideElements - patchWidthElements;
1260 patch1 += patch1StrideElements - patchWidthElements;
1267 template <
unsigned int tPatchSize>
1270 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1272 constexpr
unsigned int tChannels = 3u;
1274 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1275 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1277 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1278 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1280 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
1282 constexpr
unsigned int blocks48 = patchWidthElements / 48u;
1283 constexpr
unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
1285 constexpr
bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
1287 constexpr
unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
1289 constexpr
unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
1291 constexpr
unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
1293 constexpr
unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
1295 constexpr
unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
1297 constexpr
unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
1299 constexpr
unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
1301 constexpr
unsigned int blocks1 = remainingAfterPartialBlock15;
1303 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
1305 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
1307 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
1309 const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
1310 const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
1311 const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
1313 const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
1314 const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
1315 const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
1317 __m128i sum0_128i = _mm_setzero_si128();
1318 __m128i sum1_128i = _mm_setzero_si128();
1320 uint32_t sumIndividual = 0u;
1322 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1327 for (
unsigned int n = 0u; n < blocks48; ++n)
1329 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 0));
1330 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 16));
1331 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 32));
1333 __m128i channel0_0_128i;
1334 __m128i channel0_1_128i;
1335 __m128i channel0_2_128i;
1338 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 0));
1339 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 16));
1340 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 32));
1342 __m128i channel1_0_128i;
1343 __m128i channel1_1_128i;
1344 __m128i channel1_2_128i;
1347 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1348 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1350 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1351 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1353 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1354 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1356 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1357 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1359 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1360 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1362 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1363 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1369 if constexpr (partialBlock48)
1371 constexpr
int overlappingElements = int(48u - remainingAfterFullBlocks48);
1372 constexpr
int overlappingPixels = overlappingElements / int(tChannels);
1374 const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(patch0)), overlappingElements);
1375 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(patch0 - overlappingElements + 16));
1376 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(patch0 - overlappingElements + 32));
1378 __m128i channel0_0_128i;
1379 __m128i channel0_1_128i;
1380 __m128i channel0_2_128i;
1383 const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(patch1)), overlappingElements);
1384 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(patch1 - overlappingElements + 16));
1385 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(patch1 - overlappingElements + 32));
1387 __m128i channel1_0_128i;
1388 __m128i channel1_1_128i;
1389 __m128i channel1_2_128i;
1392 __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2);
1393 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1395 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1396 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1398 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
1399 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1401 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1402 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1404 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
1405 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1407 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1408 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1410 patch0 += remainingAfterFullBlocks48;
1411 patch1 += remainingAfterFullBlocks48;
1414 for (
unsigned int n = 0u; n < blocks24; ++n)
1416 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 0));
1417 const __m128i buffer0B_128i = _mm_loadl_epi64((
const __m128i*)(patch0 + 16));
1419 __m128i channel0_01_128i;
1420 __m128i channel0_2_128i;
1423 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 0));
1424 const __m128i buffer1B_128i = _mm_loadl_epi64((
const __m128i*)(patch1 + 16));
1426 __m128i channel1_01_128i;
1427 __m128i channel1_2_128i;
1430 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i));
1431 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
1433 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1434 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1436 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1438 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1444 for (
unsigned int n = 0u; n < blocks21; ++n)
1446 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 0));
1447 const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch0 + 16 - 3)), 3);
1449 __m128i channel0_01_128i;
1450 __m128i channel0_2_128i;
1453 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 0));
1454 const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch1 + 16 - 3)), 3);
1456 __m128i channel1_01_128i;
1457 __m128i channel1_2_128i;
1460 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2);
1461 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1463 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1464 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1466 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1468 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1474 for (
unsigned int n = 0u; n < blocks15; ++n)
1476 const __m128i buffer0_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((
const __m128i*)(patch0)) : _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch0 - 1)), 1);
1478 __m128i channel0_01_128i;
1479 __m128i channel0_2_128i;
1482 const __m128i buffer1_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((
const __m128i*)(patch1)) : _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch1 - 1)), 1);
1484 __m128i channel1_01_128i;
1485 __m128i channel1_2_128i;
1488 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6);
1489 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1491 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1492 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1494 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1496 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1502 if constexpr (blocks1 != 0u)
1504 constexpr
unsigned int pixels = blocks1 / 3u;
1506 for (
unsigned int x = 0u; x < pixels; ++x)
1508 for (
unsigned int n = 0u; n < 3u; ++n)
1510 sumIndividual +=
sqrDistance(patch0[x * 3u + n] - meanValues0[n], patch1[x * 3u + n] - meanValues1[n]);
1518 patch0 += patch0StrideElements - patchWidthElements;
1519 patch1 += patch1StrideElements - patchWidthElements;
1525 template <
unsigned int tChannels>
1526 template <
unsigned int tPatchSize>
1529 static_assert(tChannels >= 1u,
"Invalid channel number!");
1530 static_assert(tPatchSize >= 1u,
"Invalid patch size!");
1532 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1533 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1535 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1536 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1540 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1542 for (
unsigned int x = 0u; x < tPatchSize; ++x)
1544 for (
unsigned int n = 0u; n < tChannels; ++n)
1546 ssd +=
sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1550 patch0 += patch0StrideElements;
1551 patch1 += patch1StrideElements;
1557 template <
unsigned int tChannels,
unsigned int tPixels>
1560 static_assert(tChannels >= 1u,
"Invalid channel number!");
1561 static_assert(tPixels >= 8u,
"Invalid patch size!");
1563 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
1565 uint8_t meanValues0[tChannels];
1566 mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
1568 uint8_t meanValues1[tChannels];
1569 mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
1574 template <
unsigned int tChannels,
unsigned int tPatchSize>
1577 static_assert(tChannels >= 1u,
"Invalid channel number!");
1578 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1580 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1582 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1583 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1585 uint8_t meanValues0[tChannels];
1586 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
1588 uint8_t meanValues1[tChannels];
1589 mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
1594 template <
unsigned int tChannels,
unsigned int tPatchSize>
1597 static_assert(tChannels >= 1u,
"Invalid channel number!");
1598 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1600 ocean_assert(patch0 !=
nullptr && buffer1 !=
nullptr);
1602 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1604 uint8_t meanValues0[tChannels];
1605 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
1607 uint8_t meanValues1[tChannels];
1608 mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
1610 constexpr
unsigned int patch1StrideElements = tChannels * tPatchSize;
1615 template <
unsigned int tChannels,
unsigned int tPixels>
1618 static_assert(tChannels >= 1u,
"Invalid channel number!");
1619 static_assert(tPixels >= 8u,
"Invalid patch size!");
1624 template <
unsigned int tChannels,
unsigned int tPatchSize>
1627 static_assert(tChannels >= 1u,
"Invalid channel number!");
1628 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3277
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition: SSE.h:1340
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3289
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3304
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition: SSE.h:1322
This class allows to specialize functions for individual channels.
Definition: ZeroMeanSumSquareDifferencesSSE.h:39
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1527
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1089
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesSSE.h:389
This class implements function to calculate zeao-mean sum square differences using SSE instructions.
Definition: ZeroMeanSumSquareDifferencesSSE.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *const patch0, const uint8_t *const buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1595
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1558
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1616
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1575
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15