239 static_assert(tPixels >= 8u,
"Invalid buffer size!");
241 constexpr unsigned int tChannels = 3u;
243 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
245 constexpr unsigned int bufferElements = tChannels * tPixels;
247 constexpr unsigned int blocks48 = bufferElements / 48u;
248 constexpr unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
250 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
252 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
254 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
256 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
258 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
260 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
262 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
264 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
266 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
268 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
270 __m128i sumChannel0_128i = _mm_setzero_si128();
271 __m128i sumChannel1_128i = _mm_setzero_si128();
272 __m128i sumChannel2_128i = _mm_setzero_si128();
274 uint32_t sumIndividual[3] = {0u};
276 for (
unsigned int n = 0u; n < blocks48; ++n)
278 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 0));
279 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 16));
280 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 32));
287 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
288 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
289 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
294 if constexpr (partialBlock48)
296 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
298 const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(buffer)), overlappingElements);
299 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(buffer - overlappingElements + 16));
300 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(buffer - overlappingElements + 32));
307 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
308 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
309 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
311 buffer += remainingAfterFullBlocks48;
314 for (
unsigned int n = 0u; n < blocks24; ++n)
316 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 0));
317 const __m128i bufferB_128i = _mm_loadl_epi64((
const __m128i*)(buffer + 16));
319 __m128i channel01_128i;
320 __m128i channel2_128i;
323 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
325 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
326 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
327 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
332 for (
unsigned int n = 0u; n < blocks21; ++n)
334 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(buffer + 0));
335 const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer + 16 - 3)), 3);
337 __m128i channel01_128i;
338 __m128i channel2_128i;
341 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
343 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
344 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
345 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
350 for (
unsigned int n = 0u; n < blocks15; ++n)
352 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer - 1)), 1);
354 __m128i channel01_128i;
355 __m128i channel2_128i;
358 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
360 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
361 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
362 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
367 if constexpr (blocks1 != 0u)
369 constexpr unsigned int pixels = blocks1 / 3u;
371 for (
unsigned int x = 0u; x < pixels; ++x)
373 for (
unsigned int n = 0u; n < 3u; ++n)
375 sumIndividual[n] += buffer[x * 3u + n];
416 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
418 constexpr unsigned int tChannels = 1u;
420 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
422 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
424 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
426 constexpr unsigned int blocks16 = patchWidthElements / 16u;
427 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
429 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
431 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
433 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
435 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
437 static_assert(blocks1 <= 2u,
"Invalid block size!");
439 __m128i sum_128i = _mm_setzero_si128();
441 uint32_t sumIndividual = 0u;
443 for (
unsigned int y = 0u; y < tPatchSize; ++y)
447 for (
unsigned int n = 0u; n < blocks16; ++n)
449 const __m128i buffer_128i = _mm_lddqu_si128((
const __m128i*)patch);
451 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
456 if constexpr (fullBlock8)
458 const __m128i buffer_128i = _mm_loadl_epi64((
const __m128i*)patch);
460 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
465 if constexpr (partialBlock16)
467 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
469 static_assert(overlapElements < 8u,
"Invalid value!");
471 if (y < tPatchSize - 1u)
473 const __m128i buffer_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch), overlapElements);
475 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
479 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch - overlapElements)), overlapElements);
481 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
484 patch += remainingAfterBlocks16;
487 if constexpr (partialBlock8)
489 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
491 static_assert(overlapElements < 8u,
"Invalid value!");
493 if (y < tPatchSize - 1u)
495 const __m128i buffer_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch), overlapElements + 8);
497 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
501 const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch - overlapElements)), overlapElements);
503 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
506 patch += remainingAfterBlocks16;
509 if constexpr (blocks1 != 0u)
511 for (
unsigned int n = 0u; n < blocks1; ++n)
513 sumIndividual += patch[n];
519 patch += patchStrideElements - patchWidthElements;
524 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
531 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
533 constexpr unsigned int tChannels = 3u;
535 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
537 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
539 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
541 constexpr unsigned int blocks48 = patchWidthElements / 48u;
542 constexpr unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
544 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
546 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
548 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
550 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
552 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
554 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
556 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
558 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
560 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
562 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
564 __m128i sumChannel0_128i = _mm_setzero_si128();
565 __m128i sumChannel1_128i = _mm_setzero_si128();
566 __m128i sumChannel2_128i = _mm_setzero_si128();
568 uint32_t sumIndividual[3] = {0u};
570 for (
unsigned int y = 0u; y < tPatchSize; ++y)
574 for (
unsigned int n = 0u; n < blocks48; ++n)
576 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(patch + 0));
577 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(patch + 16));
578 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(patch + 32));
585 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
586 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
587 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
592 if constexpr (partialBlock48)
594 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
596 const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(patch)), overlappingElements);
597 const __m128i bufferB_128i = _mm_lddqu_si128((
const __m128i*)(patch - overlappingElements + 16));
598 const __m128i bufferC_128i = _mm_lddqu_si128((
const __m128i*)(patch - overlappingElements + 32));
605 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
606 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
607 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
609 patch += remainingAfterFullBlocks48;
612 for (
unsigned int n = 0u; n < blocks24; ++n)
614 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(patch + 0));
615 const __m128i bufferB_128i = _mm_loadl_epi64((
const __m128i*)(patch + 16));
617 __m128i channel01_128i;
618 __m128i channel2_128i;
621 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
623 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
624 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
625 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
630 for (
unsigned int n = 0u; n < blocks21; ++n)
632 const __m128i bufferA_128i = _mm_lddqu_si128((
const __m128i*)(patch + 0));
633 const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch + 16 - 3)), 3);
635 __m128i channel01_128i;
636 __m128i channel2_128i;
639 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
641 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
642 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
643 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
648 for (
unsigned int n = 0u; n < blocks15; ++n)
650 const __m128i buffer_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((
const __m128i*)(patch)) : _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch - 1)), 1);
652 __m128i channel01_128i;
653 __m128i channel2_128i;
656 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
658 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
659 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
660 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
665 if constexpr (blocks1 != 0u)
667 constexpr unsigned int pixels = blocks1 / 3u;
669 for (
unsigned int x = 0u; x < pixels; ++x)
671 for (
unsigned int n = 0u; n < 3u; ++n)
673 sumIndividual[n] += patch[x * 3u + n];
680 patch += patchStrideElements - patchWidthElements;
683 meanValues[0] = uint8_t((
SSE::sum_u32_first_third(sumChannel0_128i) + sumIndividual[0] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
684 meanValues[1] = uint8_t((
SSE::sum_u32_first_third(sumChannel1_128i) + sumIndividual[1] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
685 meanValues[2] = uint8_t((
SSE::sum_u32_first_third(sumChannel2_128i) + sumIndividual[2] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
724 static_assert(tPixels >= 8u,
"Invalid pixel number!");
726 constexpr unsigned int tChannels = 1u;
728 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
729 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
731 constexpr unsigned int bufferElements = tChannels * tPixels;
733 constexpr unsigned int blocks16 = bufferElements / 16u;
734 constexpr unsigned int remainingAfterBlocks16 = bufferElements % 16u;
736 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
738 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
740 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
742 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
744 static_assert(blocks1 <= 2u,
"Invalid block size!");
746 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
748 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
750 const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
751 const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
753 __m128i sum0_128i = _mm_setzero_si128();
754 __m128i sum1_128i = _mm_setzero_si128();
756 uint32_t sumIndividual = 0u;
758 for (
unsigned int n = 0u; n < blocks16; ++n)
760 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)buffer0);
761 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)buffer1);
763 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
764 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
766 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
767 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
773 if constexpr (partialBlock16)
775 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
777 static_assert(overlapElements < 8u,
"Invalid value!");
779 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - overlapElements)), overlapElements);
780 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - overlapElements)), overlapElements);
782 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
783 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
785 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
786 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
788 buffer0 += remainingAfterBlocks16;
789 buffer1 += remainingAfterBlocks16;
792 if constexpr (fullBlock8)
794 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)buffer0);
795 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)buffer1);
797 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
798 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
800 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
801 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
807 if constexpr (partialBlock8)
809 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
811 static_assert(overlapElements < 8u,
"Invalid value!");
813 const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer0 - overlapElements)), overlapElements);
814 const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer1 - overlapElements)), overlapElements);
816 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
817 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
819 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
820 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
822 buffer0 += remainingAfterBlocks16;
823 buffer1 += remainingAfterBlocks16;
826 if constexpr (blocks1 != 0u)
828 for (
unsigned int n = 0u; n < blocks1; ++n)
830 sumIndividual +=
sqrDistance(buffer0[n] - meanValues0[0], buffer1[n] - meanValues1[0]);
844 static_assert(tPixels >= 5u,
"Invalid pixel number!");
846 constexpr unsigned int tChannels = 3u;
848 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
849 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
851 constexpr unsigned int bufferElements = tChannels * tPixels;
853 constexpr unsigned int blocks48 = bufferElements / 48u;
854 constexpr unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
856 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
858 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
860 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
862 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
864 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
866 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
868 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
870 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
872 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
874 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
876 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
878 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
880 const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
881 const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
882 const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
884 const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
885 const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
886 const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
888 __m128i sum0_128i = _mm_setzero_si128();
889 __m128i sum1_128i = _mm_setzero_si128();
891 uint32_t sumIndividual = 0u;
893 for (
unsigned int n = 0u; n < blocks48; ++n)
895 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 0));
896 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 16));
897 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 32));
899 __m128i channel0_0_128i;
900 __m128i channel0_1_128i;
901 __m128i channel0_2_128i;
904 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 0));
905 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 16));
906 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 32));
908 __m128i channel1_0_128i;
909 __m128i channel1_1_128i;
910 __m128i channel1_2_128i;
913 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
914 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
916 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
917 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
919 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
920 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
922 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
923 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
925 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
926 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
928 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
929 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
935 if constexpr (partialBlock48)
937 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
938 constexpr int overlappingPixels = overlappingElements / int(tChannels);
940 const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0)), overlappingElements);
941 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements + 16));
942 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements + 32));
944 __m128i channel0_0_128i;
945 __m128i channel0_1_128i;
946 __m128i channel0_2_128i;
949 const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1)), overlappingElements);
950 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements + 16));
951 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements + 32));
953 __m128i channel1_0_128i;
954 __m128i channel1_1_128i;
955 __m128i channel1_2_128i;
958 __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2);
959 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
961 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
962 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
964 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
965 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
967 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
968 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
970 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
971 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
973 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
974 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
976 buffer0 += remainingAfterFullBlocks48;
977 buffer1 += remainingAfterFullBlocks48;
980 for (
unsigned int n = 0u; n < blocks24; ++n)
982 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 0));
983 const __m128i buffer0B_128i = _mm_loadl_epi64((
const __m128i*)(buffer0 + 16));
985 __m128i channel0_01_128i;
986 __m128i channel0_2_128i;
989 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 0));
990 const __m128i buffer1B_128i = _mm_loadl_epi64((
const __m128i*)(buffer1 + 16));
992 __m128i channel1_01_128i;
993 __m128i channel1_2_128i;
996 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i));
997 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
999 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1000 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1002 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1004 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1010 for (
unsigned int n = 0u; n < blocks21; ++n)
1012 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(buffer0 + 0));
1013 const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer0 + 16 - 3)), 3);
1015 __m128i channel0_01_128i;
1016 __m128i channel0_2_128i;
1019 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(buffer1 + 0));
1020 const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(buffer1 + 16 - 3)), 3);
1022 __m128i channel1_01_128i;
1023 __m128i channel1_2_128i;
1026 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2);
1027 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1029 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1030 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1032 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1034 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1040 for (
unsigned int n = 0u; n < blocks15; ++n)
1042 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - 1)), 1);
1044 __m128i channel0_01_128i;
1045 __m128i channel0_2_128i;
1048 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - 1)), 1);
1050 __m128i channel1_01_128i;
1051 __m128i channel1_2_128i;
1054 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6);
1055 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1057 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1058 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1060 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1062 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1068 if constexpr (blocks1 != 0u)
1070 constexpr unsigned int pixels = blocks1 / 3u;
1072 for (
unsigned int x = 0u; x < pixels; ++x)
1074 for (
unsigned int n = 0u; n < 3u; ++n)
1076 sumIndividual +=
sqrDistance(buffer0[x * 3u + n] - meanValues0[n], buffer1[x * 3u + n] - meanValues1[n]);
1114 static_assert(tPatchSize >= 1u,
"Invalid patch size!");
1116 constexpr unsigned int tChannels = 1u;
1118 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1119 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1121 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1122 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1124 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
1126 constexpr unsigned int blocks16 = patchWidthElements / 16u;
1127 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
1129 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
1131 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
1133 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
1135 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
1137 static_assert(blocks1 <= 2u,
"Invalid block size!");
1139 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
1141 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
1143 const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
1144 const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
1146 __m128i sum0_128i = _mm_setzero_si128();
1147 __m128i sum1_128i = _mm_setzero_si128();
1149 uint32_t sumIndividual = 0u;
1151 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1156 for (
unsigned int n = 0u; n < blocks16; ++n)
1158 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
1159 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
1161 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1162 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1164 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1165 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1171 if constexpr (fullBlock8)
1173 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
1174 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
1176 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1177 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1179 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1180 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1186 if constexpr (partialBlock16)
1188 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
1190 static_assert(overlapElements < 8u,
"Invalid value!");
1192 if (y < tPatchSize - 1u)
1194 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
1195 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
1197 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1198 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1200 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1201 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1205 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)(patch0 - overlapElements));
1206 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)(patch1 - overlapElements));
1208 const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1209 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1211 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1212 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1215 patch0 += remainingAfterBlocks16;
1216 patch1 += remainingAfterBlocks16;
1219 if constexpr (partialBlock8)
1221 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
1223 static_assert(overlapElements < 8u,
"Invalid value!");
1225 if (y < tPatchSize - 1u)
1227 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
1228 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
1230 const __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1232 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1236 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)(patch0 - overlapElements));
1237 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)(patch1 - overlapElements));
1239 const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1241 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1244 patch0 += remainingAfterBlocks16;
1245 patch1 += remainingAfterBlocks16;
1248 if constexpr (blocks1 != 0u)
1250 for (
unsigned int n = 0u; n < blocks1; ++n)
1252 sumIndividual +=
sqrDistance(patch0[n] - meanValues0[0], patch1[n] - meanValues1[0]);
1259 patch0 += patch0StrideElements - patchWidthElements;
1260 patch1 += patch1StrideElements - patchWidthElements;
1270 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1272 constexpr unsigned int tChannels = 3u;
1274 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1275 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1277 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1278 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1280 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
1282 constexpr unsigned int blocks48 = patchWidthElements / 48u;
1283 constexpr unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
1285 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
1287 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
1289 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
1291 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
1293 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
1295 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
1297 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
1299 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
1301 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
1303 static_assert(blocks1 % 3u == 0u,
"Invalid number of single blocks");
1305 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
1307 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
1309 const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
1310 const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
1311 const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
1313 const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
1314 const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
1315 const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
1317 __m128i sum0_128i = _mm_setzero_si128();
1318 __m128i sum1_128i = _mm_setzero_si128();
1320 uint32_t sumIndividual = 0u;
1322 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1327 for (
unsigned int n = 0u; n < blocks48; ++n)
1329 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 0));
1330 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 16));
1331 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 32));
1333 __m128i channel0_0_128i;
1334 __m128i channel0_1_128i;
1335 __m128i channel0_2_128i;
1338 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 0));
1339 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 16));
1340 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 32));
1342 __m128i channel1_0_128i;
1343 __m128i channel1_1_128i;
1344 __m128i channel1_2_128i;
1347 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1348 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1350 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1351 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1353 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1354 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1356 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1357 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1359 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1360 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1362 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1363 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1369 if constexpr (partialBlock48)
1371 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
1372 constexpr int overlappingPixels = overlappingElements / int(tChannels);
1374 const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(patch0)), overlappingElements);
1375 const __m128i buffer0B_128i = _mm_lddqu_si128((
const __m128i*)(patch0 - overlappingElements + 16));
1376 const __m128i buffer0C_128i = _mm_lddqu_si128((
const __m128i*)(patch0 - overlappingElements + 32));
1378 __m128i channel0_0_128i;
1379 __m128i channel0_1_128i;
1380 __m128i channel0_2_128i;
1383 const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)(patch1)), overlappingElements);
1384 const __m128i buffer1B_128i = _mm_lddqu_si128((
const __m128i*)(patch1 - overlappingElements + 16));
1385 const __m128i buffer1C_128i = _mm_lddqu_si128((
const __m128i*)(patch1 - overlappingElements + 32));
1387 __m128i channel1_0_128i;
1388 __m128i channel1_1_128i;
1389 __m128i channel1_2_128i;
1392 __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2);
1393 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1395 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1396 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1398 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
1399 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1401 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1402 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1404 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
1405 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1407 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1408 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1410 patch0 += remainingAfterFullBlocks48;
1411 patch1 += remainingAfterFullBlocks48;
1414 for (
unsigned int n = 0u; n < blocks24; ++n)
1416 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 0));
1417 const __m128i buffer0B_128i = _mm_loadl_epi64((
const __m128i*)(patch0 + 16));
1419 __m128i channel0_01_128i;
1420 __m128i channel0_2_128i;
1423 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 0));
1424 const __m128i buffer1B_128i = _mm_loadl_epi64((
const __m128i*)(patch1 + 16));
1426 __m128i channel1_01_128i;
1427 __m128i channel1_2_128i;
1430 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i));
1431 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
1433 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1434 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1436 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1438 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1444 for (
unsigned int n = 0u; n < blocks21; ++n)
1446 const __m128i buffer0A_128i = _mm_lddqu_si128((
const __m128i*)(patch0 + 0));
1447 const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch0 + 16 - 3)), 3);
1449 __m128i channel0_01_128i;
1450 __m128i channel0_2_128i;
1453 const __m128i buffer1A_128i = _mm_lddqu_si128((
const __m128i*)(patch1 + 0));
1454 const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch1 + 16 - 3)), 3);
1456 __m128i channel1_01_128i;
1457 __m128i channel1_2_128i;
1460 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2);
1461 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1463 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1464 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1466 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1468 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1474 for (
unsigned int n = 0u; n < blocks15; ++n)
1476 const __m128i buffer0_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((
const __m128i*)(patch0)) : _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch0 - 1)), 1);
1478 __m128i channel0_01_128i;
1479 __m128i channel0_2_128i;
1482 const __m128i buffer1_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((
const __m128i*)(patch1)) : _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch1 - 1)), 1);
1484 __m128i channel1_01_128i;
1485 __m128i channel1_2_128i;
1488 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6);
1489 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1491 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1492 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1494 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1496 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1502 if constexpr (blocks1 != 0u)
1504 constexpr unsigned int pixels = blocks1 / 3u;
1506 for (
unsigned int x = 0u; x < pixels; ++x)
1508 for (
unsigned int n = 0u; n < 3u; ++n)
1510 sumIndividual +=
sqrDistance(patch0[x * 3u + n] - meanValues0[n], patch1[x * 3u + n] - meanValues1[n]);
1518 patch0 += patch0StrideElements - patchWidthElements;
1519 patch1 += patch1StrideElements - patchWidthElements;