333 static_assert(tPixels >= 8u,
"Invalid pixels!");
335 constexpr unsigned int tChannels = 3u;
337 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
339 constexpr unsigned int blocks16 = tPixels / 16u;
340 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
342 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u && blocks16 >= 1u;
343 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
345 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
346 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
348 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
349 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
351 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
353 static_assert(blocks1 <= 2u,
"Invalid block size!");
355 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
356 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
357 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
359 uint32_t sumIndividual[3] = {0u};
361 for (
unsigned int n = 0u; n < blocks16; ++n)
363 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer);
365 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[0]));
366 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[1]));
367 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[2]));
369 buffer += 16u * tChannels;
372 if constexpr (partialBlock16)
374 static_assert(tPixels >= 16u,
"We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
376 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
377 ocean_assert(overlappingElements < 8u);
381 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
383 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer - overlappingElements * tChannels);
385 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[0], mask_u_8x16)));
386 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[1], mask_u_8x16)));
387 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[2], mask_u_8x16)));
389 buffer += remainingAfterBlocks16 * tChannels;
392 for (
unsigned int n = 0u; n < blocks8; ++n)
394 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer);
396 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[0]));
397 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[1]));
398 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[2]));
400 buffer += 8u * tChannels;
403 if constexpr (partialBlock8)
405 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
406 ocean_assert(overlappingElements < 8u);
408 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
410 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer - overlappingElements * tChannels);
412 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[0], mask_u_8x8)));
413 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[1], mask_u_8x8)));
414 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[2], mask_u_8x8)));
416 buffer += remainingAfterBlocks8 * tChannels;
419 for (
unsigned int n = 0u; n < blocks1; ++n)
421 sumIndividual[0] += buffer[tChannels * n + 0u];
422 sumIndividual[1] += buffer[tChannels * n + 1u];
423 sumIndividual[2] += buffer[tChannels * n + 2u];
427 meanValues[0] = uint8_t((sum0 + tPixels / 2u) / tPixels);
430 meanValues[1] = uint8_t((sum1 + tPixels / 2u) / tPixels);
433 meanValues[2] = uint8_t((sum2 + tPixels / 2u) / tPixels);
591 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
593 constexpr unsigned int tChannels = 3u;
595 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
597 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
599 constexpr unsigned int blocks16 = tPatchSize / 16u;
600 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
602 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
603 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
605 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
606 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
608 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
609 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
611 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
613 static_assert(blocks1 <= 2u,
"Invalid block size!");
615 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
616 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
617 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
619 uint32_t sumIndividual[3] = {0u};
621 for (
unsigned int y = 0u; y < tPatchSize; ++y)
623 for (
unsigned int n = 0u; n < blocks16; ++n)
625 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
627 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[0]));
628 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[1]));
629 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[2]));
631 patch += 16u * tChannels;
634 if constexpr (partialBlock16)
636 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
637 ocean_assert(overlappingElements < 8u);
639 if (y < tPatchSize - 1u)
643 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
644 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
646 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
648 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
649 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
650 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
656 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
657 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
659 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch - overlappingElements * tChannels);
661 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
662 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
663 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
666 patch += remainingAfterBlocks16 * tChannels;
669 for (
unsigned int n = 0u; n < blocks8; ++n)
671 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
673 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(patch_u_8x8x3.val[0]));
674 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(patch_u_8x8x3.val[1]));
675 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(patch_u_8x8x3.val[2]));
677 patch += 8u * tChannels;
680 if constexpr (partialBlock8)
682 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
683 ocean_assert(overlappingElements < 8u);
685 if (y < tPatchSize - 1u)
687 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
688 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
690 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
692 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
693 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
694 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
698 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
699 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
701 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch - overlappingElements * tChannels);
703 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
704 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
705 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
708 patch += remainingAfterBlocks8 * tChannels;
711 if constexpr (blocks1 != 0u)
713 for (
unsigned int n = 0u; n < blocks1; ++n)
715 sumIndividual[0] += patch[tChannels * n + 0u];
716 sumIndividual[1] += patch[tChannels * n + 1u];
717 sumIndividual[2] += patch[tChannels * n + 2u];
720 patch += blocks1 * tChannels;
723 patch += patchStrideElements - tChannels * tPatchSize;
727 meanValues[0] = uint8_t((sum0 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
730 meanValues[1] = uint8_t((sum1 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
733 meanValues[2] = uint8_t((sum2 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
772 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
774 ocean_assert(image !=
nullptr && meanValues !=
nullptr);
775 ocean_assert(centerX < width && centerY < height);
777 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
779 const unsigned int imageStrideElements = width + imagePaddingElements;
781 constexpr unsigned int blocks16 = tPatchSize / 16u;
782 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
784 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
785 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
787 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
788 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
790 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
791 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
793 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
795 static_assert(blocks1 <= 7u,
"Invalid block size!");
797 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
799 uint32_t sumIndividual = 0u;
801 uint8_t intermediate[16];
803 for (
int y =
int(centerY) -
int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
807 int x = int(centerX) - int(tPatchSize_2);
809 for (
unsigned int n = 0u; n < blocks16; ++n)
811 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow, x, width, intermediate);
813 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
818 if constexpr (partialBlock16)
820 if (y <
int(centerY) + int(tPatchSize_2))
822 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
824 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
828 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
830 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
833 x += remainingAfterBlocks16;
836 for (
unsigned int n = 0u; n < blocks8; ++n)
838 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow, x, width, intermediate);
840 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
845 if constexpr (partialBlock8)
847 if (y <
int(centerY) + int(tPatchSize_2))
849 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
851 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
855 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
857 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
860 x += remainingAfterBlocks8;
863 if constexpr (blocks1 != 0u)
865 for (
unsigned int n = 0u; n < blocks1; ++n)
869 sumIndividual += mirroredRow[index];
878 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
922 static_assert(tPixels >= 8u,
"Invalid pixels!");
924 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
925 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
927 constexpr unsigned int blocks16 = tPixels / 16u;
928 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
930 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
931 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
933 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
934 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
936 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
937 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
939 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
941 static_assert(blocks1 <= 2u,
"Invalid block size!");
946 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
948 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
949 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
951 uint32_t sumIndividual = 0u;
953 for (
unsigned int n = 0u; n < blocks16; ++n)
955 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0);
956 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1);
958 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16)));
959 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16)));
961 const uint16x8_t buffer_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8));
962 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
964 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
965 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
967 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
968 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
974 if constexpr (partialBlock16)
976 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
977 ocean_assert(overlappingElements < 8u);
979 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0 - overlappingElements);
980 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1 - overlappingElements);
982 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16)));
983 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16)));
985 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
986 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
988 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
990 const uint16x8_t buffer_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
991 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
993 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
994 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
996 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
997 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
999 buffer0 += remainingAfterBlocks16;
1000 buffer1 += remainingAfterBlocks16;
1003 for (
unsigned int n = 0u; n < blocks8; ++n)
1005 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0);
1006 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1);
1008 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8));
1010 const uint16x8_t buffer_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8));
1012 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1013 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1019 if constexpr (partialBlock8)
1021 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1022 ocean_assert(overlappingElements < 8u);
1024 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0 - overlappingElements);
1025 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1 - overlappingElements);
1027 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8));
1029 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1030 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1032 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1034 const uint16x8_t buffer_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1036 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1037 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1039 buffer0 += remainingAfterBlocks8;
1040 buffer1 += remainingAfterBlocks8;
1043 if constexpr (blocks1 != 0u)
1045 for (
unsigned int n = 0u; n < blocks1; ++n)
1047 sumIndividual +=
sqrDistance(int16_t(buffer0[n] - meanValues0[0]), int16_t(buffer1[n] - meanValues1[0]));
1051 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1060 static_assert(tPixels >= 8u,
"Invalid pixels!");
1062 constexpr unsigned int tChannels = 3u;
1064 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
1065 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1067 constexpr unsigned int blocks16 = tPixels / 16u;
1068 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
1070 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1071 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1073 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1074 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1076 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1077 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1079 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1081 static_assert(blocks1 <= 2u,
"Invalid block size!");
1086 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1087 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1088 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1090 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1091 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1093 uint32_t sumIndividual = 0u;
1095 for (
unsigned int n = 0u; n < blocks16; ++n)
1097 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0);
1098 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1);
1100 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0])));
1101 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0])));
1103 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1104 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1106 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1107 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1110 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8));
1111 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1113 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1114 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1116 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1117 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1120 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1121 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1122 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1123 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1125 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1126 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1127 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1128 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1130 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1131 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1132 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1133 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1136 buffer0 += 16u * tChannels;
1137 buffer1 += 16u * tChannels;
1140 if constexpr (partialBlock16)
1142 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1143 ocean_assert(overlappingElements < 8u);
1145 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0 - overlappingElements * tChannels);
1146 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1 - overlappingElements * tChannels);
1149 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0])));
1150 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0])));
1152 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1153 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1155 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1156 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1159 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1160 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1162 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1165 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1166 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1168 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1169 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1171 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1172 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1175 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1176 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1177 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1178 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1180 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1181 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1182 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1183 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1185 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1186 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1187 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1188 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1190 buffer0 += remainingAfterBlocks16 * tChannels;
1191 buffer1 += remainingAfterBlocks16 * tChannels;
1194 for (
unsigned int n = 0u; n < blocks8; ++n)
1196 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0);
1197 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1);
1199 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0]));
1200 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1201 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1203 const uint16x8_t bufferChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8));
1204 const uint16x8_t bufferChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1205 const uint16x8_t bufferChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1207 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1208 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1210 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1211 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1213 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1214 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1216 buffer0 += 8u * tChannels;
1217 buffer1 += 8u * tChannels;
1220 if constexpr (partialBlock8)
1222 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1223 ocean_assert(overlappingElements < 8u);
1225 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0 - overlappingElements * tChannels);
1226 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1 - overlappingElements * tChannels);
1228 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0]));
1229 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1230 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1232 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1233 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1235 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1237 const uint16x8_t bufferChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1238 const uint16x8_t bufferChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1239 const uint16x8_t bufferChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1241 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1242 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1244 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1245 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1247 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1248 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1250 buffer0 += remainingAfterBlocks8 * tChannels;
1251 buffer1 += remainingAfterBlocks8 * tChannels;
1254 if constexpr (blocks1 != 0u)
1256 for (
unsigned int n = 0u; n < blocks1; ++n)
1258 for (
unsigned int c = 0u; c < tChannels; ++c)
1260 sumIndividual +=
sqrDistance(int16_t(buffer0[n * tChannels + c] - meanValues0[c]), int16_t(buffer1[n * tChannels + c] - meanValues1[c]));
1264 buffer0 += blocks1 * tChannels;
1265 buffer1 += blocks1 * tChannels;
1268 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1300 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1302 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1303 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1305 ocean_assert(patch0StrideElements >= tPatchSize);
1306 ocean_assert(patch1StrideElements >= tPatchSize);
1308 constexpr unsigned int blocks16 = tPatchSize / 16u;
1309 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1311 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1312 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1314 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1315 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1317 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1318 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1320 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1322 static_assert(blocks1 <= 2u,
"Invalid block size!");
1327 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1329 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1330 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1332 uint32_t sumIndividual = 0u;
1334 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1336 for (
unsigned int n = 0u; n < blocks16; ++n)
1338 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1339 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1341 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1342 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1344 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1345 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1347 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1348 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1350 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1351 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1357 if constexpr (partialBlock16)
1359 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1360 ocean_assert(overlappingElements < 8u);
1362 if (y < tPatchSize - 1u)
1364 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1365 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1367 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1368 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1373 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1374 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1376 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1378 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1379 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1381 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1382 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1384 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1385 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1389 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0 - overlappingElements);
1390 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1 - overlappingElements);
1392 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1393 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1395 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1396 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1398 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1400 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1401 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1403 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1404 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1406 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1407 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1410 patch0 += remainingAfterBlocks16;
1411 patch1 += remainingAfterBlocks16;
1414 for (
unsigned int n = 0u; n < blocks8; ++n)
1416 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1417 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1419 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1421 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8));
1423 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1424 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1430 if constexpr (partialBlock8)
1432 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1433 ocean_assert(overlappingElements < 8u);
1435 if (y < tPatchSize - 1u)
1437 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1438 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1440 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1442 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1443 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1445 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1447 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1449 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1450 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1454 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0 - overlappingElements);
1455 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1 - overlappingElements);
1457 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1459 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1460 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1462 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1464 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1466 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1467 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1470 patch0 += remainingAfterBlocks8;
1471 patch1 += remainingAfterBlocks8;
1474 if constexpr (blocks1 != 0u)
1476 for (
unsigned int n = 0u; n < blocks1; ++n)
1478 sumIndividual +=
sqrDistance(int16_t(patch0[n] - meanValues0[0]), int16_t(patch1[n] - meanValues1[0]));
1485 patch0 += patch0StrideElements - tPatchSize;
1486 patch1 += patch1StrideElements - tPatchSize;
1489 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1498 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1500 constexpr unsigned int tChannels = 3u;
1502 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1503 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1505 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1506 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1508 constexpr unsigned int blocks16 = tPatchSize / 16u;
1509 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1511 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1512 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1514 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1515 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1517 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1518 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1520 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1522 static_assert(blocks1 <= 2u,
"Invalid block size!");
1527 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1528 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1529 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1531 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1532 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1534 uint32_t sumIndividual = 0u;
1536 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1538 for (
unsigned int n = 0u; n < blocks16; ++n)
1540 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1541 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1543 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0])));
1544 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0])));
1546 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1547 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1549 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1550 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1553 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8));
1554 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1556 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1557 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1559 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1560 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1563 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1564 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1565 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1566 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1568 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1569 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1570 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1571 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1573 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1574 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1575 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1576 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1579 patch0 += 16u * tChannels;
1580 patch1 += 16u * tChannels;
1583 if constexpr (partialBlock16)
1585 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1586 ocean_assert(overlappingElements < 8u);
1588 if (y < tPatchSize - 1u)
1590 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1591 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1594 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0])));
1595 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0])));
1597 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1598 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1600 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1601 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1607 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1608 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1610 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1613 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8));
1614 const uint16x8_t patchChannel0_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1616 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1617 const uint16x8_t patchChannel1_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1619 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1620 const uint16x8_t patchChannel2_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1623 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1624 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1625 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1626 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1628 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1629 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1630 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1631 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1633 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1634 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1635 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1636 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1640 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0 - overlappingElements * tChannels);
1641 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1 - overlappingElements * tChannels);
1644 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0])));
1645 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0])));
1647 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1648 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1650 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1651 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1654 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1655 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1657 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1660 const uint16x8_t patchChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1661 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1663 const uint16x8_t patchChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1664 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1666 const uint16x8_t patchChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1667 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1670 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1671 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1672 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1673 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1675 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1676 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1677 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1678 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1680 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1681 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1682 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1683 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1686 patch0 += remainingAfterBlocks16 * tChannels;
1687 patch1 += remainingAfterBlocks16 * tChannels;
1690 for (
unsigned int n = 0u; n < blocks8; ++n)
1692 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1693 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1695 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0]));
1696 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1697 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1699 const uint16x8_t patchChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8));
1700 const uint16x8_t patchChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1701 const uint16x8_t patchChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1703 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1704 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1706 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1707 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1709 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1710 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1712 patch0 += 8u * tChannels;
1713 patch1 += 8u * tChannels;
1716 if constexpr (partialBlock8)
1718 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1719 ocean_assert(overlappingElements < 8u);
1721 if (y < tPatchSize - 1u)
1723 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1724 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1726 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0]));
1727 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1728 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1730 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1731 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1733 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1735 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1736 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1737 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1739 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1740 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1742 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1743 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1745 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1746 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1750 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0 - overlappingElements * tChannels);
1751 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1 - overlappingElements * tChannels);
1753 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0]));
1754 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1755 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1757 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1758 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1760 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1762 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1763 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1764 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1766 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1767 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1769 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1770 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1772 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1773 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1776 patch0 += remainingAfterBlocks8 * tChannels;
1777 patch1 += remainingAfterBlocks8 * tChannels;
1780 if constexpr (blocks1 != 0u)
1782 for (
unsigned int n = 0u; n < blocks1; ++n)
1784 for (
unsigned int c = 0u; c < tChannels; ++c)
1786 sumIndividual +=
sqrDistance(int16_t(patch0[n * tChannels + c] - meanValues0[c]), int16_t(patch1[n * tChannels + c] - meanValues1[c]));
1790 patch0 += blocks1 * tChannels;
1791 patch1 += blocks1 * tChannels;
1794 patch0 += patch0StrideElements - tPatchSize * tChannels;
1795 patch1 += patch1StrideElements - tPatchSize * tChannels;
1798 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1837inline uint32_t
ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patchMirroredBorder8BitPerChannel(
const uint8_t*
const image0,
const uint8_t*
const image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1)
1839 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
1840 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1842 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
1844 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
1845 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1847 ocean_assert(centerX0 < width0 && centerY0 < height0);
1848 ocean_assert(centerX1 < width1 && centerY1 < height1);
1850 const unsigned int image0StrideElements = width0 + image0PaddingElements;
1851 const unsigned int image1StrideElements = width1 + image1PaddingElements;
1853 constexpr unsigned int blocks16 = tPatchSize / 16u;
1854 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1856 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1857 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1859 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1860 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1862 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1863 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1865 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1867 static_assert(blocks1 <= 2u,
"Invalid block size!");
1872 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1874 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1875 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1877 uint32_t sumIndividual = 0u;
1879 uint8_t intermediate[16];
1881 int y1 = int(centerY1) - int(tPatchSize_2);
1882 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
1887 int x0 = int(centerX0) - int(tPatchSize_2);
1888 int x1 = int(centerX1) - int(tPatchSize_2);
1890 for (
unsigned int n = 0u; n < blocks16; ++n)
1892 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow0, x0, width0, intermediate);
1893 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow1, x1, width1, intermediate);
1895 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1896 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1898 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1899 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1901 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1902 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1904 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1905 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1911 if constexpr (partialBlock16)
1913 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1914 ocean_assert(overlappingElements < 8u);
1916 if (y0 <
int(centerY0) +
int(tPatchSize_2))
1918 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1919 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1921 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1922 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1927 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1928 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1930 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1932 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1933 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1935 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1936 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1938 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1939 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1943 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1944 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1946 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1947 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1949 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1950 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1952 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1954 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1955 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1957 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1958 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1960 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1961 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1964 x0 += remainingAfterBlocks16;
1965 x1 += remainingAfterBlocks16;
1968 for (
unsigned int n = 0u; n < blocks8; ++n)
1970 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow0, x0, width0, intermediate);
1971 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow1, x1, width1, intermediate);
1973 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1975 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8));
1977 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1978 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1984 if constexpr (partialBlock8)
1986 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1987 ocean_assert(overlappingElements < 8u);
1989 if (y0 <
int(centerY0) +
int(tPatchSize_2))
1991 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
1992 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
1994 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1996 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1997 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1999 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2001 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
2003 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2004 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2008 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2009 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2011 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
2013 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
2014 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
2016 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2018 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
2020 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2021 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2024 x0 += remainingAfterBlocks8;
2025 x1 += remainingAfterBlocks8;
2028 if constexpr (blocks1 != 0u)
2030 for (
unsigned int n = 0u; n < blocks1; ++n)
2035 sumIndividual +=
sqrDistance(int16_t(mirroredRow0[index0] - meanValues0[0]), int16_t(mirroredRow1[index1] - meanValues1[0]));
2042 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);