8 #ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
9 #define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
15 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
37 template <
unsigned int tChannels>
49 template <
unsigned int tPixels>
50 static inline void mean8BitPerChannel(
const uint8_t*
const buffer, uint8_t*
const meanValues);
59 template <
unsigned int tPatchSize>
60 static inline void mean8BitPerChannel(
const uint8_t* patch,
const unsigned int patchStrideElements, uint8_t*
const meanValues);
73 template <
unsigned int tPatchSize>
74 static inline void mean8BitPerChannelMirroredBorder(
const uint8_t*
const image,
const unsigned int width,
const unsigned int height,
const unsigned int centerX,
const unsigned int centerY,
const unsigned int imagePaddingElements, uint8_t*
const meanValues);
85 template <
unsigned int tPixels>
86 static inline uint32_t
buffer8BitPerChannel(
const uint8_t*
const buffer0,
const uint8_t*
const buffer1,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1);
99 template <
unsigned int tPatchSize>
100 static inline uint32_t
patch8BitPerChannel(
const uint8_t* patch0,
const uint8_t* patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1);
121 template <
unsigned int tPatchSize>
122 static inline uint32_t
patchMirroredBorder8BitPerChannel(
const uint8_t*
const image0,
const uint8_t*
const image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1);
135 template <
unsigned int tChannels,
unsigned int tPixels>
136 static inline uint32_t
buffer8BitPerChannel(
const uint8_t*
const buffer0,
const uint8_t*
const buffer1);
148 template <
unsigned int tChannels,
unsigned int tPatchSize>
149 static inline uint32_t
patch8BitPerChannel(
const uint8_t*
const patch0,
const uint8_t*
const patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements);
160 template <
unsigned int tChannels,
unsigned int tPatchSize>
161 static inline uint32_t
patchBuffer8BitPerChannel(
const uint8_t* patch0,
const uint8_t* buffer1,
const unsigned int patch0StrideElements);
181 template <
unsigned int tChannels,
unsigned int tPatchSize>
182 static uint32_t
patchMirroredBorder8BitPerChannel(
const uint8_t*
const image0,
const uint8_t*
const image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements);
191 template <
unsigned int tChannels,
unsigned int tPixels>
192 static OCEAN_FORCE_INLINE
void mean8BitPerChannel(
const uint8_t*
const buffer, uint8_t*
const meanValues);
202 template <
unsigned int tChannels,
unsigned int tPatchSize>
203 static OCEAN_FORCE_INLINE
void mean8BitPerChannel(
const uint8_t* patch,
const unsigned int patchStrideElements, uint8_t*
const meanValues);
218 template <
bool tFront,
unsigned int tPixels,
bool tOverlappingToZero>
219 static OCEAN_FORCE_INLINE uint8x8_t
loadMirrored_u_8x8(
const uint8_t*
const row,
const int x,
const unsigned int width, uint8_t*
const intermediateBuffer);
232 template <
bool tFront,
unsigned int tPixels,
bool tOverlappingToZero>
233 static OCEAN_FORCE_INLINE uint8x16_t
loadMirrored_u_8x16(
const uint8_t*
const row,
const int x,
const unsigned int width, uint8_t*
const intermediateBuffer);
237 template <
unsigned int tPixels>
240 static_assert(tPixels >= 8u,
"Invalid pixels!");
242 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
244 constexpr
unsigned int blocks16 = tPixels / 16u;
245 constexpr
unsigned int remainingAfterBlocks16 = tPixels % 16u;
247 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u && tPixels >= 16u;
248 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
250 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
251 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
253 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u && tPixels >= 8u;
254 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
256 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
258 static_assert(blocks1 <= 2u,
"Invalid block size!");
260 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
262 uint32_t sumIndividual = 0u;
264 for (
unsigned int n = 0u; n < blocks16; ++n)
266 const uint8x16_t buffer_u_8x16 = vld1q_u8(buffer);
268 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
273 if constexpr (partialBlock16)
275 static_assert(tPixels >= 16u,
"We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
277 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
278 ocean_assert(overlappingElements < 8u);
282 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
284 const uint8x16_t buffer_u_8x16 = vandq_u8(vld1q_u8(buffer - overlappingElements), mask_u_8x16);
286 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
288 buffer += remainingAfterBlocks16;
291 for (
unsigned int n = 0u; n < blocks8; ++n)
293 const uint8x8_t buffer_u_8x8 = vld1_u8(buffer);
295 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
300 if constexpr (partialBlock8)
302 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
303 ocean_assert(overlappingElements < 8u);
305 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
307 const uint8x8_t buffer_u_8x8 = vand_u8(vld1_u8(buffer - overlappingElements), mask_u_8x8);
309 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
311 buffer += remainingAfterBlocks8;
314 if constexpr (blocks1 != 0u)
316 for (
unsigned int n = 0u; n < blocks1; ++n)
318 sumIndividual += buffer[n];
325 vst1q_u32(results, sum_u_32x4);
327 const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
329 meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
333 template <
unsigned int tPixels>
336 static_assert(tPixels >= 8u,
"Invalid pixels!");
338 constexpr
unsigned int tChannels = 3u;
340 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
342 constexpr
unsigned int blocks16 = tPixels / 16u;
343 constexpr
unsigned int remainingAfterBlocks16 = tPixels % 16u;
345 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u && blocks16 >= 1u;
346 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
348 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
349 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
351 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
352 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
354 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
356 static_assert(blocks1 <= 2u,
"Invalid block size!");
358 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
359 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
360 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
362 uint32_t sumIndividual[3] = {0u};
364 for (
unsigned int n = 0u; n < blocks16; ++n)
366 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer);
368 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[0]));
369 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[1]));
370 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[2]));
372 buffer += 16u * tChannels;
375 if constexpr (partialBlock16)
377 static_assert(tPixels >= 16u,
"We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
379 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
380 ocean_assert(overlappingElements < 8u);
384 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
386 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer - overlappingElements * tChannels);
388 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[0], mask_u_8x16)));
389 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[1], mask_u_8x16)));
390 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[2], mask_u_8x16)));
392 buffer += remainingAfterBlocks16 * tChannels;
395 for (
unsigned int n = 0u; n < blocks8; ++n)
397 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer);
399 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[0]));
400 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[1]));
401 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[2]));
403 buffer += 8u * tChannels;
406 if constexpr (partialBlock8)
408 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
409 ocean_assert(overlappingElements < 8u);
411 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
413 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer - overlappingElements * tChannels);
415 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[0], mask_u_8x8)));
416 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[1], mask_u_8x8)));
417 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[2], mask_u_8x8)));
419 buffer += remainingAfterBlocks8 * tChannels;
422 for (
unsigned int n = 0u; n < blocks1; ++n)
424 sumIndividual[0] += buffer[tChannels * n + 0u];
425 sumIndividual[1] += buffer[tChannels * n + 1u];
426 sumIndividual[2] += buffer[tChannels * n + 2u];
430 vst1q_u32(results, sumChannel0_u_32x4);
432 const uint32_t sum0 = results[0] + results[1] + results[2] + results[3] + sumIndividual[0];
433 meanValues[0] = uint8_t((sum0 + tPixels / 2u) / tPixels);
435 vst1q_u32(results, sumChannel1_u_32x4);
437 const uint32_t sum1 = results[0] + results[1] + results[2] + results[3] + sumIndividual[1];
438 meanValues[1] = uint8_t((sum1 + tPixels / 2u) / tPixels);
440 vst1q_u32(results, sumChannel2_u_32x4);
442 const uint32_t sum2 = results[0] + results[1] + results[2] + results[3] + sumIndividual[2];
443 meanValues[2] = uint8_t((sum2 + tPixels / 2u) / tPixels);
446 template <
unsigned int tChannels>
447 template <
unsigned int tPixels>
450 static_assert(tChannels >= 1u,
"Invalid channel number!");
451 static_assert(tPixels >= 1u,
"Invalid buffer size!");
453 ocean_assert(buffer !=
nullptr && meanValues !=
nullptr);
455 uint32_t sum[tChannels] = {0u};
457 for (
unsigned int n = 0u; n < tPixels; ++n)
459 for (
unsigned int c = 0u; c < tChannels; ++c)
461 sum[c] += buffer[n * tChannels + c];
465 for (
unsigned int c = 0u; c < tChannels; ++c)
467 meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
472 template <
unsigned int tPatchSize>
475 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
477 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
479 ocean_assert(patchStrideElements >= tPatchSize);
481 constexpr
unsigned int blocks16 = tPatchSize / 16u;
482 constexpr
unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
484 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
485 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
487 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
488 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
490 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
491 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
493 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
495 static_assert(blocks1 <= 2u,
"Invalid block size!");
497 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
499 uint32_t sumIndividual = 0u;
501 for (
unsigned int y = 0u; y < tPatchSize; ++y)
503 for (
unsigned int n = 0u; n < blocks16; ++n)
505 const uint8x16_t patch_u_8x16 = vld1q_u8(patch);
507 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
512 if constexpr (partialBlock16)
514 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
515 ocean_assert(overlappingElements < 8u);
517 if (y < tPatchSize - 1u)
521 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
522 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
524 const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch), mask_u_8x16);
526 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
532 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
533 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
535 const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch - overlappingElements), mask_u_8x16);
537 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
540 patch += remainingAfterBlocks16;
543 for (
unsigned int n = 0u; n < blocks8; ++n)
545 const uint8x8_t patch_u_8x8 = vld1_u8(patch);
547 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
552 if constexpr (partialBlock8)
554 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
555 ocean_assert(overlappingElements < 8u);
557 if (y < tPatchSize - 1u)
559 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
560 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
562 const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch), mask_u_8x8);
564 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
568 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
569 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
571 const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch - overlappingElements), mask_u_8x8);
573 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
576 patch += remainingAfterBlocks8;
579 if constexpr (blocks1 != 0u)
581 for (
unsigned int n = 0u; n < blocks1; ++n)
583 sumIndividual += patch[n];
589 patch += patchStrideElements - tPatchSize;
593 vst1q_u32(results, sum_u_32x4);
595 const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
597 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
601 template <
unsigned int tPatchSize>
604 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
606 constexpr
unsigned int tChannels = 3u;
608 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
610 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
612 constexpr
unsigned int blocks16 = tPatchSize / 16u;
613 constexpr
unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
615 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
616 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
618 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
619 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
621 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
622 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
624 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
626 static_assert(blocks1 <= 2u,
"Invalid block size!");
628 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
629 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
630 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
632 uint32_t sumIndividual[3] = {0u};
634 for (
unsigned int y = 0u; y < tPatchSize; ++y)
636 for (
unsigned int n = 0u; n < blocks16; ++n)
638 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
640 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[0]));
641 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[1]));
642 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[2]));
644 patch += 16u * tChannels;
647 if constexpr (partialBlock16)
649 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
650 ocean_assert(overlappingElements < 8u);
652 if (y < tPatchSize - 1u)
656 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
657 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
659 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
661 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
662 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
663 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
669 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
670 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
672 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch - overlappingElements * tChannels);
674 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
675 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
676 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
679 patch += remainingAfterBlocks16 * tChannels;
682 for (
unsigned int n = 0u; n < blocks8; ++n)
684 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
686 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(patch_u_8x8x3.val[0]));
687 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(patch_u_8x8x3.val[1]));
688 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(patch_u_8x8x3.val[2]));
690 patch += 8u * tChannels;
693 if constexpr (partialBlock8)
695 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
696 ocean_assert(overlappingElements < 8u);
698 if (y < tPatchSize - 1u)
700 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
701 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
703 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
705 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
706 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
707 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
711 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
712 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
714 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch - overlappingElements * tChannels);
716 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
717 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
718 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
721 patch += remainingAfterBlocks8 * tChannels;
724 if constexpr (blocks1 != 0u)
726 for (
unsigned int n = 0u; n < blocks1; ++n)
728 sumIndividual[0] += patch[tChannels * n + 0u];
729 sumIndividual[1] += patch[tChannels * n + 1u];
730 sumIndividual[2] += patch[tChannels * n + 2u];
733 patch += blocks1 * tChannels;
736 patch += patchStrideElements - tChannels * tPatchSize;
740 vst1q_u32(results, sumChannel0_u_32x4);
742 const uint32_t sum0 = results[0] + results[1] + results[2] + results[3] + sumIndividual[0];
743 meanValues[0] = uint8_t((sum0 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
745 vst1q_u32(results, sumChannel1_u_32x4);
747 const uint32_t sum1 = results[0] + results[1] + results[2] + results[3] + sumIndividual[1];
748 meanValues[1] = uint8_t((sum1 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
750 vst1q_u32(results, sumChannel2_u_32x4);
752 const uint32_t sum2 = results[0] + results[1] + results[2] + results[3] + sumIndividual[2];
753 meanValues[2] = uint8_t((sum2 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
756 template <
unsigned int tChannels>
757 template <
unsigned int tPatchSize>
760 static_assert(tChannels >= 1u,
"Invalid channel number!");
761 static_assert(tPatchSize >= 1u,
"Invalid patch size!");
763 ocean_assert(patch !=
nullptr && meanValues !=
nullptr);
765 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
767 uint32_t sum[tChannels] = {0u};
769 for (
unsigned int y = 0u; y < tPatchSize; ++y)
771 for (
unsigned int x = 0u; x < tPatchSize; ++x)
773 for (
unsigned int n = 0u; n < tChannels; ++n)
775 sum[n] += patch[x * tChannels + n];
779 patch += patchStrideElements;
782 for (
unsigned int n = 0u; n < tChannels; ++n)
784 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
789 template <
unsigned int tPatchSize>
792 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
794 ocean_assert(image !=
nullptr && meanValues !=
nullptr);
795 ocean_assert(centerX < width && centerY < height);
797 constexpr
unsigned int tPatchSize_2 = tPatchSize / 2u;
799 const unsigned int imageStrideElements = width + imagePaddingElements;
801 constexpr
unsigned int blocks16 = tPatchSize / 16u;
802 constexpr
unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
804 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
805 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
807 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
808 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
810 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
811 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
813 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
815 static_assert(blocks1 <= 7u,
"Invalid block size!");
817 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
819 uint32_t sumIndividual = 0u;
821 uint8_t intermediate[16];
823 for (
int y =
int(centerY) -
int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
827 int x = int(centerX) - int(tPatchSize_2);
829 for (
unsigned int n = 0u; n < blocks16; ++n)
831 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow, x, width, intermediate);
833 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
838 if constexpr (partialBlock16)
840 if (y <
int(centerY) +
int(tPatchSize_2))
842 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
844 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
848 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
850 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
853 x += remainingAfterBlocks16;
856 for (
unsigned int n = 0u; n < blocks8; ++n)
858 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow, x, width, intermediate);
860 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
865 if constexpr (partialBlock8)
867 if (y <
int(centerY) +
int(tPatchSize_2))
869 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
871 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
875 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
877 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
880 x += remainingAfterBlocks8;
883 if constexpr (blocks1 != 0u)
885 for (
unsigned int n = 0u; n < blocks1; ++n)
889 sumIndividual += mirroredRow[index];
897 vst1q_u32(results, sum_u_32x4);
899 const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
901 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
904 template <
unsigned int tChannels>
905 template <
unsigned int tPatchSize>
908 static_assert(tChannels >= 1u,
"Invalid channel number!");
909 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
911 ocean_assert(image !=
nullptr && meanValues !=
nullptr);
912 ocean_assert(centerX < width && centerY < height);
914 constexpr
unsigned int tPatchSize_2 = tPatchSize / 2u;
916 const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
918 uint32_t sum[tChannels] = {0u};
920 for (
int y =
int(centerY) -
int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
924 for (
int x =
int(centerX) -
int(tPatchSize_2); x <= int(centerX) + int(tPatchSize_2); ++x)
928 for (
unsigned int c = 0u; c < tChannels; ++c)
935 for (
unsigned int n = 0u; n < tChannels; ++n)
937 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
942 template <
unsigned int tPixels>
945 static_assert(tPixels >= 8u,
"Invalid pixels!");
947 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
948 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
950 constexpr
unsigned int blocks16 = tPixels / 16u;
951 constexpr
unsigned int remainingAfterBlocks16 = tPixels % 16u;
953 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
954 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
956 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
957 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
959 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
960 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
962 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
964 static_assert(blocks1 <= 2u,
"Invalid block size!");
969 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
971 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
972 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
974 uint32_t sumIndividual = 0u;
976 for (
unsigned int n = 0u; n < blocks16; ++n)
978 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0);
979 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1);
981 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16)));
982 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16)));
984 const uint16x8_t buffer_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8));
985 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
987 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
988 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
990 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
991 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
997 if constexpr (partialBlock16)
999 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1000 ocean_assert(overlappingElements < 8u);
1002 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0 - overlappingElements);
1003 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1 - overlappingElements);
1005 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16)));
1006 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16)));
1008 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1009 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1011 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1013 const uint16x8_t buffer_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1014 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
1016 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
1017 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
1019 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
1020 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
1022 buffer0 += remainingAfterBlocks16;
1023 buffer1 += remainingAfterBlocks16;
1026 for (
unsigned int n = 0u; n < blocks8; ++n)
1028 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0);
1029 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1);
1031 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8));
1033 const uint16x8_t buffer_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8));
1035 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1036 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1042 if constexpr (partialBlock8)
1044 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1045 ocean_assert(overlappingElements < 8u);
1047 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0 - overlappingElements);
1048 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1 - overlappingElements);
1050 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8));
1052 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1053 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1055 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1057 const uint16x8_t buffer_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1059 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1060 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1062 buffer0 += remainingAfterBlocks8;
1063 buffer1 += remainingAfterBlocks8;
1066 if constexpr (blocks1 != 0u)
1068 for (
unsigned int n = 0u; n < blocks1; ++n)
1070 sumIndividual +=
sqrDistance(int16_t(buffer0[n] - meanValues0[0]), int16_t(buffer1[n] - meanValues1[0]));
1074 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1076 uint32_t results[4];
1077 vst1q_u32(results, sum_u_32x4);
1079 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1083 template <
unsigned int tPixels>
1086 static_assert(tPixels >= 8u,
"Invalid pixels!");
1088 constexpr
unsigned int tChannels = 3u;
1090 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
1091 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1093 constexpr
unsigned int blocks16 = tPixels / 16u;
1094 constexpr
unsigned int remainingAfterBlocks16 = tPixels % 16u;
1096 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
1097 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1099 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1100 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1102 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1103 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1105 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
1107 static_assert(blocks1 <= 2u,
"Invalid block size!");
1112 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1113 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1114 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1116 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1117 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1119 uint32_t sumIndividual = 0u;
1121 for (
unsigned int n = 0u; n < blocks16; ++n)
1123 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0);
1124 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1);
1126 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0])));
1127 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0])));
1129 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1130 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1132 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1133 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1136 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8));
1137 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1139 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1140 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1142 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1143 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1146 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1147 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1148 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1149 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1151 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1152 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1153 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1154 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1156 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1157 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1158 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1159 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1162 buffer0 += 16u * tChannels;
1163 buffer1 += 16u * tChannels;
1166 if constexpr (partialBlock16)
1168 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1169 ocean_assert(overlappingElements < 8u);
1171 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0 - overlappingElements * tChannels);
1172 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1 - overlappingElements * tChannels);
1175 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0])));
1176 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0])));
1178 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1179 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1181 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1182 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1185 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1186 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1188 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1191 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1192 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1194 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1195 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1197 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1198 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1201 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1202 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1203 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1204 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1206 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1207 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1208 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1209 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1211 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1212 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1213 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1214 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1216 buffer0 += remainingAfterBlocks16 * tChannels;
1217 buffer1 += remainingAfterBlocks16 * tChannels;
1220 for (
unsigned int n = 0u; n < blocks8; ++n)
1222 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0);
1223 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1);
1225 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0]));
1226 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1227 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1229 const uint16x8_t bufferChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8));
1230 const uint16x8_t bufferChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1231 const uint16x8_t bufferChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1233 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1234 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1236 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1237 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1239 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1240 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1242 buffer0 += 8u * tChannels;
1243 buffer1 += 8u * tChannels;
1246 if constexpr (partialBlock8)
1248 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1249 ocean_assert(overlappingElements < 8u);
1251 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0 - overlappingElements * tChannels);
1252 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1 - overlappingElements * tChannels);
1254 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0]));
1255 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1256 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1258 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1259 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1261 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1263 const uint16x8_t bufferChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1264 const uint16x8_t bufferChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1265 const uint16x8_t bufferChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1267 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1268 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1270 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1271 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1273 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1274 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1276 buffer0 += remainingAfterBlocks8 * tChannels;
1277 buffer1 += remainingAfterBlocks8 * tChannels;
1280 if constexpr (blocks1 != 0u)
1282 for (
unsigned int n = 0u; n < blocks1; ++n)
1284 for (
unsigned int c = 0u; c < tChannels; ++c)
1286 sumIndividual +=
sqrDistance(int16_t(buffer0[n * tChannels + c] - meanValues0[c]), int16_t(buffer1[n * tChannels + c] - meanValues1[c]));
1290 buffer0 += blocks1 * tChannels;
1291 buffer1 += blocks1 * tChannels;
1294 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1296 uint32_t results[4];
1297 vst1q_u32(results, sum_u_32x4);
1299 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1302 template <
unsigned int tChannels>
1303 template <
unsigned int tPixels>
1306 static_assert(tChannels >= 1u,
"Invalid channel number!");
1307 static_assert(tPixels >= 1u,
"Invalid pixels!");
1309 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
1310 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1312 uint32_t zmssd = 0u;
1314 for (
unsigned int x = 0u; x < tPixels; ++x)
1316 for (
unsigned int c = 0u; c < tChannels; ++c)
1318 zmssd +=
sqrDistance(buffer0[x * tChannels + c] - meanValues0[c], buffer1[x * tChannels + c] - meanValues1[c]);
1326 template <
unsigned int tPatchSize>
1329 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1331 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1332 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1334 ocean_assert(patch0StrideElements >= tPatchSize);
1335 ocean_assert(patch1StrideElements >= tPatchSize);
1337 constexpr
unsigned int blocks16 = tPatchSize / 16u;
1338 constexpr
unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1340 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
1341 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1343 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1344 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1346 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1347 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1349 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
1351 static_assert(blocks1 <= 2u,
"Invalid block size!");
1356 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1358 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1359 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1361 uint32_t sumIndividual = 0u;
1363 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1365 for (
unsigned int n = 0u; n < blocks16; ++n)
1367 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1368 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1370 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1371 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1373 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1374 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1376 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1377 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1379 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1380 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1386 if constexpr (partialBlock16)
1388 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1389 ocean_assert(overlappingElements < 8u);
1391 if (y < tPatchSize - 1u)
1393 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1394 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1396 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1397 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1402 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1403 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1405 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1407 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1408 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1410 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1411 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1413 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1414 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1418 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0 - overlappingElements);
1419 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1 - overlappingElements);
1421 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1422 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1424 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1425 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1427 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1429 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1430 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1432 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1433 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1435 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1436 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1439 patch0 += remainingAfterBlocks16;
1440 patch1 += remainingAfterBlocks16;
1443 for (
unsigned int n = 0u; n < blocks8; ++n)
1445 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1446 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1448 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1450 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8));
1452 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1453 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1459 if constexpr (partialBlock8)
1461 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1462 ocean_assert(overlappingElements < 8u);
1464 if (y < tPatchSize - 1u)
1466 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1467 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1469 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1471 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1472 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1474 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1476 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1478 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1479 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1483 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0 - overlappingElements);
1484 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1 - overlappingElements);
1486 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
1488 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1489 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1491 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1493 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1495 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1496 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1499 patch0 += remainingAfterBlocks8;
1500 patch1 += remainingAfterBlocks8;
1503 if constexpr (blocks1 != 0u)
1505 for (
unsigned int n = 0u; n < blocks1; ++n)
1507 sumIndividual +=
sqrDistance(int16_t(patch0[n] - meanValues0[0]), int16_t(patch1[n] - meanValues1[0]));
1514 patch0 += patch0StrideElements - tPatchSize;
1515 patch1 += patch1StrideElements - tPatchSize;
1518 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1520 uint32_t results[4];
1521 vst1q_u32(results, sum_u_32x4);
1523 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1527 template <
unsigned int tPatchSize>
1530 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1532 constexpr
unsigned int tChannels = 3u;
1534 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1535 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1537 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1538 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1540 constexpr
unsigned int blocks16 = tPatchSize / 16u;
1541 constexpr
unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1543 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
1544 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1546 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1547 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1549 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1550 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1552 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
1554 static_assert(blocks1 <= 2u,
"Invalid block size!");
1559 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1560 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1561 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1563 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1564 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1566 uint32_t sumIndividual = 0u;
1568 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1570 for (
unsigned int n = 0u; n < blocks16; ++n)
1572 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1573 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1575 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0])));
1576 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0])));
1578 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1579 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1581 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1582 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1585 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8));
1586 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1588 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1589 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1591 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1592 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1595 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1596 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1597 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1598 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1600 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1601 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1602 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1603 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1605 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1606 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1607 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1608 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1611 patch0 += 16u * tChannels;
1612 patch1 += 16u * tChannels;
1615 if constexpr (partialBlock16)
1617 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1618 ocean_assert(overlappingElements < 8u);
1620 if (y < tPatchSize - 1u)
1622 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1623 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1626 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0])));
1627 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0])));
1629 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1630 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1632 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1633 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1639 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1640 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1642 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1645 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8));
1646 const uint16x8_t patchChannel0_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1648 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1649 const uint16x8_t patchChannel1_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1651 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1652 const uint16x8_t patchChannel2_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1655 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1656 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1657 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1658 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1660 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1661 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1662 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1663 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1665 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1666 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1667 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1668 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1672 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0 - overlappingElements * tChannels);
1673 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1 - overlappingElements * tChannels);
1676 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0])));
1677 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0])));
1679 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1680 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1682 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1683 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1686 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1687 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1689 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1692 const uint16x8_t patchChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1693 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1695 const uint16x8_t patchChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1696 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1698 const uint16x8_t patchChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1699 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1702 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1703 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1704 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1705 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1707 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1708 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1709 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1710 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1712 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1713 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1714 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1715 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1718 patch0 += remainingAfterBlocks16 * tChannels;
1719 patch1 += remainingAfterBlocks16 * tChannels;
1722 for (
unsigned int n = 0u; n < blocks8; ++n)
1724 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1725 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1727 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0]));
1728 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1729 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1731 const uint16x8_t patchChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8));
1732 const uint16x8_t patchChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1733 const uint16x8_t patchChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1735 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1736 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1738 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1739 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1741 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1742 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1744 patch0 += 8u * tChannels;
1745 patch1 += 8u * tChannels;
1748 if constexpr (partialBlock8)
1750 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1751 ocean_assert(overlappingElements < 8u);
1753 if (y < tPatchSize - 1u)
1755 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1756 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1758 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0]));
1759 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1760 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1762 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1763 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1765 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1767 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1768 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1769 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1771 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1772 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1774 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1775 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1777 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1778 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1782 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0 - overlappingElements * tChannels);
1783 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1 - overlappingElements * tChannels);
1785 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0]));
1786 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1787 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1789 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1790 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1792 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1794 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1795 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1796 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1798 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1799 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1801 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1802 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1804 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1805 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1808 patch0 += remainingAfterBlocks8 * tChannels;
1809 patch1 += remainingAfterBlocks8 * tChannels;
1812 if constexpr (blocks1 != 0u)
1814 for (
unsigned int n = 0u; n < blocks1; ++n)
1816 for (
unsigned int c = 0u; c < tChannels; ++c)
1818 sumIndividual +=
sqrDistance(int16_t(patch0[n * tChannels + c] - meanValues0[c]), int16_t(patch1[n * tChannels + c] - meanValues1[c]));
1822 patch0 += blocks1 * tChannels;
1823 patch1 += blocks1 * tChannels;
1826 patch0 += patch0StrideElements - tPatchSize * tChannels;
1827 patch1 += patch1StrideElements - tPatchSize * tChannels;
1830 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1832 uint32_t results[4];
1833 vst1q_u32(results, sum_u_32x4);
1835 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1838 template <
unsigned int tChannels>
1839 template <
unsigned int tPatchSize>
1842 static_assert(tChannels >= 1u,
"Invalid channel number!");
1843 static_assert(tPatchSize >= 1u,
"Invalid patch size!");
1845 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
1846 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1848 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1849 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1851 uint32_t zmssd = 0u;
1853 for (
unsigned int y = 0u; y < tPatchSize; ++y)
1855 for (
unsigned int x = 0u; x < tPatchSize; ++x)
1857 for (
unsigned int n = 0u; n < tChannels; ++n)
1859 zmssd +=
sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1863 patch0 += patch0StrideElements;
1864 patch1 += patch1StrideElements;
1871 template <
unsigned int tPatchSize>
1872 inline uint32_t
ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patchMirroredBorder8BitPerChannel(
const uint8_t*
const image0,
const uint8_t*
const image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1)
1874 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
1875 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
1877 constexpr
unsigned int tPatchSize_2 = tPatchSize / 2u;
1879 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
1880 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
1882 ocean_assert(centerX0 < width0 && centerY0 < height0);
1883 ocean_assert(centerX1 < width1 && centerY1 < height1);
1885 const unsigned int image0StrideElements = width0 + image0PaddingElements;
1886 const unsigned int image1StrideElements = width1 + image1PaddingElements;
1888 constexpr
unsigned int blocks16 = tPatchSize / 16u;
1889 constexpr
unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1891 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
1892 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1894 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1895 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1897 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1898 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1900 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
1902 static_assert(blocks1 <= 2u,
"Invalid block size!");
1907 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1909 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1910 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1912 uint32_t sumIndividual = 0u;
1914 uint8_t intermediate[16];
1916 int y1 = int(centerY1) - int(tPatchSize_2);
1917 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
1922 int x0 = int(centerX0) - int(tPatchSize_2);
1923 int x1 = int(centerX1) - int(tPatchSize_2);
1925 for (
unsigned int n = 0u; n < blocks16; ++n)
1927 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow0, x0, width0, intermediate);
1928 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow1, x1, width1, intermediate);
1930 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1931 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1933 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1934 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1936 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1937 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1939 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1940 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1946 if constexpr (partialBlock16)
1948 constexpr
unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1949 ocean_assert(overlappingElements < 8u);
1951 if (y0 <
int(centerY0) +
int(tPatchSize_2))
1953 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1954 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1956 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1957 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1962 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1963 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1965 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1967 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8));
1968 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1970 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1971 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1973 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1974 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1978 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1979 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1981 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16)));
1982 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16)));
1984 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1985 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1987 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1989 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1990 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1992 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1993 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1995 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1996 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1999 x0 += remainingAfterBlocks16;
2000 x1 += remainingAfterBlocks16;
2003 for (
unsigned int n = 0u; n < blocks8; ++n)
2005 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow0, x0, width0, intermediate);
2006 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow1, x1, width1, intermediate);
2008 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
2010 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8));
2012 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2013 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2019 if constexpr (partialBlock8)
2021 constexpr
unsigned int overlappingElements = 8u - remainingAfterBlocks8;
2022 ocean_assert(overlappingElements < 8u);
2024 if (y0 <
int(centerY0) +
int(tPatchSize_2))
2026 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2027 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2029 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
2031 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
2032 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
2034 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2036 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
2038 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2039 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2043 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2044 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2046 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8));
2048 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
2049 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
2051 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2053 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
2055 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2056 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2059 x0 += remainingAfterBlocks8;
2060 x1 += remainingAfterBlocks8;
2063 if constexpr (blocks1 != 0u)
2065 for (
unsigned int n = 0u; n < blocks1; ++n)
2070 sumIndividual +=
sqrDistance(int16_t(mirroredRow0[index0] - meanValues0[0]), int16_t(mirroredRow1[index1] - meanValues1[0]));
2077 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
2079 uint32_t results[4];
2080 vst1q_u32(results, sum_u_32x4);
2082 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
2085 template <
unsigned int tChannels>
2086 template <
unsigned int tPatchSize>
2087 inline uint32_t
ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patchMirroredBorder8BitPerChannel(
const uint8_t*
const image0,
const uint8_t*
const image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements,
const uint8_t*
const meanValues0,
const uint8_t*
const meanValues1)
2089 static_assert(tChannels >= 1u,
"Invalid channel number!");
2090 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
2092 constexpr
unsigned int tPatchSize_2 = tPatchSize / 2u;
2094 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
2095 ocean_assert(meanValues0 !=
nullptr && meanValues1 !=
nullptr);
2097 ocean_assert(centerX0 < width0 && centerY0 < height0);
2098 ocean_assert(centerX1 < width1 && centerY1 < height1);
2100 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
2101 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
2103 uint32_t zmssd = 0u;
2105 int y1 = int(centerY1) - int(tPatchSize_2);
2106 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
2111 int x1 = int(centerX1) - int(tPatchSize_2);
2112 for (
int x0 =
int(centerX0) -
int(tPatchSize_2); x0 <= int(centerX0) + int(tPatchSize_2); ++x0)
2117 for (
unsigned int c = 0u; c < tChannels; ++c)
2119 zmssd +=
sqrDistance(pixel0[c] - meanValues0[c], pixel1[c] - meanValues1[c]);
2131 template <
unsigned int tChannels,
unsigned int tPixels>
2134 static_assert(tChannels >= 1u,
"Invalid channel number!");
2135 static_assert(tPixels >= 8u,
"Invalid pixels!");
2137 ocean_assert(buffer0 !=
nullptr && buffer1 !=
nullptr);
2139 uint8_t meanValues0[tChannels];
2140 mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
2142 uint8_t meanValues1[tChannels];
2143 mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
2148 template <
unsigned int tChannels,
unsigned int tPatchSize>
2151 static_assert(tChannels >= 1u,
"Invalid channel number!");
2152 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
2154 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
2156 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2157 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
2159 uint8_t meanValues0[tChannels];
2160 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2162 uint8_t meanValues1[tChannels];
2163 mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
2168 template <
unsigned int tChannels,
unsigned int tPatchSize>
2171 static_assert(tChannels >= 1u,
"Invalid channel number!");
2172 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
2174 ocean_assert(patch0 !=
nullptr && buffer1 !=
nullptr);
2176 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2178 uint8_t meanValues0[tChannels];
2179 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2181 uint8_t meanValues1[tChannels];
2182 mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
2184 constexpr
unsigned int patch1StrideElements = tChannels * tPatchSize;
2189 template <
unsigned int tChannels,
unsigned int tPatchSize>
2190 uint32_t
ZeroMeanSumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(
const uint8_t*
const image0,
const uint8_t*
const image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements)
2192 static_assert(tChannels >= 1u,
"Invalid channel number!");
2193 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
2195 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
2197 uint8_t meanValues0[tChannels];
2200 uint8_t meanValues1[tChannels];
2203 return SpecializedForChannels<tChannels>::template patchMirroredBorder8BitPerChannel<tPatchSize>(image0, image1, width0, height0, width1, height1, centerX0, centerY0, centerX1, centerY1, image0PaddingElements, image1PaddingElements, meanValues0, meanValues1);
2206 template <
unsigned int tChannels,
unsigned int tPixels>
2209 static_assert(tChannels >= 1u,
"Invalid channel number!");
2210 static_assert(tPixels >= 8u,
"Invalid patch size!");
2215 template <
unsigned int tChannels,
unsigned int tPatchSize>
2218 static_assert(tChannels >= 1u,
"Invalid channel number!");
2219 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
2224 template <
bool tFront,
unsigned int tPixels,
bool tOverlappingToZero>
2227 ocean_assert(tPixels >= 1u && tPixels <= 8u);
2229 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
2231 constexpr
unsigned int tOverlappingElements = 8u - tPixels;
2233 if (x >= 0 && x <=
int(width) -
int(tPixels))
2235 if constexpr (tPixels == 8u)
2237 return vld1_u8(row + x);
2241 if constexpr (tFront)
2243 if constexpr (tOverlappingToZero)
2245 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2246 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2248 return vand_u8(vld1_u8(row + x), mask_u_8x8);
2252 return vld1_u8(row + x);
2257 if constexpr (tOverlappingToZero)
2259 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2260 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2262 return vand_u8(vld1_u8(row + x -
int(tOverlappingElements)), mask_u_8x8);
2266 return vld1_u8(row + x -
int(tOverlappingElements));
2272 if constexpr (tFront)
2274 for (
unsigned int n = 0u; n < tPixels; ++n)
2277 ocean_assert(mirroredIndex < width);
2279 intermediateBuffer[n] = row[mirroredIndex];
2282 if constexpr (tOverlappingToZero)
2284 for (
unsigned int n = tPixels; n < 8u; ++n)
2286 intermediateBuffer[n] = 0u;
2292 if constexpr (tOverlappingToZero)
2294 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
2296 intermediateBuffer[n] = 0u;
2300 for (
unsigned int n = 0u; n < tPixels; ++n)
2303 ocean_assert(mirroredIndex < width);
2305 intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2309 return vld1_u8(intermediateBuffer);
2312 template <
bool tFront,
unsigned int tPixels,
bool tOverlappingToZero>
2315 ocean_assert(tPixels > 8u && tPixels <= 16u);
2317 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
2319 constexpr
unsigned int tOverlappingElements = 16u - tPixels;
2321 if (x >= 0 && x <=
int(width) -
int(tPixels))
2323 if constexpr (tPixels == 16u)
2325 return vld1q_u8(row + x);
2329 if constexpr (tFront)
2331 if constexpr (tOverlappingToZero)
2333 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2334 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
2336 return vandq_u8(vld1q_u8(row + x), mask_u_8x16);
2340 return vld1q_u8(row + x);
2345 if constexpr (tOverlappingToZero)
2347 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2348 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
2350 return vandq_u8(vld1q_u8(row + x -
int(tOverlappingElements)), mask_u_8x16);
2354 return vld1q_u8(row + x -
int(tOverlappingElements));
2360 if constexpr (tFront)
2362 for (
unsigned int n = 0u; n < tPixels; ++n)
2365 ocean_assert(mirroredIndex < width);
2367 intermediateBuffer[n] = row[mirroredIndex];
2370 if constexpr (tOverlappingToZero)
2372 for (
unsigned int n = tPixels; n < 16u; ++n)
2374 intermediateBuffer[n] = 0u;
2380 if constexpr (tOverlappingToZero)
2382 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
2384 intermediateBuffer[n] = 0u;
2388 for (
unsigned int n = 0u; n < tPixels; ++n)
2391 ocean_assert(mirroredIndex < width);
2393 intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2397 return vld1q_u8(intermediateBuffer);
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int index, const unsigned int elements)
Returns the mirrored index for a given index.
Definition: CVUtilities.h:456
This class allows to specialize functions for individual channels.
Definition: ZeroMeanSumSquareDifferencesNEON.h:39
static void mean8BitPerChannelMirroredBorder(const uint8_t *const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t *const meanValues)
Determines the mean value for an image patch, one value for each channel, patch pixels outside the im...
Definition: ZeroMeanSumSquareDifferencesNEON.h:906
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesNEON.h:1840
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition: ZeroMeanSumSquareDifferencesNEON.h:2087
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesNEON.h:448
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesNEON.h:1304
This class implements function to calculate zeao-mean sum square differences using NEON instructions.
Definition: ZeroMeanSumSquareDifferencesNEON.h:30
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition: ZeroMeanSumSquareDifferencesNEON.h:2190
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2207
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2149
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2132
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2225
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2169
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2313
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15