139 static_assert(tSize >= 1u,
"Invalid buffer size!");
141 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
146 constexpr unsigned int blocks16 = tSize / 16u;
148 for (
unsigned int n = 0u; n < blocks16; ++n)
151 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
153 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
157 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
160 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
169 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
170 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
175 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
178 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
180 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
186 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
188 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
189 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
195 for (
unsigned int n = 0u; n < remainingElements; ++n)
206 static_assert(tChannels >= 1u,
"Invalid channel number!");
207 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
209 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
211 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
212 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
214 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
216 constexpr unsigned int blocks16 = patchWidthElements / 16u;
217 constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
218 constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
220 static_assert(blocks1 <= 7u,
"Invalid block size!");
222 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
223 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
225 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
226 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
228 uint32_t sumIndividual = 0u;
230 for (
unsigned int y = 0u; y < tPatchSize; ++y)
232 for (
unsigned int n = 0u; n < blocks16; ++n)
235 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
237 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
238 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
241 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
242 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
244 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
245 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
251 for (
unsigned int n = 0u; n < blocks8; ++n)
254 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
257 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
259 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
265 if constexpr (blocks1 != 0u)
271 if (y < tPatchSize - 1u)
273 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
274 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
276 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
279 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
281 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
285 constexpr unsigned int overlapElements = 8u - blocks1;
286 static_assert(overlapElements >= 1u && overlapElements < 8u,
"Invalid number!");
288 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
289 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
291 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
294 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
296 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
301 for (
unsigned int n = 0u; n < blocks1; ++n)
303 sumIndividual +=
sqrDistance(patch0[n], patch1[n]);
311 patch0 += patch0StrideElements - patchWidthElements;
312 patch1 += patch1StrideElements - patchWidthElements;
315 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
327uint32_t
SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements)
329 static_assert(tChannels >= 1u,
"Invalid channel number!");
330 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
332 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
334 ocean_assert(centerX0 < width0 && centerY0 < height0);
335 ocean_assert(centerX1 < width1 && centerY1 < height1);
337 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
339 const unsigned int width0Elements = width0 * tChannels;
340 const unsigned int width1Elements = width1 * tChannels;
342 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
343 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
345 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
347 constexpr unsigned int blocks16 = patchWidthElements / 16u;
348 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
350 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
351 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
353 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
354 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
356 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
357 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
359 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
361 static_assert(blocks1 <= 7u,
"Invalid block size!");
363 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
364 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
366 uint32_t sumIndividual = 0u;
368 uint8_t intermediate[16];
370 int y1 = int(centerY1) - int(tPatchSize_2);
371 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
373 const uint8_t*
const mirroredRow0 = image0 + (
unsigned int)(y0 +
CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
374 const uint8_t*
const mirroredRow1 = image1 + (
unsigned int)(y1 +
CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
376 int x0 = (int(centerX0) - int(tPatchSize_2)) *
int(tChannels);
377 int x1 = (int(centerX1) - int(tPatchSize_2)) *
int(tChannels);
379 for (
unsigned int n = 0u; n < blocks16; ++n)
382 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate));
384 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
385 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
388 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
389 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
391 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
392 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
398 if constexpr (partialBlock16)
400 if (y0 <
int(centerY0) + int(tPatchSize_2))
403 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
405 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
406 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
409 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
410 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
412 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
413 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
418 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
420 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
421 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
424 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
425 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
427 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
428 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
431 x0 += remainingAfterBlocks16;
432 x1 += remainingAfterBlocks16;
435 for (
unsigned int n = 0u; n < blocks8; ++n)
438 const uint8x8_t absDifference_u_8x8 = vabd_u8(loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate));
441 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
443 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
449 if constexpr (partialBlock8)
453 if (y0 <
int(centerY0) + int(tPatchSize_2))
455 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
456 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
458 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
461 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
463 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
467 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
468 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
470 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
473 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
475 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
478 x0 += remainingAfterBlocks8;
479 x1 += remainingAfterBlocks8;
482 if constexpr (blocks1 != 0u)
484 for (
unsigned int n = 0u; n < blocks1; ++n)
486 sumIndividual +=
sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 +
int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 +
int(n), width1Elements)]);
493 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
537 static_assert(tChannels >= 1u,
"Invalid channel number!");
539 ocean_assert(tSize >= 1u && tSize <= 8u);
541 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
543 constexpr unsigned int tOverlappingElements = 8u - tSize;
545 if (elementIndex >= 0 && elementIndex <=
int(elements) -
int(tSize))
547 if constexpr (tSize == 8u)
549 return vld1_u8(row + elementIndex);
553 if constexpr (tFront)
555 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
556 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
558 return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
562 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
563 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
565 return vand_u8(vld1_u8(row + elementIndex -
int(tOverlappingElements)), mask_u_8x8);
570 if constexpr (tFront)
572 for (
unsigned int n = 0u; n < tSize; ++n)
574 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
575 ocean_assert(index < elements);
577 intermediateBuffer[n] = row[index];
580 for (
unsigned int n = tSize; n < 8u; ++n)
582 intermediateBuffer[n] = 0u;
587 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
589 intermediateBuffer[n] = 0u;
592 for (
unsigned int n = 0u; n < tSize; ++n)
594 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
595 ocean_assert(index < elements);
597 intermediateBuffer[tOverlappingElements + n] = row[index];
601 return vld1_u8(intermediateBuffer);
607 static_assert(tChannels >= 1u,
"Invalid channel number!");
609 ocean_assert(tSize > 8u && tSize <= 16u);
611 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
613 constexpr unsigned int tOverlappingElements = 16u - tSize;
615 if (elementIndex >= 0 && elementIndex <=
int(elements) -
int(tSize))
617 if constexpr (tSize == 16u)
619 return vld1q_u8(row + elementIndex);
623 if constexpr (tFront)
625 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
626 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
628 return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
632 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
633 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
635 return vandq_u8(vld1q_u8(row + elementIndex -
int(tOverlappingElements)), mask_u_8x16);
640 if constexpr (tFront)
642 for (
unsigned int n = 0u; n < tSize; ++n)
644 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
645 ocean_assert(index < elements);
647 intermediateBuffer[n] = row[index];
650 for (
unsigned int n = tSize; n < 16u; ++n)
652 intermediateBuffer[n] = 0u;
657 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
659 intermediateBuffer[n] = 0u;
662 for (
unsigned int n = 0u; n < tSize; ++n)
664 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
665 ocean_assert(index < elements);
667 intermediateBuffer[tOverlappingElements + n] = row[index];
671 return vld1q_u8(intermediateBuffer);