139 static_assert(tSize >= 1u,
"Invalid buffer size!");
141 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
146 constexpr unsigned int blocks16 = tSize / 16u;
148 for (
unsigned int n = 0u; n < blocks16; ++n)
151 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
153 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
157 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
160 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
169 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
170 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
175 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
178 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
180 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
186 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
188 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
189 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
195 for (
unsigned int n = 0u; n < remainingElements; ++n)
206 static_assert(tChannels >= 1u,
"Invalid channel number!");
207 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
209 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
211 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
212 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
214 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
216 constexpr unsigned int blocks16 = patchWidthElements / 16u;
217 constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
218 constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
220 static_assert(blocks1 <= 7u,
"Invalid block size!");
222 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
223 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
225 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
226 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
228 uint32_t sumIndividual = 0u;
230 for (
unsigned int y = 0u; y < tPatchSize; ++y)
232 for (
unsigned int n = 0u; n < blocks16; ++n)
235 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
237 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
238 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
241 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
242 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
244 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
245 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
251 for (
unsigned int n = 0u; n < blocks8; ++n)
254 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
257 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
259 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
265 if constexpr (blocks1 != 0u)
271 if (y < tPatchSize - 1u)
273 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
274 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
276 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
279 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
281 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
285 constexpr unsigned int overlapElements = 8u - blocks1;
286 static_assert(overlapElements >= 1u && overlapElements < 8u,
"Invalid number!");
288 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
289 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
291 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
294 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
296 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
301 for (
unsigned int n = 0u; n < blocks1; ++n)
303 sumIndividual +=
sqrDistance(patch0[n], patch1[n]);
311 patch0 += patch0StrideElements - patchWidthElements;
312 patch1 += patch1StrideElements - patchWidthElements;
315 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
327uint32_t
SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements)
329 static_assert(tChannels >= 1u,
"Invalid channel number!");
330 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
332 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
334 ocean_assert(centerX0 < width0 && centerY0 < height0);
335 ocean_assert(centerX1 < width1 && centerY1 < height1);
337 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
339 const unsigned int width0Elements = width0 * tChannels;
340 const unsigned int width1Elements = width1 * tChannels;
342 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
343 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
345 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
347 constexpr unsigned int blocks16 = patchWidthElements / 16u;
348 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
350 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
351 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
353 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
354 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
356 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
357 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
359 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
361 static_assert(blocks1 <= 7u,
"Invalid block size!");
363 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
364 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
366 uint32_t sumIndividual = 0u;
368 uint8_t intermediate[16];
370 int y1 = int(centerY1) - int(tPatchSize_2);
371 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
373 const uint8_t*
const mirroredRow0 = image0 + (
unsigned int)(y0 +
CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
374 const uint8_t*
const mirroredRow1 = image1 + (
unsigned int)(y1 +
CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
376 int x0 = (int(centerX0) - int(tPatchSize_2)) *
int(tChannels);
377 int x1 = (int(centerX1) - int(tPatchSize_2)) *
int(tChannels);
379 for (
unsigned int n = 0u; n < blocks16; ++n)
382 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate);
383 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate);
384 const uint8x16_t absDifference_u_8x16 = vabdq_u8(patch0_u_8x16, patch1_u_8x16);
386 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
387 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
390 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
391 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
393 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
394 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
400 if constexpr (partialBlock16)
402 if (y0 <
int(centerY0) + int(tPatchSize_2))
405 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate);
406 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate);
407 const uint8x16_t absDifference_u_8x16 = vabdq_u8(patch0_u_8x16, patch1_u_8x16);
409 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
410 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
413 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
414 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
416 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
417 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
422 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate);
423 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate);
424 const uint8x16_t absDifference_u_8x16 = vabdq_u8(patch0_u_8x16, patch1_u_8x16);
426 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
427 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
430 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
431 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
433 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
434 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
437 x0 += remainingAfterBlocks16;
438 x1 += remainingAfterBlocks16;
441 for (
unsigned int n = 0u; n < blocks8; ++n)
444 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate);
445 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate);
446 const uint8x8_t absDifference_u_8x8 = vabd_u8(patch0_u_8x8, patch1_u_8x8);
449 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
451 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
457 if constexpr (partialBlock8)
461 if (y0 <
int(centerY0) + int(tPatchSize_2))
463 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
464 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
466 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
469 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
471 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
475 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
476 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
478 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
481 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
483 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
486 x0 += remainingAfterBlocks8;
487 x1 += remainingAfterBlocks8;
490 if constexpr (blocks1 != 0u)
492 for (
unsigned int n = 0u; n < blocks1; ++n)
494 sumIndividual +=
sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 +
int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 +
int(n), width1Elements)]);
501 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
545 static_assert(tChannels >= 1u,
"Invalid channel number!");
547 ocean_assert(tSize >= 1u && tSize <= 8u);
549 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
551 constexpr unsigned int tOverlappingElements = 8u - tSize;
553 if (elementIndex >= 0 && elementIndex <=
int(elements) -
int(tSize))
555 if constexpr (tSize == 8u)
557 return vld1_u8(row + elementIndex);
561 if constexpr (tFront)
563 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
564 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
566 return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
570 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
571 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
573 return vand_u8(vld1_u8(row + elementIndex -
int(tOverlappingElements)), mask_u_8x8);
578 if constexpr (tFront)
580 for (
unsigned int n = 0u; n < tSize; ++n)
582 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
583 ocean_assert(index < elements);
585 intermediateBuffer[n] = row[index];
588 for (
unsigned int n = tSize; n < 8u; ++n)
590 intermediateBuffer[n] = 0u;
595 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
597 intermediateBuffer[n] = 0u;
600 for (
unsigned int n = 0u; n < tSize; ++n)
602 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
603 ocean_assert(index < elements);
605 intermediateBuffer[tOverlappingElements + n] = row[index];
609 return vld1_u8(intermediateBuffer);
615 static_assert(tChannels >= 1u,
"Invalid channel number!");
617 ocean_assert(tSize > 8u && tSize <= 16u);
619 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
621 constexpr unsigned int tOverlappingElements = 16u - tSize;
623 if (elementIndex >= 0 && elementIndex <=
int(elements) -
int(tSize))
625 if constexpr (tSize == 16u)
627 return vld1q_u8(row + elementIndex);
631 if constexpr (tFront)
633 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
634 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
636 return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
640 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
641 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
643 return vandq_u8(vld1q_u8(row + elementIndex -
int(tOverlappingElements)), mask_u_8x16);
648 if constexpr (tFront)
650 for (
unsigned int n = 0u; n < tSize; ++n)
652 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
653 ocean_assert(index < elements);
655 intermediateBuffer[n] = row[index];
658 for (
unsigned int n = tSize; n < 16u; ++n)
660 intermediateBuffer[n] = 0u;
665 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
667 intermediateBuffer[n] = 0u;
670 for (
unsigned int n = 0u; n < tSize; ++n)
672 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
673 ocean_assert(index < elements);
675 intermediateBuffer[tOverlappingElements + n] = row[index];
679 return vld1q_u8(intermediateBuffer);