8 #ifndef META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
9 #define META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
13 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
38 template <
unsigned int tSize>
51 template <
unsigned int tChannels,
unsigned int tPatchSize>
52 static inline uint32_t
patch8BitPerChannel(
const uint8_t* patch0,
const uint8_t* patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements);
63 template <
unsigned int tChannels,
unsigned int tPatchSize>
64 static inline uint32_t
patchBuffer8BitPerChannel(
const uint8_t* patch0,
const uint8_t* buffer1,
const unsigned int patch0StrideElements);
84 template <
unsigned int tChannels,
unsigned int tPatchSize>
85 static uint32_t
patchMirroredBorder8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements);
104 template <
unsigned int tChannels>
105 static OCEAN_FORCE_INLINE
unsigned int mirrorIndex(
const int elementIndex,
const unsigned int elements);
118 template <
unsigned int tChannels,
bool tFront,
unsigned int tSize>
119 static OCEAN_FORCE_INLINE uint8x8_t
loadMirrored_u_8x8(
const uint8_t*
const row,
const int elementIndex,
const unsigned int elements, uint8_t*
const intermediateBuffer);
132 template <
unsigned int tChannels,
bool tFront,
unsigned int tSize>
133 static OCEAN_FORCE_INLINE uint8x16_t
loadMirrored_u_8x16(
const uint8_t*
const row,
const int elementIndex,
const unsigned int elements, uint8_t*
const intermediateBuffer);
136 template <
unsigned int tSize>
139 static_assert(tSize >= 1u,
"Invalid buffer size!");
141 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
146 constexpr
unsigned int blocks16 = tSize / 16u;
148 for (
unsigned int n = 0u; n < blocks16; ++n)
151 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
153 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
157 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
160 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
169 constexpr
unsigned int blocks8 = (tSize % 16u) / 8u;
170 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
175 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
178 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
180 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
186 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
189 vst1q_u32(results, sum_u_32x4);
191 constexpr
unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
192 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
194 uint32_t result = results[0] + results[1] + results[2] + results[3];
198 for (
unsigned int n = 0u; n < remainingElements; ++n)
206 template <
unsigned int tChannels,
unsigned int tPatchSize>
209 static_assert(tChannels >= 1u,
"Invalid channel number!");
210 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
212 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
214 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
215 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
217 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
219 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
220 constexpr
unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
221 constexpr
unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
223 static_assert(blocks1 <= 7u,
"Invalid block size!");
225 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
226 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
228 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
229 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
231 uint32_t sumIndividual = 0u;
233 for (
unsigned int y = 0u; y < tPatchSize; ++y)
235 for (
unsigned int n = 0u; n < blocks16; ++n)
238 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
240 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
241 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
244 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
245 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
247 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
248 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
254 for (
unsigned int n = 0u; n < blocks8; ++n)
257 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
260 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
262 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
268 if constexpr (blocks1 != 0u)
274 if (y < tPatchSize - 1u)
276 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
277 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
279 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
282 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
284 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
288 constexpr
unsigned int overlapElements = 8u - blocks1;
289 static_assert(overlapElements >= 1u && overlapElements < 8u,
"Invalid number!");
291 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
292 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
294 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
297 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
299 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
304 for (
unsigned int n = 0u; n < blocks1; ++n)
306 sumIndividual +=
sqrDistance(patch0[n], patch1[n]);
314 patch0 += patch0StrideElements - patchWidthElements;
315 patch1 += patch1StrideElements - patchWidthElements;
318 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
321 vst1q_u32(results, sum_u_32x4);
323 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
326 template <
unsigned int tChannels,
unsigned int tPatchSize>
329 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
332 template <
unsigned int tChannels,
unsigned int tPatchSize>
333 uint32_t
SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements)
335 static_assert(tChannels >= 1u,
"Invalid channel number!");
336 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
338 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
340 ocean_assert(centerX0 < width0 && centerY0 < height0);
341 ocean_assert(centerX1 < width1 && centerY1 < height1);
343 constexpr
unsigned int tPatchSize_2 = tPatchSize / 2u;
345 const unsigned int width0Elements = width0 * tChannels;
346 const unsigned int width1Elements = width1 * tChannels;
348 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
349 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
351 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
353 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
354 constexpr
unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
356 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 10u;
357 constexpr
unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
359 constexpr
unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
360 constexpr
unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
362 constexpr
bool partialBlock8 = remainingAfterBlocks8 >= 3u;
363 constexpr
unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
365 constexpr
unsigned int blocks1 = remainingAfterPartialBlock8;
367 static_assert(blocks1 <= 7u,
"Invalid block size!");
369 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
370 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
372 uint32_t sumIndividual = 0u;
374 uint8_t intermediate[16];
376 int y1 = int(centerY1) - int(tPatchSize_2);
377 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
379 const uint8_t*
const mirroredRow0 = image0 + (
unsigned int)(y0 +
CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
380 const uint8_t*
const mirroredRow1 = image1 + (
unsigned int)(y1 +
CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
382 int x0 = (int(centerX0) - int(tPatchSize_2)) *
int(tChannels);
383 int x1 = (int(centerX1) - int(tPatchSize_2)) *
int(tChannels);
385 for (
unsigned int n = 0u; n < blocks16; ++n)
388 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate));
390 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
391 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
394 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
395 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
397 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
398 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
404 if constexpr (partialBlock16)
406 if (y0 <
int(centerY0) +
int(tPatchSize_2))
409 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
411 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
412 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
415 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
416 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
418 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
419 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
424 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
426 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
427 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
430 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
431 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
433 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
434 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
437 x0 += remainingAfterBlocks16;
438 x1 += remainingAfterBlocks16;
441 for (
unsigned int n = 0u; n < blocks8; ++n)
444 const uint8x8_t absDifference_u_8x8 = vabd_u8(loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate));
447 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
449 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
455 if constexpr (partialBlock8)
459 if (y0 <
int(centerY0) +
int(tPatchSize_2))
461 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
462 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
464 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
467 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
469 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
473 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
474 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
476 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
479 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
481 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
484 x0 += remainingAfterBlocks8;
485 x1 += remainingAfterBlocks8;
488 if constexpr (blocks1 != 0u)
490 for (
unsigned int n = 0u; n < blocks1; ++n)
492 sumIndividual +=
sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 +
int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 +
int(n), width1Elements)]);
499 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
502 vst1q_u32(results, sum_u_32x4);
504 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
507 template <
unsigned int tChannels>
510 static_assert(tChannels >= 1u,
"Invalid channel number!");
512 if ((
unsigned int)(elementIndex) < elements)
517 if (elementIndex < 0)
519 const unsigned int leftElements = (
unsigned int)(-elementIndex) - 1u;
521 const unsigned int pixelIndex = leftElements / tChannels;
522 const unsigned int channelIndex = tChannels - (leftElements % tChannels) - 1u;
523 ocean_assert(channelIndex < tChannels);
525 ocean_assert(pixelIndex * tChannels + channelIndex < elements);
526 return pixelIndex * tChannels + channelIndex;
530 ocean_assert(elementIndex >= elements);
532 const unsigned int rightElements = elementIndex - elements;
534 const unsigned int rightPixels = rightElements / tChannels;
535 const unsigned int channelIndex = rightElements % tChannels;
536 ocean_assert(channelIndex < tChannels);
538 ocean_assert(elements - (rightPixels + 1u) * tChannels + channelIndex < elements);
539 return elements - (rightPixels + 1u) * tChannels + channelIndex;
543 template <
unsigned int tChannels,
bool tFront,
unsigned int tSize>
546 static_assert(tChannels >= 1u,
"Invalid channel number!");
548 ocean_assert(tSize >= 1u && tSize <= 8u);
550 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
552 constexpr
unsigned int tOverlappingElements = 8u - tSize;
554 if (elementIndex >= 0 && elementIndex <=
int(elements) -
int(tSize))
556 if constexpr (tSize == 8u)
558 return vld1_u8(row + elementIndex);
562 if constexpr (tFront)
564 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
565 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
567 return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
571 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
572 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
574 return vand_u8(vld1_u8(row + elementIndex -
int(tOverlappingElements)), mask_u_8x8);
579 if constexpr (tFront)
581 for (
unsigned int n = 0u; n < tSize; ++n)
583 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
584 ocean_assert(index < elements);
586 intermediateBuffer[n] = row[index];
589 for (
unsigned int n = tSize; n < 8u; ++n)
591 intermediateBuffer[n] = 0u;
596 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
598 intermediateBuffer[n] = 0u;
601 for (
unsigned int n = 0u; n < tSize; ++n)
603 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
604 ocean_assert(index < elements);
606 intermediateBuffer[tOverlappingElements + n] = row[index];
610 return vld1_u8(intermediateBuffer);
613 template <
unsigned int tChannels,
bool tFront,
unsigned int tSize>
616 static_assert(tChannels >= 1u,
"Invalid channel number!");
618 ocean_assert(tSize > 8u && tSize <= 16u);
620 ocean_assert(row !=
nullptr && intermediateBuffer !=
nullptr);
622 constexpr
unsigned int tOverlappingElements = 16u - tSize;
624 if (elementIndex >= 0 && elementIndex <=
int(elements) -
int(tSize))
626 if constexpr (tSize == 16u)
628 return vld1q_u8(row + elementIndex);
632 if constexpr (tFront)
634 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
635 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
637 return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
641 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
642 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
644 return vandq_u8(vld1q_u8(row + elementIndex -
int(tOverlappingElements)), mask_u_8x16);
649 if constexpr (tFront)
651 for (
unsigned int n = 0u; n < tSize; ++n)
653 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
654 ocean_assert(index < elements);
656 intermediateBuffer[n] = row[index];
659 for (
unsigned int n = tSize; n < 16u; ++n)
661 intermediateBuffer[n] = 0u;
666 for (
unsigned int n = 0u; n < tOverlappingElements; ++n)
668 intermediateBuffer[n] = 0u;
671 for (
unsigned int n = 0u; n < tSize; ++n)
673 const unsigned int index = mirrorIndex<tChannels>(elementIndex +
int(n), elements);
674 ocean_assert(index < elements);
676 intermediateBuffer[tOverlappingElements + n] = row[index];
680 return vld1q_u8(intermediateBuffer);
static int mirrorOffset(const unsigned int index, const unsigned int elements)
Deprecated.
Definition: CVUtilities.h:446
This class implements function to calculate sum square differences using NEON instructions.
Definition: SumSquareDifferencesNEON.h:28
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements)
Returns the mirrored element index for a given element index.
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences between two patches within an image, patch pixels outside the i...
Definition: SumSquareDifferencesNEON.h:333
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of square differences between two memory buffers.
Definition: SumSquareDifferencesNEON.h:137
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
Definition: SumSquareDifferencesNEON.h:544
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of square differences between two patches within an image.
Definition: SumSquareDifferencesNEON.h:207
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
Definition: SumSquareDifferencesNEON.h:614
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of square differences between an image patch and a buffer.
Definition: SumSquareDifferencesNEON.h:327
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15