8 #ifndef META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_NEON_H
9 #define META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_NEON_H
15 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
40 template <
unsigned int tSize>
53 template <
unsigned int tChannels,
unsigned int tPatchSize>
54 static inline uint32_t
patch8BitPerChannel(
const uint8_t* patch0,
const uint8_t* patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements);
65 template <
unsigned int tChannels,
unsigned int tPatchSize>
66 static inline uint32_t
patchBuffer8BitPerChannel(
const uint8_t* patch0,
const uint8_t* buffer1,
const unsigned int patch0StrideElements);
69 template <
unsigned int tSize>
72 static_assert(tSize >= 1u,
"Invalid buffer size!");
74 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
78 constexpr
unsigned int blocks16 = tSize / 16u;
80 for (
unsigned int n = 0u; n < blocks16; ++n)
83 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
85 const uint16x8_t absDifference_u_16x8 = vaddl_u8(vget_low_u8(absDifference_u_8x16), vget_high_u8(absDifference_u_8x16));
87 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
95 constexpr
unsigned int blocks8 = (tSize % 16u) / 8u;
96 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
101 const uint16x8_t absDifference_u_16x8 = vabdl_u8(vld1_u8(buffer0), vld1_u8(buffer1));
103 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
110 vst1q_u32(results, sum_u_32x4);
112 constexpr
unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
113 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
115 uint32_t result = results[0] + results[1] + results[2] + results[3];
119 for (
unsigned int n = 0u; n < remainingElements; ++n)
121 result += uint32_t(abs(int32_t(buffer0[n]) - int32_t(buffer1[n])));
127 template <
unsigned int tChannels,
unsigned int tPatchSize>
130 static_assert(tChannels >= 1u,
"Invalid channel number!");
131 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
133 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
135 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
136 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
138 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
140 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
141 constexpr
unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
142 constexpr
unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
144 static_assert(blocks1 <= 7u,
"Invalid block size!");
146 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
147 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
149 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
151 uint32_t sumIndividual = 0u;
153 for (
unsigned int y = 0u; y < tPatchSize; ++y)
155 for (
unsigned int n = 0u; n < blocks16; ++n)
158 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
160 const uint16x8_t absDifference_u_16x8 = vaddl_u8(vget_low_u8(absDifference_u_8x16), vget_high_u8(absDifference_u_8x16));
162 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
168 for (
unsigned int n = 0u; n < blocks8; ++n)
171 const uint16x8_t absDifference_u_16x8 = vabdl_u8(vld1_u8(patch0), vld1_u8(patch1));
173 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
179 if constexpr (blocks1 != 0u)
185 if (y < tPatchSize - 1u)
187 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
188 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
190 const uint16x8_t absDifference_u_16x8 = vabdl_u8(remaining0_u_8x8, remaining1_u_8x8);
192 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
196 constexpr
unsigned int overlapElements = 8u - blocks1;
197 static_assert(overlapElements >= 1u && overlapElements < 8u,
"Invalid number!");
199 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
200 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
202 const uint16x8_t absDifference_u_16x8 = vabdl_u8(remaining0_u_8x8, remaining1_u_8x8);
204 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
209 for (
unsigned int n = 0u; n < blocks1; ++n)
211 sumIndividual += uint32_t(abs(int32_t(patch0[n]) - int32_t(patch1[n])));
219 patch0 += patch0StrideElements - patchWidthElements;
220 patch1 += patch1StrideElements - patchWidthElements;
224 vst1q_u32(results, sum_u_32x4);
226 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
229 template <
unsigned int tChannels,
unsigned int tPatchSize>
232 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
This class implements functions calculation the sum of absolute differences with NEON instructions.
Definition: SumAbsoluteDifferencesNEON.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of absolute differences between an image patch and a buffer.
Definition: SumAbsoluteDifferencesNEON.h:230
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of absolute differences between two patches within an image.
Definition: SumAbsoluteDifferencesNEON.h:128
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of absolute differences between two memory buffers.
Definition: SumAbsoluteDifferencesNEON.h:70
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15