Ocean
Loading...
Searching...
No Matches
SumSquareDifferencesNEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
9#define META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
10
11#include "ocean/cv/CV.h"
12
13#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
14
15#include "ocean/cv/NEON.h"
16
17namespace Ocean
18{
19
20namespace CV
21{
22
23/**
24 * This class implements function to calculate sum square differences using NEON instructions.
25 * @ingroup cv
26 */
28{
29 public:
30
31 /**
32 * Returns the sum of square differences between two memory buffers.
33 * @param buffer0 The first memory buffer, must be valid
34 * @param buffer1 The second memory buffer, must be valid
35 * @return The resulting sum of square differences
36 * @tparam tSize The size of the buffers in elements, with range [1, infinity)
37 */
38 template <unsigned int tSize>
39 static inline uint32_t buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1);
40
41 /**
42 * Returns the sum of square differences between two patches within an image.
43 * @param patch0 The top left start position of the first image patch, must be valid
44 * @param patch1 The top left start position of the second image patch, must be valid
45 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
46 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
47 * @return The resulting sum of square differences
48 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
49 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
50 */
51 template <unsigned int tChannels, unsigned int tPatchSize>
52 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
53
54 /**
55 * Returns the sum of square differences between an image patch and a buffer.
56 * @param patch0 The top left start position of the image patch, must be valid
57 * @param buffer1 The memory buffer, must be valid
58 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
59 * @return The resulting sum of square differences
60 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
61 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
62 */
63 template <unsigned int tChannels, unsigned int tPatchSize>
64 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
65
66 /**
67 * Returns the sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
68 * @param image0 The image in which the first patch is located, must be valid
69 * @param image1 The image in which the second patch is located, must be valid
70 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
71 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
72 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
73 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
74 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
75 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
76 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
77 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
78 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
79 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
80 * @return The resulting sum of square differences, with range [0, infinity)
81 * @tparam tChannels The number of frame channels, with range [1, infinity)
82 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
83 */
84 template <unsigned int tChannels, unsigned int tPatchSize>
85 static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
86
87 protected:
88
89 /**
90 * Returns the mirrored element index for a given element index.
91 * The mirrored index is calculated as follows:
92 * <pre>
93 * |<----------------------- valid value range -------------------------->|
94 *
95 * elementIndex: -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, ... elements - 3, elements - 2, elements - 1, elements + 0, elements + 1
96 * result: 2 1 0 0 1 2 3 4 5 6 7 ... elements - 3 elements - 2 elements - 1 elements - 1 elements - 2
97 * </pre>
98 * The resulting mirrored index is adjusted to support several channels.
99 * @param elementIndex The index for which the mirrored index will be returned, with range [-elements/2, elements + elements/2]
100 * @param elements The number of maximal elements, with range [1, infinity)
101 * @return The mirrored index, with range [0, elements)
102 * @tparam tChannels The number of channels the elements have, with range [1, infinity)
103 */
104 template <unsigned int tChannels>
105 static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements);
106
107 /**
108 * Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
109 * @param row The row from which the values will be loaded, must be valid
110 * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
111 * @param elements The number of elements in the row, with range [4, infinity)
112 * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
113 * @return The uint8x8_t object with the loaded values
114 * @tparam tChannels The number of channels the row has, with range [1, infinity)
115 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
116 * @tparam tSize The number of uint8_t values to be read, with range [1, 8]
117 */
118 template <unsigned int tChannels, bool tFront, unsigned int tSize>
119 static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
120
121 /**
122 * Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
123 * @param row The row from which the values will be loaded, must be valid
124 * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
125 * @param elements The number of elements in the row, with range [8, infinity)
126 * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
127 * @return The uint8x16_t object with the loaded values
128 * @tparam tChannels The number of channels the row has, with range [1, infinity)
129 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
130 * @tparam tSize The number of uint8_t values to be read, with range [1, 16]
131 */
132 template <unsigned int tChannels, bool tFront, unsigned int tSize>
133 static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
134};
135
136template <unsigned int tSize>
137inline uint32_t SumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1)
138{
139 static_assert(tSize >= 1u, "Invalid buffer size!");
140
141 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
143
144 // first, we handle blocks with 16 elements
145
146 constexpr unsigned int blocks16 = tSize / 16u;
147
148 for (unsigned int n = 0u; n < blocks16; ++n)
149 {
150 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
151 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
152
153 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
155
156 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
157 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
159
160 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
162
163 buffer0 += 16;
164 buffer1 += 16;
165 }
166
167 // we may handle at most one block with 8 elements
168
169 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
170 static_assert(blocks8 <= 1u, "Invalid number of blocks!");
171
172 if (blocks8 == 1u)
173 {
174 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
175 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
176
177 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
178 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
179
180 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
181
182 buffer0 += 8;
183 buffer1 += 8;
184 }
185
186 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
187
188 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
189 static_assert(remainingElements < 8u, "Invalid number of remaining elements!");
190
191 uint32_t result = NEON::sumHorizontal_u_32x4(sum_u_32x4);
192
193 // we apply the remaining elements (at most 7)
194
195 for (unsigned int n = 0u; n < remainingElements; ++n)
196 {
197 result += sqrDistance(buffer0[n], buffer1[n]);
198 }
199
200 return result;
201}
202
203template <unsigned int tChannels, unsigned int tPatchSize>
204inline uint32_t SumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
205{
206 static_assert(tChannels >= 1u, "Invalid channel number!");
207 static_assert(tPatchSize >= 5u, "Invalid patch size!");
208
209 ocean_assert(patch0 != nullptr && patch1 != nullptr);
210
211 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
212 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
213
214 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
215
216 constexpr unsigned int blocks16 = patchWidthElements / 16u;
217 constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
218 constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
219
220 static_assert(blocks1 <= 7u, "Invalid block size!");
221
222 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
223 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
224
225 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
226 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
227
228 uint32_t sumIndividual = 0u;
229
230 for (unsigned int y = 0u; y < tPatchSize; ++y)
231 {
232 for (unsigned int n = 0u; n < blocks16; ++n)
233 {
234 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
235 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
236
237 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
238 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
239
240 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
241 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
242 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
243
244 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
245 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
246
247 patch0 += 16;
248 patch1 += 16;
249 }
250
251 for (unsigned int n = 0u; n < blocks8; ++n)
252 {
253 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
254 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
255
256 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
257 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
258
259 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
260
261 patch0 += 8;
262 patch1 += 8;
263 }
264
265 if constexpr (blocks1 != 0u)
266 {
267 if (blocks1 >= 3u)
268 {
269 // we have enough elements left so that using NEON is still faster than handling each element individually
270
271 if (y < tPatchSize - 1u)
272 {
273 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
274 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
275
276 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
277
278 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
279 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
280
281 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
282 }
283 else
284 {
285 constexpr unsigned int overlapElements = 8u - blocks1;
286 static_assert(overlapElements >= 1u && overlapElements < 8u, "Invalid number!");
287
288 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
289 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
290
291 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
292
293 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
294 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
295
296 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
297 }
298 }
299 else
300 {
301 for (unsigned int n = 0u; n < blocks1; ++n)
302 {
303 sumIndividual += sqrDistance(patch0[n], patch1[n]);
304 }
305 }
306
307 patch0 += blocks1;
308 patch1 += blocks1;
309 }
310
311 patch0 += patch0StrideElements - patchWidthElements;
312 patch1 += patch1StrideElements - patchWidthElements;
313 }
314
315 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
316
317 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
318}
319
320template <unsigned int tChannels, unsigned int tPatchSize>
321inline uint32_t SumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
322{
323 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
324}
325
326template <unsigned int tChannels, unsigned int tPatchSize>
327uint32_t SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
328{
329 static_assert(tChannels >= 1u, "Invalid channel number!");
330 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
331
332 ocean_assert(image0 != nullptr && image1 != nullptr);
333
334 ocean_assert(centerX0 < width0 && centerY0 < height0);
335 ocean_assert(centerX1 < width1 && centerY1 < height1);
336
337 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
338
339 const unsigned int width0Elements = width0 * tChannels;
340 const unsigned int width1Elements = width1 * tChannels;
341
342 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
343 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
344
345 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
346
347 constexpr unsigned int blocks16 = patchWidthElements / 16u;
348 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
349
350 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
351 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
352
353 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
354 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
355
356 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
357 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
358
359 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
360
361 static_assert(blocks1 <= 7u, "Invalid block size!");
362
363 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
364 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
365
366 uint32_t sumIndividual = 0u;
367
368 uint8_t intermediate[16];
369
370 int y1 = int(centerY1) - int(tPatchSize_2);
371 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
372 {
373 const uint8_t* const mirroredRow0 = image0 + (unsigned int)(y0 + CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
374 const uint8_t* const mirroredRow1 = image1 + (unsigned int)(y1 + CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
375
376 int x0 = (int(centerX0) - int(tPatchSize_2)) * int(tChannels);
377 int x1 = (int(centerX1) - int(tPatchSize_2)) * int(tChannels);
378
379 for (unsigned int n = 0u; n < blocks16; ++n)
380 {
381 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
382 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate));
383
384 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
385 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
386
387 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
388 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
389 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
390
391 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
392 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
393
394 x0 += 16;
395 x1 += 16;
396 }
397
398 if constexpr (partialBlock16)
399 {
400 if (y0 < int(centerY0) + int(tPatchSize_2))
401 {
402 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
403 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
404
405 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
406 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
407
408 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
409 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
410 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
411
412 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
413 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
414 }
415 else
416 {
417 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
418 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
419
420 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
421 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
422
423 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
424 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
425 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
426
427 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
428 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
429 }
430
431 x0 += remainingAfterBlocks16;
432 x1 += remainingAfterBlocks16;
433 }
434
435 for (unsigned int n = 0u; n < blocks8; ++n)
436 {
437 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
438 const uint8x8_t absDifference_u_8x8 = vabd_u8(loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate));
439
440 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
441 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
442
443 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
444
445 x0 += 8;
446 x1 += 8;
447 }
448
449 if constexpr (partialBlock8)
450 {
451 // we have enough elements left so that using NEON is still faster than handling each element individually
452
453 if (y0 < int(centerY0) + int(tPatchSize_2))
454 {
455 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
456 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
457
458 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
459
460 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
461 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
462
463 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
464 }
465 else
466 {
467 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
468 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
469
470 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
471
472 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
473 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
474
475 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
476 }
477
478 x0 += remainingAfterBlocks8;
479 x1 += remainingAfterBlocks8;
480 }
481
482 if constexpr (blocks1 != 0u)
483 {
484 for (unsigned int n = 0u; n < blocks1; ++n)
485 {
486 sumIndividual += sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 + int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 + int(n), width1Elements)]);
487 }
488 }
489
490 ++y1;
491 }
492
493 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
494
495 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
496}
497
498template <unsigned int tChannels>
499inline unsigned int SumSquareDifferencesNEON::mirrorIndex(const int elementIndex, const unsigned int elements)
500{
501 static_assert(tChannels >= 1u, "Invalid channel number!");
502
503 if ((unsigned int)(elementIndex) < elements)
504 {
505 return elementIndex;
506 }
507
508 if (elementIndex < 0)
509 {
510 const unsigned int leftElements = (unsigned int)(-elementIndex) - 1u;
511
512 const unsigned int pixelIndex = leftElements / tChannels;
513 const unsigned int channelIndex = tChannels - (leftElements % tChannels) - 1u;
514 ocean_assert(channelIndex < tChannels);
515
516 ocean_assert(pixelIndex * tChannels + channelIndex < elements);
517 return pixelIndex * tChannels + channelIndex;
518 }
519 else
520 {
521 ocean_assert((unsigned int)(elementIndex) >= elements);
522
523 const unsigned int rightElements = elementIndex - elements;
524
525 const unsigned int rightPixels = rightElements / tChannels;
526 const unsigned int channelIndex = rightElements % tChannels;
527 ocean_assert(channelIndex < tChannels);
528
529 ocean_assert(elements - (rightPixels + 1u) * tChannels + channelIndex < elements);
530 return elements - (rightPixels + 1u) * tChannels + channelIndex;
531 }
532}
533
534template <unsigned int tChannels, bool tFront, unsigned int tSize>
535OCEAN_FORCE_INLINE uint8x8_t SumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
536{
537 static_assert(tChannels >= 1u, "Invalid channel number!");
538
539 ocean_assert(tSize >= 1u && tSize <= 8u);
540
541 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
542
543 constexpr unsigned int tOverlappingElements = 8u - tSize;
544
545 if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
546 {
547 if constexpr (tSize == 8u)
548 {
549 return vld1_u8(row + elementIndex);
550 }
551 else
552 {
553 if constexpr (tFront)
554 {
555 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
556 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
557
558 return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
559 }
560 else
561 {
562 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
563 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
564
565 return vand_u8(vld1_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x8);
566 }
567 }
568 }
569
570 if constexpr (tFront)
571 {
572 for (unsigned int n = 0u; n < tSize; ++n)
573 {
574 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
575 ocean_assert(index < elements);
576
577 intermediateBuffer[n] = row[index];
578 }
579
580 for (unsigned int n = tSize; n < 8u; ++n)
581 {
582 intermediateBuffer[n] = 0u;
583 }
584 }
585 else
586 {
587 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
588 {
589 intermediateBuffer[n] = 0u;
590 }
591
592 for (unsigned int n = 0u; n < tSize; ++n)
593 {
594 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
595 ocean_assert(index < elements);
596
597 intermediateBuffer[tOverlappingElements + n] = row[index];
598 }
599 }
600
601 return vld1_u8(intermediateBuffer);
602}
603
604template <unsigned int tChannels, bool tFront, unsigned int tSize>
605OCEAN_FORCE_INLINE uint8x16_t SumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
606{
607 static_assert(tChannels >= 1u, "Invalid channel number!");
608
609 ocean_assert(tSize > 8u && tSize <= 16u);
610
611 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
612
613 constexpr unsigned int tOverlappingElements = 16u - tSize;
614
615 if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
616 {
617 if constexpr (tSize == 16u)
618 {
619 return vld1q_u8(row + elementIndex);
620 }
621 else
622 {
623 if constexpr (tFront)
624 {
625 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
626 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
627
628 return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
629 }
630 else
631 {
632 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
633 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
634
635 return vandq_u8(vld1q_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x16);
636 }
637 }
638 }
639
640 if constexpr (tFront)
641 {
642 for (unsigned int n = 0u; n < tSize; ++n)
643 {
644 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
645 ocean_assert(index < elements);
646
647 intermediateBuffer[n] = row[index];
648 }
649
650 for (unsigned int n = tSize; n < 16u; ++n)
651 {
652 intermediateBuffer[n] = 0u;
653 }
654 }
655 else
656 {
657 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
658 {
659 intermediateBuffer[n] = 0u;
660 }
661
662 for (unsigned int n = 0u; n < tSize; ++n)
663 {
664 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
665 ocean_assert(index < elements);
666
667 intermediateBuffer[tOverlappingElements + n] = row[index];
668 }
669 }
670
671 return vld1q_u8(intermediateBuffer);
672}
673
674}
675
676}
677
678#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
679
680#endif // META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
static int mirrorOffset(const unsigned int index, const unsigned int elements)
Deprecated.
Definition CVUtilities.h:449
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1084
This class implements function to calculate sum square differences using NEON instructions.
Definition SumSquareDifferencesNEON.h:28
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements)
Returns the mirrored element index for a given element index.
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences between two patches within an image, patch pixels outside the i...
Definition SumSquareDifferencesNEON.h:327
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of square differences between two memory buffers.
Definition SumSquareDifferencesNEON.h:137
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
Definition SumSquareDifferencesNEON.h:535
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of square differences between two patches within an image.
Definition SumSquareDifferencesNEON.h:204
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
Definition SumSquareDifferencesNEON.h:605
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of square differences between an image patch and a buffer.
Definition SumSquareDifferencesNEON.h:321
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15