Ocean
Loading...
Searching...
No Matches
SumSquareDifferencesNEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
9#define META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
10
11#include "ocean/cv/CV.h"
12
13#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
14
15#include "ocean/cv/NEON.h"
16
17namespace Ocean
18{
19
20namespace CV
21{
22
23/**
24 * This class implements function to calculate sum square differences using NEON instructions.
25 * @ingroup cv
26 */
28{
29 public:
30
31 /**
32 * Returns the sum of square differences between two memory buffers.
33 * @param buffer0 The first memory buffer, must be valid
34 * @param buffer1 The second memory buffer, must be valid
35 * @return The resulting sum of square differences
36 * @tparam tSize The size of the buffers in elements, with range [1, infinity)
37 */
38 template <unsigned int tSize>
39 static inline uint32_t buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1);
40
41 /**
42 * Returns the sum of square differences between two patches within an image.
43 * @param patch0 The top left start position of the first image patch, must be valid
44 * @param patch1 The top left start position of the second image patch, must be valid
45 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
46 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
47 * @return The resulting sum of square differences
48 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
49 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
50 */
51 template <unsigned int tChannels, unsigned int tPatchSize>
52 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
53
54 /**
55 * Returns the sum of square differences between an image patch and a buffer.
56 * @param patch0 The top left start position of the image patch, must be valid
57 * @param buffer1 The memory buffer, must be valid
58 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
59 * @return The resulting sum of square differences
60 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
61 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
62 */
63 template <unsigned int tChannels, unsigned int tPatchSize>
64 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
65
66 /**
67 * Returns the sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
68 * @param image0 The image in which the first patch is located, must be valid
69 * @param image1 The image in which the second patch is located, must be valid
70 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
71 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
72 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
73 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
74 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
75 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
76 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
77 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
78 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
79 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
80 * @return The resulting sum of square differences, with range [0, infinity)
81 * @tparam tChannels The number of frame channels, with range [1, infinity)
82 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
83 */
84 template <unsigned int tChannels, unsigned int tPatchSize>
85 static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
86
87 protected:
88
89 /**
90 * Returns the mirrored element index for a given element index.
91 * The mirrored index is calculated as follows:
92 * <pre>
93 * |<----------------------- valid value range -------------------------->|
94 *
95 * elementIndex: -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, ... elements - 3, elements - 2, elements - 1, elements + 0, elements + 1
96 * result: 2 1 0 0 1 2 3 4 5 6 7 ... elements - 3 elements - 2 elements - 1 elements - 1 elements - 2
97 * </pre>
98 * The resulting mirrored index is adjusted to support several channels.
99 * @param elementIndex The index for which the mirrored index will be returned, with range [-elements/2, elements + elements/2]
100 * @param elements The number of maximal elements, with range [1, infinity)
101 * @return The mirrored index, with range [0, elements)
102 * @tparam tChannels The number of channels the elements have, with range [1, infinity)
103 */
104 template <unsigned int tChannels>
105 static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements);
106
107 /**
108 * Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
109 * @param row The row from which the values will be loaded, must be valid
110 * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
111 * @param elements The number of elements in the row, with range [4, infinity)
112 * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
113 * @return The uint8x8_t object with the loaded values
114 * @tparam tChannels The number of channels the row has, with range [1, infinity)
115 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
116 * @tparam tSize The number of uint8_t values to be read, with range [1, 8]
117 */
118 template <unsigned int tChannels, bool tFront, unsigned int tSize>
119 static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
120
121 /**
122 * Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
123 * @param row The row from which the values will be loaded, must be valid
124 * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
125 * @param elements The number of elements in the row, with range [8, infinity)
126 * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
127 * @return The uint8x16_t object with the loaded values
128 * @tparam tChannels The number of channels the row has, with range [1, infinity)
129 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
130 * @tparam tSize The number of uint8_t values to be read, with range [1, 16]
131 */
132 template <unsigned int tChannels, bool tFront, unsigned int tSize>
133 static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
134};
135
136template <unsigned int tSize>
137inline uint32_t SumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1)
138{
139 static_assert(tSize >= 1u, "Invalid buffer size!");
140
141 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
143
144 // first, we handle blocks with 16 elements
145
146 constexpr unsigned int blocks16 = tSize / 16u;
147
148 for (unsigned int n = 0u; n < blocks16; ++n)
149 {
150 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
151 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
152
153 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
155
156 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
157 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
159
160 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
162
163 buffer0 += 16;
164 buffer1 += 16;
165 }
166
167 // we may handle at most one block with 8 elements
168
169 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
170 static_assert(blocks8 <= 1u, "Invalid number of blocks!");
171
172 if (blocks8 == 1u)
173 {
174 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
175 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
176
177 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
178 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
179
180 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
181
182 buffer0 += 8;
183 buffer1 += 8;
184 }
185
186 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
187
188 uint32_t results[4];
189 vst1q_u32(results, sum_u_32x4);
190
191 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
192 static_assert(remainingElements < 8u, "Invalid number of remaining elements!");
193
194 uint32_t result = results[0] + results[1] + results[2] + results[3];
195
196 // we apply the remaining elements (at most 7)
197
198 for (unsigned int n = 0u; n < remainingElements; ++n)
199 {
200 result += sqrDistance(buffer0[n], buffer1[n]);
201 }
202
203 return result;
204}
205
206template <unsigned int tChannels, unsigned int tPatchSize>
207inline uint32_t SumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
208{
209 static_assert(tChannels >= 1u, "Invalid channel number!");
210 static_assert(tPatchSize >= 5u, "Invalid patch size!");
211
212 ocean_assert(patch0 != nullptr && patch1 != nullptr);
213
214 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
215 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
216
217 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
218
219 constexpr unsigned int blocks16 = patchWidthElements / 16u;
220 constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
221 constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
222
223 static_assert(blocks1 <= 7u, "Invalid block size!");
224
225 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
226 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
227
228 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
229 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
230
231 uint32_t sumIndividual = 0u;
232
233 for (unsigned int y = 0u; y < tPatchSize; ++y)
234 {
235 for (unsigned int n = 0u; n < blocks16; ++n)
236 {
237 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
238 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
239
240 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
241 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
242
243 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
244 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
245 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
246
247 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
248 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
249
250 patch0 += 16;
251 patch1 += 16;
252 }
253
254 for (unsigned int n = 0u; n < blocks8; ++n)
255 {
256 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
257 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
258
259 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
260 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
261
262 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
263
264 patch0 += 8;
265 patch1 += 8;
266 }
267
268 if constexpr (blocks1 != 0u)
269 {
270 if (blocks1 >= 3u)
271 {
272 // we have enough elements left so that using NEON is still faster than handling each element individually
273
274 if (y < tPatchSize - 1u)
275 {
276 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
277 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
278
279 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
280
281 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
282 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
283
284 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
285 }
286 else
287 {
288 constexpr unsigned int overlapElements = 8u - blocks1;
289 static_assert(overlapElements >= 1u && overlapElements < 8u, "Invalid number!");
290
291 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
292 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
293
294 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
295
296 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
297 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
298
299 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
300 }
301 }
302 else
303 {
304 for (unsigned int n = 0u; n < blocks1; ++n)
305 {
306 sumIndividual += sqrDistance(patch0[n], patch1[n]);
307 }
308 }
309
310 patch0 += blocks1;
311 patch1 += blocks1;
312 }
313
314 patch0 += patch0StrideElements - patchWidthElements;
315 patch1 += patch1StrideElements - patchWidthElements;
316 }
317
318 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
319
320 uint32_t results[4];
321 vst1q_u32(results, sum_u_32x4);
322
323 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
324}
325
326template <unsigned int tChannels, unsigned int tPatchSize>
327inline uint32_t SumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
328{
329 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
330}
331
332template <unsigned int tChannels, unsigned int tPatchSize>
333uint32_t SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
334{
335 static_assert(tChannels >= 1u, "Invalid channel number!");
336 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
337
338 ocean_assert(image0 != nullptr && image1 != nullptr);
339
340 ocean_assert(centerX0 < width0 && centerY0 < height0);
341 ocean_assert(centerX1 < width1 && centerY1 < height1);
342
343 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
344
345 const unsigned int width0Elements = width0 * tChannels;
346 const unsigned int width1Elements = width1 * tChannels;
347
348 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
349 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
350
351 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
352
353 constexpr unsigned int blocks16 = patchWidthElements / 16u;
354 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
355
356 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
357 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
358
359 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
360 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
361
362 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
363 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
364
365 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
366
367 static_assert(blocks1 <= 7u, "Invalid block size!");
368
369 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
370 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
371
372 uint32_t sumIndividual = 0u;
373
374 uint8_t intermediate[16];
375
376 int y1 = int(centerY1) - int(tPatchSize_2);
377 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
378 {
379 const uint8_t* const mirroredRow0 = image0 + (unsigned int)(y0 + CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
380 const uint8_t* const mirroredRow1 = image1 + (unsigned int)(y1 + CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
381
382 int x0 = (int(centerX0) - int(tPatchSize_2)) * int(tChannels);
383 int x1 = (int(centerX1) - int(tPatchSize_2)) * int(tChannels);
384
385 for (unsigned int n = 0u; n < blocks16; ++n)
386 {
387 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
388 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate));
389
390 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
391 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
392
393 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
394 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
395 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
396
397 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
398 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
399
400 x0 += 16;
401 x1 += 16;
402 }
403
404 if constexpr (partialBlock16)
405 {
406 if (y0 < int(centerY0) + int(tPatchSize_2))
407 {
408 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
409 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
410
411 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
412 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
413
414 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
415 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
416 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
417
418 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
419 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
420 }
421 else
422 {
423 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
424 const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
425
426 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
427 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
428
429 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
430 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
431 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
432
433 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
434 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
435 }
436
437 x0 += remainingAfterBlocks16;
438 x1 += remainingAfterBlocks16;
439 }
440
441 for (unsigned int n = 0u; n < blocks8; ++n)
442 {
443 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
444 const uint8x8_t absDifference_u_8x8 = vabd_u8(loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate));
445
446 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
447 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
448
449 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
450
451 x0 += 8;
452 x1 += 8;
453 }
454
455 if constexpr (partialBlock8)
456 {
457 // we have enough elements left so that using NEON is still faster than handling each element individually
458
459 if (y0 < int(centerY0) + int(tPatchSize_2))
460 {
461 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
462 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
463
464 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
465
466 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
467 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
468
469 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
470 }
471 else
472 {
473 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
474 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
475
476 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
477
478 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
479 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
480
481 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
482 }
483
484 x0 += remainingAfterBlocks8;
485 x1 += remainingAfterBlocks8;
486 }
487
488 if constexpr (blocks1 != 0u)
489 {
490 for (unsigned int n = 0u; n < blocks1; ++n)
491 {
492 sumIndividual += sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 + int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 + int(n), width1Elements)]);
493 }
494 }
495
496 ++y1;
497 }
498
499 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
500
501 uint32_t results[4];
502 vst1q_u32(results, sum_u_32x4);
503
504 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
505}
506
507template <unsigned int tChannels>
508inline unsigned int SumSquareDifferencesNEON::mirrorIndex(const int elementIndex, const unsigned int elements)
509{
510 static_assert(tChannels >= 1u, "Invalid channel number!");
511
512 if ((unsigned int)(elementIndex) < elements)
513 {
514 return elementIndex;
515 }
516
517 if (elementIndex < 0)
518 {
519 const unsigned int leftElements = (unsigned int)(-elementIndex) - 1u;
520
521 const unsigned int pixelIndex = leftElements / tChannels;
522 const unsigned int channelIndex = tChannels - (leftElements % tChannels) - 1u;
523 ocean_assert(channelIndex < tChannels);
524
525 ocean_assert(pixelIndex * tChannels + channelIndex < elements);
526 return pixelIndex * tChannels + channelIndex;
527 }
528 else
529 {
530 ocean_assert(elementIndex >= elements);
531
532 const unsigned int rightElements = elementIndex - elements;
533
534 const unsigned int rightPixels = rightElements / tChannels;
535 const unsigned int channelIndex = rightElements % tChannels;
536 ocean_assert(channelIndex < tChannels);
537
538 ocean_assert(elements - (rightPixels + 1u) * tChannels + channelIndex < elements);
539 return elements - (rightPixels + 1u) * tChannels + channelIndex;
540 }
541}
542
543template <unsigned int tChannels, bool tFront, unsigned int tSize>
544OCEAN_FORCE_INLINE uint8x8_t SumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
545{
546 static_assert(tChannels >= 1u, "Invalid channel number!");
547
548 ocean_assert(tSize >= 1u && tSize <= 8u);
549
550 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
551
552 constexpr unsigned int tOverlappingElements = 8u - tSize;
553
554 if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
555 {
556 if constexpr (tSize == 8u)
557 {
558 return vld1_u8(row + elementIndex);
559 }
560 else
561 {
562 if constexpr (tFront)
563 {
564 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
565 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
566
567 return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
568 }
569 else
570 {
571 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
572 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
573
574 return vand_u8(vld1_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x8);
575 }
576 }
577 }
578
579 if constexpr (tFront)
580 {
581 for (unsigned int n = 0u; n < tSize; ++n)
582 {
583 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
584 ocean_assert(index < elements);
585
586 intermediateBuffer[n] = row[index];
587 }
588
589 for (unsigned int n = tSize; n < 8u; ++n)
590 {
591 intermediateBuffer[n] = 0u;
592 }
593 }
594 else
595 {
596 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
597 {
598 intermediateBuffer[n] = 0u;
599 }
600
601 for (unsigned int n = 0u; n < tSize; ++n)
602 {
603 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
604 ocean_assert(index < elements);
605
606 intermediateBuffer[tOverlappingElements + n] = row[index];
607 }
608 }
609
610 return vld1_u8(intermediateBuffer);
611}
612
613template <unsigned int tChannels, bool tFront, unsigned int tSize>
614OCEAN_FORCE_INLINE uint8x16_t SumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
615{
616 static_assert(tChannels >= 1u, "Invalid channel number!");
617
618 ocean_assert(tSize > 8u && tSize <= 16u);
619
620 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
621
622 constexpr unsigned int tOverlappingElements = 16u - tSize;
623
624 if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
625 {
626 if constexpr (tSize == 16u)
627 {
628 return vld1q_u8(row + elementIndex);
629 }
630 else
631 {
632 if constexpr (tFront)
633 {
634 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
635 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
636
637 return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
638 }
639 else
640 {
641 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
642 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
643
644 return vandq_u8(vld1q_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x16);
645 }
646 }
647 }
648
649 if constexpr (tFront)
650 {
651 for (unsigned int n = 0u; n < tSize; ++n)
652 {
653 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
654 ocean_assert(index < elements);
655
656 intermediateBuffer[n] = row[index];
657 }
658
659 for (unsigned int n = tSize; n < 16u; ++n)
660 {
661 intermediateBuffer[n] = 0u;
662 }
663 }
664 else
665 {
666 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
667 {
668 intermediateBuffer[n] = 0u;
669 }
670
671 for (unsigned int n = 0u; n < tSize; ++n)
672 {
673 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
674 ocean_assert(index < elements);
675
676 intermediateBuffer[tOverlappingElements + n] = row[index];
677 }
678 }
679
680 return vld1q_u8(intermediateBuffer);
681}
682
683}
684
685}
686
687#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
688
689#endif // META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
static int mirrorOffset(const unsigned int index, const unsigned int elements)
Deprecated.
Definition CVUtilities.h:446
This class implements function to calculate sum square differences using NEON instructions.
Definition SumSquareDifferencesNEON.h:28
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements)
Returns the mirrored element index for a given element index.
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences between two patches within an image, patch pixels outside the i...
Definition SumSquareDifferencesNEON.h:333
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of square differences between two memory buffers.
Definition SumSquareDifferencesNEON.h:137
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
Definition SumSquareDifferencesNEON.h:544
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of square differences between two patches within an image.
Definition SumSquareDifferencesNEON.h:207
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
Definition SumSquareDifferencesNEON.h:614
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of square differences between an image patch and a buffer.
Definition SumSquareDifferencesNEON.h:327
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition Accessor.h:15