Ocean
Loading...
Searching...
No Matches
SumSquareDifferencesNEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
9#define META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
10
11#include "ocean/cv/CV.h"
12
13#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
14
15#include "ocean/cv/NEON.h"
16
17namespace Ocean
18{
19
20namespace CV
21{
22
23/**
24 * This class implements function to calculate sum square differences using NEON instructions.
25 * @ingroup cv
26 */
28{
29 public:
30
31 /**
32 * Returns the sum of square differences between two memory buffers.
33 * @param buffer0 The first memory buffer, must be valid
34 * @param buffer1 The second memory buffer, must be valid
35 * @return The resulting sum of square differences
36 * @tparam tSize The size of the buffers in elements, with range [1, infinity)
37 */
38 template <unsigned int tSize>
39 static inline uint32_t buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1);
40
41 /**
42 * Returns the sum of square differences between two patches within an image.
43 * @param patch0 The top left start position of the first image patch, must be valid
44 * @param patch1 The top left start position of the second image patch, must be valid
45 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
46 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
47 * @return The resulting sum of square differences
48 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
49 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
50 */
51 template <unsigned int tChannels, unsigned int tPatchSize>
52 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
53
54 /**
55 * Returns the sum of square differences between an image patch and a buffer.
56 * @param patch0 The top left start position of the image patch, must be valid
57 * @param buffer1 The memory buffer, must be valid
58 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
59 * @return The resulting sum of square differences
60 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
61 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
62 */
63 template <unsigned int tChannels, unsigned int tPatchSize>
64 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
65
66 /**
67 * Returns the sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
68 * @param image0 The image in which the first patch is located, must be valid
69 * @param image1 The image in which the second patch is located, must be valid
70 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
71 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
72 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
73 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
74 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
75 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
76 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
77 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
78 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
79 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
80 * @return The resulting sum of square differences, with range [0, infinity)
81 * @tparam tChannels The number of frame channels, with range [1, infinity)
82 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
83 */
84 template <unsigned int tChannels, unsigned int tPatchSize>
85 static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
86
87 protected:
88
89 /**
90 * Returns the mirrored element index for a given element index.
91 * The mirrored index is calculated as follows:
92 * <pre>
93 * |<----------------------- valid value range -------------------------->|
94 *
95 * elementIndex: -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, ... elements - 3, elements - 2, elements - 1, elements + 0, elements + 1
96 * result: 2 1 0 0 1 2 3 4 5 6 7 ... elements - 3 elements - 2 elements - 1 elements - 1 elements - 2
97 * </pre>
98 * The resulting mirrored index is adjusted to support several channels.
99 * @param elementIndex The index for which the mirrored index will be returned, with range [-elements/2, elements + elements/2]
100 * @param elements The number of maximal elements, with range [1, infinity)
101 * @return The mirrored index, with range [0, elements)
102 * @tparam tChannels The number of channels the elements have, with range [1, infinity)
103 */
104 template <unsigned int tChannels>
105 static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements);
106
107 /**
108 * Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
109 * @param row The row from which the values will be loaded, must be valid
110 * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
111 * @param elements The number of elements in the row, with range [4, infinity)
112 * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
113 * @return The uint8x8_t object with the loaded values
114 * @tparam tChannels The number of channels the row has, with range [1, infinity)
115 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
116 * @tparam tSize The number of uint8_t values to be read, with range [1, 8]
117 */
118 template <unsigned int tChannels, bool tFront, unsigned int tSize>
119 static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
120
121 /**
122 * Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
123 * @param row The row from which the values will be loaded, must be valid
124 * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
125 * @param elements The number of elements in the row, with range [8, infinity)
126 * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
127 * @return The uint8x16_t object with the loaded values
128 * @tparam tChannels The number of channels the row has, with range [1, infinity)
129 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
130 * @tparam tSize The number of uint8_t values to be read, with range [1, 16]
131 */
132 template <unsigned int tChannels, bool tFront, unsigned int tSize>
133 static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
134};
135
136template <unsigned int tSize>
137inline uint32_t SumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1)
138{
139 static_assert(tSize >= 1u, "Invalid buffer size!");
140
141 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
143
144 // first, we handle blocks with 16 elements
145
146 constexpr unsigned int blocks16 = tSize / 16u;
147
148 for (unsigned int n = 0u; n < blocks16; ++n)
149 {
150 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
151 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
152
153 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
155
156 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
157 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
159
160 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
162
163 buffer0 += 16;
164 buffer1 += 16;
165 }
166
167 // we may handle at most one block with 8 elements
168
169 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
170 static_assert(blocks8 <= 1u, "Invalid number of blocks!");
171
172 if (blocks8 == 1u)
173 {
174 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
175 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
176
177 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
178 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
179
180 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
181
182 buffer0 += 8;
183 buffer1 += 8;
184 }
185
186 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
187
188 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
189 static_assert(remainingElements < 8u, "Invalid number of remaining elements!");
190
191 uint32_t result = NEON::sumHorizontal_u_32x4(sum_u_32x4);
192
193 // we apply the remaining elements (at most 7)
194
195 for (unsigned int n = 0u; n < remainingElements; ++n)
196 {
197 result += sqrDistance(buffer0[n], buffer1[n]);
198 }
199
200 return result;
201}
202
203template <unsigned int tChannels, unsigned int tPatchSize>
204inline uint32_t SumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
205{
206 static_assert(tChannels >= 1u, "Invalid channel number!");
207 static_assert(tPatchSize >= 5u, "Invalid patch size!");
208
209 ocean_assert(patch0 != nullptr && patch1 != nullptr);
210
211 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
212 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
213
214 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
215
216 constexpr unsigned int blocks16 = patchWidthElements / 16u;
217 constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
218 constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
219
220 static_assert(blocks1 <= 7u, "Invalid block size!");
221
222 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
223 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
224
225 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
226 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
227
228 uint32_t sumIndividual = 0u;
229
230 for (unsigned int y = 0u; y < tPatchSize; ++y)
231 {
232 for (unsigned int n = 0u; n < blocks16; ++n)
233 {
234 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
235 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
236
237 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
238 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
239
240 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
241 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
242 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
243
244 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
245 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
246
247 patch0 += 16;
248 patch1 += 16;
249 }
250
251 for (unsigned int n = 0u; n < blocks8; ++n)
252 {
253 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
254 const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
255
256 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
257 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
258
259 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
260
261 patch0 += 8;
262 patch1 += 8;
263 }
264
265 if constexpr (blocks1 != 0u)
266 {
267 if (blocks1 >= 3u)
268 {
269 // we have enough elements left so that using NEON is still faster than handling each element individually
270
271 if (y < tPatchSize - 1u)
272 {
273 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
274 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
275
276 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
277
278 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
279 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
280
281 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
282 }
283 else
284 {
285 constexpr unsigned int overlapElements = 8u - blocks1;
286 static_assert(overlapElements >= 1u && overlapElements < 8u, "Invalid number!");
287
288 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
289 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
290
291 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
292
293 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
294 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
295
296 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
297 }
298 }
299 else
300 {
301 for (unsigned int n = 0u; n < blocks1; ++n)
302 {
303 sumIndividual += sqrDistance(patch0[n], patch1[n]);
304 }
305 }
306
307 patch0 += blocks1;
308 patch1 += blocks1;
309 }
310
311 patch0 += patch0StrideElements - patchWidthElements;
312 patch1 += patch1StrideElements - patchWidthElements;
313 }
314
315 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
316
317 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
318}
319
320template <unsigned int tChannels, unsigned int tPatchSize>
321inline uint32_t SumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
322{
323 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
324}
325
326template <unsigned int tChannels, unsigned int tPatchSize>
327uint32_t SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
328{
329 static_assert(tChannels >= 1u, "Invalid channel number!");
330 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
331
332 ocean_assert(image0 != nullptr && image1 != nullptr);
333
334 ocean_assert(centerX0 < width0 && centerY0 < height0);
335 ocean_assert(centerX1 < width1 && centerY1 < height1);
336
337 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
338
339 const unsigned int width0Elements = width0 * tChannels;
340 const unsigned int width1Elements = width1 * tChannels;
341
342 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
343 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
344
345 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
346
347 constexpr unsigned int blocks16 = patchWidthElements / 16u;
348 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
349
350 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
351 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
352
353 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
354 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
355
356 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
357 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
358
359 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
360
361 static_assert(blocks1 <= 7u, "Invalid block size!");
362
363 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
364 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
365
366 uint32_t sumIndividual = 0u;
367
368 uint8_t intermediate[16];
369
370 int y1 = int(centerY1) - int(tPatchSize_2);
371 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
372 {
373 const uint8_t* const mirroredRow0 = image0 + (unsigned int)(y0 + CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
374 const uint8_t* const mirroredRow1 = image1 + (unsigned int)(y1 + CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
375
376 int x0 = (int(centerX0) - int(tPatchSize_2)) * int(tChannels);
377 int x1 = (int(centerX1) - int(tPatchSize_2)) * int(tChannels);
378
379 for (unsigned int n = 0u; n < blocks16; ++n)
380 {
381 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
382 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate);
383 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate);
384 const uint8x16_t absDifference_u_8x16 = vabdq_u8(patch0_u_8x16, patch1_u_8x16);
385
386 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
387 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
388
389 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
390 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
391 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
392
393 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
394 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
395
396 x0 += 16;
397 x1 += 16;
398 }
399
400 if constexpr (partialBlock16)
401 {
402 if (y0 < int(centerY0) + int(tPatchSize_2))
403 {
404 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
405 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate);
406 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate);
407 const uint8x16_t absDifference_u_8x16 = vabdq_u8(patch0_u_8x16, patch1_u_8x16);
408
409 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
410 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
411
412 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
413 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
414 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
415
416 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
417 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
418 }
419 else
420 {
421 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
422 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate);
423 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate);
424 const uint8x16_t absDifference_u_8x16 = vabdq_u8(patch0_u_8x16, patch1_u_8x16);
425
426 const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
427 const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
428
429 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
430 const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
431 const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
432
433 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
434 sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
435 }
436
437 x0 += remainingAfterBlocks16;
438 x1 += remainingAfterBlocks16;
439 }
440
441 for (unsigned int n = 0u; n < blocks8; ++n)
442 {
443 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
444 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate);
445 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate);
446 const uint8x8_t absDifference_u_8x8 = vabd_u8(patch0_u_8x8, patch1_u_8x8);
447
448 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
449 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
450
451 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
452
453 x0 += 8;
454 x1 += 8;
455 }
456
457 if constexpr (partialBlock8)
458 {
459 // we have enough elements left so that using NEON is still faster than handling each element individually
460
461 if (y0 < int(centerY0) + int(tPatchSize_2))
462 {
463 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
464 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
465
466 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
467
468 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
469 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
470
471 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
472 }
473 else
474 {
475 const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
476 const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
477
478 const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
479
480 // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
481 const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
482
483 sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
484 }
485
486 x0 += remainingAfterBlocks8;
487 x1 += remainingAfterBlocks8;
488 }
489
490 if constexpr (blocks1 != 0u)
491 {
492 for (unsigned int n = 0u; n < blocks1; ++n)
493 {
494 sumIndividual += sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 + int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 + int(n), width1Elements)]);
495 }
496 }
497
498 ++y1;
499 }
500
501 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
502
503 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
504}
505
506template <unsigned int tChannels>
507inline unsigned int SumSquareDifferencesNEON::mirrorIndex(const int elementIndex, const unsigned int elements)
508{
509 static_assert(tChannels >= 1u, "Invalid channel number!");
510
511 if ((unsigned int)(elementIndex) < elements)
512 {
513 return elementIndex;
514 }
515
516 if (elementIndex < 0)
517 {
518 const unsigned int leftElements = (unsigned int)(-elementIndex) - 1u;
519
520 const unsigned int pixelIndex = leftElements / tChannels;
521 const unsigned int channelIndex = tChannels - (leftElements % tChannels) - 1u;
522 ocean_assert(channelIndex < tChannels);
523
524 ocean_assert(pixelIndex * tChannels + channelIndex < elements);
525 return pixelIndex * tChannels + channelIndex;
526 }
527 else
528 {
529 ocean_assert((unsigned int)(elementIndex) >= elements);
530
531 const unsigned int rightElements = elementIndex - elements;
532
533 const unsigned int rightPixels = rightElements / tChannels;
534 const unsigned int channelIndex = rightElements % tChannels;
535 ocean_assert(channelIndex < tChannels);
536
537 ocean_assert(elements - (rightPixels + 1u) * tChannels + channelIndex < elements);
538 return elements - (rightPixels + 1u) * tChannels + channelIndex;
539 }
540}
541
542template <unsigned int tChannels, bool tFront, unsigned int tSize>
543OCEAN_FORCE_INLINE uint8x8_t SumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
544{
545 static_assert(tChannels >= 1u, "Invalid channel number!");
546
547 ocean_assert(tSize >= 1u && tSize <= 8u);
548
549 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
550
551 constexpr unsigned int tOverlappingElements = 8u - tSize;
552
553 if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
554 {
555 if constexpr (tSize == 8u)
556 {
557 return vld1_u8(row + elementIndex);
558 }
559 else
560 {
561 if constexpr (tFront)
562 {
563 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
564 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
565
566 return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
567 }
568 else
569 {
570 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
571 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
572
573 return vand_u8(vld1_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x8);
574 }
575 }
576 }
577
578 if constexpr (tFront)
579 {
580 for (unsigned int n = 0u; n < tSize; ++n)
581 {
582 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
583 ocean_assert(index < elements);
584
585 intermediateBuffer[n] = row[index];
586 }
587
588 for (unsigned int n = tSize; n < 8u; ++n)
589 {
590 intermediateBuffer[n] = 0u;
591 }
592 }
593 else
594 {
595 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
596 {
597 intermediateBuffer[n] = 0u;
598 }
599
600 for (unsigned int n = 0u; n < tSize; ++n)
601 {
602 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
603 ocean_assert(index < elements);
604
605 intermediateBuffer[tOverlappingElements + n] = row[index];
606 }
607 }
608
609 return vld1_u8(intermediateBuffer);
610}
611
612template <unsigned int tChannels, bool tFront, unsigned int tSize>
613OCEAN_FORCE_INLINE uint8x16_t SumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
614{
615 static_assert(tChannels >= 1u, "Invalid channel number!");
616
617 ocean_assert(tSize > 8u && tSize <= 16u);
618
619 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
620
621 constexpr unsigned int tOverlappingElements = 16u - tSize;
622
623 if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
624 {
625 if constexpr (tSize == 16u)
626 {
627 return vld1q_u8(row + elementIndex);
628 }
629 else
630 {
631 if constexpr (tFront)
632 {
633 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
634 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
635
636 return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
637 }
638 else
639 {
640 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
641 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
642
643 return vandq_u8(vld1q_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x16);
644 }
645 }
646 }
647
648 if constexpr (tFront)
649 {
650 for (unsigned int n = 0u; n < tSize; ++n)
651 {
652 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
653 ocean_assert(index < elements);
654
655 intermediateBuffer[n] = row[index];
656 }
657
658 for (unsigned int n = tSize; n < 16u; ++n)
659 {
660 intermediateBuffer[n] = 0u;
661 }
662 }
663 else
664 {
665 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
666 {
667 intermediateBuffer[n] = 0u;
668 }
669
670 for (unsigned int n = 0u; n < tSize; ++n)
671 {
672 const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
673 ocean_assert(index < elements);
674
675 intermediateBuffer[tOverlappingElements + n] = row[index];
676 }
677 }
678
679 return vld1q_u8(intermediateBuffer);
680}
681
682}
683
684}
685
686#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
687
688#endif // META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
static int mirrorOffset(const unsigned int index, const unsigned int elements)
Deprecated.
Definition CVUtilities.h:449
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1196
This class implements function to calculate sum square differences using NEON instructions.
Definition SumSquareDifferencesNEON.h:28
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements)
Returns the mirrored element index for a given element index.
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences between two patches within an image, patch pixels outside the i...
Definition SumSquareDifferencesNEON.h:327
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of square differences between two memory buffers.
Definition SumSquareDifferencesNEON.h:137
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
Definition SumSquareDifferencesNEON.h:543
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of square differences between two patches within an image.
Definition SumSquareDifferencesNEON.h:204
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
Definition SumSquareDifferencesNEON.h:613
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of square differences between an image patch and a buffer.
Definition SumSquareDifferencesNEON.h:321
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15