Ocean
SumSquareDifferencesNEON.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
9 #define META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
10 
11 #include "ocean/cv/CV.h"
12 
13 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
14 
15 #include "ocean/cv/NEON.h"
16 
17 namespace Ocean
18 {
19 
20 namespace CV
21 {
22 
23 /**
24  * This class implements function to calculate sum square differences using NEON instructions.
25  * @ingroup cv
26  */
28 {
29  public:
30 
31  /**
32  * Returns the sum of square differences between two memory buffers.
33  * @param buffer0 The first memory buffer, must be valid
34  * @param buffer1 The second memory buffer, must be valid
35  * @return The resulting sum of square differences
36  * @tparam tSize The size of the buffers in elements, with range [1, infinity)
37  */
38  template <unsigned int tSize>
39  static inline uint32_t buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1);
40 
41  /**
42  * Returns the sum of square differences between two patches within an image.
43  * @param patch0 The top left start position of the first image patch, must be valid
44  * @param patch1 The top left start position of the second image patch, must be valid
45  * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
46  * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
47  * @return The resulting sum of square differences
48  * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
49  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
50  */
51  template <unsigned int tChannels, unsigned int tPatchSize>
52  static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
53 
54  /**
55  * Returns the sum of square differences between an image patch and a buffer.
56  * @param patch0 The top left start position of the image patch, must be valid
57  * @param buffer1 The memory buffer, must be valid
58  * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
59  * @return The resulting sum of square differences
60  * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
61  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
62  */
63  template <unsigned int tChannels, unsigned int tPatchSize>
64  static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
65 
66  /**
67  * Returns the sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
68  * @param image0 The image in which the first patch is located, must be valid
69  * @param image1 The image in which the second patch is located, must be valid
70  * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
71  * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
72  * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
73  * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
74  * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
75  * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
76  * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
77  * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
78  * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
79  * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
80  * @return The resulting sum of square differences, with range [0, infinity)
81  * @tparam tChannels The number of frame channels, with range [1, infinity)
82  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
83  */
84  template <unsigned int tChannels, unsigned int tPatchSize>
85  static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
86 
87  protected:
88 
89  /**
90  * Returns the mirrored element index for a given element index.
91  * The mirrored index is calculated as follows:
92  * <pre>
93  * |<----------------------- valid value range -------------------------->|
94  *
95  * elementIndex: -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, ... elements - 3, elements - 2, elements - 1, elements + 0, elements + 1
96  * result: 2 1 0 0 1 2 3 4 5 6 7 ... elements - 3 elements - 2 elements - 1 elements - 1 elements - 2
97  * </pre>
98  * The resulting mirrored index is adjusted to support several channels.
99  * @param elementIndex The index for which the mirrored index will be returned, with range [-elements/2, elements + elements/2]
100  * @param elements The number of maximal elements, with range [1, infinity)
101  * @return The mirrored index, with range [0, elements)
102  * @tparam tChannels The number of channels the elements have, with range [1, infinity)
103  */
104  template <unsigned int tChannels>
105  static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements);
106 
107  /**
108  * Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
109  * @param row The row from which the values will be loaded, must be valid
110  * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
111  * @param elements The number of elements in the row, with range [4, infinity)
112  * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
113  * @return The uint8x8_t object with the loaded values
114  * @tparam tChannels The number of channels the row has, with range [1, infinity)
115  * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
116  * @tparam tSize The number of uint8_t values to be read, with range [1, 8]
117  */
118  template <unsigned int tChannels, bool tFront, unsigned int tSize>
119  static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
120 
121  /**
122  * Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
123  * @param row The row from which the values will be loaded, must be valid
124  * @param elementIndex The index of the first elements to load, with range [-elements/2, elements + elements/2]
125  * @param elements The number of elements in the row, with range [8, infinity)
126  * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
127  * @return The uint8x16_t object with the loaded values
128  * @tparam tChannels The number of channels the row has, with range [1, infinity)
129  * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
130  * @tparam tSize The number of uint8_t values to be read, with range [1, 16]
131  */
132  template <unsigned int tChannels, bool tFront, unsigned int tSize>
133  static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer);
134 };
135 
136 template <unsigned int tSize>
137 inline uint32_t SumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1)
138 {
139  static_assert(tSize >= 1u, "Invalid buffer size!");
140 
141  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
142  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
143 
144  // first, we handle blocks with 16 elements
145 
146  constexpr unsigned int blocks16 = tSize / 16u;
147 
148  for (unsigned int n = 0u; n < blocks16; ++n)
149  {
150  // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
151  const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
152 
153  const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
154  const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
155 
156  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
157  const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
158  const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
159 
160  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
161  sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
162 
163  buffer0 += 16;
164  buffer1 += 16;
165  }
166 
167  // we may handle at most one block with 8 elements
168 
169  constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
170  static_assert(blocks8 <= 1u, "Invalid number of blocks!");
171 
172  if (blocks8 == 1u)
173  {
174  // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
175  const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(buffer0), vld1_u8(buffer1));
176 
177  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
178  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
179 
180  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
181 
182  buffer0 += 8;
183  buffer1 += 8;
184  }
185 
186  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
187 
188  uint32_t results[4];
189  vst1q_u32(results, sum_u_32x4);
190 
191  constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
192  static_assert(remainingElements < 8u, "Invalid number of remaining elements!");
193 
194  uint32_t result = results[0] + results[1] + results[2] + results[3];
195 
196  // we apply the remaining elements (at most 7)
197 
198  for (unsigned int n = 0u; n < remainingElements; ++n)
199  {
200  result += sqrDistance(buffer0[n], buffer1[n]);
201  }
202 
203  return result;
204 }
205 
206 template <unsigned int tChannels, unsigned int tPatchSize>
207 inline uint32_t SumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
208 {
209  static_assert(tChannels >= 1u, "Invalid channel number!");
210  static_assert(tPatchSize >= 5u, "Invalid patch size!");
211 
212  ocean_assert(patch0 != nullptr && patch1 != nullptr);
213 
214  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
215  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
216 
217  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
218 
219  constexpr unsigned int blocks16 = patchWidthElements / 16u;
220  constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
221  constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
222 
223  static_assert(blocks1 <= 7u, "Invalid block size!");
224 
225  const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
226  const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
227 
228  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
229  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
230 
231  uint32_t sumIndividual = 0u;
232 
233  for (unsigned int y = 0u; y < tPatchSize; ++y)
234  {
235  for (unsigned int n = 0u; n < blocks16; ++n)
236  {
237  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
238  const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
239 
240  const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
241  const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
242 
243  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
244  const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
245  const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
246 
247  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
248  sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
249 
250  patch0 += 16;
251  patch1 += 16;
252  }
253 
254  for (unsigned int n = 0u; n < blocks8; ++n)
255  {
256  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
257  const uint8x8_t absDifference_u_8x8 = vabd_u8(vld1_u8(patch0), vld1_u8(patch1));
258 
259  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
260  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
261 
262  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
263 
264  patch0 += 8;
265  patch1 += 8;
266  }
267 
268  if constexpr (blocks1 != 0u)
269  {
270  if (blocks1 >= 3u)
271  {
272  // we have enough elements left so that using NEON is still faster than handling each element individually
273 
274  if (y < tPatchSize - 1u)
275  {
276  const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
277  const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
278 
279  const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
280 
281  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
282  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
283 
284  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
285  }
286  else
287  {
288  constexpr unsigned int overlapElements = 8u - blocks1;
289  static_assert(overlapElements >= 1u && overlapElements < 8u, "Invalid number!");
290 
291  const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
292  const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
293 
294  const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
295 
296  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
297  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
298 
299  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
300  }
301  }
302  else
303  {
304  for (unsigned int n = 0u; n < blocks1; ++n)
305  {
306  sumIndividual += sqrDistance(patch0[n], patch1[n]);
307  }
308  }
309 
310  patch0 += blocks1;
311  patch1 += blocks1;
312  }
313 
314  patch0 += patch0StrideElements - patchWidthElements;
315  patch1 += patch1StrideElements - patchWidthElements;
316  }
317 
318  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
319 
320  uint32_t results[4];
321  vst1q_u32(results, sum_u_32x4);
322 
323  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
324 }
325 
326 template <unsigned int tChannels, unsigned int tPatchSize>
327 inline uint32_t SumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
328 {
329  return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
330 }
331 
332 template <unsigned int tChannels, unsigned int tPatchSize>
333 uint32_t SumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
334 {
335  static_assert(tChannels >= 1u, "Invalid channel number!");
336  static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
337 
338  ocean_assert(image0 != nullptr && image1 != nullptr);
339 
340  ocean_assert(centerX0 < width0 && centerY0 < height0);
341  ocean_assert(centerX1 < width1 && centerY1 < height1);
342 
343  constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
344 
345  const unsigned int width0Elements = width0 * tChannels;
346  const unsigned int width1Elements = width1 * tChannels;
347 
348  const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
349  const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
350 
351  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
352 
353  constexpr unsigned int blocks16 = patchWidthElements / 16u;
354  constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
355 
356  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
357  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
358 
359  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
360  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
361 
362  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
363  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
364 
365  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
366 
367  static_assert(blocks1 <= 7u, "Invalid block size!");
368 
369  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
370  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
371 
372  uint32_t sumIndividual = 0u;
373 
374  uint8_t intermediate[16];
375 
376  int y1 = int(centerY1) - int(tPatchSize_2);
377  for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
378  {
379  const uint8_t* const mirroredRow0 = image0 + (unsigned int)(y0 + CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
380  const uint8_t* const mirroredRow1 = image1 + (unsigned int)(y1 + CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
381 
382  int x0 = (int(centerX0) - int(tPatchSize_2)) * int(tChannels);
383  int x1 = (int(centerX1) - int(tPatchSize_2)) * int(tChannels);
384 
385  for (unsigned int n = 0u; n < blocks16; ++n)
386  {
387  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
388  const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate));
389 
390  const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
391  const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
392 
393  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
394  const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
395  const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
396 
397  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
398  sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
399 
400  x0 += 16;
401  x1 += 16;
402  }
403 
404  if constexpr (partialBlock16)
405  {
406  if (y0 < int(centerY0) + int(tPatchSize_2))
407  {
408  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
409  const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
410 
411  const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
412  const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
413 
414  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
415  const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
416  const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
417 
418  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
419  sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
420  }
421  else
422  {
423  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
424  const uint8x16_t absDifference_u_8x16 = vabdq_u8(loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate));
425 
426  const uint8x8_t absDifferenceA_u_8x8 = vget_low_u8(absDifference_u_8x16);
427  const uint8x8_t absDifferenceB_u_8x8 = vget_high_u8(absDifference_u_8x16);
428 
429  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
430  const uint16x8_t sqrDifferenceA_u_16x8 = vmull_u8(absDifferenceA_u_8x8, absDifferenceA_u_8x8);
431  const uint16x8_t sqrDifferenceB_u_16x8 = vmull_u8(absDifferenceB_u_8x8, absDifferenceB_u_8x8);
432 
433  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifferenceA_u_16x8);
434  sumB_u_32x4 = vpadalq_u16(sumB_u_32x4, sqrDifferenceB_u_16x8);
435  }
436 
437  x0 += remainingAfterBlocks16;
438  x1 += remainingAfterBlocks16;
439  }
440 
441  for (unsigned int n = 0u; n < blocks8; ++n)
442  {
443  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
444  const uint8x8_t absDifference_u_8x8 = vabd_u8(loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate), loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate));
445 
446  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
447  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
448 
449  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
450 
451  x0 += 8;
452  x1 += 8;
453  }
454 
455  if constexpr (partialBlock8)
456  {
457  // we have enough elements left so that using NEON is still faster than handling each element individually
458 
459  if (y0 < int(centerY0) + int(tPatchSize_2))
460  {
461  const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
462  const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
463 
464  const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
465 
466  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
467  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
468 
469  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
470  }
471  else
472  {
473  const uint8x8_t remaining0_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
474  const uint8x8_t remaining1_u_8x8 = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
475 
476  const uint8x8_t absDifference_u_8x8 = vabd_u8(remaining0_u_8x8, remaining1_u_8x8);
477 
478  // sqrDifferenceA_u_16x8 = absDifferenceA_u_8x8 ^ 2
479  const uint16x8_t sqrDifference_u_16x8 = vmull_u8(absDifference_u_8x8, absDifference_u_8x8);
480 
481  sumA_u_32x4 = vpadalq_u16(sumA_u_32x4, sqrDifference_u_16x8);
482  }
483 
484  x0 += remainingAfterBlocks8;
485  x1 += remainingAfterBlocks8;
486  }
487 
488  if constexpr (blocks1 != 0u)
489  {
490  for (unsigned int n = 0u; n < blocks1; ++n)
491  {
492  sumIndividual += sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 + int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 + int(n), width1Elements)]);
493  }
494  }
495 
496  ++y1;
497  }
498 
499  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
500 
501  uint32_t results[4];
502  vst1q_u32(results, sum_u_32x4);
503 
504  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
505 }
506 
507 template <unsigned int tChannels>
508 inline unsigned int SumSquareDifferencesNEON::mirrorIndex(const int elementIndex, const unsigned int elements)
509 {
510  static_assert(tChannels >= 1u, "Invalid channel number!");
511 
512  if ((unsigned int)(elementIndex) < elements)
513  {
514  return elementIndex;
515  }
516 
517  if (elementIndex < 0)
518  {
519  const unsigned int leftElements = (unsigned int)(-elementIndex) - 1u;
520 
521  const unsigned int pixelIndex = leftElements / tChannels;
522  const unsigned int channelIndex = tChannels - (leftElements % tChannels) - 1u;
523  ocean_assert(channelIndex < tChannels);
524 
525  ocean_assert(pixelIndex * tChannels + channelIndex < elements);
526  return pixelIndex * tChannels + channelIndex;
527  }
528  else
529  {
530  ocean_assert(elementIndex >= elements);
531 
532  const unsigned int rightElements = elementIndex - elements;
533 
534  const unsigned int rightPixels = rightElements / tChannels;
535  const unsigned int channelIndex = rightElements % tChannels;
536  ocean_assert(channelIndex < tChannels);
537 
538  ocean_assert(elements - (rightPixels + 1u) * tChannels + channelIndex < elements);
539  return elements - (rightPixels + 1u) * tChannels + channelIndex;
540  }
541 }
542 
543 template <unsigned int tChannels, bool tFront, unsigned int tSize>
544 OCEAN_FORCE_INLINE uint8x8_t SumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
545 {
546  static_assert(tChannels >= 1u, "Invalid channel number!");
547 
548  ocean_assert(tSize >= 1u && tSize <= 8u);
549 
550  ocean_assert(row != nullptr && intermediateBuffer != nullptr);
551 
552  constexpr unsigned int tOverlappingElements = 8u - tSize;
553 
554  if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
555  {
556  if constexpr (tSize == 8u)
557  {
558  return vld1_u8(row + elementIndex);
559  }
560  else
561  {
562  if constexpr (tFront)
563  {
564  constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
565  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
566 
567  return vand_u8(vld1_u8(row + elementIndex), mask_u_8x8);
568  }
569  else
570  {
571  constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
572  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
573 
574  return vand_u8(vld1_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x8);
575  }
576  }
577  }
578 
579  if constexpr (tFront)
580  {
581  for (unsigned int n = 0u; n < tSize; ++n)
582  {
583  const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
584  ocean_assert(index < elements);
585 
586  intermediateBuffer[n] = row[index];
587  }
588 
589  for (unsigned int n = tSize; n < 8u; ++n)
590  {
591  intermediateBuffer[n] = 0u;
592  }
593  }
594  else
595  {
596  for (unsigned int n = 0u; n < tOverlappingElements; ++n)
597  {
598  intermediateBuffer[n] = 0u;
599  }
600 
601  for (unsigned int n = 0u; n < tSize; ++n)
602  {
603  const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
604  ocean_assert(index < elements);
605 
606  intermediateBuffer[tOverlappingElements + n] = row[index];
607  }
608  }
609 
610  return vld1_u8(intermediateBuffer);
611 }
612 
613 template <unsigned int tChannels, bool tFront, unsigned int tSize>
614 OCEAN_FORCE_INLINE uint8x16_t SumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int elementIndex, const unsigned int elements, uint8_t* const intermediateBuffer)
615 {
616  static_assert(tChannels >= 1u, "Invalid channel number!");
617 
618  ocean_assert(tSize > 8u && tSize <= 16u);
619 
620  ocean_assert(row != nullptr && intermediateBuffer != nullptr);
621 
622  constexpr unsigned int tOverlappingElements = 16u - tSize;
623 
624  if (elementIndex >= 0 && elementIndex <= int(elements) - int(tSize))
625  {
626  if constexpr (tSize == 16u)
627  {
628  return vld1q_u8(row + elementIndex);
629  }
630  else
631  {
632  if constexpr (tFront)
633  {
634  constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
635  const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
636 
637  return vandq_u8(vld1q_u8(row + elementIndex), mask_u_8x16);
638  }
639  else
640  {
641  constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
642  const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
643 
644  return vandq_u8(vld1q_u8(row + elementIndex - int(tOverlappingElements)), mask_u_8x16);
645  }
646  }
647  }
648 
649  if constexpr (tFront)
650  {
651  for (unsigned int n = 0u; n < tSize; ++n)
652  {
653  const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
654  ocean_assert(index < elements);
655 
656  intermediateBuffer[n] = row[index];
657  }
658 
659  for (unsigned int n = tSize; n < 16u; ++n)
660  {
661  intermediateBuffer[n] = 0u;
662  }
663  }
664  else
665  {
666  for (unsigned int n = 0u; n < tOverlappingElements; ++n)
667  {
668  intermediateBuffer[n] = 0u;
669  }
670 
671  for (unsigned int n = 0u; n < tSize; ++n)
672  {
673  const unsigned int index = mirrorIndex<tChannels>(elementIndex + int(n), elements);
674  ocean_assert(index < elements);
675 
676  intermediateBuffer[tOverlappingElements + n] = row[index];
677  }
678  }
679 
680  return vld1q_u8(intermediateBuffer);
681 }
682 
683 }
684 
685 }
686 
687 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
688 
689 #endif // META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_NEON_H
static int mirrorOffset(const unsigned int index, const unsigned int elements)
Deprecated.
Definition: CVUtilities.h:446
This class implements function to calculate sum square differences using NEON instructions.
Definition: SumSquareDifferencesNEON.h:28
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int elementIndex, const unsigned int elements)
Returns the mirrored element index for a given element index.
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences between two patches within an image, patch pixels outside the i...
Definition: SumSquareDifferencesNEON.h:333
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of square differences between two memory buffers.
Definition: SumSquareDifferencesNEON.h:137
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a row with mirroring pixels if necessary.
Definition: SumSquareDifferencesNEON.h:544
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of square differences between two patches within an image.
Definition: SumSquareDifferencesNEON.h:207
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int elementIndex, const unsigned int elements, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a row with mirroring pixels if necessary.
Definition: SumSquareDifferencesNEON.h:614
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of square differences between an image patch and a buffer.
Definition: SumSquareDifferencesNEON.h:327
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15