Ocean
ZeroMeanSumSquareDifferencesNEON.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
9 #define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
10 
11 #include "ocean/cv/CV.h"
12 
13 #include "ocean/base/Utilities.h"
14 
15 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
16 
17 #include "ocean/cv/NEON.h"
18 
19 namespace Ocean
20 {
21 
22 namespace CV
23 {
24 
25 /**
26  * This class implements function to calculate zeao-mean sum square differences using NEON instructions.
27  * @ingroup cv
28  */
30 {
31  protected:
32 
33  /**
34  * This class allows to specialize functions for individual channels.
35  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
36  */
37  template <unsigned int tChannels>
39  {
40  public:
41 
42  /**
43  * Determines the mean value for a buffer, one value for each channel.
44  * @param buffer The memory buffer to be handled, must be valid
45  * @param meanValues The resulting mean values, one for each channel
46  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
47  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
48  */
49  template <unsigned int tPixels>
50  static inline void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
51 
52  /**
53  * Determines the mean value for an image patch, one value for each channel.
54  * @param patch The top left start position of the image patch, must be valid
55  * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
56  * @param meanValues The resulting mean values, one for each channel
57  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity)
58  */
59  template <unsigned int tPatchSize>
60  static inline void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
61 
62  /**
63  * Determines the mean value for an image patch, one value for each channel, patch pixels outside the image will be mirrored back into the image.
64  * @param image The image in which the patch is located, must be valid
65  * @param width The width of the image, in pixels, with range [tPatchSize, infinity)
66  * @param height The height of the image, in pixels, with range [tPatchSize, infinity)
67  * @param centerX Horizontal center position of the (tPatchSize x tPatchSize) block in the frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
68  * @param centerY Vertical center position of the (tPatchSize x tPatchSize) block in the frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
69  * @param imagePaddingElements The number of padding elements at the end of each row of the image, in elements, with range [0, infinity)
70  * @param meanValues The resulting mean values, one for each channel
71  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
72  */
73  template <unsigned int tPatchSize>
74  static inline void mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues);
75 
76  /**
77  * Returns the zero-mean sum of square differences between two memory buffers.
78  * @param buffer0 The first memory buffer, must be valid
79  * @param buffer1 The second memory buffer, must be valid
80  * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
81  * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
82  * @return The resulting sum of square differences
83  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
84  */
85  template <unsigned int tPixels>
86  static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
87 
88  /**
89  * Returns the zero-mean sum of square differences between two patches within an image.
90  * @param patch0 The top left start position of the first image patch, must be valid
91  * @param patch1 The top left start position of the second image patch, must be valid
92  * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
93  * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
94  * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
95  * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
96  * @return The resulting sum of square differences
97  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
98  */
99  template <unsigned int tPatchSize>
100  static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
101 
102  /**
103  * Returns the zero-mean sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
104  * @param image0 The image in which the first patch is located, must be valid
105  * @param image1 The image in which the second patch is located, must be valid
106  * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
107  * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
108  * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
109  * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
110  * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
111  * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
112  * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
113  * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
114  * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
115  * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
116  * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
117  * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
118  * @return The resulting zero-mean sum of square differences, with range [0, infinity)
119  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
120  */
121  template <unsigned int tPatchSize>
122  static inline uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
123  };
124 
125  public:
126 
127  /**
128  * Returns the zero-mean sum of square differences between two memory buffers.
129  * @param buffer0 The first memory buffer, must be valid
130  * @param buffer1 The second memory buffer, must be valid
131  * @return The resulting sum of square differences
132  * @tparam tChannels Specifies the number of channels for the given buffers, with range [1, infinity)
133  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
134  */
135  template <unsigned int tChannels, unsigned int tPixels>
136  static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1);
137 
138  /**
139  * Returns the zero-mean sum of square differences between two patches within an image.
140  * @param patch0 The top left start position of the first image patch, must be valid
141  * @param patch1 The top left start position of the second image patch, must be valid
142  * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
143  * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
144  * @return The resulting sum of square differences
145  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
146  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
147  */
148  template <unsigned int tChannels, unsigned int tPatchSize>
149  static inline uint32_t patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
150 
151  /**
152  * Returns the zero-mean sum of square differences between an image patch and a buffer.
153  * @param patch0 The top left start position of the image patch, must be valid
154  * @param buffer1 The memory buffer, must be valid
155  * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
156  * @return The resulting sum of square differences
157  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
158  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
159  */
160  template <unsigned int tChannels, unsigned int tPatchSize>
161  static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
162 
163  /**
164  * Returns the zero-mean sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
165  * @param image0 The image in which the first patch is located, must be valid
166  * @param image1 The image in which the second patch is located, must be valid
167  * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
168  * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
169  * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
170  * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
171  * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
172  * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
173  * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
174  * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
175  * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
176  * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
177  * @return The resulting zero-mean sum of square differences, with range [0, infinity)
178  * @tparam tChannels The number of frame channels, with range [1, infinity)
179  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
180  */
181  template <unsigned int tChannels, unsigned int tPatchSize>
182  static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
183 
184  /**
185  * Determines the mean value for a buffer, one value for each channel.
186  * @param buffer The memory buffer to be handled, must be valid
187  * @param meanValues The resulting mean values, one for each channel
188  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
189  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
190  */
191  template <unsigned int tChannels, unsigned int tPixels>
192  static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
193 
194  /**
195  * Determines the mean value for an image patch, one value for each channel.
196  * @param patch The top left start position of the image patch, must be valid
197  * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
198  * @param meanValues The resulting mean values, one for each channel
199  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
200  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
201  */
202  template <unsigned int tChannels, unsigned int tPatchSize>
203  static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
204 
205  protected:
206 
207  /**
208  * Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
209  * @param row The row from which the values will be loaded, must be valid
210  * @param x The index of the first pixel to load, with range [-elements/2, elements + elements/2]
211  * @param width The width of the row, in pixels, with range [4, infinity)
212  * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
213  * @return The uint8x8_t object with the loaded values
214  * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
215  * @tparam tPixels The number of uint8_t pixels to be read, with range [1, 8]
216  * @tparam tOverlappingToZero True, to set overlapping pixels to zero; False, to get overlapping pixels with random values
217  */
218  template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
219  static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer);
220 
221  /**
222  * Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
223  * @param row The row from which the values will be loaded, must be valid
224  * @param x The index of the first pixel to load, with range [-elements/2, elements + elements/2]
225  * @param width The width of the row in pixels, with range [8, infinity)
226  * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
227  * @return The uint8x16_t object with the loaded values
228  * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
229  * @tparam tSize The number of uint8_t pixels to be read, with range [1, 16]
230  * @tparam tOverlappingToZero True, to set overlapping pixels to zero; False, to get overlapping pixels with random values
231  */
232  template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
233  static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer);
234 };
235 
236 template <>
237 template <unsigned int tPixels>
238 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
239 {
240  static_assert(tPixels >= 8u, "Invalid pixels!");
241 
242  ocean_assert(buffer != nullptr && meanValues != nullptr);
243 
244  constexpr unsigned int blocks16 = tPixels / 16u;
245  constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
246 
247  constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u && tPixels >= 16u;
248  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
249 
250  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
251  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
252 
253  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u && tPixels >= 8u;
254  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
255 
256  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
257 
258  static_assert(blocks1 <= 2u, "Invalid block size!");
259 
260  uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
261 
262  uint32_t sumIndividual = 0u;
263 
264  for (unsigned int n = 0u; n < blocks16; ++n)
265  {
266  const uint8x16_t buffer_u_8x16 = vld1q_u8(buffer);
267 
268  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
269 
270  buffer += 16;
271  }
272 
273  if constexpr (partialBlock16)
274  {
275  static_assert(tPixels >= 16u, "We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
276 
277  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
278  ocean_assert(overlappingElements < 8u);
279 
280  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
281  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
282  const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
283 
284  const uint8x16_t buffer_u_8x16 = vandq_u8(vld1q_u8(buffer - overlappingElements), mask_u_8x16);
285 
286  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
287 
288  buffer += remainingAfterBlocks16;
289  }
290 
291  for (unsigned int n = 0u; n < blocks8; ++n)
292  {
293  const uint8x8_t buffer_u_8x8 = vld1_u8(buffer);
294 
295  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
296 
297  buffer += 8;
298  }
299 
300  if constexpr (partialBlock8)
301  {
302  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
303  ocean_assert(overlappingElements < 8u);
304 
305  const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
306 
307  const uint8x8_t buffer_u_8x8 = vand_u8(vld1_u8(buffer - overlappingElements), mask_u_8x8);
308 
309  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
310 
311  buffer += remainingAfterBlocks8;
312  }
313 
314  if constexpr (blocks1 != 0u)
315  {
316  for (unsigned int n = 0u; n < blocks1; ++n)
317  {
318  sumIndividual += buffer[n];
319  }
320 
321  buffer += blocks1;
322  }
323 
324  uint32_t results[4];
325  vst1q_u32(results, sum_u_32x4);
326 
327  const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
328 
329  meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
330 }
331 
332 template <>
333 template <unsigned int tPixels>
334 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
335 {
336  static_assert(tPixels >= 8u, "Invalid pixels!");
337 
338  constexpr unsigned int tChannels = 3u;
339 
340  ocean_assert(buffer != nullptr && meanValues != nullptr);
341 
342  constexpr unsigned int blocks16 = tPixels / 16u;
343  constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
344 
345  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u && blocks16 >= 1u;
346  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
347 
348  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
349  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
350 
351  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
352  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
353 
354  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
355 
356  static_assert(blocks1 <= 2u, "Invalid block size!");
357 
358  uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
359  uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
360  uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
361 
362  uint32_t sumIndividual[3] = {0u};
363 
364  for (unsigned int n = 0u; n < blocks16; ++n)
365  {
366  const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer);
367 
368  sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[0]));
369  sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[1]));
370  sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[2]));
371 
372  buffer += 16u * tChannels;
373  }
374 
375  if constexpr (partialBlock16)
376  {
377  static_assert(tPixels >= 16u, "We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
378 
379  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
380  ocean_assert(overlappingElements < 8u);
381 
382  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
383  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
384  const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
385 
386  const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer - overlappingElements * tChannels);
387 
388  sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[0], mask_u_8x16)));
389  sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[1], mask_u_8x16)));
390  sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[2], mask_u_8x16)));
391 
392  buffer += remainingAfterBlocks16 * tChannels;
393  }
394 
395  for (unsigned int n = 0u; n < blocks8; ++n)
396  {
397  const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer);
398 
399  sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[0]));
400  sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[1]));
401  sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[2]));
402 
403  buffer += 8u * tChannels;
404  }
405 
406  if constexpr (partialBlock8)
407  {
408  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
409  ocean_assert(overlappingElements < 8u);
410 
411  const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
412 
413  const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer - overlappingElements * tChannels);
414 
415  sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[0], mask_u_8x8)));
416  sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[1], mask_u_8x8)));
417  sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[2], mask_u_8x8)));
418 
419  buffer += remainingAfterBlocks8 * tChannels;
420  }
421 
422  for (unsigned int n = 0u; n < blocks1; ++n)
423  {
424  sumIndividual[0] += buffer[tChannels * n + 0u];
425  sumIndividual[1] += buffer[tChannels * n + 1u];
426  sumIndividual[2] += buffer[tChannels * n + 2u];
427  }
428 
429  uint32_t results[4];
430  vst1q_u32(results, sumChannel0_u_32x4);
431 
432  const uint32_t sum0 = results[0] + results[1] + results[2] + results[3] + sumIndividual[0];
433  meanValues[0] = uint8_t((sum0 + tPixels / 2u) / tPixels);
434 
435  vst1q_u32(results, sumChannel1_u_32x4);
436 
437  const uint32_t sum1 = results[0] + results[1] + results[2] + results[3] + sumIndividual[1];
438  meanValues[1] = uint8_t((sum1 + tPixels / 2u) / tPixels);
439 
440  vst1q_u32(results, sumChannel2_u_32x4);
441 
442  const uint32_t sum2 = results[0] + results[1] + results[2] + results[3] + sumIndividual[2];
443  meanValues[2] = uint8_t((sum2 + tPixels / 2u) / tPixels);
444 }
445 
446 template <unsigned int tChannels>
447 template <unsigned int tPixels>
448 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
449 {
450  static_assert(tChannels >= 1u, "Invalid channel number!");
451  static_assert(tPixels >= 1u, "Invalid buffer size!");
452 
453  ocean_assert(buffer != nullptr && meanValues != nullptr);
454 
455  uint32_t sum[tChannels] = {0u};
456 
457  for (unsigned int n = 0u; n < tPixels; ++n)
458  {
459  for (unsigned int c = 0u; c < tChannels; ++c)
460  {
461  sum[c] += buffer[n * tChannels + c];
462  }
463  }
464 
465  for (unsigned int c = 0u; c < tChannels; ++c)
466  {
467  meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
468  }
469 }
470 
471 template <>
472 template <unsigned int tPatchSize>
473 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
474 {
475  static_assert(tPatchSize >= 5u, "Invalid patch size!");
476 
477  ocean_assert(patch != nullptr && meanValues != nullptr);
478 
479  ocean_assert(patchStrideElements >= tPatchSize);
480 
481  constexpr unsigned int blocks16 = tPatchSize / 16u;
482  constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
483 
484  constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
485  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
486 
487  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
488  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
489 
490  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
491  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
492 
493  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
494 
495  static_assert(blocks1 <= 2u, "Invalid block size!");
496 
497  uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
498 
499  uint32_t sumIndividual = 0u;
500 
501  for (unsigned int y = 0u; y < tPatchSize; ++y)
502  {
503  for (unsigned int n = 0u; n < blocks16; ++n)
504  {
505  const uint8x16_t patch_u_8x16 = vld1q_u8(patch);
506 
507  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
508 
509  patch += 16;
510  }
511 
512  if constexpr (partialBlock16)
513  {
514  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
515  ocean_assert(overlappingElements < 8u);
516 
517  if (y < tPatchSize - 1u)
518  {
519  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
520  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
521  constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
522  const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
523 
524  const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch), mask_u_8x16);
525 
526  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
527  }
528  else
529  {
530  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
531  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
532  constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
533  const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
534 
535  const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch - overlappingElements), mask_u_8x16);
536 
537  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
538  }
539 
540  patch += remainingAfterBlocks16;
541  }
542 
543  for (unsigned int n = 0u; n < blocks8; ++n)
544  {
545  const uint8x8_t patch_u_8x8 = vld1_u8(patch);
546 
547  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
548 
549  patch += 8;
550  }
551 
552  if constexpr (partialBlock8)
553  {
554  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
555  ocean_assert(overlappingElements < 8u);
556 
557  if (y < tPatchSize - 1u)
558  {
559  constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
560  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
561 
562  const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch), mask_u_8x8);
563 
564  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
565  }
566  else
567  {
568  constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
569  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
570 
571  const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch - overlappingElements), mask_u_8x8);
572 
573  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
574  }
575 
576  patch += remainingAfterBlocks8;
577  }
578 
579  if constexpr (blocks1 != 0u)
580  {
581  for (unsigned int n = 0u; n < blocks1; ++n)
582  {
583  sumIndividual += patch[n];
584  }
585 
586  patch += blocks1;
587  }
588 
589  patch += patchStrideElements - tPatchSize;
590  }
591 
592  uint32_t results[4];
593  vst1q_u32(results, sum_u_32x4);
594 
595  const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
596 
597  meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
598 }
599 
600 template <>
601 template <unsigned int tPatchSize>
602 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
603 {
604  static_assert(tPatchSize >= 5u, "Invalid patch size!");
605 
606  constexpr unsigned int tChannels = 3u;
607 
608  ocean_assert(patch != nullptr && meanValues != nullptr);
609 
610  ocean_assert(patchStrideElements >= tChannels * tPatchSize);
611 
612  constexpr unsigned int blocks16 = tPatchSize / 16u;
613  constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
614 
615  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
616  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
617 
618  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
619  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
620 
621  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
622  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
623 
624  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
625 
626  static_assert(blocks1 <= 2u, "Invalid block size!");
627 
628  uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
629  uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
630  uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
631 
632  uint32_t sumIndividual[3] = {0u};
633 
634  for (unsigned int y = 0u; y < tPatchSize; ++y)
635  {
636  for (unsigned int n = 0u; n < blocks16; ++n)
637  {
638  const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
639 
640  sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[0]));
641  sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[1]));
642  sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[2]));
643 
644  patch += 16u * tChannels;
645  }
646 
647  if constexpr (partialBlock16)
648  {
649  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
650  ocean_assert(overlappingElements < 8u);
651 
652  if (y < tPatchSize - 1u)
653  {
654  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
655  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
656  constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
657  const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
658 
659  const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
660 
661  sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
662  sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
663  sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
664  }
665  else
666  {
667  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
668  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
669  constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
670  const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
671 
672  const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch - overlappingElements * tChannels);
673 
674  sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
675  sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
676  sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
677  }
678 
679  patch += remainingAfterBlocks16 * tChannels;
680  }
681 
682  for (unsigned int n = 0u; n < blocks8; ++n)
683  {
684  const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
685 
686  sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(patch_u_8x8x3.val[0]));
687  sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(patch_u_8x8x3.val[1]));
688  sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(patch_u_8x8x3.val[2]));
689 
690  patch += 8u * tChannels;
691  }
692 
693  if constexpr (partialBlock8)
694  {
695  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
696  ocean_assert(overlappingElements < 8u);
697 
698  if (y < tPatchSize - 1u)
699  {
700  constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
701  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
702 
703  const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
704 
705  sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
706  sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
707  sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
708  }
709  else
710  {
711  constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
712  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
713 
714  const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch - overlappingElements * tChannels);
715 
716  sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
717  sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
718  sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
719  }
720 
721  patch += remainingAfterBlocks8 * tChannels;
722  }
723 
724  if constexpr (blocks1 != 0u)
725  {
726  for (unsigned int n = 0u; n < blocks1; ++n)
727  {
728  sumIndividual[0] += patch[tChannels * n + 0u];
729  sumIndividual[1] += patch[tChannels * n + 1u];
730  sumIndividual[2] += patch[tChannels * n + 2u];
731  }
732 
733  patch += blocks1 * tChannels;
734  }
735 
736  patch += patchStrideElements - tChannels * tPatchSize;
737  }
738 
739  uint32_t results[4];
740  vst1q_u32(results, sumChannel0_u_32x4);
741 
742  const uint32_t sum0 = results[0] + results[1] + results[2] + results[3] + sumIndividual[0];
743  meanValues[0] = uint8_t((sum0 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
744 
745  vst1q_u32(results, sumChannel1_u_32x4);
746 
747  const uint32_t sum1 = results[0] + results[1] + results[2] + results[3] + sumIndividual[1];
748  meanValues[1] = uint8_t((sum1 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
749 
750  vst1q_u32(results, sumChannel2_u_32x4);
751 
752  const uint32_t sum2 = results[0] + results[1] + results[2] + results[3] + sumIndividual[2];
753  meanValues[2] = uint8_t((sum2 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
754 }
755 
756 template <unsigned int tChannels>
757 template <unsigned int tPatchSize>
758 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
759 {
760  static_assert(tChannels >= 1u, "Invalid channel number!");
761  static_assert(tPatchSize >= 1u, "Invalid patch size!");
762 
763  ocean_assert(patch != nullptr && meanValues != nullptr);
764 
765  ocean_assert(patchStrideElements >= tChannels * tPatchSize);
766 
767  uint32_t sum[tChannels] = {0u};
768 
769  for (unsigned int y = 0u; y < tPatchSize; ++y)
770  {
771  for (unsigned int x = 0u; x < tPatchSize; ++x)
772  {
773  for (unsigned int n = 0u; n < tChannels; ++n)
774  {
775  sum[n] += patch[x * tChannels + n];
776  }
777  }
778 
779  patch += patchStrideElements;
780  }
781 
782  for (unsigned int n = 0u; n < tChannels; ++n)
783  {
784  meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
785  }
786 }
787 
788 template <>
789 template <unsigned int tPatchSize>
790 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues)
791 {
792  static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
793 
794  ocean_assert(image != nullptr && meanValues != nullptr);
795  ocean_assert(centerX < width && centerY < height);
796 
797  constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
798 
799  const unsigned int imageStrideElements = width + imagePaddingElements;
800 
801  constexpr unsigned int blocks16 = tPatchSize / 16u;
802  constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
803 
804  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
805  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
806 
807  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
808  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
809 
810  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
811  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
812 
813  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
814 
815  static_assert(blocks1 <= 7u, "Invalid block size!");
816 
817  uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
818 
819  uint32_t sumIndividual = 0u;
820 
821  uint8_t intermediate[16];
822 
823  for (int y = int(centerY) - int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
824  {
825  const uint8_t* const mirroredRow = image + CVUtilities::mirrorIndex(y, height) * imageStrideElements;
826 
827  int x = int(centerX) - int(tPatchSize_2);
828 
829  for (unsigned int n = 0u; n < blocks16; ++n)
830  {
831  const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow, x, width, intermediate);
832 
833  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
834 
835  x += 16;
836  }
837 
838  if constexpr (partialBlock16)
839  {
840  if (y < int(centerY) + int(tPatchSize_2))
841  {
842  const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
843 
844  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
845  }
846  else
847  {
848  const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
849 
850  sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
851  }
852 
853  x += remainingAfterBlocks16;
854  }
855 
856  for (unsigned int n = 0u; n < blocks8; ++n)
857  {
858  const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow, x, width, intermediate);
859 
860  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
861 
862  x += 8;
863  }
864 
865  if constexpr (partialBlock8)
866  {
867  if (y < int(centerY) + int(tPatchSize_2))
868  {
869  const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
870 
871  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
872  }
873  else
874  {
875  const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
876 
877  sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
878  }
879 
880  x += remainingAfterBlocks8;
881  }
882 
883  if constexpr (blocks1 != 0u)
884  {
885  for (unsigned int n = 0u; n < blocks1; ++n)
886  {
887  const unsigned int index = CVUtilities::mirrorIndex(x, width);
888 
889  sumIndividual += mirroredRow[index];
890 
891  x++;
892  }
893  }
894  }
895 
896  uint32_t results[4];
897  vst1q_u32(results, sum_u_32x4);
898 
899  const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
900 
901  meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
902 }
903 
904 template <unsigned int tChannels>
905 template <unsigned int tPatchSize>
906 inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues)
907 {
908  static_assert(tChannels >= 1u, "Invalid channel number!");
909  static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
910 
911  ocean_assert(image != nullptr && meanValues != nullptr);
912  ocean_assert(centerX < width && centerY < height);
913 
914  constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
915 
916  const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
917 
918  uint32_t sum[tChannels] = {0u};
919 
920  for (int y = int(centerY) - int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
921  {
922  const uint8_t* const mirroredRow = image + CVUtilities::mirrorIndex(y, height) * imageStrideElements;
923 
924  for (int x = int(centerX) - int(tPatchSize_2); x <= int(centerX) + int(tPatchSize_2); ++x)
925  {
926  const uint8_t* const pixel = mirroredRow + CVUtilities::mirrorIndex(x, width) * tChannels;
927 
928  for (unsigned int c = 0u; c < tChannels; ++c)
929  {
930  sum[c] += pixel[c];
931  }
932  }
933  }
934 
935  for (unsigned int n = 0u; n < tChannels; ++n)
936  {
937  meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
938  }
939 }
940 
941 template <>
942 template <unsigned int tPixels>
943 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
944 {
945  static_assert(tPixels >= 8u, "Invalid pixels!");
946 
947  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
948  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
949 
950  constexpr unsigned int blocks16 = tPixels / 16u;
951  constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
952 
953  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
954  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
955 
956  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
957  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
958 
959  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
960  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
961 
962  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
963 
964  static_assert(blocks1 <= 2u, "Invalid block size!");
965 
966  // [(buffer0 - mean0) - (buffer1 - mean1)]^2
967  // [buffer0 - buffer1 - mean0 + mean1]^2
968 
969  const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
970 
971  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
972  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
973 
974  uint32_t sumIndividual = 0u;
975 
976  for (unsigned int n = 0u; n < blocks16; ++n)
977  {
978  const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0);
979  const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1);
980 
981  const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16))); // low 8 bytes: buffer0 - buffer1
982  const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16))); // high 8 bytes: buffer0 - buffer1
983 
984  const uint16x8_t buffer_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
985  const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
986 
987  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
988  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
989 
990  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
991  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
992 
993  buffer0 += 16;
994  buffer1 += 16;
995  }
996 
997  if constexpr (partialBlock16)
998  {
999  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1000  ocean_assert(overlappingElements < 8u);
1001 
1002  const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0 - overlappingElements);
1003  const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1 - overlappingElements);
1004 
1005  const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16))); // low 8 bytes: buffer0 - buffer1
1006  const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16))); // high 8 bytes: buffer0 - buffer1
1007 
1008  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1009  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1010 
1011  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1012 
1013  const uint16x8_t buffer_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1014  const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
1015 
1016  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
1017  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
1018 
1019  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
1020  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
1021 
1022  buffer0 += remainingAfterBlocks16;
1023  buffer1 += remainingAfterBlocks16;
1024  }
1025 
1026  for (unsigned int n = 0u; n < blocks8; ++n)
1027  {
1028  const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0);
1029  const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1);
1030 
1031  const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8)); // buffer0 - buffer1
1032 
1033  const uint16x8_t buffer_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1034 
1035  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1036  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1037 
1038  buffer0 += 8;
1039  buffer1 += 8;
1040  }
1041 
1042  if constexpr (partialBlock8)
1043  {
1044  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1045  ocean_assert(overlappingElements < 8u);
1046 
1047  const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0 - overlappingElements);
1048  const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1 - overlappingElements);
1049 
1050  const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8)); // buffer0 - buffer1
1051 
1052  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1053  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1054 
1055  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1056 
1057  const uint16x8_t buffer_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1058 
1059  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1060  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1061 
1062  buffer0 += remainingAfterBlocks8;
1063  buffer1 += remainingAfterBlocks8;
1064  }
1065 
1066  if constexpr (blocks1 != 0u)
1067  {
1068  for (unsigned int n = 0u; n < blocks1; ++n)
1069  {
1070  sumIndividual += sqrDistance(int16_t(buffer0[n] - meanValues0[0]), int16_t(buffer1[n] - meanValues1[0]));
1071  }
1072  }
1073 
1074  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1075 
1076  uint32_t results[4];
1077  vst1q_u32(results, sum_u_32x4);
1078 
1079  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1080 }
1081 
1082 template <>
1083 template <unsigned int tPixels>
1084 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1085 {
1086  static_assert(tPixels >= 8u, "Invalid pixels!");
1087 
1088  constexpr unsigned int tChannels = 3u;
1089 
1090  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1091  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1092 
1093  constexpr unsigned int blocks16 = tPixels / 16u;
1094  constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
1095 
1096  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1097  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1098 
1099  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1100  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1101 
1102  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1103  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1104 
1105  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1106 
1107  static_assert(blocks1 <= 2u, "Invalid block size!");
1108 
1109  // [(buffer0 - mean0) - (buffer1 - mean1)]^2
1110  // [buffer0 - buffer1 - mean0 + mean1]^2
1111 
1112  const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1113  const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1114  const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1115 
1116  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1117  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1118 
1119  uint32_t sumIndividual = 0u;
1120 
1121  for (unsigned int n = 0u; n < blocks16; ++n)
1122  {
1123  const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0);
1124  const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1);
1125 
1126  const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0]))); // low 8 bytes: buffer0 - buffer1
1127  const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0]))); // high 8 bytes: buffer0 - buffer1
1128 
1129  const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1130  const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1131 
1132  const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1133  const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1134 
1135 
1136  const uint16x8_t bufferChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1137  const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1138 
1139  const uint16x8_t bufferChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1140  const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1141 
1142  const uint16x8_t bufferChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1143  const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1144 
1145 
1146  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1147  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1148  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1149  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1150 
1151  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1152  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1153  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1154  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1155 
1156  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1157  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1158  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1159  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1160 
1161 
1162  buffer0 += 16u * tChannels;
1163  buffer1 += 16u * tChannels;
1164  }
1165 
1166  if constexpr (partialBlock16)
1167  {
1168  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1169  ocean_assert(overlappingElements < 8u);
1170 
1171  const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0 - overlappingElements * tChannels);
1172  const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1 - overlappingElements * tChannels);
1173 
1174 
1175  const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0]))); // low 8 bytes: buffer0 - buffer1
1176  const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0]))); // high 8 bytes: buffer0 - buffer1
1177 
1178  const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1179  const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1180 
1181  const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1182  const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1183 
1184 
1185  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1186  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1187 
1188  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1189 
1190 
1191  const uint16x8_t bufferChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1192  const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1193 
1194  const uint16x8_t bufferChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1195  const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1196 
1197  const uint16x8_t bufferChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1198  const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1199 
1200 
1201  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1202  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1203  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1204  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1205 
1206  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1207  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1208  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1209  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1210 
1211  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1212  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1213  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1214  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1215 
1216  buffer0 += remainingAfterBlocks16 * tChannels;
1217  buffer1 += remainingAfterBlocks16 * tChannels;
1218  }
1219 
1220  for (unsigned int n = 0u; n < blocks8; ++n)
1221  {
1222  const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0);
1223  const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1);
1224 
1225  const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0])); // buffer0 - buffer1
1226  const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1227  const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1228 
1229  const uint16x8_t bufferChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1230  const uint16x8_t bufferChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1231  const uint16x8_t bufferChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1232 
1233  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1234  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1235 
1236  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1237  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1238 
1239  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1240  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1241 
1242  buffer0 += 8u * tChannels;
1243  buffer1 += 8u * tChannels;
1244  }
1245 
1246  if constexpr (partialBlock8)
1247  {
1248  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1249  ocean_assert(overlappingElements < 8u);
1250 
1251  const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0 - overlappingElements * tChannels);
1252  const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1 - overlappingElements * tChannels);
1253 
1254  const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0])); // buffer0 - buffer1
1255  const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1256  const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1257 
1258  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1259  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1260 
1261  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1262 
1263  const uint16x8_t bufferChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1264  const uint16x8_t bufferChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1265  const uint16x8_t bufferChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1266 
1267  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1268  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1269 
1270  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1271  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1272 
1273  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1274  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1275 
1276  buffer0 += remainingAfterBlocks8 * tChannels;
1277  buffer1 += remainingAfterBlocks8 * tChannels;
1278  }
1279 
1280  if constexpr (blocks1 != 0u)
1281  {
1282  for (unsigned int n = 0u; n < blocks1; ++n)
1283  {
1284  for (unsigned int c = 0u; c < tChannels; ++c)
1285  {
1286  sumIndividual += sqrDistance(int16_t(buffer0[n * tChannels + c] - meanValues0[c]), int16_t(buffer1[n * tChannels + c] - meanValues1[c]));
1287  }
1288  }
1289 
1290  buffer0 += blocks1 * tChannels;
1291  buffer1 += blocks1 * tChannels;
1292  }
1293 
1294  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1295 
1296  uint32_t results[4];
1297  vst1q_u32(results, sum_u_32x4);
1298 
1299  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1300 }
1301 
1302 template <unsigned int tChannels>
1303 template <unsigned int tPixels>
1304 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1305 {
1306  static_assert(tChannels >= 1u, "Invalid channel number!");
1307  static_assert(tPixels >= 1u, "Invalid pixels!");
1308 
1309  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1310  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1311 
1312  uint32_t zmssd = 0u;
1313 
1314  for (unsigned int x = 0u; x < tPixels; ++x)
1315  {
1316  for (unsigned int c = 0u; c < tChannels; ++c)
1317  {
1318  zmssd += sqrDistance(buffer0[x * tChannels + c] - meanValues0[c], buffer1[x * tChannels + c] - meanValues1[c]);
1319  }
1320  }
1321 
1322  return zmssd;
1323 }
1324 
1325 template <>
1326 template <unsigned int tPatchSize>
1327 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1328 {
1329  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1330 
1331  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1332  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1333 
1334  ocean_assert(patch0StrideElements >= tPatchSize);
1335  ocean_assert(patch1StrideElements >= tPatchSize);
1336 
1337  constexpr unsigned int blocks16 = tPatchSize / 16u;
1338  constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1339 
1340  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1341  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1342 
1343  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1344  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1345 
1346  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1347  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1348 
1349  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1350 
1351  static_assert(blocks1 <= 2u, "Invalid block size!");
1352 
1353  // [(patch0 - mean0) - (patch1 - mean1)]^2
1354  // [patch0 - patch1 - mean0 + mean1]^2
1355 
1356  const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1357 
1358  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1359  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1360 
1361  uint32_t sumIndividual = 0u;
1362 
1363  for (unsigned int y = 0u; y < tPatchSize; ++y)
1364  {
1365  for (unsigned int n = 0u; n < blocks16; ++n)
1366  {
1367  const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1368  const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1369 
1370  const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1371  const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1372 
1373  const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1374  const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1375 
1376  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1377  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1378 
1379  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1380  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1381 
1382  patch0 += 16;
1383  patch1 += 16;
1384  }
1385 
1386  if constexpr (partialBlock16)
1387  {
1388  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1389  ocean_assert(overlappingElements < 8u);
1390 
1391  if (y < tPatchSize - 1u)
1392  {
1393  const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1394  const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1395 
1396  const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1397  const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1398 
1399  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1400  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1401 
1402  constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1403  constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1404 
1405  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1406 
1407  const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1408  const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1409 
1410  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1411  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1412 
1413  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1414  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1415  }
1416  else
1417  {
1418  const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0 - overlappingElements);
1419  const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1 - overlappingElements);
1420 
1421  const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1422  const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1423 
1424  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1425  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1426 
1427  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1428 
1429  const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1430  const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1431 
1432  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1433  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1434 
1435  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1436  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1437  }
1438 
1439  patch0 += remainingAfterBlocks16;
1440  patch1 += remainingAfterBlocks16;
1441  }
1442 
1443  for (unsigned int n = 0u; n < blocks8; ++n)
1444  {
1445  const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1446  const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1447 
1448  const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1449 
1450  const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1451 
1452  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1453  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1454 
1455  patch0 += 8;
1456  patch1 += 8;
1457  }
1458 
1459  if constexpr (partialBlock8)
1460  {
1461  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1462  ocean_assert(overlappingElements < 8u);
1463 
1464  if (y < tPatchSize - 1u)
1465  {
1466  const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1467  const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1468 
1469  const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1470 
1471  constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1472  constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1473 
1474  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1475 
1476  const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1477 
1478  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1479  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1480  }
1481  else
1482  {
1483  const uint8x8_t patch0_u_8x8 = vld1_u8(patch0 - overlappingElements);
1484  const uint8x8_t patch1_u_8x8 = vld1_u8(patch1 - overlappingElements);
1485 
1486  const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1487 
1488  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1489  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1490 
1491  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1492 
1493  const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1494 
1495  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1496  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1497  }
1498 
1499  patch0 += remainingAfterBlocks8;
1500  patch1 += remainingAfterBlocks8;
1501  }
1502 
1503  if constexpr (blocks1 != 0u)
1504  {
1505  for (unsigned int n = 0u; n < blocks1; ++n)
1506  {
1507  sumIndividual += sqrDistance(int16_t(patch0[n] - meanValues0[0]), int16_t(patch1[n] - meanValues1[0]));
1508  }
1509 
1510  patch0 += blocks1;
1511  patch1 += blocks1;
1512  }
1513 
1514  patch0 += patch0StrideElements - tPatchSize;
1515  patch1 += patch1StrideElements - tPatchSize;
1516  }
1517 
1518  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1519 
1520  uint32_t results[4];
1521  vst1q_u32(results, sum_u_32x4);
1522 
1523  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1524 }
1525 
1526 template <>
1527 template <unsigned int tPatchSize>
1528 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1529 {
1530  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1531 
1532  constexpr unsigned int tChannels = 3u;
1533 
1534  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1535  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1536 
1537  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1538  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1539 
1540  constexpr unsigned int blocks16 = tPatchSize / 16u;
1541  constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1542 
1543  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1544  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1545 
1546  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1547  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1548 
1549  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1550  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1551 
1552  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1553 
1554  static_assert(blocks1 <= 2u, "Invalid block size!");
1555 
1556  // [(patch0 - mean0) - (patch1 - mean1)]^2
1557  // [patch0 - patch1 - mean0 + mean1]^2
1558 
1559  const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1560  const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1561  const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1562 
1563  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1564  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1565 
1566  uint32_t sumIndividual = 0u;
1567 
1568  for (unsigned int y = 0u; y < tPatchSize; ++y)
1569  {
1570  for (unsigned int n = 0u; n < blocks16; ++n)
1571  {
1572  const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1573  const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1574 
1575  const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1576  const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1577 
1578  const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1579  const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1580 
1581  const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1582  const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1583 
1584 
1585  const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1586  const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1587 
1588  const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1589  const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1590 
1591  const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1592  const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1593 
1594 
1595  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1596  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1597  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1598  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1599 
1600  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1601  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1602  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1603  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1604 
1605  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1606  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1607  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1608  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1609 
1610 
1611  patch0 += 16u * tChannels;
1612  patch1 += 16u * tChannels;
1613  }
1614 
1615  if constexpr (partialBlock16)
1616  {
1617  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1618  ocean_assert(overlappingElements < 8u);
1619 
1620  if (y < tPatchSize - 1u)
1621  {
1622  const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1623  const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1624 
1625 
1626  const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1627  const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1628 
1629  const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1630  const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1631 
1632  const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1633  const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1634 
1635 
1636  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1637  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1638 
1639  constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1640  constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1641 
1642  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1643 
1644 
1645  const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1646  const uint16x8_t patchChannel0_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1647 
1648  const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1649  const uint16x8_t patchChannel1_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1650 
1651  const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1652  const uint16x8_t patchChannel2_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1653 
1654 
1655  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1656  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1657  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1658  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1659 
1660  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1661  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1662  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1663  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1664 
1665  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1666  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1667  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1668  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1669  }
1670  else
1671  {
1672  const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0 - overlappingElements * tChannels);
1673  const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1 - overlappingElements * tChannels);
1674 
1675 
1676  const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1677  const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1678 
1679  const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1680  const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1681 
1682  const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1683  const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1684 
1685 
1686  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1687  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1688 
1689  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1690 
1691 
1692  const uint16x8_t patchChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1693  const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1694 
1695  const uint16x8_t patchChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1696  const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1697 
1698  const uint16x8_t patchChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1699  const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1700 
1701 
1702  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1703  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1704  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1705  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1706 
1707  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1708  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1709  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1710  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1711 
1712  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1713  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1714  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1715  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1716  }
1717 
1718  patch0 += remainingAfterBlocks16 * tChannels;
1719  patch1 += remainingAfterBlocks16 * tChannels;
1720  }
1721 
1722  for (unsigned int n = 0u; n < blocks8; ++n)
1723  {
1724  const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1725  const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1726 
1727  const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1728  const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1729  const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1730 
1731  const uint16x8_t patchChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1732  const uint16x8_t patchChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1733  const uint16x8_t patchChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1734 
1735  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1736  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1737 
1738  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1739  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1740 
1741  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1742  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1743 
1744  patch0 += 8u * tChannels;
1745  patch1 += 8u * tChannels;
1746  }
1747 
1748  if constexpr (partialBlock8)
1749  {
1750  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1751  ocean_assert(overlappingElements < 8u);
1752 
1753  if (y < tPatchSize - 1u)
1754  {
1755  const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1756  const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1757 
1758  const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1759  const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1760  const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1761 
1762  constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1763  constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1764 
1765  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1766 
1767  const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1768  const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1769  const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1770 
1771  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1772  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1773 
1774  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1775  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1776 
1777  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1778  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1779  }
1780  else
1781  {
1782  const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0 - overlappingElements * tChannels);
1783  const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1 - overlappingElements * tChannels);
1784 
1785  const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1786  const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1787  const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1788 
1789  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1790  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1791 
1792  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1793 
1794  const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1795  const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1796  const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1797 
1798  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1799  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1800 
1801  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1802  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1803 
1804  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1805  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1806  }
1807 
1808  patch0 += remainingAfterBlocks8 * tChannels;
1809  patch1 += remainingAfterBlocks8 * tChannels;
1810  }
1811 
1812  if constexpr (blocks1 != 0u)
1813  {
1814  for (unsigned int n = 0u; n < blocks1; ++n)
1815  {
1816  for (unsigned int c = 0u; c < tChannels; ++c)
1817  {
1818  sumIndividual += sqrDistance(int16_t(patch0[n * tChannels + c] - meanValues0[c]), int16_t(patch1[n * tChannels + c] - meanValues1[c]));
1819  }
1820  }
1821 
1822  patch0 += blocks1 * tChannels;
1823  patch1 += blocks1 * tChannels;
1824  }
1825 
1826  patch0 += patch0StrideElements - tPatchSize * tChannels;
1827  patch1 += patch1StrideElements - tPatchSize * tChannels;
1828  }
1829 
1830  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1831 
1832  uint32_t results[4];
1833  vst1q_u32(results, sum_u_32x4);
1834 
1835  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1836 }
1837 
1838 template <unsigned int tChannels>
1839 template <unsigned int tPatchSize>
1840 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1841 {
1842  static_assert(tChannels >= 1u, "Invalid channel number!");
1843  static_assert(tPatchSize >= 1u, "Invalid patch size!");
1844 
1845  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1846  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1847 
1848  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1849  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1850 
1851  uint32_t zmssd = 0u;
1852 
1853  for (unsigned int y = 0u; y < tPatchSize; ++y)
1854  {
1855  for (unsigned int x = 0u; x < tPatchSize; ++x)
1856  {
1857  for (unsigned int n = 0u; n < tChannels; ++n)
1858  {
1859  zmssd += sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1860  }
1861  }
1862 
1863  patch0 += patch0StrideElements;
1864  patch1 += patch1StrideElements;
1865  }
1866 
1867  return zmssd;
1868 }
1869 
1870 template <>
1871 template <unsigned int tPatchSize>
1872 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1873 {
1874  static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
1875  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1876 
1877  constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
1878 
1879  ocean_assert(image0 != nullptr && image1 != nullptr);
1880  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1881 
1882  ocean_assert(centerX0 < width0 && centerY0 < height0);
1883  ocean_assert(centerX1 < width1 && centerY1 < height1);
1884 
1885  const unsigned int image0StrideElements = width0 + image0PaddingElements;
1886  const unsigned int image1StrideElements = width1 + image1PaddingElements;
1887 
1888  constexpr unsigned int blocks16 = tPatchSize / 16u;
1889  constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1890 
1891  constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1892  constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1893 
1894  constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1895  constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1896 
1897  constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1898  constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1899 
1900  constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1901 
1902  static_assert(blocks1 <= 2u, "Invalid block size!");
1903 
1904  // [(patch0 - mean0) - (patch1 - mean1)]^2
1905  // [patch0 - patch1 - mean0 + mean1]^2
1906 
1907  const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1908 
1909  uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1910  uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1911 
1912  uint32_t sumIndividual = 0u;
1913 
1914  uint8_t intermediate[16];
1915 
1916  int y1 = int(centerY1) - int(tPatchSize_2);
1917  for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
1918  {
1919  const uint8_t* const mirroredRow0 = image0 + CVUtilities::mirrorIndex(y0, height0) * image0StrideElements;
1920  const uint8_t* const mirroredRow1 = image1 + CVUtilities::mirrorIndex(y1, height1) * image1StrideElements;
1921 
1922  int x0 = int(centerX0) - int(tPatchSize_2);
1923  int x1 = int(centerX1) - int(tPatchSize_2);
1924 
1925  for (unsigned int n = 0u; n < blocks16; ++n)
1926  {
1927  const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow0, x0, width0, intermediate);
1928  const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow1, x1, width1, intermediate);
1929 
1930  const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1931  const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1932 
1933  const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1934  const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1935 
1936  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1937  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1938 
1939  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1940  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1941 
1942  x0 += 16;
1943  x1 += 16;
1944  }
1945 
1946  if constexpr (partialBlock16)
1947  {
1948  constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1949  ocean_assert(overlappingElements < 8u);
1950 
1951  if (y0 < int(centerY0) + int(tPatchSize_2))
1952  {
1953  const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1954  const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1955 
1956  const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1957  const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1958 
1959  // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1960  // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1961 
1962  constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1963  constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1964 
1965  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1966 
1967  const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1968  const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1969 
1970  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1971  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1972 
1973  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1974  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1975  }
1976  else
1977  {
1978  const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1979  const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1980 
1981  const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1982  const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1983 
1984  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1985  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1986 
1987  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1988 
1989  const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1990  const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1991 
1992  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1993  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1994 
1995  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1996  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1997  }
1998 
1999  x0 += remainingAfterBlocks16;
2000  x1 += remainingAfterBlocks16;
2001  }
2002 
2003  for (unsigned int n = 0u; n < blocks8; ++n)
2004  {
2005  const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow0, x0, width0, intermediate);
2006  const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow1, x1, width1, intermediate);
2007 
2008  const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2009 
2010  const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2011 
2012  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2013  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2014 
2015  x0 += 8;
2016  x1 += 8;
2017  }
2018 
2019  if constexpr (partialBlock8)
2020  {
2021  constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
2022  ocean_assert(overlappingElements < 8u);
2023 
2024  if (y0 < int(centerY0) + int(tPatchSize_2))
2025  {
2026  const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2027  const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2028 
2029  const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2030 
2031  constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
2032  constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
2033 
2034  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2035 
2036  const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2037 
2038  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2039  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2040  }
2041  else
2042  {
2043  const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2044  const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2045 
2046  const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2047 
2048  constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
2049  constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
2050 
2051  const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2052 
2053  const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2054 
2055  sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2056  sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2057  }
2058 
2059  x0 += remainingAfterBlocks8;
2060  x1 += remainingAfterBlocks8;
2061  }
2062 
2063  if constexpr (blocks1 != 0u)
2064  {
2065  for (unsigned int n = 0u; n < blocks1; ++n)
2066  {
2067  const unsigned int index0 = CVUtilities::mirrorIndex(x0 + int(n), width0);
2068  const unsigned int index1 = CVUtilities::mirrorIndex(x1 + int(n), width1);
2069 
2070  sumIndividual += sqrDistance(int16_t(mirroredRow0[index0] - meanValues0[0]), int16_t(mirroredRow1[index1] - meanValues1[0]));
2071  }
2072  }
2073 
2074  ++y1;
2075  }
2076 
2077  const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
2078 
2079  uint32_t results[4];
2080  vst1q_u32(results, sum_u_32x4);
2081 
2082  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
2083 }
2084 
2085 template <unsigned int tChannels>
2086 template <unsigned int tPatchSize>
2087 inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
2088 {
2089  static_assert(tChannels >= 1u, "Invalid channel number!");
2090  static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
2091 
2092  constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
2093 
2094  ocean_assert(image0 != nullptr && image1 != nullptr);
2095  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
2096 
2097  ocean_assert(centerX0 < width0 && centerY0 < height0);
2098  ocean_assert(centerX1 < width1 && centerY1 < height1);
2099 
2100  const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
2101  const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
2102 
2103  uint32_t zmssd = 0u;
2104 
2105  int y1 = int(centerY1) - int(tPatchSize_2);
2106  for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
2107  {
2108  const uint8_t* const mirroredRow0 = image0 + CVUtilities::mirrorIndex(y0, height0) * image0StrideElements;
2109  const uint8_t* const mirroredRow1 = image1 + CVUtilities::mirrorIndex(y1, height1) * image1StrideElements;
2110 
2111  int x1 = int(centerX1) - int(tPatchSize_2);
2112  for (int x0 = int(centerX0) - int(tPatchSize_2); x0 <= int(centerX0) + int(tPatchSize_2); ++x0)
2113  {
2114  const uint8_t* const pixel0 = mirroredRow0 + CVUtilities::mirrorIndex(x0, width0) * tChannels;
2115  const uint8_t* const pixel1 = mirroredRow1 + CVUtilities::mirrorIndex(x1, width1) * tChannels;
2116 
2117  for (unsigned int c = 0u; c < tChannels; ++c)
2118  {
2119  zmssd += sqrDistance(pixel0[c] - meanValues0[c], pixel1[c] - meanValues1[c]);
2120  }
2121 
2122  ++x1;
2123  }
2124 
2125  ++y1;
2126  }
2127 
2128  return zmssd;
2129 }
2130 
2131 template <unsigned int tChannels, unsigned int tPixels>
2132 inline uint32_t ZeroMeanSumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1)
2133 {
2134  static_assert(tChannels >= 1u, "Invalid channel number!");
2135  static_assert(tPixels >= 8u, "Invalid pixels!");
2136 
2137  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
2138 
2139  uint8_t meanValues0[tChannels];
2140  mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
2141 
2142  uint8_t meanValues1[tChannels];
2143  mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
2144 
2145  return SpecializedForChannels<tChannels>::template buffer8BitPerChannel<tPixels>(buffer0, buffer1, meanValues0, meanValues1);
2146 }
2147 
2148 template <unsigned int tChannels, unsigned int tPatchSize>
2149 inline uint32_t ZeroMeanSumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
2150 {
2151  static_assert(tChannels >= 1u, "Invalid channel number!");
2152  static_assert(tPatchSize >= 5u, "Invalid patch size!");
2153 
2154  ocean_assert(patch0 != nullptr && patch1 != nullptr);
2155 
2156  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2157  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
2158 
2159  uint8_t meanValues0[tChannels];
2160  mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2161 
2162  uint8_t meanValues1[tChannels];
2163  mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
2164 
2165  return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, patch1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
2166 }
2167 
2168 template <unsigned int tChannels, unsigned int tPatchSize>
2169 inline uint32_t ZeroMeanSumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
2170 {
2171  static_assert(tChannels >= 1u, "Invalid channel number!");
2172  static_assert(tPatchSize >= 5u, "Invalid patch size!");
2173 
2174  ocean_assert(patch0 != nullptr && buffer1 != nullptr);
2175 
2176  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2177 
2178  uint8_t meanValues0[tChannels];
2179  mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2180 
2181  uint8_t meanValues1[tChannels];
2182  mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
2183 
2184  constexpr unsigned int patch1StrideElements = tChannels * tPatchSize;
2185 
2186  return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, buffer1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
2187 }
2188 
2189 template <unsigned int tChannels, unsigned int tPatchSize>
2190 uint32_t ZeroMeanSumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
2191 {
2192  static_assert(tChannels >= 1u, "Invalid channel number!");
2193  static_assert(tPatchSize >= 5u, "Invalid patch size!");
2194 
2195  ocean_assert(image0 != nullptr && image1 != nullptr);
2196 
2197  uint8_t meanValues0[tChannels];
2198  SpecializedForChannels<tChannels>::template mean8BitPerChannelMirroredBorder<tPatchSize>(image0, width0, height0, centerX0, centerY0, image0PaddingElements, meanValues0);
2199 
2200  uint8_t meanValues1[tChannels];
2201  SpecializedForChannels<tChannels>::template mean8BitPerChannelMirroredBorder<tPatchSize>(image1, width1, height1, centerX1, centerY1, image1PaddingElements, meanValues1);
2202 
2203  return SpecializedForChannels<tChannels>::template patchMirroredBorder8BitPerChannel<tPatchSize>(image0, image1, width0, height0, width1, height1, centerX0, centerY0, centerX1, centerY1, image0PaddingElements, image1PaddingElements, meanValues0, meanValues1);
2204 }
2205 
2206 template <unsigned int tChannels, unsigned int tPixels>
2207 OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesNEON::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
2208 {
2209  static_assert(tChannels >= 1u, "Invalid channel number!");
2210  static_assert(tPixels >= 8u, "Invalid patch size!");
2211 
2212  SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPixels>(buffer, meanValues);
2213 }
2214 
2215 template <unsigned int tChannels, unsigned int tPatchSize>
2216 OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesNEON::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
2217 {
2218  static_assert(tChannels >= 1u, "Invalid channel number!");
2219  static_assert(tPatchSize >= 5u, "Invalid patch size!");
2220 
2221  SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPatchSize>(patch, patchStrideElements, meanValues);
2222 }
2223 
2224 template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
2225 OCEAN_FORCE_INLINE uint8x8_t ZeroMeanSumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer)
2226 {
2227  ocean_assert(tPixels >= 1u && tPixels <= 8u);
2228 
2229  ocean_assert(row != nullptr && intermediateBuffer != nullptr);
2230 
2231  constexpr unsigned int tOverlappingElements = 8u - tPixels;
2232 
2233  if (x >= 0 && x <= int(width) - int(tPixels))
2234  {
2235  if constexpr (tPixels == 8u)
2236  {
2237  return vld1_u8(row + x);
2238  }
2239  else
2240  {
2241  if constexpr (tFront)
2242  {
2243  if constexpr (tOverlappingToZero)
2244  {
2245  constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2246  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2247 
2248  return vand_u8(vld1_u8(row + x), mask_u_8x8);
2249  }
2250  else
2251  {
2252  return vld1_u8(row + x);
2253  }
2254  }
2255  else
2256  {
2257  if constexpr (tOverlappingToZero)
2258  {
2259  constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2260  const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2261 
2262  return vand_u8(vld1_u8(row + x - int(tOverlappingElements)), mask_u_8x8);
2263  }
2264  else
2265  {
2266  return vld1_u8(row + x - int(tOverlappingElements));
2267  }
2268  }
2269  }
2270  }
2271 
2272  if constexpr (tFront)
2273  {
2274  for (unsigned int n = 0u; n < tPixels; ++n)
2275  {
2276  const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2277  ocean_assert(mirroredIndex < width);
2278 
2279  intermediateBuffer[n] = row[mirroredIndex];
2280  }
2281 
2282  if constexpr (tOverlappingToZero)
2283  {
2284  for (unsigned int n = tPixels; n < 8u; ++n)
2285  {
2286  intermediateBuffer[n] = 0u;
2287  }
2288  }
2289  }
2290  else
2291  {
2292  if constexpr (tOverlappingToZero)
2293  {
2294  for (unsigned int n = 0u; n < tOverlappingElements; ++n)
2295  {
2296  intermediateBuffer[n] = 0u;
2297  }
2298  }
2299 
2300  for (unsigned int n = 0u; n < tPixels; ++n)
2301  {
2302  const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2303  ocean_assert(mirroredIndex < width);
2304 
2305  intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2306  }
2307  }
2308 
2309  return vld1_u8(intermediateBuffer);
2310 }
2311 
2312 template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
2313 OCEAN_FORCE_INLINE uint8x16_t ZeroMeanSumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer)
2314 {
2315  ocean_assert(tPixels > 8u && tPixels <= 16u);
2316 
2317  ocean_assert(row != nullptr && intermediateBuffer != nullptr);
2318 
2319  constexpr unsigned int tOverlappingElements = 16u - tPixels;
2320 
2321  if (x >= 0 && x <= int(width) - int(tPixels))
2322  {
2323  if constexpr (tPixels == 16u)
2324  {
2325  return vld1q_u8(row + x);
2326  }
2327  else
2328  {
2329  if constexpr (tFront)
2330  {
2331  if constexpr (tOverlappingToZero)
2332  {
2333  constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2334  const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
2335 
2336  return vandq_u8(vld1q_u8(row + x), mask_u_8x16);
2337  }
2338  else
2339  {
2340  return vld1q_u8(row + x);
2341  }
2342  }
2343  else
2344  {
2345  if constexpr (tOverlappingToZero)
2346  {
2347  constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2348  const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
2349 
2350  return vandq_u8(vld1q_u8(row + x - int(tOverlappingElements)), mask_u_8x16);
2351  }
2352  else
2353  {
2354  return vld1q_u8(row + x - int(tOverlappingElements));
2355  }
2356  }
2357  }
2358  }
2359 
2360  if constexpr (tFront)
2361  {
2362  for (unsigned int n = 0u; n < tPixels; ++n)
2363  {
2364  const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2365  ocean_assert(mirroredIndex < width);
2366 
2367  intermediateBuffer[n] = row[mirroredIndex];
2368  }
2369 
2370  if constexpr (tOverlappingToZero)
2371  {
2372  for (unsigned int n = tPixels; n < 16u; ++n)
2373  {
2374  intermediateBuffer[n] = 0u;
2375  }
2376  }
2377  }
2378  else
2379  {
2380  if constexpr (tOverlappingToZero)
2381  {
2382  for (unsigned int n = 0u; n < tOverlappingElements; ++n)
2383  {
2384  intermediateBuffer[n] = 0u;
2385  }
2386  }
2387 
2388  for (unsigned int n = 0u; n < tPixels; ++n)
2389  {
2390  const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2391  ocean_assert(mirroredIndex < width);
2392 
2393  intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2394  }
2395  }
2396 
2397  return vld1q_u8(intermediateBuffer);
2398 }
2399 
2400 }
2401 
2402 }
2403 
2404 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2405 
2406 #endif // META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int index, const unsigned int elements)
Returns the mirrored index for a given index.
Definition: CVUtilities.h:456
This class allows to specialize functions for individual channels.
Definition: ZeroMeanSumSquareDifferencesNEON.h:39
static void mean8BitPerChannelMirroredBorder(const uint8_t *const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t *const meanValues)
Determines the mean value for an image patch, one value for each channel, patch pixels outside the im...
Definition: ZeroMeanSumSquareDifferencesNEON.h:906
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesNEON.h:1840
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition: ZeroMeanSumSquareDifferencesNEON.h:2087
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesNEON.h:448
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesNEON.h:1304
This class implements function to calculate zeao-mean sum square differences using NEON instructions.
Definition: ZeroMeanSumSquareDifferencesNEON.h:30
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition: ZeroMeanSumSquareDifferencesNEON.h:2190
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2207
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2149
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2132
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2225
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2169
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition: ZeroMeanSumSquareDifferencesNEON.h:2313
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15