Ocean
Loading...
Searching...
No Matches
ZeroMeanSumSquareDifferencesNEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
9#define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
10
11#include "ocean/cv/CV.h"
12
14
15#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
16
17#include "ocean/cv/NEON.h"
18
19namespace Ocean
20{
21
22namespace CV
23{
24
25/**
26 * This class implements functions to calculate zero-mean sum square differences using NEON instructions.
27 * @ingroup cv
28 */
30{
31 protected:
32
33 /**
34 * This class allows to specialize functions for individual channels.
35 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
36 */
37 template <unsigned int tChannels>
39 {
40 public:
41
42 /**
43 * Determines the mean value for a buffer, one value for each channel.
44 * @param buffer The memory buffer to be handled, must be valid
45 * @param meanValues The resulting mean values, one for each channel
46 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
47 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
48 */
49 template <unsigned int tPixels>
50 static inline void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
51
52 /**
53 * Determines the mean value for an image patch, one value for each channel.
54 * @param patch The top left start position of the image patch, must be valid
55 * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
56 * @param meanValues The resulting mean values, one for each channel
57 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity)
58 */
59 template <unsigned int tPatchSize>
60 static inline void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
61
62 /**
63 * Determines the mean value for an image patch, one value for each channel, patch pixels outside the image will be mirrored back into the image.
64 * @param image The image in which the patch is located, must be valid
65 * @param width The width of the image, in pixels, with range [tPatchSize, infinity)
66 * @param height The height of the image, in pixels, with range [tPatchSize, infinity)
67 * @param centerX Horizontal center position of the (tPatchSize x tPatchSize) block in the frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
68 * @param centerY Vertical center position of the (tPatchSize x tPatchSize) block in the frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
69 * @param imagePaddingElements The number of padding elements at the end of each row of the image, in elements, with range [0, infinity)
70 * @param meanValues The resulting mean values, one for each channel
71 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
72 */
73 template <unsigned int tPatchSize>
74 static inline void mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues);
75
76 /**
77 * Returns the zero-mean sum of square differences between two memory buffers.
78 * @param buffer0 The first memory buffer, must be valid
79 * @param buffer1 The second memory buffer, must be valid
80 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
81 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
82 * @return The resulting sum of square differences
83 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
84 */
85 template <unsigned int tPixels>
86 static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
87
88 /**
89 * Returns the zero-mean sum of square differences between two patches within an image.
90 * @param patch0 The top left start position of the first image patch, must be valid
91 * @param patch1 The top left start position of the second image patch, must be valid
92 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
93 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
94 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
95 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
96 * @return The resulting sum of square differences
97 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
98 */
99 template <unsigned int tPatchSize>
100 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
101
102 /**
103 * Returns the zero-mean sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
104 * @param image0 The image in which the first patch is located, must be valid
105 * @param image1 The image in which the second patch is located, must be valid
106 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
107 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
108 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
109 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
110 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
111 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
112 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
113 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
114 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
115 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
116 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
117 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
118 * @return The resulting zero-mean sum of square differences, with range [0, infinity)
119 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
120 */
121 template <unsigned int tPatchSize>
122 static inline uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
123 };
124
125 public:
126
127 /**
128 * Returns the zero-mean sum of square differences between two memory buffers.
129 * @param buffer0 The first memory buffer, must be valid
130 * @param buffer1 The second memory buffer, must be valid
131 * @return The resulting sum of square differences
132 * @tparam tChannels Specifies the number of channels for the given buffers, with range [1, infinity)
133 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
134 */
135 template <unsigned int tChannels, unsigned int tPixels>
136 static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1);
137
138 /**
139 * Returns the zero-mean sum of square differences between two patches within an image.
140 * @param patch0 The top left start position of the first image patch, must be valid
141 * @param patch1 The top left start position of the second image patch, must be valid
142 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
143 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
144 * @return The resulting sum of square differences
145 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
146 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
147 */
148 template <unsigned int tChannels, unsigned int tPatchSize>
149 static inline uint32_t patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
150
151 /**
152 * Returns the zero-mean sum of square differences between an image patch and a buffer.
153 * @param patch0 The top left start position of the image patch, must be valid
154 * @param buffer1 The memory buffer, must be valid
155 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
156 * @return The resulting sum of square differences
157 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
158 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
159 */
160 template <unsigned int tChannels, unsigned int tPatchSize>
161 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
162
163 /**
164 * Returns the zero-mean sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
165 * @param image0 The image in which the first patch is located, must be valid
166 * @param image1 The image in which the second patch is located, must be valid
167 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
168 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
169 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
170 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
171 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
172 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
173 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
174 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
175 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
176 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
177 * @return The resulting zero-mean sum of square differences, with range [0, infinity)
178 * @tparam tChannels The number of frame channels, with range [1, infinity)
179 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
180 */
181 template <unsigned int tChannels, unsigned int tPatchSize>
182 static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
183
184 /**
185 * Determines the mean value for a buffer, one value for each channel.
186 * @param buffer The memory buffer to be handled, must be valid
187 * @param meanValues The resulting mean values, one for each channel
188 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
189 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
190 */
191 template <unsigned int tChannels, unsigned int tPixels>
192 static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
193
194 /**
195 * Determines the mean value for an image patch, one value for each channel.
196 * @param patch The top left start position of the image patch, must be valid
197 * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
198 * @param meanValues The resulting mean values, one for each channel
199 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
200 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
201 */
202 template <unsigned int tChannels, unsigned int tPatchSize>
203 static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
204
205 protected:
206
207 /**
208 * Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
209 * @param row The row from which the values will be loaded, must be valid
210 * @param x The index of the first pixel to load, with range [-elements/2, elements + elements/2]
211 * @param width The width of the row, in pixels, with range [4, infinity)
212 * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
213 * @return The uint8x8_t object with the loaded values
214 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
215 * @tparam tPixels The number of uint8_t pixels to be read, with range [1, 8]
216 * @tparam tOverlappingToZero True, to set overlapping pixels to zero; False, to get overlapping pixels with random values
217 */
218 template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
219 static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer);
220
221 /**
222 * Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
223 * @param row The row from which the values will be loaded, must be valid
224 * @param x The index of the first pixel to load, with range [-elements/2, elements + elements/2]
225 * @param width The width of the row in pixels, with range [8, infinity)
226 * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
227 * @return The uint8x16_t object with the loaded values
228 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
229 * @tparam tSize The number of uint8_t pixels to be read, with range [1, 16]
230 * @tparam tOverlappingToZero True, to set overlapping pixels to zero; False, to get overlapping pixels with random values
231 */
232 template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
233 static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer);
234};
235
236template <>
237template <unsigned int tPixels>
238inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
239{
240 static_assert(tPixels >= 8u, "Invalid pixels!");
241
242 ocean_assert(buffer != nullptr && meanValues != nullptr);
243
244 constexpr unsigned int blocks16 = tPixels / 16u;
245 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
246
247 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u && tPixels >= 16u;
248 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
249
250 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
251 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
252
253 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u && tPixels >= 8u;
254 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
255
256 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
257
258 static_assert(blocks1 <= 2u, "Invalid block size!");
259
260 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
261
262 uint32_t sumIndividual = 0u;
263
264 for (unsigned int n = 0u; n < blocks16; ++n)
265 {
266 const uint8x16_t buffer_u_8x16 = vld1q_u8(buffer);
267
268 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
269
270 buffer += 16;
271 }
272
273 if constexpr (partialBlock16)
274 {
275 static_assert(tPixels >= 16u, "We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
276
277 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
278 ocean_assert(overlappingElements < 8u);
279
280 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
281 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
282 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
283
284 const uint8x16_t buffer_u_8x16 = vandq_u8(vld1q_u8(buffer - overlappingElements), mask_u_8x16);
285
286 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
287
288 buffer += remainingAfterBlocks16;
289 }
290
291 for (unsigned int n = 0u; n < blocks8; ++n)
292 {
293 const uint8x8_t buffer_u_8x8 = vld1_u8(buffer);
294
295 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
296
297 buffer += 8;
298 }
299
300 if constexpr (partialBlock8)
301 {
302 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
303 ocean_assert(overlappingElements < 8u);
304
305 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
306
307 const uint8x8_t buffer_u_8x8 = vand_u8(vld1_u8(buffer - overlappingElements), mask_u_8x8);
308
309 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
310
311 buffer += remainingAfterBlocks8;
312 }
313
314 if constexpr (blocks1 != 0u)
315 {
316 for (unsigned int n = 0u; n < blocks1; ++n)
317 {
318 sumIndividual += buffer[n];
319 }
320
321 buffer += blocks1;
322 }
323
324 const uint32_t sum = NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
325
326 meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
327}
328
329template <>
330template <unsigned int tPixels>
331inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
332{
333 static_assert(tPixels >= 8u, "Invalid pixels!");
334
335 constexpr unsigned int tChannels = 3u;
336
337 ocean_assert(buffer != nullptr && meanValues != nullptr);
338
339 constexpr unsigned int blocks16 = tPixels / 16u;
340 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
341
342 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u && blocks16 >= 1u;
343 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
344
345 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
346 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
347
348 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
349 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
350
351 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
352
353 static_assert(blocks1 <= 2u, "Invalid block size!");
354
355 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
356 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
357 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
358
359 uint32_t sumIndividual[3] = {0u};
360
361 for (unsigned int n = 0u; n < blocks16; ++n)
362 {
363 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer);
364
365 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[0]));
366 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[1]));
367 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[2]));
368
369 buffer += 16u * tChannels;
370 }
371
372 if constexpr (partialBlock16)
373 {
374 static_assert(tPixels >= 16u, "We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
375
376 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
377 ocean_assert(overlappingElements < 8u);
378
379 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
380 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
381 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
382
383 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer - overlappingElements * tChannels);
384
385 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[0], mask_u_8x16)));
386 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[1], mask_u_8x16)));
387 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[2], mask_u_8x16)));
388
389 buffer += remainingAfterBlocks16 * tChannels;
390 }
391
392 for (unsigned int n = 0u; n < blocks8; ++n)
393 {
394 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer);
395
396 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[0]));
397 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[1]));
398 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[2]));
399
400 buffer += 8u * tChannels;
401 }
402
403 if constexpr (partialBlock8)
404 {
405 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
406 ocean_assert(overlappingElements < 8u);
407
408 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
409
410 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer - overlappingElements * tChannels);
411
412 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[0], mask_u_8x8)));
413 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[1], mask_u_8x8)));
414 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[2], mask_u_8x8)));
415
416 buffer += remainingAfterBlocks8 * tChannels;
417 }
418
419 for (unsigned int n = 0u; n < blocks1; ++n)
420 {
421 sumIndividual[0] += buffer[tChannels * n + 0u];
422 sumIndividual[1] += buffer[tChannels * n + 1u];
423 sumIndividual[2] += buffer[tChannels * n + 2u];
424 }
425
426 const uint32_t sum0 = NEON::sumHorizontal_u_32x4(sumChannel0_u_32x4) + sumIndividual[0];
427 meanValues[0] = uint8_t((sum0 + tPixels / 2u) / tPixels);
428
429 const uint32_t sum1 = NEON::sumHorizontal_u_32x4(sumChannel1_u_32x4) + sumIndividual[1];
430 meanValues[1] = uint8_t((sum1 + tPixels / 2u) / tPixels);
431
432 const uint32_t sum2 = NEON::sumHorizontal_u_32x4(sumChannel2_u_32x4) + sumIndividual[2];
433 meanValues[2] = uint8_t((sum2 + tPixels / 2u) / tPixels);
434}
435
436template <unsigned int tChannels>
437template <unsigned int tPixels>
438inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
439{
440 static_assert(tChannels >= 1u, "Invalid channel number!");
441 static_assert(tPixels >= 1u, "Invalid buffer size!");
442
443 ocean_assert(buffer != nullptr && meanValues != nullptr);
444
445 uint32_t sum[tChannels] = {0u};
446
447 for (unsigned int n = 0u; n < tPixels; ++n)
448 {
449 for (unsigned int c = 0u; c < tChannels; ++c)
450 {
451 sum[c] += buffer[n * tChannels + c];
452 }
453 }
454
455 for (unsigned int c = 0u; c < tChannels; ++c)
456 {
457 meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
458 }
459}
460
461template <>
462template <unsigned int tPatchSize>
463inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
464{
465 static_assert(tPatchSize >= 5u, "Invalid patch size!");
466
467 ocean_assert(patch != nullptr && meanValues != nullptr);
468
469 ocean_assert(patchStrideElements >= tPatchSize);
470
471 constexpr unsigned int blocks16 = tPatchSize / 16u;
472 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
473
474 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
475 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
476
477 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
478 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
479
480 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
481 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
482
483 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
484
485 static_assert(blocks1 <= 2u, "Invalid block size!");
486
487 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
488
489 uint32_t sumIndividual = 0u;
490
491 for (unsigned int y = 0u; y < tPatchSize; ++y)
492 {
493 for (unsigned int n = 0u; n < blocks16; ++n)
494 {
495 const uint8x16_t patch_u_8x16 = vld1q_u8(patch);
496
497 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
498
499 patch += 16;
500 }
501
502 if constexpr (partialBlock16)
503 {
504 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
505 ocean_assert(overlappingElements < 8u);
506
507 if (y < tPatchSize - 1u)
508 {
509 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
510 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
511 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
512 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
513
514 const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch), mask_u_8x16);
515
516 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
517 }
518 else
519 {
520 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
521 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
522 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
523 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
524
525 const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch - overlappingElements), mask_u_8x16);
526
527 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
528 }
529
530 patch += remainingAfterBlocks16;
531 }
532
533 for (unsigned int n = 0u; n < blocks8; ++n)
534 {
535 const uint8x8_t patch_u_8x8 = vld1_u8(patch);
536
537 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
538
539 patch += 8;
540 }
541
542 if constexpr (partialBlock8)
543 {
544 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
545 ocean_assert(overlappingElements < 8u);
546
547 if (y < tPatchSize - 1u)
548 {
549 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
550 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
551
552 const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch), mask_u_8x8);
553
554 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
555 }
556 else
557 {
558 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
559 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
560
561 const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch - overlappingElements), mask_u_8x8);
562
563 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
564 }
565
566 patch += remainingAfterBlocks8;
567 }
568
569 if constexpr (blocks1 != 0u)
570 {
571 for (unsigned int n = 0u; n < blocks1; ++n)
572 {
573 sumIndividual += patch[n];
574 }
575
576 patch += blocks1;
577 }
578
579 patch += patchStrideElements - tPatchSize;
580 }
581
582 const uint32_t sum = NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
583
584 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
585}
586
587template <>
588template <unsigned int tPatchSize>
589inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
590{
591 static_assert(tPatchSize >= 5u, "Invalid patch size!");
592
593 constexpr unsigned int tChannels = 3u;
594
595 ocean_assert(patch != nullptr && meanValues != nullptr);
596
597 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
598
599 constexpr unsigned int blocks16 = tPatchSize / 16u;
600 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
601
602 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
603 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
604
605 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
606 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
607
608 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
609 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
610
611 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
612
613 static_assert(blocks1 <= 2u, "Invalid block size!");
614
615 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
616 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
617 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
618
619 uint32_t sumIndividual[3] = {0u};
620
621 for (unsigned int y = 0u; y < tPatchSize; ++y)
622 {
623 for (unsigned int n = 0u; n < blocks16; ++n)
624 {
625 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
626
627 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[0]));
628 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[1]));
629 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[2]));
630
631 patch += 16u * tChannels;
632 }
633
634 if constexpr (partialBlock16)
635 {
636 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
637 ocean_assert(overlappingElements < 8u);
638
639 if (y < tPatchSize - 1u)
640 {
641 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
642 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
643 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
644 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
645
646 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
647
648 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
649 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
650 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
651 }
652 else
653 {
654 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
655 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
656 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
657 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
658
659 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch - overlappingElements * tChannels);
660
661 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
662 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
663 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
664 }
665
666 patch += remainingAfterBlocks16 * tChannels;
667 }
668
669 for (unsigned int n = 0u; n < blocks8; ++n)
670 {
671 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
672
673 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(patch_u_8x8x3.val[0]));
674 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(patch_u_8x8x3.val[1]));
675 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(patch_u_8x8x3.val[2]));
676
677 patch += 8u * tChannels;
678 }
679
680 if constexpr (partialBlock8)
681 {
682 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
683 ocean_assert(overlappingElements < 8u);
684
685 if (y < tPatchSize - 1u)
686 {
687 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
688 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
689
690 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
691
692 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
693 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
694 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
695 }
696 else
697 {
698 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
699 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
700
701 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch - overlappingElements * tChannels);
702
703 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
704 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
705 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
706 }
707
708 patch += remainingAfterBlocks8 * tChannels;
709 }
710
711 if constexpr (blocks1 != 0u)
712 {
713 for (unsigned int n = 0u; n < blocks1; ++n)
714 {
715 sumIndividual[0] += patch[tChannels * n + 0u];
716 sumIndividual[1] += patch[tChannels * n + 1u];
717 sumIndividual[2] += patch[tChannels * n + 2u];
718 }
719
720 patch += blocks1 * tChannels;
721 }
722
723 patch += patchStrideElements - tChannels * tPatchSize;
724 }
725
726 const uint32_t sum0 = NEON::sumHorizontal_u_32x4(sumChannel0_u_32x4) + sumIndividual[0];
727 meanValues[0] = uint8_t((sum0 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
728
729 const uint32_t sum1 = NEON::sumHorizontal_u_32x4(sumChannel1_u_32x4) + sumIndividual[1];
730 meanValues[1] = uint8_t((sum1 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
731
732 const uint32_t sum2 = NEON::sumHorizontal_u_32x4(sumChannel2_u_32x4) + sumIndividual[2];
733 meanValues[2] = uint8_t((sum2 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
734}
735
736template <unsigned int tChannels>
737template <unsigned int tPatchSize>
738inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
739{
740 static_assert(tChannels >= 1u, "Invalid channel number!");
741 static_assert(tPatchSize >= 1u, "Invalid patch size!");
742
743 ocean_assert(patch != nullptr && meanValues != nullptr);
744
745 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
746
747 uint32_t sum[tChannels] = {0u};
748
749 for (unsigned int y = 0u; y < tPatchSize; ++y)
750 {
751 for (unsigned int x = 0u; x < tPatchSize; ++x)
752 {
753 for (unsigned int n = 0u; n < tChannels; ++n)
754 {
755 sum[n] += patch[x * tChannels + n];
756 }
757 }
758
759 patch += patchStrideElements;
760 }
761
762 for (unsigned int n = 0u; n < tChannels; ++n)
763 {
764 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
765 }
766}
767
768template <>
769template <unsigned int tPatchSize>
770inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues)
771{
772 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
773
774 ocean_assert(image != nullptr && meanValues != nullptr);
775 ocean_assert(centerX < width && centerY < height);
776
777 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
778
779 const unsigned int imageStrideElements = width + imagePaddingElements;
780
781 constexpr unsigned int blocks16 = tPatchSize / 16u;
782 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
783
784 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
785 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
786
787 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
788 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
789
790 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
791 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
792
793 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
794
795 static_assert(blocks1 <= 7u, "Invalid block size!");
796
797 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
798
799 uint32_t sumIndividual = 0u;
800
801 uint8_t intermediate[16];
802
803 for (int y = int(centerY) - int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
804 {
805 const uint8_t* const mirroredRow = image + CVUtilities::mirrorIndex(y, height) * imageStrideElements;
806
807 int x = int(centerX) - int(tPatchSize_2);
808
809 for (unsigned int n = 0u; n < blocks16; ++n)
810 {
811 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow, x, width, intermediate);
812
813 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
814
815 x += 16;
816 }
817
818 if constexpr (partialBlock16)
819 {
820 if (y < int(centerY) + int(tPatchSize_2))
821 {
822 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
823
824 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
825 }
826 else
827 {
828 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
829
830 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
831 }
832
833 x += remainingAfterBlocks16;
834 }
835
836 for (unsigned int n = 0u; n < blocks8; ++n)
837 {
838 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow, x, width, intermediate);
839
840 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
841
842 x += 8;
843 }
844
845 if constexpr (partialBlock8)
846 {
847 if (y < int(centerY) + int(tPatchSize_2))
848 {
849 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
850
851 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
852 }
853 else
854 {
855 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
856
857 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
858 }
859
860 x += remainingAfterBlocks8;
861 }
862
863 if constexpr (blocks1 != 0u)
864 {
865 for (unsigned int n = 0u; n < blocks1; ++n)
866 {
867 const unsigned int index = CVUtilities::mirrorIndex(x, width);
868
869 sumIndividual += mirroredRow[index];
870
871 x++;
872 }
873 }
874 }
875
876 const uint32_t sum = NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
877
878 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
879}
880
881template <unsigned int tChannels>
882template <unsigned int tPatchSize>
883inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues)
884{
885 static_assert(tChannels >= 1u, "Invalid channel number!");
886 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
887
888 ocean_assert(image != nullptr && meanValues != nullptr);
889 ocean_assert(centerX < width && centerY < height);
890
891 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
892
893 const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
894
895 uint32_t sum[tChannels] = {0u};
896
897 for (int y = int(centerY) - int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
898 {
899 const uint8_t* const mirroredRow = image + CVUtilities::mirrorIndex(y, height) * imageStrideElements;
900
901 for (int x = int(centerX) - int(tPatchSize_2); x <= int(centerX) + int(tPatchSize_2); ++x)
902 {
903 const uint8_t* const pixel = mirroredRow + CVUtilities::mirrorIndex(x, width) * tChannels;
904
905 for (unsigned int c = 0u; c < tChannels; ++c)
906 {
907 sum[c] += pixel[c];
908 }
909 }
910 }
911
912 for (unsigned int n = 0u; n < tChannels; ++n)
913 {
914 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
915 }
916}
917
918template <>
919template <unsigned int tPixels>
920inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
921{
922 static_assert(tPixels >= 8u, "Invalid pixels!");
923
924 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
925 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
926
927 constexpr unsigned int blocks16 = tPixels / 16u;
928 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
929
930 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
931 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
932
933 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
934 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
935
936 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
937 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
938
939 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
940
941 static_assert(blocks1 <= 2u, "Invalid block size!");
942
943 // [(buffer0 - mean0) - (buffer1 - mean1)]^2
944 // [buffer0 - buffer1 - mean0 + mean1]^2
945
946 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
947
948 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
949 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
950
951 uint32_t sumIndividual = 0u;
952
953 for (unsigned int n = 0u; n < blocks16; ++n)
954 {
955 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0);
956 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1);
957
958 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16))); // low 8 bytes: buffer0 - buffer1
959 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16))); // high 8 bytes: buffer0 - buffer1
960
961 const uint16x8_t buffer_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
962 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
963
964 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
965 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
966
967 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
968 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
969
970 buffer0 += 16;
971 buffer1 += 16;
972 }
973
974 if constexpr (partialBlock16)
975 {
976 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
977 ocean_assert(overlappingElements < 8u);
978
979 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0 - overlappingElements);
980 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1 - overlappingElements);
981
982 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16))); // low 8 bytes: buffer0 - buffer1
983 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16))); // high 8 bytes: buffer0 - buffer1
984
985 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
986 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
987
988 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
989
990 const uint16x8_t buffer_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
991 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
992
993 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
994 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
995
996 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
997 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
998
999 buffer0 += remainingAfterBlocks16;
1000 buffer1 += remainingAfterBlocks16;
1001 }
1002
1003 for (unsigned int n = 0u; n < blocks8; ++n)
1004 {
1005 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0);
1006 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1);
1007
1008 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8)); // buffer0 - buffer1
1009
1010 const uint16x8_t buffer_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1011
1012 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1013 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1014
1015 buffer0 += 8;
1016 buffer1 += 8;
1017 }
1018
1019 if constexpr (partialBlock8)
1020 {
1021 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1022 ocean_assert(overlappingElements < 8u);
1023
1024 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0 - overlappingElements);
1025 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1 - overlappingElements);
1026
1027 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8)); // buffer0 - buffer1
1028
1029 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1030 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1031
1032 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1033
1034 const uint16x8_t buffer_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1035
1036 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1037 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1038
1039 buffer0 += remainingAfterBlocks8;
1040 buffer1 += remainingAfterBlocks8;
1041 }
1042
1043 if constexpr (blocks1 != 0u)
1044 {
1045 for (unsigned int n = 0u; n < blocks1; ++n)
1046 {
1047 sumIndividual += sqrDistance(int16_t(buffer0[n] - meanValues0[0]), int16_t(buffer1[n] - meanValues1[0]));
1048 }
1049 }
1050
1051 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1052
1053 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
1054}
1055
1056template <>
1057template <unsigned int tPixels>
1058inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1059{
1060 static_assert(tPixels >= 8u, "Invalid pixels!");
1061
1062 constexpr unsigned int tChannels = 3u;
1063
1064 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1065 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1066
1067 constexpr unsigned int blocks16 = tPixels / 16u;
1068 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
1069
1070 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1071 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1072
1073 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1074 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1075
1076 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1077 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1078
1079 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1080
1081 static_assert(blocks1 <= 2u, "Invalid block size!");
1082
1083 // [(buffer0 - mean0) - (buffer1 - mean1)]^2
1084 // [buffer0 - buffer1 - mean0 + mean1]^2
1085
1086 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1087 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1088 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1089
1090 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1091 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1092
1093 uint32_t sumIndividual = 0u;
1094
1095 for (unsigned int n = 0u; n < blocks16; ++n)
1096 {
1097 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0);
1098 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1);
1099
1100 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0]))); // low 8 bytes: buffer0 - buffer1
1101 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0]))); // high 8 bytes: buffer0 - buffer1
1102
1103 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1104 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1105
1106 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1107 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1108
1109
1110 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1111 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1112
1113 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1114 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1115
1116 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1117 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1118
1119
1120 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1121 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1122 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1123 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1124
1125 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1126 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1127 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1128 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1129
1130 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1131 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1132 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1133 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1134
1135
1136 buffer0 += 16u * tChannels;
1137 buffer1 += 16u * tChannels;
1138 }
1139
1140 if constexpr (partialBlock16)
1141 {
1142 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1143 ocean_assert(overlappingElements < 8u);
1144
1145 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0 - overlappingElements * tChannels);
1146 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1 - overlappingElements * tChannels);
1147
1148
1149 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0]))); // low 8 bytes: buffer0 - buffer1
1150 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0]))); // high 8 bytes: buffer0 - buffer1
1151
1152 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1153 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1154
1155 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1156 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1157
1158
1159 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1160 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1161
1162 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1163
1164
1165 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1166 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1167
1168 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1169 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1170
1171 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1172 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1173
1174
1175 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1176 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1177 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1178 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1179
1180 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1181 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1182 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1183 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1184
1185 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1186 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1187 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1188 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1189
1190 buffer0 += remainingAfterBlocks16 * tChannels;
1191 buffer1 += remainingAfterBlocks16 * tChannels;
1192 }
1193
1194 for (unsigned int n = 0u; n < blocks8; ++n)
1195 {
1196 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0);
1197 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1);
1198
1199 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0])); // buffer0 - buffer1
1200 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1201 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1202
1203 const uint16x8_t bufferChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1204 const uint16x8_t bufferChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1205 const uint16x8_t bufferChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1206
1207 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1208 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1209
1210 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1211 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1212
1213 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1214 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1215
1216 buffer0 += 8u * tChannels;
1217 buffer1 += 8u * tChannels;
1218 }
1219
1220 if constexpr (partialBlock8)
1221 {
1222 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1223 ocean_assert(overlappingElements < 8u);
1224
1225 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0 - overlappingElements * tChannels);
1226 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1 - overlappingElements * tChannels);
1227
1228 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0])); // buffer0 - buffer1
1229 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1230 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1231
1232 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1233 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1234
1235 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1236
1237 const uint16x8_t bufferChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1238 const uint16x8_t bufferChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1239 const uint16x8_t bufferChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1240
1241 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1242 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1243
1244 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1245 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1246
1247 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1248 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1249
1250 buffer0 += remainingAfterBlocks8 * tChannels;
1251 buffer1 += remainingAfterBlocks8 * tChannels;
1252 }
1253
1254 if constexpr (blocks1 != 0u)
1255 {
1256 for (unsigned int n = 0u; n < blocks1; ++n)
1257 {
1258 for (unsigned int c = 0u; c < tChannels; ++c)
1259 {
1260 sumIndividual += sqrDistance(int16_t(buffer0[n * tChannels + c] - meanValues0[c]), int16_t(buffer1[n * tChannels + c] - meanValues1[c]));
1261 }
1262 }
1263
1264 buffer0 += blocks1 * tChannels;
1265 buffer1 += blocks1 * tChannels;
1266 }
1267
1268 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1269
1270 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
1271}
1272
1273template <unsigned int tChannels>
1274template <unsigned int tPixels>
1275inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1276{
1277 static_assert(tChannels >= 1u, "Invalid channel number!");
1278 static_assert(tPixels >= 1u, "Invalid pixels!");
1279
1280 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1281 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1282
1283 uint32_t zmssd = 0u;
1284
1285 for (unsigned int x = 0u; x < tPixels; ++x)
1286 {
1287 for (unsigned int c = 0u; c < tChannels; ++c)
1288 {
1289 zmssd += sqrDistance(buffer0[x * tChannels + c] - meanValues0[c], buffer1[x * tChannels + c] - meanValues1[c]);
1290 }
1291 }
1292
1293 return zmssd;
1294}
1295
1296template <>
1297template <unsigned int tPatchSize>
1298inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1299{
1300 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1301
1302 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1303 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1304
1305 ocean_assert(patch0StrideElements >= tPatchSize);
1306 ocean_assert(patch1StrideElements >= tPatchSize);
1307
1308 constexpr unsigned int blocks16 = tPatchSize / 16u;
1309 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1310
1311 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1312 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1313
1314 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1315 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1316
1317 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1318 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1319
1320 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1321
1322 static_assert(blocks1 <= 2u, "Invalid block size!");
1323
1324 // [(patch0 - mean0) - (patch1 - mean1)]^2
1325 // [patch0 - patch1 - mean0 + mean1]^2
1326
1327 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1328
1329 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1330 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1331
1332 uint32_t sumIndividual = 0u;
1333
1334 for (unsigned int y = 0u; y < tPatchSize; ++y)
1335 {
1336 for (unsigned int n = 0u; n < blocks16; ++n)
1337 {
1338 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1339 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1340
1341 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1342 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1343
1344 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1345 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1346
1347 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1348 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1349
1350 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1351 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1352
1353 patch0 += 16;
1354 patch1 += 16;
1355 }
1356
1357 if constexpr (partialBlock16)
1358 {
1359 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1360 ocean_assert(overlappingElements < 8u);
1361
1362 if (y < tPatchSize - 1u)
1363 {
1364 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1365 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1366
1367 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1368 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1369
1370 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1371 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1372
1373 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1374 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1375
1376 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1377
1378 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1379 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1380
1381 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1382 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1383
1384 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1385 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1386 }
1387 else
1388 {
1389 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0 - overlappingElements);
1390 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1 - overlappingElements);
1391
1392 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1393 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1394
1395 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1396 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1397
1398 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1399
1400 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1401 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1402
1403 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1404 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1405
1406 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1407 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1408 }
1409
1410 patch0 += remainingAfterBlocks16;
1411 patch1 += remainingAfterBlocks16;
1412 }
1413
1414 for (unsigned int n = 0u; n < blocks8; ++n)
1415 {
1416 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1417 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1418
1419 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1420
1421 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1422
1423 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1424 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1425
1426 patch0 += 8;
1427 patch1 += 8;
1428 }
1429
1430 if constexpr (partialBlock8)
1431 {
1432 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1433 ocean_assert(overlappingElements < 8u);
1434
1435 if (y < tPatchSize - 1u)
1436 {
1437 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1438 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1439
1440 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1441
1442 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1443 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1444
1445 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1446
1447 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1448
1449 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1450 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1451 }
1452 else
1453 {
1454 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0 - overlappingElements);
1455 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1 - overlappingElements);
1456
1457 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1458
1459 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1460 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1461
1462 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1463
1464 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1465
1466 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1467 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1468 }
1469
1470 patch0 += remainingAfterBlocks8;
1471 patch1 += remainingAfterBlocks8;
1472 }
1473
1474 if constexpr (blocks1 != 0u)
1475 {
1476 for (unsigned int n = 0u; n < blocks1; ++n)
1477 {
1478 sumIndividual += sqrDistance(int16_t(patch0[n] - meanValues0[0]), int16_t(patch1[n] - meanValues1[0]));
1479 }
1480
1481 patch0 += blocks1;
1482 patch1 += blocks1;
1483 }
1484
1485 patch0 += patch0StrideElements - tPatchSize;
1486 patch1 += patch1StrideElements - tPatchSize;
1487 }
1488
1489 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1490
1491 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
1492}
1493
1494template <>
1495template <unsigned int tPatchSize>
1496inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1497{
1498 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1499
1500 constexpr unsigned int tChannels = 3u;
1501
1502 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1503 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1504
1505 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1506 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1507
1508 constexpr unsigned int blocks16 = tPatchSize / 16u;
1509 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1510
1511 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1512 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1513
1514 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1515 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1516
1517 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1518 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1519
1520 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1521
1522 static_assert(blocks1 <= 2u, "Invalid block size!");
1523
1524 // [(patch0 - mean0) - (patch1 - mean1)]^2
1525 // [patch0 - patch1 - mean0 + mean1]^2
1526
1527 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1528 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1529 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1530
1531 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1532 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1533
1534 uint32_t sumIndividual = 0u;
1535
1536 for (unsigned int y = 0u; y < tPatchSize; ++y)
1537 {
1538 for (unsigned int n = 0u; n < blocks16; ++n)
1539 {
1540 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1541 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1542
1543 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1544 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1545
1546 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1547 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1548
1549 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1550 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1551
1552
1553 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1554 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1555
1556 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1557 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1558
1559 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1560 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1561
1562
1563 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1564 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1565 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1566 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1567
1568 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1569 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1570 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1571 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1572
1573 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1574 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1575 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1576 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1577
1578
1579 patch0 += 16u * tChannels;
1580 patch1 += 16u * tChannels;
1581 }
1582
1583 if constexpr (partialBlock16)
1584 {
1585 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1586 ocean_assert(overlappingElements < 8u);
1587
1588 if (y < tPatchSize - 1u)
1589 {
1590 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1591 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1592
1593
1594 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1595 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1596
1597 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1598 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1599
1600 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1601 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1602
1603
1604 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1605 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1606
1607 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1608 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1609
1610 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1611
1612
1613 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1614 const uint16x8_t patchChannel0_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1615
1616 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1617 const uint16x8_t patchChannel1_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1618
1619 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1620 const uint16x8_t patchChannel2_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1621
1622
1623 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1624 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1625 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1626 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1627
1628 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1629 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1630 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1631 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1632
1633 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1634 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1635 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1636 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1637 }
1638 else
1639 {
1640 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0 - overlappingElements * tChannels);
1641 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1 - overlappingElements * tChannels);
1642
1643
1644 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1645 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1646
1647 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1648 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1649
1650 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1651 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1652
1653
1654 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1655 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1656
1657 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1658
1659
1660 const uint16x8_t patchChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1661 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1662
1663 const uint16x8_t patchChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1664 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1665
1666 const uint16x8_t patchChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1667 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1668
1669
1670 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1671 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1672 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1673 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1674
1675 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1676 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1677 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1678 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1679
1680 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1681 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1682 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1683 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1684 }
1685
1686 patch0 += remainingAfterBlocks16 * tChannels;
1687 patch1 += remainingAfterBlocks16 * tChannels;
1688 }
1689
1690 for (unsigned int n = 0u; n < blocks8; ++n)
1691 {
1692 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1693 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1694
1695 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1696 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1697 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1698
1699 const uint16x8_t patchChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1700 const uint16x8_t patchChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1701 const uint16x8_t patchChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1702
1703 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1704 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1705
1706 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1707 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1708
1709 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1710 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1711
1712 patch0 += 8u * tChannels;
1713 patch1 += 8u * tChannels;
1714 }
1715
1716 if constexpr (partialBlock8)
1717 {
1718 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1719 ocean_assert(overlappingElements < 8u);
1720
1721 if (y < tPatchSize - 1u)
1722 {
1723 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1724 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1725
1726 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1727 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1728 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1729
1730 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1731 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1732
1733 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1734
1735 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1736 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1737 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1738
1739 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1740 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1741
1742 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1743 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1744
1745 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1746 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1747 }
1748 else
1749 {
1750 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0 - overlappingElements * tChannels);
1751 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1 - overlappingElements * tChannels);
1752
1753 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1754 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1755 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1756
1757 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1758 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1759
1760 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1761
1762 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1763 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1764 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1765
1766 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1767 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1768
1769 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1770 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1771
1772 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1773 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1774 }
1775
1776 patch0 += remainingAfterBlocks8 * tChannels;
1777 patch1 += remainingAfterBlocks8 * tChannels;
1778 }
1779
1780 if constexpr (blocks1 != 0u)
1781 {
1782 for (unsigned int n = 0u; n < blocks1; ++n)
1783 {
1784 for (unsigned int c = 0u; c < tChannels; ++c)
1785 {
1786 sumIndividual += sqrDistance(int16_t(patch0[n * tChannels + c] - meanValues0[c]), int16_t(patch1[n * tChannels + c] - meanValues1[c]));
1787 }
1788 }
1789
1790 patch0 += blocks1 * tChannels;
1791 patch1 += blocks1 * tChannels;
1792 }
1793
1794 patch0 += patch0StrideElements - tPatchSize * tChannels;
1795 patch1 += patch1StrideElements - tPatchSize * tChannels;
1796 }
1797
1798 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1799
1800 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
1801}
1802
1803template <unsigned int tChannels>
1804template <unsigned int tPatchSize>
1805inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1806{
1807 static_assert(tChannels >= 1u, "Invalid channel number!");
1808 static_assert(tPatchSize >= 1u, "Invalid patch size!");
1809
1810 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1811 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1812
1813 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1814 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1815
1816 uint32_t zmssd = 0u;
1817
1818 for (unsigned int y = 0u; y < tPatchSize; ++y)
1819 {
1820 for (unsigned int x = 0u; x < tPatchSize; ++x)
1821 {
1822 for (unsigned int n = 0u; n < tChannels; ++n)
1823 {
1824 zmssd += sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1825 }
1826 }
1827
1828 patch0 += patch0StrideElements;
1829 patch1 += patch1StrideElements;
1830 }
1831
1832 return zmssd;
1833}
1834
1835template <>
1836template <unsigned int tPatchSize>
1837inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1838{
1839 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
1840 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1841
1842 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
1843
1844 ocean_assert(image0 != nullptr && image1 != nullptr);
1845 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1846
1847 ocean_assert(centerX0 < width0 && centerY0 < height0);
1848 ocean_assert(centerX1 < width1 && centerY1 < height1);
1849
1850 const unsigned int image0StrideElements = width0 + image0PaddingElements;
1851 const unsigned int image1StrideElements = width1 + image1PaddingElements;
1852
1853 constexpr unsigned int blocks16 = tPatchSize / 16u;
1854 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1855
1856 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1857 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1858
1859 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1860 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1861
1862 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1863 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1864
1865 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1866
1867 static_assert(blocks1 <= 2u, "Invalid block size!");
1868
1869 // [(patch0 - mean0) - (patch1 - mean1)]^2
1870 // [patch0 - patch1 - mean0 + mean1]^2
1871
1872 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1873
1874 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1875 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1876
1877 uint32_t sumIndividual = 0u;
1878
1879 uint8_t intermediate[16];
1880
1881 int y1 = int(centerY1) - int(tPatchSize_2);
1882 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
1883 {
1884 const uint8_t* const mirroredRow0 = image0 + CVUtilities::mirrorIndex(y0, height0) * image0StrideElements;
1885 const uint8_t* const mirroredRow1 = image1 + CVUtilities::mirrorIndex(y1, height1) * image1StrideElements;
1886
1887 int x0 = int(centerX0) - int(tPatchSize_2);
1888 int x1 = int(centerX1) - int(tPatchSize_2);
1889
1890 for (unsigned int n = 0u; n < blocks16; ++n)
1891 {
1892 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow0, x0, width0, intermediate);
1893 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow1, x1, width1, intermediate);
1894
1895 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1896 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1897
1898 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1899 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1900
1901 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1902 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1903
1904 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1905 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1906
1907 x0 += 16;
1908 x1 += 16;
1909 }
1910
1911 if constexpr (partialBlock16)
1912 {
1913 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1914 ocean_assert(overlappingElements < 8u);
1915
1916 if (y0 < int(centerY0) + int(tPatchSize_2))
1917 {
1918 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1919 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1920
1921 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1922 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1923
1924 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1925 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1926
1927 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1928 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1929
1930 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1931
1932 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1933 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1934
1935 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1936 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1937
1938 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1939 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1940 }
1941 else
1942 {
1943 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1944 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1945
1946 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1947 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1948
1949 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1950 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1951
1952 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1953
1954 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1955 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1956
1957 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1958 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1959
1960 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1961 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1962 }
1963
1964 x0 += remainingAfterBlocks16;
1965 x1 += remainingAfterBlocks16;
1966 }
1967
1968 for (unsigned int n = 0u; n < blocks8; ++n)
1969 {
1970 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow0, x0, width0, intermediate);
1971 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow1, x1, width1, intermediate);
1972
1973 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1974
1975 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1976
1977 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1978 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1979
1980 x0 += 8;
1981 x1 += 8;
1982 }
1983
1984 if constexpr (partialBlock8)
1985 {
1986 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1987 ocean_assert(overlappingElements < 8u);
1988
1989 if (y0 < int(centerY0) + int(tPatchSize_2))
1990 {
1991 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
1992 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
1993
1994 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1995
1996 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1997 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1998
1999 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2000
2001 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2002
2003 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2004 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2005 }
2006 else
2007 {
2008 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2009 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2010
2011 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2012
2013 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
2014 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
2015
2016 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2017
2018 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2019
2020 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2021 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2022 }
2023
2024 x0 += remainingAfterBlocks8;
2025 x1 += remainingAfterBlocks8;
2026 }
2027
2028 if constexpr (blocks1 != 0u)
2029 {
2030 for (unsigned int n = 0u; n < blocks1; ++n)
2031 {
2032 const unsigned int index0 = CVUtilities::mirrorIndex(x0 + int(n), width0);
2033 const unsigned int index1 = CVUtilities::mirrorIndex(x1 + int(n), width1);
2034
2035 sumIndividual += sqrDistance(int16_t(mirroredRow0[index0] - meanValues0[0]), int16_t(mirroredRow1[index1] - meanValues1[0]));
2036 }
2037 }
2038
2039 ++y1;
2040 }
2041
2042 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
2043
2044 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
2045}
2046
2047template <unsigned int tChannels>
2048template <unsigned int tPatchSize>
2049inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
2050{
2051 static_assert(tChannels >= 1u, "Invalid channel number!");
2052 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
2053
2054 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
2055
2056 ocean_assert(image0 != nullptr && image1 != nullptr);
2057 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
2058
2059 ocean_assert(centerX0 < width0 && centerY0 < height0);
2060 ocean_assert(centerX1 < width1 && centerY1 < height1);
2061
2062 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
2063 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
2064
2065 uint32_t zmssd = 0u;
2066
2067 int y1 = int(centerY1) - int(tPatchSize_2);
2068 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
2069 {
2070 const uint8_t* const mirroredRow0 = image0 + CVUtilities::mirrorIndex(y0, height0) * image0StrideElements;
2071 const uint8_t* const mirroredRow1 = image1 + CVUtilities::mirrorIndex(y1, height1) * image1StrideElements;
2072
2073 int x1 = int(centerX1) - int(tPatchSize_2);
2074 for (int x0 = int(centerX0) - int(tPatchSize_2); x0 <= int(centerX0) + int(tPatchSize_2); ++x0)
2075 {
2076 const uint8_t* const pixel0 = mirroredRow0 + CVUtilities::mirrorIndex(x0, width0) * tChannels;
2077 const uint8_t* const pixel1 = mirroredRow1 + CVUtilities::mirrorIndex(x1, width1) * tChannels;
2078
2079 for (unsigned int c = 0u; c < tChannels; ++c)
2080 {
2081 zmssd += sqrDistance(pixel0[c] - meanValues0[c], pixel1[c] - meanValues1[c]);
2082 }
2083
2084 ++x1;
2085 }
2086
2087 ++y1;
2088 }
2089
2090 return zmssd;
2091}
2092
2093template <unsigned int tChannels, unsigned int tPixels>
2094inline uint32_t ZeroMeanSumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1)
2095{
2096 static_assert(tChannels >= 1u, "Invalid channel number!");
2097 static_assert(tPixels >= 8u, "Invalid pixels!");
2098
2099 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
2100
2101 uint8_t meanValues0[tChannels];
2102 mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
2103
2104 uint8_t meanValues1[tChannels];
2105 mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
2106
2107 return SpecializedForChannels<tChannels>::template buffer8BitPerChannel<tPixels>(buffer0, buffer1, meanValues0, meanValues1);
2108}
2109
2110template <unsigned int tChannels, unsigned int tPatchSize>
2111inline uint32_t ZeroMeanSumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
2112{
2113 static_assert(tChannels >= 1u, "Invalid channel number!");
2114 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2115
2116 ocean_assert(patch0 != nullptr && patch1 != nullptr);
2117
2118 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2119 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
2120
2121 uint8_t meanValues0[tChannels];
2122 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2123
2124 uint8_t meanValues1[tChannels];
2125 mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
2126
2127 return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, patch1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
2128}
2129
2130template <unsigned int tChannels, unsigned int tPatchSize>
2131inline uint32_t ZeroMeanSumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
2132{
2133 static_assert(tChannels >= 1u, "Invalid channel number!");
2134 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2135
2136 ocean_assert(patch0 != nullptr && buffer1 != nullptr);
2137
2138 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2139
2140 uint8_t meanValues0[tChannels];
2141 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2142
2143 uint8_t meanValues1[tChannels];
2144 mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
2145
2146 constexpr unsigned int patch1StrideElements = tChannels * tPatchSize;
2147
2148 return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, buffer1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
2149}
2150
2151template <unsigned int tChannels, unsigned int tPatchSize>
2152uint32_t ZeroMeanSumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
2153{
2154 static_assert(tChannels >= 1u, "Invalid channel number!");
2155 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2156
2157 ocean_assert(image0 != nullptr && image1 != nullptr);
2158
2159 uint8_t meanValues0[tChannels];
2160 SpecializedForChannels<tChannels>::template mean8BitPerChannelMirroredBorder<tPatchSize>(image0, width0, height0, centerX0, centerY0, image0PaddingElements, meanValues0);
2161
2162 uint8_t meanValues1[tChannels];
2163 SpecializedForChannels<tChannels>::template mean8BitPerChannelMirroredBorder<tPatchSize>(image1, width1, height1, centerX1, centerY1, image1PaddingElements, meanValues1);
2164
2165 return SpecializedForChannels<tChannels>::template patchMirroredBorder8BitPerChannel<tPatchSize>(image0, image1, width0, height0, width1, height1, centerX0, centerY0, centerX1, centerY1, image0PaddingElements, image1PaddingElements, meanValues0, meanValues1);
2166}
2167
2168template <unsigned int tChannels, unsigned int tPixels>
2169OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesNEON::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
2170{
2171 static_assert(tChannels >= 1u, "Invalid channel number!");
2172 static_assert(tPixels >= 8u, "Invalid patch size!");
2173
2174 SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPixels>(buffer, meanValues);
2175}
2176
2177template <unsigned int tChannels, unsigned int tPatchSize>
2178OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesNEON::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
2179{
2180 static_assert(tChannels >= 1u, "Invalid channel number!");
2181 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2182
2183 SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPatchSize>(patch, patchStrideElements, meanValues);
2184}
2185
2186template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
2187OCEAN_FORCE_INLINE uint8x8_t ZeroMeanSumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer)
2188{
2189 ocean_assert(tPixels >= 1u && tPixels <= 8u);
2190
2191 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
2192
2193 constexpr unsigned int tOverlappingElements = 8u - tPixels;
2194
2195 if (x >= 0 && x <= int(width) - int(tPixels))
2196 {
2197 if constexpr (tPixels == 8u)
2198 {
2199 return vld1_u8(row + x);
2200 }
2201 else
2202 {
2203 if constexpr (tFront)
2204 {
2205 if constexpr (tOverlappingToZero)
2206 {
2207 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2208 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2209
2210 return vand_u8(vld1_u8(row + x), mask_u_8x8);
2211 }
2212 else
2213 {
2214 return vld1_u8(row + x);
2215 }
2216 }
2217 else
2218 {
2219 if constexpr (tOverlappingToZero)
2220 {
2221 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2222 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2223
2224 return vand_u8(vld1_u8(row + x - int(tOverlappingElements)), mask_u_8x8);
2225 }
2226 else
2227 {
2228 return vld1_u8(row + x - int(tOverlappingElements));
2229 }
2230 }
2231 }
2232 }
2233
2234 if constexpr (tFront)
2235 {
2236 for (unsigned int n = 0u; n < tPixels; ++n)
2237 {
2238 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2239 ocean_assert(mirroredIndex < width);
2240
2241 intermediateBuffer[n] = row[mirroredIndex];
2242 }
2243
2244 if constexpr (tOverlappingToZero)
2245 {
2246 for (unsigned int n = tPixels; n < 8u; ++n)
2247 {
2248 intermediateBuffer[n] = 0u;
2249 }
2250 }
2251 }
2252 else
2253 {
2254 if constexpr (tOverlappingToZero)
2255 {
2256 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
2257 {
2258 intermediateBuffer[n] = 0u;
2259 }
2260 }
2261
2262 for (unsigned int n = 0u; n < tPixels; ++n)
2263 {
2264 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2265 ocean_assert(mirroredIndex < width);
2266
2267 intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2268 }
2269 }
2270
2271 return vld1_u8(intermediateBuffer);
2272}
2273
2274template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
2275OCEAN_FORCE_INLINE uint8x16_t ZeroMeanSumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer)
2276{
2277 ocean_assert(tPixels > 8u && tPixels <= 16u);
2278
2279 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
2280
2281 constexpr unsigned int tOverlappingElements = 16u - tPixels;
2282
2283 if (x >= 0 && x <= int(width) - int(tPixels))
2284 {
2285 if constexpr (tPixels == 16u)
2286 {
2287 return vld1q_u8(row + x);
2288 }
2289 else
2290 {
2291 if constexpr (tFront)
2292 {
2293 if constexpr (tOverlappingToZero)
2294 {
2295 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2296 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
2297
2298 return vandq_u8(vld1q_u8(row + x), mask_u_8x16);
2299 }
2300 else
2301 {
2302 return vld1q_u8(row + x);
2303 }
2304 }
2305 else
2306 {
2307 if constexpr (tOverlappingToZero)
2308 {
2309 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2310 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
2311
2312 return vandq_u8(vld1q_u8(row + x - int(tOverlappingElements)), mask_u_8x16);
2313 }
2314 else
2315 {
2316 return vld1q_u8(row + x - int(tOverlappingElements));
2317 }
2318 }
2319 }
2320 }
2321
2322 if constexpr (tFront)
2323 {
2324 for (unsigned int n = 0u; n < tPixels; ++n)
2325 {
2326 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2327 ocean_assert(mirroredIndex < width);
2328
2329 intermediateBuffer[n] = row[mirroredIndex];
2330 }
2331
2332 if constexpr (tOverlappingToZero)
2333 {
2334 for (unsigned int n = tPixels; n < 16u; ++n)
2335 {
2336 intermediateBuffer[n] = 0u;
2337 }
2338 }
2339 }
2340 else
2341 {
2342 if constexpr (tOverlappingToZero)
2343 {
2344 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
2345 {
2346 intermediateBuffer[n] = 0u;
2347 }
2348 }
2349
2350 for (unsigned int n = 0u; n < tPixels; ++n)
2351 {
2352 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2353 ocean_assert(mirroredIndex < width);
2354
2355 intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2356 }
2357 }
2358
2359 return vld1q_u8(intermediateBuffer);
2360}
2361
2362}
2363
2364}
2365
2366#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2367
2368#endif // META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int index, const unsigned int elements)
Returns the mirrored index for a given index.
Definition CVUtilities.h:459
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1084
This class allows to specialize functions for individual channels.
Definition ZeroMeanSumSquareDifferencesNEON.h:39
static void mean8BitPerChannelMirroredBorder(const uint8_t *const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t *const meanValues)
Determines the mean value for an image patch, one value for each channel, patch pixels outside the im...
Definition ZeroMeanSumSquareDifferencesNEON.h:883
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition ZeroMeanSumSquareDifferencesNEON.h:1805
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition ZeroMeanSumSquareDifferencesNEON.h:2049
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition ZeroMeanSumSquareDifferencesNEON.h:438
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition ZeroMeanSumSquareDifferencesNEON.h:1275
This class implements functions to calculate zero-mean sum square differences using NEON instructions...
Definition ZeroMeanSumSquareDifferencesNEON.h:30
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition ZeroMeanSumSquareDifferencesNEON.h:2152
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition ZeroMeanSumSquareDifferencesNEON.h:2169
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition ZeroMeanSumSquareDifferencesNEON.h:2111
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition ZeroMeanSumSquareDifferencesNEON.h:2094
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition ZeroMeanSumSquareDifferencesNEON.h:2187
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition ZeroMeanSumSquareDifferencesNEON.h:2131
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition ZeroMeanSumSquareDifferencesNEON.h:2275
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15