Ocean
Loading...
Searching...
No Matches
ZeroMeanSumSquareDifferencesNEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
9#define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
10
11#include "ocean/cv/CV.h"
12
14
15#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
16
17#include "ocean/cv/NEON.h"
18
19namespace Ocean
20{
21
22namespace CV
23{
24
25/**
26 * This class implements function to calculate zeao-mean sum square differences using NEON instructions.
27 * @ingroup cv
28 */
30{
31 protected:
32
33 /**
34 * This class allows to specialize functions for individual channels.
35 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
36 */
37 template <unsigned int tChannels>
39 {
40 public:
41
42 /**
43 * Determines the mean value for a buffer, one value for each channel.
44 * @param buffer The memory buffer to be handled, must be valid
45 * @param meanValues The resulting mean values, one for each channel
46 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
47 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
48 */
49 template <unsigned int tPixels>
50 static inline void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
51
52 /**
53 * Determines the mean value for an image patch, one value for each channel.
54 * @param patch The top left start position of the image patch, must be valid
55 * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
56 * @param meanValues The resulting mean values, one for each channel
57 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity)
58 */
59 template <unsigned int tPatchSize>
60 static inline void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
61
62 /**
63 * Determines the mean value for an image patch, one value for each channel, patch pixels outside the image will be mirrored back into the image.
64 * @param image The image in which the patch is located, must be valid
65 * @param width The width of the image, in pixels, with range [tPatchSize, infinity)
66 * @param height The height of the image, in pixels, with range [tPatchSize, infinity)
67 * @param centerX Horizontal center position of the (tPatchSize x tPatchSize) block in the frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
68 * @param centerY Vertical center position of the (tPatchSize x tPatchSize) block in the frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
69 * @param imagePaddingElements The number of padding elements at the end of each row of the image, in elements, with range [0, infinity)
70 * @param meanValues The resulting mean values, one for each channel
71 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
72 */
73 template <unsigned int tPatchSize>
74 static inline void mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues);
75
76 /**
77 * Returns the zero-mean sum of square differences between two memory buffers.
78 * @param buffer0 The first memory buffer, must be valid
79 * @param buffer1 The second memory buffer, must be valid
80 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
81 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
82 * @return The resulting sum of square differences
83 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
84 */
85 template <unsigned int tPixels>
86 static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
87
88 /**
89 * Returns the zero-mean sum of square differences between two patches within an image.
90 * @param patch0 The top left start position of the first image patch, must be valid
91 * @param patch1 The top left start position of the second image patch, must be valid
92 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
93 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
94 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
95 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
96 * @return The resulting sum of square differences
97 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
98 */
99 template <unsigned int tPatchSize>
100 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
101
102 /**
103 * Returns the zero-mean sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
104 * @param image0 The image in which the first patch is located, must be valid
105 * @param image1 The image in which the second patch is located, must be valid
106 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
107 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
108 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
109 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
110 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
111 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
112 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
113 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
114 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
115 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
116 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
117 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
118 * @return The resulting zero-mean sum of square differences, with range [0, infinity)
119 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
120 */
121 template <unsigned int tPatchSize>
122 static inline uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
123 };
124
125 public:
126
127 /**
128 * Returns the zero-mean sum of square differences between two memory buffers.
129 * @param buffer0 The first memory buffer, must be valid
130 * @param buffer1 The second memory buffer, must be valid
131 * @return The resulting sum of square differences
132 * @tparam tChannels Specifies the number of channels for the given buffers, with range [1, infinity)
133 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
134 */
135 template <unsigned int tChannels, unsigned int tPixels>
136 static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1);
137
138 /**
139 * Returns the zero-mean sum of square differences between two patches within an image.
140 * @param patch0 The top left start position of the first image patch, must be valid
141 * @param patch1 The top left start position of the second image patch, must be valid
142 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
143 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
144 * @return The resulting sum of square differences
145 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
146 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
147 */
148 template <unsigned int tChannels, unsigned int tPatchSize>
149 static inline uint32_t patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
150
151 /**
152 * Returns the zero-mean sum of square differences between an image patch and a buffer.
153 * @param patch0 The top left start position of the image patch, must be valid
154 * @param buffer1 The memory buffer, must be valid
155 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
156 * @return The resulting sum of square differences
157 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
158 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
159 */
160 template <unsigned int tChannels, unsigned int tPatchSize>
161 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
162
163 /**
164 * Returns the zero-mean sum of square differences between two patches within an image, patch pixels outside the image will be mirrored back into the image.
165 * @param image0 The image in which the first patch is located, must be valid
166 * @param image1 The image in which the second patch is located, must be valid
167 * @param width0 The width of the first image, in pixels, with range [tPatchSize, infinity)
168 * @param height0 The height of the first image, in pixels, with range [tPatchSize, infinity)
169 * @param width1 The width of the second image, in pixels, with range [tPatchSize, infinity)
170 * @param height1 The height of the second image, in pixels, with range [tPatchSize, infinity)
171 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
172 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
173 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1]
174 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1]
175 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
176 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
177 * @return The resulting zero-mean sum of square differences, with range [0, infinity)
178 * @tparam tChannels The number of frame channels, with range [1, infinity)
179 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
180 */
181 template <unsigned int tChannels, unsigned int tPatchSize>
182 static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
183
184 /**
185 * Determines the mean value for a buffer, one value for each channel.
186 * @param buffer The memory buffer to be handled, must be valid
187 * @param meanValues The resulting mean values, one for each channel
188 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
189 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
190 */
191 template <unsigned int tChannels, unsigned int tPixels>
192 static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
193
194 /**
195 * Determines the mean value for an image patch, one value for each channel.
196 * @param patch The top left start position of the image patch, must be valid
197 * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
198 * @param meanValues The resulting mean values, one for each channel
199 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
200 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
201 */
202 template <unsigned int tChannels, unsigned int tPatchSize>
203 static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
204
205 protected:
206
207 /**
208 * Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
209 * @param row The row from which the values will be loaded, must be valid
210 * @param x The index of the first pixel to load, with range [-elements/2, elements + elements/2]
211 * @param width The width of the row, in pixels, with range [4, infinity)
212 * @param intermediateBuffer An intermediate buffer with 8 elements, must be valid
213 * @return The uint8x8_t object with the loaded values
214 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
215 * @tparam tPixels The number of uint8_t pixels to be read, with range [1, 8]
216 * @tparam tOverlappingToZero True, to set overlapping pixels to zero; False, to get overlapping pixels with random values
217 */
218 template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
219 static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer);
220
221 /**
222 * Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
223 * @param row The row from which the values will be loaded, must be valid
224 * @param x The index of the first pixel to load, with range [-elements/2, elements + elements/2]
225 * @param width The width of the row in pixels, with range [8, infinity)
226 * @param intermediateBuffer An intermediate buffer with 16 elements, must be valid
227 * @return The uint8x16_t object with the loaded values
228 * @tparam tFront True, if the uint8_t values will be placed at the front of the resulting uint8x8_t object; False, to placed the uint8_t values at the end
229 * @tparam tSize The number of uint8_t pixels to be read, with range [1, 16]
230 * @tparam tOverlappingToZero True, to set overlapping pixels to zero; False, to get overlapping pixels with random values
231 */
232 template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
233 static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer);
234};
235
236template <>
237template <unsigned int tPixels>
238inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
239{
240 static_assert(tPixels >= 8u, "Invalid pixels!");
241
242 ocean_assert(buffer != nullptr && meanValues != nullptr);
243
244 constexpr unsigned int blocks16 = tPixels / 16u;
245 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
246
247 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u && tPixels >= 16u;
248 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
249
250 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
251 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
252
253 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u && tPixels >= 8u;
254 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
255
256 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
257
258 static_assert(blocks1 <= 2u, "Invalid block size!");
259
260 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
261
262 uint32_t sumIndividual = 0u;
263
264 for (unsigned int n = 0u; n < blocks16; ++n)
265 {
266 const uint8x16_t buffer_u_8x16 = vld1q_u8(buffer);
267
268 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
269
270 buffer += 16;
271 }
272
273 if constexpr (partialBlock16)
274 {
275 static_assert(tPixels >= 16u, "We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
276
277 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
278 ocean_assert(overlappingElements < 8u);
279
280 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
281 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
282 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
283
284 const uint8x16_t buffer_u_8x16 = vandq_u8(vld1q_u8(buffer - overlappingElements), mask_u_8x16);
285
286 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(buffer_u_8x16));
287
288 buffer += remainingAfterBlocks16;
289 }
290
291 for (unsigned int n = 0u; n < blocks8; ++n)
292 {
293 const uint8x8_t buffer_u_8x8 = vld1_u8(buffer);
294
295 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
296
297 buffer += 8;
298 }
299
300 if constexpr (partialBlock8)
301 {
302 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
303 ocean_assert(overlappingElements < 8u);
304
305 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
306
307 const uint8x8_t buffer_u_8x8 = vand_u8(vld1_u8(buffer - overlappingElements), mask_u_8x8);
308
309 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(buffer_u_8x8));
310
311 buffer += remainingAfterBlocks8;
312 }
313
314 if constexpr (blocks1 != 0u)
315 {
316 for (unsigned int n = 0u; n < blocks1; ++n)
317 {
318 sumIndividual += buffer[n];
319 }
320
321 buffer += blocks1;
322 }
323
324 uint32_t results[4];
325 vst1q_u32(results, sum_u_32x4);
326
327 const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
328
329 meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
330}
331
332template <>
333template <unsigned int tPixels>
334inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
335{
336 static_assert(tPixels >= 8u, "Invalid pixels!");
337
338 constexpr unsigned int tChannels = 3u;
339
340 ocean_assert(buffer != nullptr && meanValues != nullptr);
341
342 constexpr unsigned int blocks16 = tPixels / 16u;
343 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
344
345 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u && blocks16 >= 1u;
346 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
347
348 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
349 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
350
351 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
352 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
353
354 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
355
356 static_assert(blocks1 <= 2u, "Invalid block size!");
357
358 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
359 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
360 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
361
362 uint32_t sumIndividual[3] = {0u};
363
364 for (unsigned int n = 0u; n < blocks16; ++n)
365 {
366 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer);
367
368 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[0]));
369 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[1]));
370 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(buffer_u_8x16x3.val[2]));
371
372 buffer += 16u * tChannels;
373 }
374
375 if constexpr (partialBlock16)
376 {
377 static_assert(tPixels >= 16u, "We need to guarantee that loading 16 pixels of worth of data preceding the end boundary cannot cause memory access violation");
378
379 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
380 ocean_assert(overlappingElements < 8u);
381
382 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
383 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
384 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(uint64_t(-1) << overlappingElements * 8u), vcreate_u8(uint64_t(-1)));
385
386 const uint8x16x3_t buffer_u_8x16x3 = vld3q_u8(buffer - overlappingElements * tChannels);
387
388 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[0], mask_u_8x16)));
389 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[1], mask_u_8x16)));
390 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(buffer_u_8x16x3.val[2], mask_u_8x16)));
391
392 buffer += remainingAfterBlocks16 * tChannels;
393 }
394
395 for (unsigned int n = 0u; n < blocks8; ++n)
396 {
397 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer);
398
399 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[0]));
400 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[1]));
401 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(buffer_u_8x8x3.val[2]));
402
403 buffer += 8u * tChannels;
404 }
405
406 if constexpr (partialBlock8)
407 {
408 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
409 ocean_assert(overlappingElements < 8u);
410
411 const uint8x8_t mask_u_8x8 = vcreate_u8(uint64_t(-1) << overlappingElements * 8u);
412
413 const uint8x8x3_t buffer_u_8x8x3 = vld3_u8(buffer - overlappingElements * tChannels);
414
415 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[0], mask_u_8x8)));
416 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[1], mask_u_8x8)));
417 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(buffer_u_8x8x3.val[2], mask_u_8x8)));
418
419 buffer += remainingAfterBlocks8 * tChannels;
420 }
421
422 for (unsigned int n = 0u; n < blocks1; ++n)
423 {
424 sumIndividual[0] += buffer[tChannels * n + 0u];
425 sumIndividual[1] += buffer[tChannels * n + 1u];
426 sumIndividual[2] += buffer[tChannels * n + 2u];
427 }
428
429 uint32_t results[4];
430 vst1q_u32(results, sumChannel0_u_32x4);
431
432 const uint32_t sum0 = results[0] + results[1] + results[2] + results[3] + sumIndividual[0];
433 meanValues[0] = uint8_t((sum0 + tPixels / 2u) / tPixels);
434
435 vst1q_u32(results, sumChannel1_u_32x4);
436
437 const uint32_t sum1 = results[0] + results[1] + results[2] + results[3] + sumIndividual[1];
438 meanValues[1] = uint8_t((sum1 + tPixels / 2u) / tPixels);
439
440 vst1q_u32(results, sumChannel2_u_32x4);
441
442 const uint32_t sum2 = results[0] + results[1] + results[2] + results[3] + sumIndividual[2];
443 meanValues[2] = uint8_t((sum2 + tPixels / 2u) / tPixels);
444}
445
446template <unsigned int tChannels>
447template <unsigned int tPixels>
448inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
449{
450 static_assert(tChannels >= 1u, "Invalid channel number!");
451 static_assert(tPixels >= 1u, "Invalid buffer size!");
452
453 ocean_assert(buffer != nullptr && meanValues != nullptr);
454
455 uint32_t sum[tChannels] = {0u};
456
457 for (unsigned int n = 0u; n < tPixels; ++n)
458 {
459 for (unsigned int c = 0u; c < tChannels; ++c)
460 {
461 sum[c] += buffer[n * tChannels + c];
462 }
463 }
464
465 for (unsigned int c = 0u; c < tChannels; ++c)
466 {
467 meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
468 }
469}
470
471template <>
472template <unsigned int tPatchSize>
473inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
474{
475 static_assert(tPatchSize >= 5u, "Invalid patch size!");
476
477 ocean_assert(patch != nullptr && meanValues != nullptr);
478
479 ocean_assert(patchStrideElements >= tPatchSize);
480
481 constexpr unsigned int blocks16 = tPatchSize / 16u;
482 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
483
484 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
485 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
486
487 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
488 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
489
490 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
491 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
492
493 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
494
495 static_assert(blocks1 <= 2u, "Invalid block size!");
496
497 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
498
499 uint32_t sumIndividual = 0u;
500
501 for (unsigned int y = 0u; y < tPatchSize; ++y)
502 {
503 for (unsigned int n = 0u; n < blocks16; ++n)
504 {
505 const uint8x16_t patch_u_8x16 = vld1q_u8(patch);
506
507 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
508
509 patch += 16;
510 }
511
512 if constexpr (partialBlock16)
513 {
514 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
515 ocean_assert(overlappingElements < 8u);
516
517 if (y < tPatchSize - 1u)
518 {
519 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
520 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
521 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
522 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
523
524 const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch), mask_u_8x16);
525
526 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
527 }
528 else
529 {
530 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
531 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
532 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
533 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
534
535 const uint8x16_t patch_u_8x16 = vandq_u8(vld1q_u8(patch - overlappingElements), mask_u_8x16);
536
537 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
538 }
539
540 patch += remainingAfterBlocks16;
541 }
542
543 for (unsigned int n = 0u; n < blocks8; ++n)
544 {
545 const uint8x8_t patch_u_8x8 = vld1_u8(patch);
546
547 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
548
549 patch += 8;
550 }
551
552 if constexpr (partialBlock8)
553 {
554 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
555 ocean_assert(overlappingElements < 8u);
556
557 if (y < tPatchSize - 1u)
558 {
559 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
560 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
561
562 const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch), mask_u_8x8);
563
564 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
565 }
566 else
567 {
568 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
569 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
570
571 const uint8x8_t patch_u_8x8 = vand_u8(vld1_u8(patch - overlappingElements), mask_u_8x8);
572
573 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
574 }
575
576 patch += remainingAfterBlocks8;
577 }
578
579 if constexpr (blocks1 != 0u)
580 {
581 for (unsigned int n = 0u; n < blocks1; ++n)
582 {
583 sumIndividual += patch[n];
584 }
585
586 patch += blocks1;
587 }
588
589 patch += patchStrideElements - tPatchSize;
590 }
591
592 uint32_t results[4];
593 vst1q_u32(results, sum_u_32x4);
594
595 const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
596
597 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
598}
599
600template <>
601template <unsigned int tPatchSize>
602inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
603{
604 static_assert(tPatchSize >= 5u, "Invalid patch size!");
605
606 constexpr unsigned int tChannels = 3u;
607
608 ocean_assert(patch != nullptr && meanValues != nullptr);
609
610 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
611
612 constexpr unsigned int blocks16 = tPatchSize / 16u;
613 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
614
615 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
616 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
617
618 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
619 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
620
621 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
622 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
623
624 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
625
626 static_assert(blocks1 <= 2u, "Invalid block size!");
627
628 uint32x4_t sumChannel0_u_32x4 = vdupq_n_u32(0u);
629 uint32x4_t sumChannel1_u_32x4 = vdupq_n_u32(0u);
630 uint32x4_t sumChannel2_u_32x4 = vdupq_n_u32(0u);
631
632 uint32_t sumIndividual[3] = {0u};
633
634 for (unsigned int y = 0u; y < tPatchSize; ++y)
635 {
636 for (unsigned int n = 0u; n < blocks16; ++n)
637 {
638 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
639
640 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[0]));
641 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[1]));
642 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(patch_u_8x16x3.val[2]));
643
644 patch += 16u * tChannels;
645 }
646
647 if constexpr (partialBlock16)
648 {
649 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
650 ocean_assert(overlappingElements < 8u);
651
652 if (y < tPatchSize - 1u)
653 {
654 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
655 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
656 constexpr uint64_t maskHigh = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
657 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(uint64_t(-1)), vcreate_u8(maskHigh));
658
659 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch);
660
661 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
662 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
663 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
664 }
665 else
666 {
667 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
668 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
669 constexpr uint64_t maskLow = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
670 const uint8x16_t mask_u_8x16 = vcombine_u16(vcreate_u8(maskLow), vcreate_u8(uint64_t(-1)));
671
672 const uint8x16x3_t patch_u_8x16x3 = vld3q_u8(patch - overlappingElements * tChannels);
673
674 sumChannel0_u_32x4 = vpadalq_u16(sumChannel0_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[0], mask_u_8x16)));
675 sumChannel1_u_32x4 = vpadalq_u16(sumChannel1_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[1], mask_u_8x16)));
676 sumChannel2_u_32x4 = vpadalq_u16(sumChannel2_u_32x4, vpaddlq_u8(vandq_u8(patch_u_8x16x3.val[2], mask_u_8x16)));
677 }
678
679 patch += remainingAfterBlocks16 * tChannels;
680 }
681
682 for (unsigned int n = 0u; n < blocks8; ++n)
683 {
684 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
685
686 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(patch_u_8x8x3.val[0]));
687 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(patch_u_8x8x3.val[1]));
688 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(patch_u_8x8x3.val[2]));
689
690 patch += 8u * tChannels;
691 }
692
693 if constexpr (partialBlock8)
694 {
695 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
696 ocean_assert(overlappingElements < 8u);
697
698 if (y < tPatchSize - 1u)
699 {
700 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 8u);
701 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
702
703 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch);
704
705 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
706 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
707 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
708 }
709 else
710 {
711 constexpr uint64_t mask = overlappingElements >= 8u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 8u);
712 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
713
714 const uint8x8x3_t patch_u_8x8x3 = vld3_u8(patch - overlappingElements * tChannels);
715
716 sumChannel0_u_32x4 = vaddw_u16(sumChannel0_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[0], mask_u_8x8)));
717 sumChannel1_u_32x4 = vaddw_u16(sumChannel1_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[1], mask_u_8x8)));
718 sumChannel2_u_32x4 = vaddw_u16(sumChannel2_u_32x4, vpaddl_u8(vand_u8(patch_u_8x8x3.val[2], mask_u_8x8)));
719 }
720
721 patch += remainingAfterBlocks8 * tChannels;
722 }
723
724 if constexpr (blocks1 != 0u)
725 {
726 for (unsigned int n = 0u; n < blocks1; ++n)
727 {
728 sumIndividual[0] += patch[tChannels * n + 0u];
729 sumIndividual[1] += patch[tChannels * n + 1u];
730 sumIndividual[2] += patch[tChannels * n + 2u];
731 }
732
733 patch += blocks1 * tChannels;
734 }
735
736 patch += patchStrideElements - tChannels * tPatchSize;
737 }
738
739 uint32_t results[4];
740 vst1q_u32(results, sumChannel0_u_32x4);
741
742 const uint32_t sum0 = results[0] + results[1] + results[2] + results[3] + sumIndividual[0];
743 meanValues[0] = uint8_t((sum0 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
744
745 vst1q_u32(results, sumChannel1_u_32x4);
746
747 const uint32_t sum1 = results[0] + results[1] + results[2] + results[3] + sumIndividual[1];
748 meanValues[1] = uint8_t((sum1 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
749
750 vst1q_u32(results, sumChannel2_u_32x4);
751
752 const uint32_t sum2 = results[0] + results[1] + results[2] + results[3] + sumIndividual[2];
753 meanValues[2] = uint8_t((sum2 + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
754}
755
756template <unsigned int tChannels>
757template <unsigned int tPatchSize>
758inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
759{
760 static_assert(tChannels >= 1u, "Invalid channel number!");
761 static_assert(tPatchSize >= 1u, "Invalid patch size!");
762
763 ocean_assert(patch != nullptr && meanValues != nullptr);
764
765 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
766
767 uint32_t sum[tChannels] = {0u};
768
769 for (unsigned int y = 0u; y < tPatchSize; ++y)
770 {
771 for (unsigned int x = 0u; x < tPatchSize; ++x)
772 {
773 for (unsigned int n = 0u; n < tChannels; ++n)
774 {
775 sum[n] += patch[x * tChannels + n];
776 }
777 }
778
779 patch += patchStrideElements;
780 }
781
782 for (unsigned int n = 0u; n < tChannels; ++n)
783 {
784 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
785 }
786}
787
788template <>
789template <unsigned int tPatchSize>
790inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues)
791{
792 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
793
794 ocean_assert(image != nullptr && meanValues != nullptr);
795 ocean_assert(centerX < width && centerY < height);
796
797 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
798
799 const unsigned int imageStrideElements = width + imagePaddingElements;
800
801 constexpr unsigned int blocks16 = tPatchSize / 16u;
802 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
803
804 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
805 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
806
807 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
808 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
809
810 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
811 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
812
813 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
814
815 static_assert(blocks1 <= 7u, "Invalid block size!");
816
817 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
818
819 uint32_t sumIndividual = 0u;
820
821 uint8_t intermediate[16];
822
823 for (int y = int(centerY) - int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
824 {
825 const uint8_t* const mirroredRow = image + CVUtilities::mirrorIndex(y, height) * imageStrideElements;
826
827 int x = int(centerX) - int(tPatchSize_2);
828
829 for (unsigned int n = 0u; n < blocks16; ++n)
830 {
831 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow, x, width, intermediate);
832
833 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
834
835 x += 16;
836 }
837
838 if constexpr (partialBlock16)
839 {
840 if (y < int(centerY) + int(tPatchSize_2))
841 {
842 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
843
844 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
845 }
846 else
847 {
848 const uint8x16_t patch_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, true>(mirroredRow, x, width, intermediate);
849
850 sum_u_32x4 = vpadalq_u16(sum_u_32x4, vpaddlq_u8(patch_u_8x16));
851 }
852
853 x += remainingAfterBlocks16;
854 }
855
856 for (unsigned int n = 0u; n < blocks8; ++n)
857 {
858 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow, x, width, intermediate);
859
860 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
861
862 x += 8;
863 }
864
865 if constexpr (partialBlock8)
866 {
867 if (y < int(centerY) + int(tPatchSize_2))
868 {
869 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
870
871 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
872 }
873 else
874 {
875 const uint8x8_t patch_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, true>(mirroredRow, x, width, intermediate);
876
877 sum_u_32x4 = vaddw_u16(sum_u_32x4, vpaddl_u8(patch_u_8x8));
878 }
879
880 x += remainingAfterBlocks8;
881 }
882
883 if constexpr (blocks1 != 0u)
884 {
885 for (unsigned int n = 0u; n < blocks1; ++n)
886 {
887 const unsigned int index = CVUtilities::mirrorIndex(x, width);
888
889 sumIndividual += mirroredRow[index];
890
891 x++;
892 }
893 }
894 }
895
896 uint32_t results[4];
897 vst1q_u32(results, sum_u_32x4);
898
899 const uint32_t sum = results[0] + results[1] + results[2] + results[3] + sumIndividual;
900
901 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
902}
903
904template <unsigned int tChannels>
905template <unsigned int tPatchSize>
906inline void ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::mean8BitPerChannelMirroredBorder(const uint8_t* const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t* const meanValues)
907{
908 static_assert(tChannels >= 1u, "Invalid channel number!");
909 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
910
911 ocean_assert(image != nullptr && meanValues != nullptr);
912 ocean_assert(centerX < width && centerY < height);
913
914 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
915
916 const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
917
918 uint32_t sum[tChannels] = {0u};
919
920 for (int y = int(centerY) - int(tPatchSize_2); y <= int(centerY) + int(tPatchSize_2); ++y)
921 {
922 const uint8_t* const mirroredRow = image + CVUtilities::mirrorIndex(y, height) * imageStrideElements;
923
924 for (int x = int(centerX) - int(tPatchSize_2); x <= int(centerX) + int(tPatchSize_2); ++x)
925 {
926 const uint8_t* const pixel = mirroredRow + CVUtilities::mirrorIndex(x, width) * tChannels;
927
928 for (unsigned int c = 0u; c < tChannels; ++c)
929 {
930 sum[c] += pixel[c];
931 }
932 }
933 }
934
935 for (unsigned int n = 0u; n < tChannels; ++n)
936 {
937 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
938 }
939}
940
941template <>
942template <unsigned int tPixels>
943inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
944{
945 static_assert(tPixels >= 8u, "Invalid pixels!");
946
947 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
948 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
949
950 constexpr unsigned int blocks16 = tPixels / 16u;
951 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
952
953 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
954 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
955
956 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
957 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
958
959 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
960 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
961
962 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
963
964 static_assert(blocks1 <= 2u, "Invalid block size!");
965
966 // [(buffer0 - mean0) - (buffer1 - mean1)]^2
967 // [buffer0 - buffer1 - mean0 + mean1]^2
968
969 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
970
971 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
972 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
973
974 uint32_t sumIndividual = 0u;
975
976 for (unsigned int n = 0u; n < blocks16; ++n)
977 {
978 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0);
979 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1);
980
981 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16))); // low 8 bytes: buffer0 - buffer1
982 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16))); // high 8 bytes: buffer0 - buffer1
983
984 const uint16x8_t buffer_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
985 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
986
987 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
988 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
989
990 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
991 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
992
993 buffer0 += 16;
994 buffer1 += 16;
995 }
996
997 if constexpr (partialBlock16)
998 {
999 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1000 ocean_assert(overlappingElements < 8u);
1001
1002 const uint8x16_t buffer0_u_8x16 = vld1q_u8(buffer0 - overlappingElements);
1003 const uint8x16_t buffer1_u_8x16 = vld1q_u8(buffer1 - overlappingElements);
1004
1005 const int16x8_t bufferLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16), vget_low_u8(buffer1_u_8x16))); // low 8 bytes: buffer0 - buffer1
1006 const int16x8_t bufferHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16), vget_high_u8(buffer1_u_8x16))); // high 8 bytes: buffer0 - buffer1
1007
1008 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1009 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1010
1011 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1012
1013 const uint16x8_t buffer_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1014 const uint16x8_t buffer_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferHigh0_1_s_16x8, mean0_1_s_16x8));
1015
1016 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_low_u_16x8), vget_low_u16(buffer_mean_low_u_16x8));
1017 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_low_u_16x8), vget_high_u16(buffer_mean_low_u_16x8));
1018
1019 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_high_u_16x8), vget_low_u16(buffer_mean_high_u_16x8));
1020 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_high_u_16x8), vget_high_u16(buffer_mean_high_u_16x8));
1021
1022 buffer0 += remainingAfterBlocks16;
1023 buffer1 += remainingAfterBlocks16;
1024 }
1025
1026 for (unsigned int n = 0u; n < blocks8; ++n)
1027 {
1028 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0);
1029 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1);
1030
1031 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8)); // buffer0 - buffer1
1032
1033 const uint16x8_t buffer_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1034
1035 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1036 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1037
1038 buffer0 += 8;
1039 buffer1 += 8;
1040 }
1041
1042 if constexpr (partialBlock8)
1043 {
1044 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1045 ocean_assert(overlappingElements < 8u);
1046
1047 const uint8x8_t buffer0_u_8x8 = vld1_u8(buffer0 - overlappingElements);
1048 const uint8x8_t buffer1_u_8x8 = vld1_u8(buffer1 - overlappingElements);
1049
1050 const int16x8_t buffer0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8, buffer1_u_8x8)); // buffer0 - buffer1
1051
1052 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1053 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1054
1055 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1056
1057 const uint16x8_t buffer_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(buffer0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1058
1059 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(buffer_mean_u_16x8), vget_low_u16(buffer_mean_u_16x8));
1060 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(buffer_mean_u_16x8), vget_high_u16(buffer_mean_u_16x8));
1061
1062 buffer0 += remainingAfterBlocks8;
1063 buffer1 += remainingAfterBlocks8;
1064 }
1065
1066 if constexpr (blocks1 != 0u)
1067 {
1068 for (unsigned int n = 0u; n < blocks1; ++n)
1069 {
1070 sumIndividual += sqrDistance(int16_t(buffer0[n] - meanValues0[0]), int16_t(buffer1[n] - meanValues1[0]));
1071 }
1072 }
1073
1074 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1075
1076 uint32_t results[4];
1077 vst1q_u32(results, sum_u_32x4);
1078
1079 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1080}
1081
1082template <>
1083template <unsigned int tPixels>
1084inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1085{
1086 static_assert(tPixels >= 8u, "Invalid pixels!");
1087
1088 constexpr unsigned int tChannels = 3u;
1089
1090 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1091 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1092
1093 constexpr unsigned int blocks16 = tPixels / 16u;
1094 constexpr unsigned int remainingAfterBlocks16 = tPixels % 16u;
1095
1096 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1097 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1098
1099 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1100 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1101
1102 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1103 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1104
1105 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1106
1107 static_assert(blocks1 <= 2u, "Invalid block size!");
1108
1109 // [(buffer0 - mean0) - (buffer1 - mean1)]^2
1110 // [buffer0 - buffer1 - mean0 + mean1]^2
1111
1112 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1113 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1114 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1115
1116 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1117 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1118
1119 uint32_t sumIndividual = 0u;
1120
1121 for (unsigned int n = 0u; n < blocks16; ++n)
1122 {
1123 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0);
1124 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1);
1125
1126 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0]))); // low 8 bytes: buffer0 - buffer1
1127 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0]))); // high 8 bytes: buffer0 - buffer1
1128
1129 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1130 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1131
1132 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1133 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1134
1135
1136 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1137 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1138
1139 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1140 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1141
1142 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1143 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1144
1145
1146 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1147 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1148 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1149 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1150
1151 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1152 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1153 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1154 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1155
1156 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1157 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1158 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1159 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1160
1161
1162 buffer0 += 16u * tChannels;
1163 buffer1 += 16u * tChannels;
1164 }
1165
1166 if constexpr (partialBlock16)
1167 {
1168 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1169 ocean_assert(overlappingElements < 8u);
1170
1171 const uint8x16x3_t buffer0_u_8x16x3 = vld3q_u8(buffer0 - overlappingElements * tChannels);
1172 const uint8x16x3_t buffer1_u_8x16x3 = vld3q_u8(buffer1 - overlappingElements * tChannels);
1173
1174
1175 const int16x8_t bufferChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[0]), vget_low_u8(buffer1_u_8x16x3.val[0]))); // low 8 bytes: buffer0 - buffer1
1176 const int16x8_t bufferChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[0]), vget_high_u8(buffer1_u_8x16x3.val[0]))); // high 8 bytes: buffer0 - buffer1
1177
1178 const int16x8_t bufferChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[1]), vget_low_u8(buffer1_u_8x16x3.val[1])));
1179 const int16x8_t bufferChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[1]), vget_high_u8(buffer1_u_8x16x3.val[1])));
1180
1181 const int16x8_t bufferChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(buffer0_u_8x16x3.val[2]), vget_low_u8(buffer1_u_8x16x3.val[2])));
1182 const int16x8_t bufferChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(buffer0_u_8x16x3.val[2]), vget_high_u8(buffer1_u_8x16x3.val[2])));
1183
1184
1185 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1186 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1187
1188 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1189
1190
1191 const uint16x8_t bufferChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1192 const uint16x8_t bufferChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1193
1194 const uint16x8_t bufferChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1195 const uint16x8_t bufferChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1196
1197 const uint16x8_t bufferChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1198 const uint16x8_t bufferChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1199
1200
1201 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_low_u_16x8), vget_low_u16(bufferChannel0_mean_low_u_16x8));
1202 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_low_u_16x8), vget_high_u16(bufferChannel0_mean_low_u_16x8));
1203 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_high_u_16x8), vget_low_u16(bufferChannel0_mean_high_u_16x8));
1204 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_high_u_16x8), vget_high_u16(bufferChannel0_mean_high_u_16x8));
1205
1206 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_low_u_16x8), vget_low_u16(bufferChannel1_mean_low_u_16x8));
1207 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_low_u_16x8), vget_high_u16(bufferChannel1_mean_low_u_16x8));
1208 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_high_u_16x8), vget_low_u16(bufferChannel1_mean_high_u_16x8));
1209 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_high_u_16x8), vget_high_u16(bufferChannel1_mean_high_u_16x8));
1210
1211 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_low_u_16x8), vget_low_u16(bufferChannel2_mean_low_u_16x8));
1212 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_low_u_16x8), vget_high_u16(bufferChannel2_mean_low_u_16x8));
1213 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_high_u_16x8), vget_low_u16(bufferChannel2_mean_high_u_16x8));
1214 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_high_u_16x8), vget_high_u16(bufferChannel2_mean_high_u_16x8));
1215
1216 buffer0 += remainingAfterBlocks16 * tChannels;
1217 buffer1 += remainingAfterBlocks16 * tChannels;
1218 }
1219
1220 for (unsigned int n = 0u; n < blocks8; ++n)
1221 {
1222 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0);
1223 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1);
1224
1225 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0])); // buffer0 - buffer1
1226 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1227 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1228
1229 const uint16x8_t bufferChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1230 const uint16x8_t bufferChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1231 const uint16x8_t bufferChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1232
1233 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1234 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1235
1236 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1237 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1238
1239 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1240 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1241
1242 buffer0 += 8u * tChannels;
1243 buffer1 += 8u * tChannels;
1244 }
1245
1246 if constexpr (partialBlock8)
1247 {
1248 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1249 ocean_assert(overlappingElements < 8u);
1250
1251 const uint8x8x3_t buffer0_u_8x8x3 = vld3_u8(buffer0 - overlappingElements * tChannels);
1252 const uint8x8x3_t buffer1_u_8x8x3 = vld3_u8(buffer1 - overlappingElements * tChannels);
1253
1254 const int16x8_t bufferChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[0], buffer1_u_8x8x3.val[0])); // buffer0 - buffer1
1255 const int16x8_t bufferChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[1], buffer1_u_8x8x3.val[1]));
1256 const int16x8_t bufferChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(buffer0_u_8x8x3.val[2], buffer1_u_8x8x3.val[2]));
1257
1258 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1259 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1260
1261 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1262
1263 const uint16x8_t bufferChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(buffer0 - buffer1) - (mean1 - mean0)|, with range [0, 255 * 255]
1264 const uint16x8_t bufferChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1265 const uint16x8_t bufferChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(bufferChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1266
1267 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel0_mean_u_16x8), vget_low_u16(bufferChannel0_mean_u_16x8));
1268 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel0_mean_u_16x8), vget_high_u16(bufferChannel0_mean_u_16x8));
1269
1270 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel1_mean_u_16x8), vget_low_u16(bufferChannel1_mean_u_16x8));
1271 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel1_mean_u_16x8), vget_high_u16(bufferChannel1_mean_u_16x8));
1272
1273 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(bufferChannel2_mean_u_16x8), vget_low_u16(bufferChannel2_mean_u_16x8));
1274 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(bufferChannel2_mean_u_16x8), vget_high_u16(bufferChannel2_mean_u_16x8));
1275
1276 buffer0 += remainingAfterBlocks8 * tChannels;
1277 buffer1 += remainingAfterBlocks8 * tChannels;
1278 }
1279
1280 if constexpr (blocks1 != 0u)
1281 {
1282 for (unsigned int n = 0u; n < blocks1; ++n)
1283 {
1284 for (unsigned int c = 0u; c < tChannels; ++c)
1285 {
1286 sumIndividual += sqrDistance(int16_t(buffer0[n * tChannels + c] - meanValues0[c]), int16_t(buffer1[n * tChannels + c] - meanValues1[c]));
1287 }
1288 }
1289
1290 buffer0 += blocks1 * tChannels;
1291 buffer1 += blocks1 * tChannels;
1292 }
1293
1294 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1295
1296 uint32_t results[4];
1297 vst1q_u32(results, sum_u_32x4);
1298
1299 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1300}
1301
1302template <unsigned int tChannels>
1303template <unsigned int tPixels>
1304inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1305{
1306 static_assert(tChannels >= 1u, "Invalid channel number!");
1307 static_assert(tPixels >= 1u, "Invalid pixels!");
1308
1309 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1310 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1311
1312 uint32_t zmssd = 0u;
1313
1314 for (unsigned int x = 0u; x < tPixels; ++x)
1315 {
1316 for (unsigned int c = 0u; c < tChannels; ++c)
1317 {
1318 zmssd += sqrDistance(buffer0[x * tChannels + c] - meanValues0[c], buffer1[x * tChannels + c] - meanValues1[c]);
1319 }
1320 }
1321
1322 return zmssd;
1323}
1324
1325template <>
1326template <unsigned int tPatchSize>
1327inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1328{
1329 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1330
1331 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1332 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1333
1334 ocean_assert(patch0StrideElements >= tPatchSize);
1335 ocean_assert(patch1StrideElements >= tPatchSize);
1336
1337 constexpr unsigned int blocks16 = tPatchSize / 16u;
1338 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1339
1340 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1341 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1342
1343 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1344 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1345
1346 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1347 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1348
1349 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1350
1351 static_assert(blocks1 <= 2u, "Invalid block size!");
1352
1353 // [(patch0 - mean0) - (patch1 - mean1)]^2
1354 // [patch0 - patch1 - mean0 + mean1]^2
1355
1356 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1357
1358 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1359 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1360
1361 uint32_t sumIndividual = 0u;
1362
1363 for (unsigned int y = 0u; y < tPatchSize; ++y)
1364 {
1365 for (unsigned int n = 0u; n < blocks16; ++n)
1366 {
1367 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1368 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1369
1370 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1371 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1372
1373 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1374 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1375
1376 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1377 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1378
1379 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1380 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1381
1382 patch0 += 16;
1383 patch1 += 16;
1384 }
1385
1386 if constexpr (partialBlock16)
1387 {
1388 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1389 ocean_assert(overlappingElements < 8u);
1390
1391 if (y < tPatchSize - 1u)
1392 {
1393 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0);
1394 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1);
1395
1396 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1397 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1398
1399 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1400 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1401
1402 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1403 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1404
1405 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1406
1407 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1408 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1409
1410 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1411 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1412
1413 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1414 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1415 }
1416 else
1417 {
1418 const uint8x16_t patch0_u_8x16 = vld1q_u8(patch0 - overlappingElements);
1419 const uint8x16_t patch1_u_8x16 = vld1q_u8(patch1 - overlappingElements);
1420
1421 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1422 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1423
1424 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1425 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1426
1427 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1428
1429 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1430 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1431
1432 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1433 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1434
1435 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1436 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1437 }
1438
1439 patch0 += remainingAfterBlocks16;
1440 patch1 += remainingAfterBlocks16;
1441 }
1442
1443 for (unsigned int n = 0u; n < blocks8; ++n)
1444 {
1445 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1446 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1447
1448 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1449
1450 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1451
1452 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1453 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1454
1455 patch0 += 8;
1456 patch1 += 8;
1457 }
1458
1459 if constexpr (partialBlock8)
1460 {
1461 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1462 ocean_assert(overlappingElements < 8u);
1463
1464 if (y < tPatchSize - 1u)
1465 {
1466 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0);
1467 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1);
1468
1469 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1470
1471 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1472 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1473
1474 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1475
1476 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1477
1478 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1479 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1480 }
1481 else
1482 {
1483 const uint8x8_t patch0_u_8x8 = vld1_u8(patch0 - overlappingElements);
1484 const uint8x8_t patch1_u_8x8 = vld1_u8(patch1 - overlappingElements);
1485
1486 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
1487
1488 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1489 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1490
1491 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1492
1493 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1494
1495 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
1496 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
1497 }
1498
1499 patch0 += remainingAfterBlocks8;
1500 patch1 += remainingAfterBlocks8;
1501 }
1502
1503 if constexpr (blocks1 != 0u)
1504 {
1505 for (unsigned int n = 0u; n < blocks1; ++n)
1506 {
1507 sumIndividual += sqrDistance(int16_t(patch0[n] - meanValues0[0]), int16_t(patch1[n] - meanValues1[0]));
1508 }
1509
1510 patch0 += blocks1;
1511 patch1 += blocks1;
1512 }
1513
1514 patch0 += patch0StrideElements - tPatchSize;
1515 patch1 += patch1StrideElements - tPatchSize;
1516 }
1517
1518 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1519
1520 uint32_t results[4];
1521 vst1q_u32(results, sum_u_32x4);
1522
1523 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1524}
1525
1526template <>
1527template <unsigned int tPatchSize>
1528inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<3u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1529{
1530 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1531
1532 constexpr unsigned int tChannels = 3u;
1533
1534 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1535 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1536
1537 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1538 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1539
1540 constexpr unsigned int blocks16 = tPatchSize / 16u;
1541 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1542
1543 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1544 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1545
1546 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1547 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1548
1549 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1550 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1551
1552 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1553
1554 static_assert(blocks1 <= 2u, "Invalid block size!");
1555
1556 // [(patch0 - mean0) - (patch1 - mean1)]^2
1557 // [patch0 - patch1 - mean0 + mean1]^2
1558
1559 const int16x8_t meanChannel0_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1560 const int16x8_t meanChannel1_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[1]) - int16_t(meanValues1[1]));
1561 const int16x8_t meanChannel2_0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[2]) - int16_t(meanValues1[2]));
1562
1563 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1564 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1565
1566 uint32_t sumIndividual = 0u;
1567
1568 for (unsigned int y = 0u; y < tPatchSize; ++y)
1569 {
1570 for (unsigned int n = 0u; n < blocks16; ++n)
1571 {
1572 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1573 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1574
1575 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1576 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1577
1578 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1579 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1580
1581 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1582 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1583
1584
1585 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1586 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1587
1588 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1589 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1590
1591 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1592 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1593
1594
1595 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1596 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1597 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1598 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1599
1600 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1601 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1602 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1603 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1604
1605 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1606 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1607 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1608 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1609
1610
1611 patch0 += 16u * tChannels;
1612 patch1 += 16u * tChannels;
1613 }
1614
1615 if constexpr (partialBlock16)
1616 {
1617 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1618 ocean_assert(overlappingElements < 8u);
1619
1620 if (y < tPatchSize - 1u)
1621 {
1622 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0);
1623 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1);
1624
1625
1626 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1627 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1628
1629 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1630 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1631
1632 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1633 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1634
1635
1636 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1637 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1638
1639 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1640 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1641
1642 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1643
1644
1645 const uint16x8_t patchChannel0_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1646 const uint16x8_t patchChannel0_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8);
1647
1648 const uint16x8_t patchChannel1_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8));
1649 const uint16x8_t patchChannel1_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1650
1651 const uint16x8_t patchChannel2_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8));
1652 const uint16x8_t patchChannel2_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1653
1654
1655 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1656 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1657 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1658 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1659
1660 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1661 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1662 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1663 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1664
1665 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1666 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1667 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1668 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1669 }
1670 else
1671 {
1672 const uint8x16x3_t patch0_u_8x16x3 = vld3q_u8(patch0 - overlappingElements * tChannels);
1673 const uint8x16x3_t patch1_u_8x16x3 = vld3q_u8(patch1 - overlappingElements * tChannels);
1674
1675
1676 const int16x8_t patchChannel0Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[0]), vget_low_u8(patch1_u_8x16x3.val[0]))); // low 8 bytes: patch0 - patch1
1677 const int16x8_t patchChannel0High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[0]), vget_high_u8(patch1_u_8x16x3.val[0]))); // high 8 bytes: patch0 - patch1
1678
1679 const int16x8_t patchChannel1Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[1]), vget_low_u8(patch1_u_8x16x3.val[1])));
1680 const int16x8_t patchChannel1High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[1]), vget_high_u8(patch1_u_8x16x3.val[1])));
1681
1682 const int16x8_t patchChannel2Low0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16x3.val[2]), vget_low_u8(patch1_u_8x16x3.val[2])));
1683 const int16x8_t patchChannel2High0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16x3.val[2]), vget_high_u8(patch1_u_8x16x3.val[2])));
1684
1685
1686 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1687 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1688
1689 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1690
1691
1692 const uint16x8_t patchChannel0_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0Low0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1693 const uint16x8_t patchChannel0_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0High0_1_s_16x8, meanChannel0_0_1_s_16x8));
1694
1695 const uint16x8_t patchChannel1_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1Low0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1696 const uint16x8_t patchChannel1_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1High0_1_s_16x8, meanChannel1_0_1_s_16x8));
1697
1698 const uint16x8_t patchChannel2_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2Low0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1699 const uint16x8_t patchChannel2_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2High0_1_s_16x8, meanChannel2_0_1_s_16x8));
1700
1701
1702 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_low_u_16x8), vget_low_u16(patchChannel0_mean_low_u_16x8));
1703 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_low_u_16x8), vget_high_u16(patchChannel0_mean_low_u_16x8));
1704 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_high_u_16x8), vget_low_u16(patchChannel0_mean_high_u_16x8));
1705 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_high_u_16x8), vget_high_u16(patchChannel0_mean_high_u_16x8));
1706
1707 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_low_u_16x8), vget_low_u16(patchChannel1_mean_low_u_16x8));
1708 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_low_u_16x8), vget_high_u16(patchChannel1_mean_low_u_16x8));
1709 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_high_u_16x8), vget_low_u16(patchChannel1_mean_high_u_16x8));
1710 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_high_u_16x8), vget_high_u16(patchChannel1_mean_high_u_16x8));
1711
1712 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_low_u_16x8), vget_low_u16(patchChannel2_mean_low_u_16x8));
1713 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_low_u_16x8), vget_high_u16(patchChannel2_mean_low_u_16x8));
1714 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_high_u_16x8), vget_low_u16(patchChannel2_mean_high_u_16x8));
1715 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_high_u_16x8), vget_high_u16(patchChannel2_mean_high_u_16x8));
1716 }
1717
1718 patch0 += remainingAfterBlocks16 * tChannels;
1719 patch1 += remainingAfterBlocks16 * tChannels;
1720 }
1721
1722 for (unsigned int n = 0u; n < blocks8; ++n)
1723 {
1724 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1725 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1726
1727 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1728 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1729 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1730
1731 const uint16x8_t patchChannel0_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1732 const uint16x8_t patchChannel1_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8));
1733 const uint16x8_t patchChannel2_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8));
1734
1735 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1736 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1737
1738 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1739 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1740
1741 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1742 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1743
1744 patch0 += 8u * tChannels;
1745 patch1 += 8u * tChannels;
1746 }
1747
1748 if constexpr (partialBlock8)
1749 {
1750 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
1751 ocean_assert(overlappingElements < 8u);
1752
1753 if (y < tPatchSize - 1u)
1754 {
1755 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0);
1756 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1);
1757
1758 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1759 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1760 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1761
1762 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1763 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1764
1765 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1766
1767 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1768 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1769 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1770
1771 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1772 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1773
1774 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1775 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1776
1777 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1778 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1779 }
1780 else
1781 {
1782 const uint8x8x3_t patch0_u_8x8x3 = vld3_u8(patch0 - overlappingElements * tChannels);
1783 const uint8x8x3_t patch1_u_8x8x3 = vld3_u8(patch1 - overlappingElements * tChannels);
1784
1785 const int16x8_t patchChannel0_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[0], patch1_u_8x8x3.val[0])); // patch0 - patch1
1786 const int16x8_t patchChannel1_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[1], patch1_u_8x8x3.val[1]));
1787 const int16x8_t patchChannel2_0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8x3.val[2], patch1_u_8x8x3.val[2]));
1788
1789 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1790 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1791
1792 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1793
1794 const uint16x8_t patchChannel0_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel0_0_1_s_16x8, meanChannel0_0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1795 const uint16x8_t patchChannel1_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel1_0_1_s_16x8, meanChannel1_0_1_s_16x8)), mask_u_16x8);
1796 const uint16x8_t patchChannel2_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchChannel2_0_1_s_16x8, meanChannel2_0_1_s_16x8)), mask_u_16x8);
1797
1798 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel0_mean_u_16x8), vget_low_u16(patchChannel0_mean_u_16x8));
1799 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel0_mean_u_16x8), vget_high_u16(patchChannel0_mean_u_16x8));
1800
1801 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel1_mean_u_16x8), vget_low_u16(patchChannel1_mean_u_16x8));
1802 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel1_mean_u_16x8), vget_high_u16(patchChannel1_mean_u_16x8));
1803
1804 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patchChannel2_mean_u_16x8), vget_low_u16(patchChannel2_mean_u_16x8));
1805 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patchChannel2_mean_u_16x8), vget_high_u16(patchChannel2_mean_u_16x8));
1806 }
1807
1808 patch0 += remainingAfterBlocks8 * tChannels;
1809 patch1 += remainingAfterBlocks8 * tChannels;
1810 }
1811
1812 if constexpr (blocks1 != 0u)
1813 {
1814 for (unsigned int n = 0u; n < blocks1; ++n)
1815 {
1816 for (unsigned int c = 0u; c < tChannels; ++c)
1817 {
1818 sumIndividual += sqrDistance(int16_t(patch0[n * tChannels + c] - meanValues0[c]), int16_t(patch1[n * tChannels + c] - meanValues1[c]));
1819 }
1820 }
1821
1822 patch0 += blocks1 * tChannels;
1823 patch1 += blocks1 * tChannels;
1824 }
1825
1826 patch0 += patch0StrideElements - tPatchSize * tChannels;
1827 patch1 += patch1StrideElements - tPatchSize * tChannels;
1828 }
1829
1830 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
1831
1832 uint32_t results[4];
1833 vst1q_u32(results, sum_u_32x4);
1834
1835 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
1836}
1837
1838template <unsigned int tChannels>
1839template <unsigned int tPatchSize>
1840inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1841{
1842 static_assert(tChannels >= 1u, "Invalid channel number!");
1843 static_assert(tPatchSize >= 1u, "Invalid patch size!");
1844
1845 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1846 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1847
1848 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1849 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1850
1851 uint32_t zmssd = 0u;
1852
1853 for (unsigned int y = 0u; y < tPatchSize; ++y)
1854 {
1855 for (unsigned int x = 0u; x < tPatchSize; ++x)
1856 {
1857 for (unsigned int n = 0u; n < tChannels; ++n)
1858 {
1859 zmssd += sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1860 }
1861 }
1862
1863 patch0 += patch0StrideElements;
1864 patch1 += patch1StrideElements;
1865 }
1866
1867 return zmssd;
1868}
1869
1870template <>
1871template <unsigned int tPatchSize>
1872inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<1u>::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1873{
1874 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
1875 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1876
1877 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
1878
1879 ocean_assert(image0 != nullptr && image1 != nullptr);
1880 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1881
1882 ocean_assert(centerX0 < width0 && centerY0 < height0);
1883 ocean_assert(centerX1 < width1 && centerY1 < height1);
1884
1885 const unsigned int image0StrideElements = width0 + image0PaddingElements;
1886 const unsigned int image1StrideElements = width1 + image1PaddingElements;
1887
1888 constexpr unsigned int blocks16 = tPatchSize / 16u;
1889 constexpr unsigned int remainingAfterBlocks16 = tPatchSize % 16u;
1890
1891 constexpr bool partialBlock16 = remainingAfterBlocks16 > 10u;
1892 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
1893
1894 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
1895 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
1896
1897 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
1898 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
1899
1900 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
1901
1902 static_assert(blocks1 <= 2u, "Invalid block size!");
1903
1904 // [(patch0 - mean0) - (patch1 - mean1)]^2
1905 // [patch0 - patch1 - mean0 + mean1]^2
1906
1907 const int16x8_t mean0_1_s_16x8 = vdupq_n_s16(int16_t(meanValues0[0]) - int16_t(meanValues1[0]));
1908
1909 uint32x4_t sumA_u_32x4 = vdupq_n_u32(0u);
1910 uint32x4_t sumB_u_32x4 = vdupq_n_u32(0u);
1911
1912 uint32_t sumIndividual = 0u;
1913
1914 uint8_t intermediate[16];
1915
1916 int y1 = int(centerY1) - int(tPatchSize_2);
1917 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
1918 {
1919 const uint8_t* const mirroredRow0 = image0 + CVUtilities::mirrorIndex(y0, height0) * image0StrideElements;
1920 const uint8_t* const mirroredRow1 = image1 + CVUtilities::mirrorIndex(y1, height1) * image1StrideElements;
1921
1922 int x0 = int(centerX0) - int(tPatchSize_2);
1923 int x1 = int(centerX1) - int(tPatchSize_2);
1924
1925 for (unsigned int n = 0u; n < blocks16; ++n)
1926 {
1927 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow0, x0, width0, intermediate);
1928 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, 16u, true>(mirroredRow1, x1, width1, intermediate);
1929
1930 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1931 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1932
1933 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1934 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1935
1936 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1937 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1938
1939 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1940 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1941
1942 x0 += 16;
1943 x1 += 16;
1944 }
1945
1946 if constexpr (partialBlock16)
1947 {
1948 constexpr unsigned int overlappingElements = 16u - remainingAfterBlocks16;
1949 ocean_assert(overlappingElements < 8u);
1950
1951 if (y0 < int(centerY0) + int(tPatchSize_2))
1952 {
1953 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1954 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<true, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1955
1956 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1957 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1958
1959 // mask: |<- overlapping ->|<- remainingAfterBlocks16 ->|
1960 // 00 00 00 00 00 00 FF FF FF FF FF FF FF FF FF FF
1961
1962 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
1963 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
1964
1965 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1966
1967 const uint16x8_t patch_mean_low_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1968 const uint16x8_t patch_mean_high_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8);
1969
1970 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1971 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1972
1973 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1974 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1975 }
1976 else
1977 {
1978 const uint8x16_t patch0_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow0, x0, width0, intermediate);
1979 const uint8x16_t patch1_u_8x16 = loadMirrored_u_8x16<false, remainingAfterBlocks16, false>(mirroredRow1, x1, width1, intermediate);
1980
1981 const int16x8_t patchLow0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(patch0_u_8x16), vget_low_u8(patch1_u_8x16))); // low 8 bytes: patch0 - patch1
1982 const int16x8_t patchHigh0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(patch0_u_8x16), vget_high_u8(patch1_u_8x16))); // high 8 bytes: patch0 - patch1
1983
1984 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
1985 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
1986
1987 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
1988
1989 const uint16x8_t patch_mean_low_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patchLow0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
1990 const uint16x8_t patch_mean_high_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patchHigh0_1_s_16x8, mean0_1_s_16x8));
1991
1992 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_low_u_16x8), vget_low_u16(patch_mean_low_u_16x8));
1993 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_low_u_16x8), vget_high_u16(patch_mean_low_u_16x8));
1994
1995 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_high_u_16x8), vget_low_u16(patch_mean_high_u_16x8));
1996 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_high_u_16x8), vget_high_u16(patch_mean_high_u_16x8));
1997 }
1998
1999 x0 += remainingAfterBlocks16;
2000 x1 += remainingAfterBlocks16;
2001 }
2002
2003 for (unsigned int n = 0u; n < blocks8; ++n)
2004 {
2005 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow0, x0, width0, intermediate);
2006 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, 8u, true>(mirroredRow1, x1, width1, intermediate);
2007
2008 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2009
2010 const uint16x8_t patch_mean_u_16x8 = vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2011
2012 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2013 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2014
2015 x0 += 8;
2016 x1 += 8;
2017 }
2018
2019 if constexpr (partialBlock8)
2020 {
2021 constexpr unsigned int overlappingElements = 8u - remainingAfterBlocks8;
2022 ocean_assert(overlappingElements < 8u);
2023
2024 if (y0 < int(centerY0) + int(tPatchSize_2))
2025 {
2026 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2027 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<true, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2028
2029 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2030
2031 constexpr uint64_t maskLow = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) >> (overlappingElements - 4u) * 2u * 8u);
2032 constexpr uint64_t maskHigh = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) >> overlappingElements * 2u * 8u);
2033
2034 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2035
2036 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2037
2038 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2039 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2040 }
2041 else
2042 {
2043 const uint8x8_t patch0_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow0, x0, width0, intermediate);
2044 const uint8x8_t patch1_u_8x8 = loadMirrored_u_8x8<false, remainingAfterBlocks8, false>(mirroredRow1, x1, width1, intermediate);
2045
2046 const int16x8_t patch0_1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(patch0_u_8x8, patch1_u_8x8)); // patch0 - patch1
2047
2048 constexpr uint64_t maskLow = overlappingElements >= 4u ? uint64_t(0) : (uint64_t(-1) << overlappingElements * 2u * 8u);
2049 constexpr uint64_t maskHigh = (overlappingElements <= 4u || overlappingElements >= 8u) ? uint64_t(-1) : (uint64_t(-1) << (overlappingElements - 4u) * 2u * 8u);
2050
2051 const uint16x8_t mask_u_16x8 = vcombine_u16(vcreate_u16(maskLow), vcreate_u16(maskHigh));
2052
2053 const uint16x8_t patch_mean_u_16x8 = vandq_u16(vreinterpretq_u16_s16(vabdq_s16(patch0_1_s_16x8, mean0_1_s_16x8)), mask_u_16x8); // |(patch0 - patch1) - (mean1 - mean0)|, with range [0, 255 * 255]
2054
2055 sumA_u_32x4 = vmlal_u16(sumA_u_32x4, vget_low_u16(patch_mean_u_16x8), vget_low_u16(patch_mean_u_16x8));
2056 sumB_u_32x4 = vmlal_u16(sumB_u_32x4, vget_high_u16(patch_mean_u_16x8), vget_high_u16(patch_mean_u_16x8));
2057 }
2058
2059 x0 += remainingAfterBlocks8;
2060 x1 += remainingAfterBlocks8;
2061 }
2062
2063 if constexpr (blocks1 != 0u)
2064 {
2065 for (unsigned int n = 0u; n < blocks1; ++n)
2066 {
2067 const unsigned int index0 = CVUtilities::mirrorIndex(x0 + int(n), width0);
2068 const unsigned int index1 = CVUtilities::mirrorIndex(x1 + int(n), width1);
2069
2070 sumIndividual += sqrDistance(int16_t(mirroredRow0[index0] - meanValues0[0]), int16_t(mirroredRow1[index1] - meanValues1[0]));
2071 }
2072 }
2073
2074 ++y1;
2075 }
2076
2077 const uint32x4_t sum_u_32x4 = vaddq_u32(sumA_u_32x4, sumB_u_32x4);
2078
2079 uint32_t results[4];
2080 vst1q_u32(results, sum_u_32x4);
2081
2082 return results[0] + results[1] + results[2] + results[3] + sumIndividual;
2083}
2084
2085template <unsigned int tChannels>
2086template <unsigned int tPatchSize>
2087inline uint32_t ZeroMeanSumSquareDifferencesNEON::SpecializedForChannels<tChannels>::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
2088{
2089 static_assert(tChannels >= 1u, "Invalid channel number!");
2090 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
2091
2092 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
2093
2094 ocean_assert(image0 != nullptr && image1 != nullptr);
2095 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
2096
2097 ocean_assert(centerX0 < width0 && centerY0 < height0);
2098 ocean_assert(centerX1 < width1 && centerY1 < height1);
2099
2100 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
2101 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
2102
2103 uint32_t zmssd = 0u;
2104
2105 int y1 = int(centerY1) - int(tPatchSize_2);
2106 for (int y0 = int(centerY0) - int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
2107 {
2108 const uint8_t* const mirroredRow0 = image0 + CVUtilities::mirrorIndex(y0, height0) * image0StrideElements;
2109 const uint8_t* const mirroredRow1 = image1 + CVUtilities::mirrorIndex(y1, height1) * image1StrideElements;
2110
2111 int x1 = int(centerX1) - int(tPatchSize_2);
2112 for (int x0 = int(centerX0) - int(tPatchSize_2); x0 <= int(centerX0) + int(tPatchSize_2); ++x0)
2113 {
2114 const uint8_t* const pixel0 = mirroredRow0 + CVUtilities::mirrorIndex(x0, width0) * tChannels;
2115 const uint8_t* const pixel1 = mirroredRow1 + CVUtilities::mirrorIndex(x1, width1) * tChannels;
2116
2117 for (unsigned int c = 0u; c < tChannels; ++c)
2118 {
2119 zmssd += sqrDistance(pixel0[c] - meanValues0[c], pixel1[c] - meanValues1[c]);
2120 }
2121
2122 ++x1;
2123 }
2124
2125 ++y1;
2126 }
2127
2128 return zmssd;
2129}
2130
2131template <unsigned int tChannels, unsigned int tPixels>
2132inline uint32_t ZeroMeanSumSquareDifferencesNEON::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1)
2133{
2134 static_assert(tChannels >= 1u, "Invalid channel number!");
2135 static_assert(tPixels >= 8u, "Invalid pixels!");
2136
2137 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
2138
2139 uint8_t meanValues0[tChannels];
2140 mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
2141
2142 uint8_t meanValues1[tChannels];
2143 mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
2144
2145 return SpecializedForChannels<tChannels>::template buffer8BitPerChannel<tPixels>(buffer0, buffer1, meanValues0, meanValues1);
2146}
2147
2148template <unsigned int tChannels, unsigned int tPatchSize>
2149inline uint32_t ZeroMeanSumSquareDifferencesNEON::patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
2150{
2151 static_assert(tChannels >= 1u, "Invalid channel number!");
2152 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2153
2154 ocean_assert(patch0 != nullptr && patch1 != nullptr);
2155
2156 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2157 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
2158
2159 uint8_t meanValues0[tChannels];
2160 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2161
2162 uint8_t meanValues1[tChannels];
2163 mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
2164
2165 return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, patch1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
2166}
2167
2168template <unsigned int tChannels, unsigned int tPatchSize>
2169inline uint32_t ZeroMeanSumSquareDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
2170{
2171 static_assert(tChannels >= 1u, "Invalid channel number!");
2172 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2173
2174 ocean_assert(patch0 != nullptr && buffer1 != nullptr);
2175
2176 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
2177
2178 uint8_t meanValues0[tChannels];
2179 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
2180
2181 uint8_t meanValues1[tChannels];
2182 mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
2183
2184 constexpr unsigned int patch1StrideElements = tChannels * tPatchSize;
2185
2186 return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, buffer1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
2187}
2188
2189template <unsigned int tChannels, unsigned int tPatchSize>
2190uint32_t ZeroMeanSumSquareDifferencesNEON::patchMirroredBorder8BitPerChannel(const uint8_t* const image0, const uint8_t* const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
2191{
2192 static_assert(tChannels >= 1u, "Invalid channel number!");
2193 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2194
2195 ocean_assert(image0 != nullptr && image1 != nullptr);
2196
2197 uint8_t meanValues0[tChannels];
2198 SpecializedForChannels<tChannels>::template mean8BitPerChannelMirroredBorder<tPatchSize>(image0, width0, height0, centerX0, centerY0, image0PaddingElements, meanValues0);
2199
2200 uint8_t meanValues1[tChannels];
2201 SpecializedForChannels<tChannels>::template mean8BitPerChannelMirroredBorder<tPatchSize>(image1, width1, height1, centerX1, centerY1, image1PaddingElements, meanValues1);
2202
2203 return SpecializedForChannels<tChannels>::template patchMirroredBorder8BitPerChannel<tPatchSize>(image0, image1, width0, height0, width1, height1, centerX0, centerY0, centerX1, centerY1, image0PaddingElements, image1PaddingElements, meanValues0, meanValues1);
2204}
2205
2206template <unsigned int tChannels, unsigned int tPixels>
2207OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesNEON::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
2208{
2209 static_assert(tChannels >= 1u, "Invalid channel number!");
2210 static_assert(tPixels >= 8u, "Invalid patch size!");
2211
2212 SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPixels>(buffer, meanValues);
2213}
2214
2215template <unsigned int tChannels, unsigned int tPatchSize>
2216OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesNEON::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
2217{
2218 static_assert(tChannels >= 1u, "Invalid channel number!");
2219 static_assert(tPatchSize >= 5u, "Invalid patch size!");
2220
2221 SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPatchSize>(patch, patchStrideElements, meanValues);
2222}
2223
2224template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
2225OCEAN_FORCE_INLINE uint8x8_t ZeroMeanSumSquareDifferencesNEON::loadMirrored_u_8x8(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer)
2226{
2227 ocean_assert(tPixels >= 1u && tPixels <= 8u);
2228
2229 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
2230
2231 constexpr unsigned int tOverlappingElements = 8u - tPixels;
2232
2233 if (x >= 0 && x <= int(width) - int(tPixels))
2234 {
2235 if constexpr (tPixels == 8u)
2236 {
2237 return vld1_u8(row + x);
2238 }
2239 else
2240 {
2241 if constexpr (tFront)
2242 {
2243 if constexpr (tOverlappingToZero)
2244 {
2245 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2246 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2247
2248 return vand_u8(vld1_u8(row + x), mask_u_8x8);
2249 }
2250 else
2251 {
2252 return vld1_u8(row + x);
2253 }
2254 }
2255 else
2256 {
2257 if constexpr (tOverlappingToZero)
2258 {
2259 constexpr uint64_t mask = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2260 const uint8x8_t mask_u_8x8 = vcreate_u8(mask);
2261
2262 return vand_u8(vld1_u8(row + x - int(tOverlappingElements)), mask_u_8x8);
2263 }
2264 else
2265 {
2266 return vld1_u8(row + x - int(tOverlappingElements));
2267 }
2268 }
2269 }
2270 }
2271
2272 if constexpr (tFront)
2273 {
2274 for (unsigned int n = 0u; n < tPixels; ++n)
2275 {
2276 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2277 ocean_assert(mirroredIndex < width);
2278
2279 intermediateBuffer[n] = row[mirroredIndex];
2280 }
2281
2282 if constexpr (tOverlappingToZero)
2283 {
2284 for (unsigned int n = tPixels; n < 8u; ++n)
2285 {
2286 intermediateBuffer[n] = 0u;
2287 }
2288 }
2289 }
2290 else
2291 {
2292 if constexpr (tOverlappingToZero)
2293 {
2294 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
2295 {
2296 intermediateBuffer[n] = 0u;
2297 }
2298 }
2299
2300 for (unsigned int n = 0u; n < tPixels; ++n)
2301 {
2302 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2303 ocean_assert(mirroredIndex < width);
2304
2305 intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2306 }
2307 }
2308
2309 return vld1_u8(intermediateBuffer);
2310}
2311
2312template <bool tFront, unsigned int tPixels, bool tOverlappingToZero>
2313OCEAN_FORCE_INLINE uint8x16_t ZeroMeanSumSquareDifferencesNEON::loadMirrored_u_8x16(const uint8_t* const row, const int x, const unsigned int width, uint8_t* const intermediateBuffer)
2314{
2315 ocean_assert(tPixels > 8u && tPixels <= 16u);
2316
2317 ocean_assert(row != nullptr && intermediateBuffer != nullptr);
2318
2319 constexpr unsigned int tOverlappingElements = 16u - tPixels;
2320
2321 if (x >= 0 && x <= int(width) - int(tPixels))
2322 {
2323 if constexpr (tPixels == 16u)
2324 {
2325 return vld1q_u8(row + x);
2326 }
2327 else
2328 {
2329 if constexpr (tFront)
2330 {
2331 if constexpr (tOverlappingToZero)
2332 {
2333 constexpr uint64_t maskHigh = tOverlappingElements < 8u ? (uint64_t(-1) >> tOverlappingElements * 8u) : uint64_t(0);
2334 const uint8x16_t mask_u_8x16 = vcombine_u8(vdup_n_u8(uint8_t(0xFFu)), vcreate_u8(maskHigh));
2335
2336 return vandq_u8(vld1q_u8(row + x), mask_u_8x16);
2337 }
2338 else
2339 {
2340 return vld1q_u8(row + x);
2341 }
2342 }
2343 else
2344 {
2345 if constexpr (tOverlappingToZero)
2346 {
2347 constexpr uint64_t maskLow = tOverlappingElements < 8u ? (uint64_t(-1) << tOverlappingElements * 8u) : uint64_t(0);
2348 const uint8x16_t mask_u_8x16 = vcombine_u8(vcreate_u8(maskLow), vdup_n_u8(uint8_t(0xFFu)));
2349
2350 return vandq_u8(vld1q_u8(row + x - int(tOverlappingElements)), mask_u_8x16);
2351 }
2352 else
2353 {
2354 return vld1q_u8(row + x - int(tOverlappingElements));
2355 }
2356 }
2357 }
2358 }
2359
2360 if constexpr (tFront)
2361 {
2362 for (unsigned int n = 0u; n < tPixels; ++n)
2363 {
2364 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2365 ocean_assert(mirroredIndex < width);
2366
2367 intermediateBuffer[n] = row[mirroredIndex];
2368 }
2369
2370 if constexpr (tOverlappingToZero)
2371 {
2372 for (unsigned int n = tPixels; n < 16u; ++n)
2373 {
2374 intermediateBuffer[n] = 0u;
2375 }
2376 }
2377 }
2378 else
2379 {
2380 if constexpr (tOverlappingToZero)
2381 {
2382 for (unsigned int n = 0u; n < tOverlappingElements; ++n)
2383 {
2384 intermediateBuffer[n] = 0u;
2385 }
2386 }
2387
2388 for (unsigned int n = 0u; n < tPixels; ++n)
2389 {
2390 const unsigned int mirroredIndex = CVUtilities::mirrorIndex(x + int(n), width);
2391 ocean_assert(mirroredIndex < width);
2392
2393 intermediateBuffer[tOverlappingElements + n] = row[mirroredIndex];
2394 }
2395 }
2396
2397 return vld1q_u8(intermediateBuffer);
2398}
2399
2400}
2401
2402}
2403
2404#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2405
2406#endif // META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_NEON_H
static OCEAN_FORCE_INLINE unsigned int mirrorIndex(const int index, const unsigned int elements)
Returns the mirrored index for a given index.
Definition CVUtilities.h:456
This class allows to specialize functions for individual channels.
Definition ZeroMeanSumSquareDifferencesNEON.h:39
static void mean8BitPerChannelMirroredBorder(const uint8_t *const image, const unsigned int width, const unsigned int height, const unsigned int centerX, const unsigned int centerY, const unsigned int imagePaddingElements, uint8_t *const meanValues)
Determines the mean value for an image patch, one value for each channel, patch pixels outside the im...
Definition ZeroMeanSumSquareDifferencesNEON.h:906
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition ZeroMeanSumSquareDifferencesNEON.h:1840
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition ZeroMeanSumSquareDifferencesNEON.h:2087
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition ZeroMeanSumSquareDifferencesNEON.h:448
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition ZeroMeanSumSquareDifferencesNEON.h:1304
This class implements function to calculate zeao-mean sum square differences using NEON instructions.
Definition ZeroMeanSumSquareDifferencesNEON.h:30
static uint32_t patchMirroredBorder8BitPerChannel(const uint8_t *const image0, const uint8_t *const image1, const unsigned int width0, const unsigned int height0, const unsigned int width1, const unsigned int height1, const unsigned int centerX0, const unsigned int centerY0, const unsigned int centerX1, const unsigned int centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the zero-mean sum of square differences between two patches within an image,...
Definition ZeroMeanSumSquareDifferencesNEON.h:2190
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition ZeroMeanSumSquareDifferencesNEON.h:2207
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition ZeroMeanSumSquareDifferencesNEON.h:2149
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition ZeroMeanSumSquareDifferencesNEON.h:2132
static OCEAN_FORCE_INLINE uint8x8_t loadMirrored_u_8x8(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 8 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition ZeroMeanSumSquareDifferencesNEON.h:2225
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition ZeroMeanSumSquareDifferencesNEON.h:2169
static OCEAN_FORCE_INLINE uint8x16_t loadMirrored_u_8x16(const uint8_t *const row, const int x, const unsigned int width, uint8_t *const intermediateBuffer)
Loads up to 16 uint8_t values from a 1-channel row with mirroring pixels if necessary.
Definition ZeroMeanSumSquareDifferencesNEON.h:2313
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition Accessor.h:15