Ocean
ZeroMeanSumSquareDifferencesSSE.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
9 #define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
10 
11 #include "ocean/cv/CV.h"
12 
13 #include "ocean/base/Utilities.h"
14 
15 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
16 
17 #include "ocean/cv/SSE.h"
18 
19 namespace Ocean
20 {
21 
22 namespace CV
23 {
24 
25 /**
26  * This class implements function to calculate zeao-mean sum square differences using SSE instructions.
27  * @ingroup cv
28  */
30 {
31  protected:
32 
33  /**
34  * This class allows to specialize functions for individual channels.
35  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
36  */
37  template <unsigned int tChannels>
39  {
40  public:
41 
42  /**
43  * Determines the mean value for a buffer, one value for each channel.
44  * @param buffer The memory buffer to be handled, must be valid
45  * @param meanValues The resulting mean values, one for each channel
46  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
47  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
48  */
49  template <unsigned int tPixels>
50  static inline void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
51 
52  /**
53  * Determines the mean value for an image patch, one value for each channel.
54  * @param patch The top left start position of the image patch, must be valid
55  * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
56  * @param meanValues The resulting mean values, one for each channel
57  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
58  */
59  template <unsigned int tPatchSize>
60  static inline void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
61 
62  /**
63  * Returns the zero-mean sum of square differences between two memory buffers.
64  * @param buffer0 The first memory buffer, must be valid
65  * @param buffer1 The second memory buffer, must be valid
66  * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
67  * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
68  * @return The resulting sum of square differences
69  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
70  */
71  template <unsigned int tPixels>
72  static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
73 
74  /**
75  * Returns the zero-mean sum of square differences between two patches within an image.
76  * @param patch0 The top left start position of the first image patch, must be valid
77  * @param patch1 The top left start position of the second image patch, must be valid
78  * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
79  * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
80  * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
81  * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
82  * @return The resulting sum of square differences
83  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
84  */
85  template <unsigned int tPatchSize>
86  static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
87  };
88 
89  public:
90 
91  /**
92  * Returns the zero-mean sum of square differences between two memory buffers.
93  * @param buffer0 The first memory buffer, must be valid
94  * @param buffer1 The second memory buffer, must be valid
95  * @return The resulting sum of square differences
96  * @tparam tChannels Specifies the number of channels for the given buffers, with range [1, infinity)
97  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
98  */
99  template <unsigned int tChannels, unsigned int tPixels>
100  static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1);
101 
102  /**
103  * Returns the zero-mean sum of square differences between two patches within an image.
104  * @param patch0 The top left start position of the first image patch, must be valid
105  * @param patch1 The top left start position of the second image patch, must be valid
106  * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
107  * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
108  * @return The resulting sum of square differences
109  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
110  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
111  */
112  template <unsigned int tChannels, unsigned int tPatchSize>
113  static inline uint32_t patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
114 
115  /**
116  * Returns the zero-mean sum of square differences between an image patch and a buffer.
117  * @param patch0 The top left start position of the image patch, must be valid
118  * @param buffer1 The memory buffer, must be valid
119  * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
120  * @return The resulting sum of square differences
121  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
122  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
123  */
124  template <unsigned int tChannels, unsigned int tPatchSize>
125  static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* const patch0, const uint8_t* const buffer1, const unsigned int patch0StrideElements);
126 
127  /**
128  * Determines the mean value for a buffer, one value for each channel.
129  * @param buffer The memory buffer to be handled, must be valid
130  * @param meanValues The resulting mean values, one for each channel
131  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
132  * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
133  */
134  template <unsigned int tChannels, unsigned int tPixels>
135  static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
136 
137  /**
138  * Determines the mean value for an image patch, one value for each channel.
139  * @param patch The top left start position of the image patch, must be valid
140  * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
141  * @param meanValues The resulting mean values, one for each channel
142  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
143  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
144  */
145  template <unsigned int tChannels, unsigned int tPatchSize>
146  static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
147 };
148 
149 template <>
150 template <unsigned int tPixels>
151 inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
152 {
153  static_assert(tPixels >= 8u, "Invalid buffer size!");
154 
155  constexpr unsigned int tChannels = 1u;
156 
157  ocean_assert(buffer != nullptr && meanValues != nullptr);
158 
159  constexpr unsigned int bufferElements = tChannels * tPixels;
160 
161  constexpr unsigned int blocks16 = bufferElements / 16u;
162  constexpr unsigned int remainingAfterBlocks16 = bufferElements % 16u;
163 
164  constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
165 
166  constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
167 
168  constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
169 
170  constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
171 
172  static_assert(blocks1 <= 2u, "Invalid block size!");
173 
174  __m128i sum_128i = _mm_setzero_si128();
175 
176  uint32_t sumIndividual = 0u;
177 
178  for (unsigned int n = 0u; n < blocks16; ++n)
179  {
180  const __m128i buffer_128i = _mm_lddqu_si128((const __m128i*)buffer);
181 
182  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
183 
184  buffer += 16;
185  }
186 
187  if constexpr (partialBlock16)
188  {
189  constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
190 
191  static_assert(overlapElements < 8u, "Invalid value!");
192 
193  const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer - overlapElements)), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the right
194 
195  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
196 
197  buffer += remainingAfterBlocks16;
198  }
199 
200  if constexpr (fullBlock8)
201  {
202  const __m128i buffer_128i = _mm_loadl_epi64((const __m128i*)buffer); // load for unaligned 64 bit memory
203 
204  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
205 
206  buffer += 8;
207  }
208 
209  if constexpr (partialBlock8)
210  {
211  constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
212 
213  static_assert(overlapElements < 8u, "Invalid value!");
214 
215  const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer - overlapElements)), overlapElements); // loading 8 elements, but shifting `overlapElements` zeros to the right
216 
217  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
218 
219  buffer += remainingAfterBlocks16;
220  }
221 
222  if constexpr (blocks1 != 0u)
223  {
224  for (unsigned int n = 0u; n < blocks1; ++n)
225  {
226  sumIndividual += buffer[n];
227  }
228  }
229 
230  const uint32_t sum = SSE::sum_u32_first_third(sum_128i) + sumIndividual;
231 
232  meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
233 }
234 
235 template <>
236 template <unsigned int tPixels>
237 inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
238 {
239  static_assert(tPixels >= 8u, "Invalid buffer size!");
240 
241  constexpr unsigned int tChannels = 3u;
242 
243  ocean_assert(buffer != nullptr && meanValues != nullptr);
244 
245  constexpr unsigned int bufferElements = tChannels * tPixels;
246 
247  constexpr unsigned int blocks48 = bufferElements / 48u;
248  constexpr unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
249 
250  constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
251 
252  constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
253 
254  constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
255 
256  constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
257 
258  constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
259 
260  constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
261 
262  constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
263 
264  constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
265 
266  constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
267 
268  static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
269 
270  __m128i sumChannel0_128i = _mm_setzero_si128();
271  __m128i sumChannel1_128i = _mm_setzero_si128();
272  __m128i sumChannel2_128i = _mm_setzero_si128();
273 
274  uint32_t sumIndividual[3] = {0u};
275 
276  for (unsigned int n = 0u; n < blocks48; ++n)
277  {
278  const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(buffer + 0));
279  const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(buffer + 16));
280  const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(buffer + 32));
281 
282  __m128i channel0;
283  __m128i channel1;
284  __m128i channel2;
285  SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
286 
287  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
288  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
289  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
290 
291  buffer += 48;
292  }
293 
294  if constexpr (partialBlock48)
295  {
296  constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
297 
298  const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(buffer)), overlappingElements);
299  const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(buffer - overlappingElements + 16));
300  const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(buffer - overlappingElements + 32));
301 
302  __m128i channel0;
303  __m128i channel1;
304  __m128i channel2;
305  SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
306 
307  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
308  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
309  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
310 
311  buffer += remainingAfterFullBlocks48;
312  }
313 
314  for (unsigned int n = 0u; n < blocks24; ++n)
315  {
316  const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(buffer + 0));
317  const __m128i bufferB_128i = _mm_loadl_epi64((const __m128i*)(buffer + 16)); // load for unaligned 64 bit memory
318 
319  __m128i channel01_128i;
320  __m128i channel2_128i;
321  SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
322 
323  const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
324 
325  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
326  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
327  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
328 
329  buffer += 24;
330  }
331 
332  for (unsigned int n = 0u; n < blocks21; ++n)
333  {
334  const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(buffer + 0));
335  const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer + 16 - 3)), 3); // load for unaligned 64 bit memory
336 
337  __m128i channel01_128i;
338  __m128i channel2_128i;
339  SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
340 
341  const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
342 
343  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
344  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
345  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
346 
347  buffer += 21;
348  }
349 
350  for (unsigned int n = 0u; n < blocks15; ++n)
351  {
352  const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer - 1)), 1);
353 
354  __m128i channel01_128i;
355  __m128i channel2_128i;
356  SSE::deInterleave3Channel8Bit15Elements(buffer_128i, channel01_128i, channel2_128i);
357 
358  const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
359 
360  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
361  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
362  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
363 
364  buffer += 15;
365  }
366 
367  if constexpr (blocks1 != 0u)
368  {
369  constexpr unsigned int pixels = blocks1 / 3u;
370 
371  for (unsigned int x = 0u; x < pixels; ++x)
372  {
373  for (unsigned int n = 0u; n < 3u; ++n)
374  {
375  sumIndividual[n] += buffer[x * 3u + n];
376  }
377  }
378 
379  buffer += blocks1;
380  }
381 
382  meanValues[0] = uint8_t((SSE::sum_u32_first_third(sumChannel0_128i) + sumIndividual[0] + tPixels / 2u) / tPixels);
383  meanValues[1] = uint8_t((SSE::sum_u32_first_third(sumChannel1_128i) + sumIndividual[1] + tPixels / 2u) / tPixels);
384  meanValues[2] = uint8_t((SSE::sum_u32_first_third(sumChannel2_128i) + sumIndividual[2] + tPixels / 2u) / tPixels);
385 }
386 
387 template <unsigned int tChannels>
388 template <unsigned int tPixels>
389 inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
390 {
391  static_assert(tChannels >= 1u, "Invalid channel number!");
392  static_assert(tPixels >= 1u, "Invalid buffer size!");
393 
394  ocean_assert(buffer != nullptr && meanValues != nullptr);
395 
396  uint32_t sum[tChannels] = {0u};
397 
398  for (unsigned int n = 0u; n < tPixels; ++n)
399  {
400  for (unsigned int c = 0u; c < tChannels; ++c)
401  {
402  sum[c] += buffer[n * tChannels + c];
403  }
404  }
405 
406  for (unsigned int c = 0u; c < tChannels; ++c)
407  {
408  meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
409  }
410 }
411 
412 template <>
413 template <unsigned int tPatchSize>
414 inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
415 {
416  static_assert(tPatchSize >= 5u, "Invalid patch size!");
417 
418  constexpr unsigned int tChannels = 1u;
419 
420  ocean_assert(patch != nullptr && meanValues != nullptr);
421 
422  ocean_assert(patchStrideElements >= tChannels * tPatchSize);
423 
424  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
425 
426  constexpr unsigned int blocks16 = patchWidthElements / 16u;
427  constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
428 
429  constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
430 
431  constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
432 
433  constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
434 
435  constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
436 
437  static_assert(blocks1 <= 2u, "Invalid block size!");
438 
439  __m128i sum_128i = _mm_setzero_si128();
440 
441  uint32_t sumIndividual = 0u;
442 
443  for (unsigned int y = 0u; y < tPatchSize; ++y)
444  {
445  SSE::prefetchT0(patch + patchStrideElements);
446 
447  for (unsigned int n = 0u; n < blocks16; ++n)
448  {
449  const __m128i buffer_128i = _mm_lddqu_si128((const __m128i*)patch);
450 
451  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
452 
453  patch += 16;
454  }
455 
456  if constexpr (fullBlock8)
457  {
458  const __m128i buffer_128i = _mm_loadl_epi64((const __m128i*)patch); // load for unaligned 64 bit memory
459 
460  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
461 
462  patch += 8;
463  }
464 
465  if constexpr (partialBlock16)
466  {
467  constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
468 
469  static_assert(overlapElements < 8u, "Invalid value!");
470 
471  if (y < tPatchSize - 1u)
472  {
473  const __m128i buffer_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)patch), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the left
474 
475  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
476  }
477  else
478  {
479  const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch - overlapElements)), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the right
480 
481  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
482  }
483 
484  patch += remainingAfterBlocks16;
485  }
486 
487  if constexpr (partialBlock8)
488  {
489  constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
490 
491  static_assert(overlapElements < 8u, "Invalid value!");
492 
493  if (y < tPatchSize - 1u)
494  {
495  const __m128i buffer_128i = _mm_slli_si128(_mm_loadl_epi64((const __m128i*)patch), overlapElements + 8); // loading 8 elements, but shifting `overlapElements` zeros to the left
496 
497  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
498  }
499  else
500  {
501  const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch - overlapElements)), overlapElements); // loading 8 elements, but shifting `overlapElements` zeros to the right
502 
503  sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
504  }
505 
506  patch += remainingAfterBlocks16;
507  }
508 
509  if constexpr (blocks1 != 0u)
510  {
511  for (unsigned int n = 0u; n < blocks1; ++n)
512  {
513  sumIndividual += patch[n];
514  }
515 
516  patch += blocks1;
517  }
518 
519  patch += patchStrideElements - patchWidthElements;
520  }
521 
522  const uint32_t sum = SSE::sum_u32_first_third(sum_128i) + sumIndividual;
523 
524  meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
525 }
526 
527 template <>
528 template <unsigned int tPatchSize>
529 inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
530 {
531  static_assert(tPatchSize >= 5u, "Invalid patch size!");
532 
533  constexpr unsigned int tChannels = 3u;
534 
535  ocean_assert(patch != nullptr && meanValues != nullptr);
536 
537  ocean_assert(patchStrideElements >= tChannels * tPatchSize);
538 
539  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
540 
541  constexpr unsigned int blocks48 = patchWidthElements / 48u;
542  constexpr unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
543 
544  constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
545 
546  constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
547 
548  constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
549 
550  constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
551 
552  constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
553 
554  constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
555 
556  constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
557 
558  constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
559 
560  constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
561 
562  static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
563 
564  __m128i sumChannel0_128i = _mm_setzero_si128();
565  __m128i sumChannel1_128i = _mm_setzero_si128();
566  __m128i sumChannel2_128i = _mm_setzero_si128();
567 
568  uint32_t sumIndividual[3] = {0u};
569 
570  for (unsigned int y = 0u; y < tPatchSize; ++y)
571  {
572  SSE::prefetchT0(patch + patchStrideElements);
573 
574  for (unsigned int n = 0u; n < blocks48; ++n)
575  {
576  const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(patch + 0));
577  const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(patch + 16));
578  const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(patch + 32));
579 
580  __m128i channel0;
581  __m128i channel1;
582  __m128i channel2;
583  SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
584 
585  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
586  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
587  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
588 
589  patch += 48;
590  }
591 
592  if constexpr (partialBlock48)
593  {
594  constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
595 
596  const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(patch)), overlappingElements);
597  const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(patch - overlappingElements + 16));
598  const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(patch - overlappingElements + 32));
599 
600  __m128i channel0;
601  __m128i channel1;
602  __m128i channel2;
603  SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
604 
605  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
606  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
607  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
608 
609  patch += remainingAfterFullBlocks48;
610  }
611 
612  for (unsigned int n = 0u; n < blocks24; ++n)
613  {
614  const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(patch + 0));
615  const __m128i bufferB_128i = _mm_loadl_epi64((const __m128i*)(patch + 16)); // load for unaligned 64 bit memory
616 
617  __m128i channel01_128i;
618  __m128i channel2_128i;
619  SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
620 
621  const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
622 
623  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
624  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
625  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
626 
627  patch += 24;
628  }
629 
630  for (unsigned int n = 0u; n < blocks21; ++n)
631  {
632  const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(patch + 0));
633  const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch + 16 - 3)), 3); // load for unaligned 64 bit memory
634 
635  __m128i channel01_128i;
636  __m128i channel2_128i;
637  SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
638 
639  const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
640 
641  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
642  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
643  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
644 
645  patch += 21;
646  }
647 
648  for (unsigned int n = 0u; n < blocks15; ++n)
649  {
650  const __m128i buffer_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((const __m128i*)(patch)) : _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch - 1)), 1);
651 
652  __m128i channel01_128i;
653  __m128i channel2_128i;
654  SSE::deInterleave3Channel8Bit15Elements(buffer_128i, channel01_128i, channel2_128i);
655 
656  const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
657 
658  sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
659  sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
660  sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
661 
662  patch += 15;
663  }
664 
665  if constexpr (blocks1 != 0u)
666  {
667  constexpr unsigned int pixels = blocks1 / 3u;
668 
669  for (unsigned int x = 0u; x < pixels; ++x)
670  {
671  for (unsigned int n = 0u; n < 3u; ++n)
672  {
673  sumIndividual[n] += patch[x * 3u + n];
674  }
675  }
676 
677  patch += blocks1;
678  }
679 
680  patch += patchStrideElements - patchWidthElements;
681  }
682 
683  meanValues[0] = uint8_t((SSE::sum_u32_first_third(sumChannel0_128i) + sumIndividual[0] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
684  meanValues[1] = uint8_t((SSE::sum_u32_first_third(sumChannel1_128i) + sumIndividual[1] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
685  meanValues[2] = uint8_t((SSE::sum_u32_first_third(sumChannel2_128i) + sumIndividual[2] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
686 }
687 
688 template <unsigned int tChannels>
689 template <unsigned int tPatchSize>
690 inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
691 {
692  static_assert(tChannels >= 1u, "Invalid channel number!");
693  static_assert(tPatchSize >= 1u, "Invalid patch size!");
694 
695  ocean_assert(patch != nullptr && meanValues != nullptr);
696 
697  ocean_assert(patchStrideElements >= tChannels * tPatchSize);
698 
699  uint32_t sum[tChannels] = {0u};
700 
701  for (unsigned int y = 0u; y < tPatchSize; ++y)
702  {
703  for (unsigned int x = 0u; x < tPatchSize; ++x)
704  {
705  for (unsigned int n = 0u; n < tChannels; ++n)
706  {
707  sum[n] += patch[x * tChannels + n];
708  }
709  }
710 
711  patch += patchStrideElements;
712  }
713 
714  for (unsigned int n = 0u; n < tChannels; ++n)
715  {
716  meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
717  }
718 }
719 
720 template <>
721 template <unsigned int tPixels>
722 inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
723 {
724  static_assert(tPixels >= 8u, "Invalid pixel number!");
725 
726  constexpr unsigned int tChannels = 1u;
727 
728  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
729  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
730 
731  constexpr unsigned int bufferElements = tChannels * tPixels;
732 
733  constexpr unsigned int blocks16 = bufferElements / 16u;
734  constexpr unsigned int remainingAfterBlocks16 = bufferElements % 16u;
735 
736  constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
737 
738  constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
739 
740  constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
741 
742  constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
743 
744  static_assert(blocks1 <= 2u, "Invalid block size!");
745 
746  static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
747 
748  const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
749 
750  const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
751  const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
752 
753  __m128i sum0_128i = _mm_setzero_si128();
754  __m128i sum1_128i = _mm_setzero_si128();
755 
756  uint32_t sumIndividual = 0u;
757 
758  for (unsigned int n = 0u; n < blocks16; ++n)
759  {
760  const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)buffer0);
761  const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)buffer1);
762 
763  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
764  const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
765 
766  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
767  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
768 
769  buffer0 += 16;
770  buffer1 += 16;
771  }
772 
773  if constexpr (partialBlock16)
774  {
775  constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
776 
777  static_assert(overlapElements < 8u, "Invalid value!");
778 
779  const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer0 - overlapElements)), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the right
780  const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer1 - overlapElements)), overlapElements);
781 
782  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
783  const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
784 
785  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
786  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
787 
788  buffer0 += remainingAfterBlocks16;
789  buffer1 += remainingAfterBlocks16;
790  }
791 
792  if constexpr (fullBlock8)
793  {
794  const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)buffer0); // load for unaligned 64 bit memory
795  const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)buffer1); // load for unaligned 64 bit memory
796 
797  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
798  const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
799 
800  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
801  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
802 
803  buffer0 += 8;
804  buffer1 += 8;
805  }
806 
807  if constexpr (partialBlock8)
808  {
809  constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
810 
811  static_assert(overlapElements < 8u, "Invalid value!");
812 
813  const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer0 - overlapElements)), overlapElements); // loading 8 elements, but shifting `overlapElements` zeros to the right
814  const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer1 - overlapElements)), overlapElements);
815 
816  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
817  const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
818 
819  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
820  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
821 
822  buffer0 += remainingAfterBlocks16;
823  buffer1 += remainingAfterBlocks16;
824  }
825 
826  if constexpr (blocks1 != 0u)
827  {
828  for (unsigned int n = 0u; n < blocks1; ++n)
829  {
830  sumIndividual += sqrDistance(buffer0[n] - meanValues0[0], buffer1[n] - meanValues1[0]);
831  }
832 
833  buffer0 += blocks1;
834  buffer1 += blocks1;
835  }
836 
837  return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
838 }
839 
840 template <>
841 template <unsigned int tPixels>
842 inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
843 {
844  static_assert(tPixels >= 5u, "Invalid pixel number!");
845 
846  constexpr unsigned int tChannels = 3u;
847 
848  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
849  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
850 
851  constexpr unsigned int bufferElements = tChannels * tPixels;
852 
853  constexpr unsigned int blocks48 = bufferElements / 48u;
854  constexpr unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
855 
856  constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
857 
858  constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
859 
860  constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
861 
862  constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
863 
864  constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
865 
866  constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
867 
868  constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
869 
870  constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
871 
872  constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
873 
874  static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
875 
876  static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
877 
878  const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
879 
880  const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
881  const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
882  const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
883 
884  const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
885  const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
886  const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
887 
888  __m128i sum0_128i = _mm_setzero_si128();
889  __m128i sum1_128i = _mm_setzero_si128();
890 
891  uint32_t sumIndividual = 0u;
892 
893  for (unsigned int n = 0u; n < blocks48; ++n)
894  {
895  const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 0));
896  const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 16));
897  const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 32));
898 
899  __m128i channel0_0_128i;
900  __m128i channel0_1_128i;
901  __m128i channel0_2_128i;
902  SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
903 
904  const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 0));
905  const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 16));
906  const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 32));
907 
908  __m128i channel1_0_128i;
909  __m128i channel1_1_128i;
910  __m128i channel1_2_128i;
911  SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
912 
913  __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)); // (channel0_0 - mean0) - (channel1_0 - mean1)
914  __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
915 
916  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
917  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
918 
919  absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
920  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
921 
922  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
923  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
924 
925  absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
926  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
927 
928  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
929  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
930 
931  buffer0 += 48;
932  buffer1 += 48;
933  }
934 
935  if constexpr (partialBlock48)
936  {
937  constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
938  constexpr int overlappingPixels = overlappingElements / int(tChannels);
939 
940  const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(buffer0)), overlappingElements);
941  const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(buffer0 - overlappingElements + 16));
942  const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(buffer0 - overlappingElements + 32));
943 
944  __m128i channel0_0_128i;
945  __m128i channel0_1_128i;
946  __m128i channel0_2_128i;
947  SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
948 
949  const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(buffer1)), overlappingElements);
950  const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(buffer1 - overlappingElements + 16));
951  const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(buffer1 - overlappingElements + 32));
952 
953  __m128i channel1_0_128i;
954  __m128i channel1_1_128i;
955  __m128i channel1_2_128i;
956  SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
957 
958  __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2); // (channel0_0 - mean0) - (channel1_0 - mean1)
959  __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
960 
961  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
962  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
963 
964  absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
965  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
966 
967  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
968  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
969 
970  absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
971  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
972 
973  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
974  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
975 
976  buffer0 += remainingAfterFullBlocks48;
977  buffer1 += remainingAfterFullBlocks48;
978  }
979 
980  for (unsigned int n = 0u; n < blocks24; ++n)
981  {
982  const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 0));
983  const __m128i buffer0B_128i = _mm_loadl_epi64((const __m128i*)(buffer0 + 16)); // load for unaligned 64 bit memory
984 
985  __m128i channel0_01_128i;
986  __m128i channel0_2_128i;
987  SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
988 
989  const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 0));
990  const __m128i buffer1B_128i = _mm_loadl_epi64((const __m128i*)(buffer1 + 16)); // load for unaligned 64 bit memory
991 
992  __m128i channel1_01_128i;
993  __m128i channel1_2_128i;
994  SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
995 
996  __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)); // (channel0_01 - mean0) - (channel1_01 - mean1)
997  __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
998 
999  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1000  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1001 
1002  absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1003 
1004  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1005 
1006  buffer0 += 24;
1007  buffer1 += 24;
1008  }
1009 
1010  for (unsigned int n = 0u; n < blocks21; ++n)
1011  {
1012  const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 0));
1013  const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer0 + 16 - 3)), 3); // load for unaligned 64 bit memory
1014 
1015  __m128i channel0_01_128i;
1016  __m128i channel0_2_128i;
1017  SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
1018 
1019  const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 0));
1020  const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer1 + 16 - 3)), 3); // load for unaligned 64 bit memory
1021 
1022  __m128i channel1_01_128i;
1023  __m128i channel1_2_128i;
1024  SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
1025 
1026  __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2); // (channel0_01 - mean0) - (channel1_01 - mean1)
1027  __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1028 
1029  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1030  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1031 
1032  absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1033 
1034  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1035 
1036  buffer0 += 21;
1037  buffer1 += 21;
1038  }
1039 
1040  for (unsigned int n = 0u; n < blocks15; ++n)
1041  {
1042  const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer0 - 1)), 1);
1043 
1044  __m128i channel0_01_128i;
1045  __m128i channel0_2_128i;
1046  SSE::deInterleave3Channel8Bit15Elements(buffer0_128i, channel0_01_128i, channel0_2_128i);
1047 
1048  const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer1 - 1)), 1);
1049 
1050  __m128i channel1_01_128i;
1051  __m128i channel1_2_128i;
1052  SSE::deInterleave3Channel8Bit15Elements(buffer1_128i, channel1_01_128i, channel1_2_128i);
1053 
1054  __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6); // (channel0_01 - mean0) - (channel1_01 - mean1)
1055  __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1056 
1057  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1058  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1059 
1060  absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1061 
1062  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1063 
1064  buffer0 += 15;
1065  buffer1 += 15;
1066  }
1067 
1068  if constexpr (blocks1 != 0u)
1069  {
1070  constexpr unsigned int pixels = blocks1 / 3u;
1071 
1072  for (unsigned int x = 0u; x < pixels; ++x)
1073  {
1074  for (unsigned int n = 0u; n < 3u; ++n)
1075  {
1076  sumIndividual += sqrDistance(buffer0[x * 3u + n] - meanValues0[n], buffer1[x * 3u + n] - meanValues1[n]);
1077  }
1078  }
1079 
1080  buffer0 += blocks1;
1081  buffer1 += blocks1;
1082  }
1083 
1084  return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
1085 }
1086 
1087 template <unsigned int tChannels>
1088 template <unsigned int tPixels>
1089 inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1090 {
1091  static_assert(tChannels >= 1u, "Invalid channel number!");
1092  static_assert(tPixels >= 1u, "Invalid patch size!");
1093 
1094  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1095  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1096 
1097  uint32_t ssd = 0u;
1098 
1099  for (unsigned int n = 0u; n < tPixels; ++n)
1100  {
1101  for (unsigned int c = 0u; c < tChannels; ++c)
1102  {
1103  ssd += sqrDistance(buffer0[n * tChannels + c] - meanValues0[c], buffer1[n * tChannels + c] - meanValues1[c]);
1104  }
1105  }
1106 
1107  return ssd;
1108 }
1109 
1110 template <>
1111 template <unsigned int tPatchSize>
1112 inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1113 {
1114  static_assert(tPatchSize >= 1u, "Invalid patch size!");
1115 
1116  constexpr unsigned int tChannels = 1u;
1117 
1118  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1119  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1120 
1121  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1122  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1123 
1124  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
1125 
1126  constexpr unsigned int blocks16 = patchWidthElements / 16u;
1127  constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
1128 
1129  constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
1130 
1131  constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
1132 
1133  constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
1134 
1135  constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
1136 
1137  static_assert(blocks1 <= 2u, "Invalid block size!");
1138 
1139  static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
1140 
1141  const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
1142 
1143  const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
1144  const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
1145 
1146  __m128i sum0_128i = _mm_setzero_si128();
1147  __m128i sum1_128i = _mm_setzero_si128();
1148 
1149  uint32_t sumIndividual = 0u;
1150 
1151  for (unsigned int y = 0u; y < tPatchSize; ++y)
1152  {
1153  SSE::prefetchT0(patch0 + patch0StrideElements);
1154  SSE::prefetchT0(patch1 + patch1StrideElements);
1155 
1156  for (unsigned int n = 0u; n < blocks16; ++n)
1157  {
1158  const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)patch0);
1159  const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)patch1);
1160 
1161  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
1162  const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1163 
1164  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1165  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1166 
1167  patch0 += 16;
1168  patch1 += 16;
1169  }
1170 
1171  if constexpr (fullBlock8)
1172  {
1173  const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)patch0); // load for unaligned 64 bit memory
1174  const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)patch1); // load for unaligned 64 bit memory
1175 
1176  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
1177  const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1178 
1179  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1180  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1181 
1182  patch0 += 8;
1183  patch1 += 8;
1184  }
1185 
1186  if constexpr (partialBlock16)
1187  {
1188  constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
1189 
1190  static_assert(overlapElements < 8u, "Invalid value!");
1191 
1192  if (y < tPatchSize - 1u)
1193  {
1194  const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)patch0); // loading 16 elements
1195  const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)patch1);
1196 
1197  const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
1198  const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1199 
1200  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1201  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1202  }
1203  else
1204  {
1205  const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)(patch0 - overlapElements)); // loading 16 elements
1206  const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)(patch1 - overlapElements));
1207 
1208  const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2); // (buffer0 - mean0) - (buffer1 - mean1)
1209  const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1210 
1211  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1212  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1213  }
1214 
1215  patch0 += remainingAfterBlocks16;
1216  patch1 += remainingAfterBlocks16;
1217  }
1218 
1219  if constexpr (partialBlock8)
1220  {
1221  constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
1222 
1223  static_assert(overlapElements < 8u, "Invalid value!");
1224 
1225  if (y < tPatchSize - 1u)
1226  {
1227  const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)patch0);// loading 8 elements
1228  const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)patch1);
1229 
1230  const __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2); // (buffer0 - mean0) - (buffer1 - mean1)
1231 
1232  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1233  }
1234  else
1235  {
1236  const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)(patch0 - overlapElements)); // loading 8 elements
1237  const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)(patch1 - overlapElements));
1238 
1239  const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2); // (buffer0 - mean0) - (buffer1 - mean1)
1240 
1241  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1242  }
1243 
1244  patch0 += remainingAfterBlocks16;
1245  patch1 += remainingAfterBlocks16;
1246  }
1247 
1248  if constexpr (blocks1 != 0u)
1249  {
1250  for (unsigned int n = 0u; n < blocks1; ++n)
1251  {
1252  sumIndividual += sqrDistance(patch0[n] - meanValues0[0], patch1[n] - meanValues1[0]);
1253  }
1254 
1255  patch0 += blocks1;
1256  patch1 += blocks1;
1257  }
1258 
1259  patch0 += patch0StrideElements - patchWidthElements;
1260  patch1 += patch1StrideElements - patchWidthElements;
1261  }
1262 
1263  return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
1264 }
1265 
1266 template <>
1267 template <unsigned int tPatchSize>
1268 inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1269 {
1270  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1271 
1272  constexpr unsigned int tChannels = 3u;
1273 
1274  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1275  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1276 
1277  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1278  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1279 
1280  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
1281 
1282  constexpr unsigned int blocks48 = patchWidthElements / 48u;
1283  constexpr unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
1284 
1285  constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
1286 
1287  constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
1288 
1289  constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
1290 
1291  constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
1292 
1293  constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
1294 
1295  constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
1296 
1297  constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
1298 
1299  constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
1300 
1301  constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
1302 
1303  static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
1304 
1305  static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
1306 
1307  const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
1308 
1309  const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
1310  const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
1311  const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
1312 
1313  const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
1314  const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
1315  const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
1316 
1317  __m128i sum0_128i = _mm_setzero_si128();
1318  __m128i sum1_128i = _mm_setzero_si128();
1319 
1320  uint32_t sumIndividual = 0u;
1321 
1322  for (unsigned int y = 0u; y < tPatchSize; ++y)
1323  {
1324  SSE::prefetchT0(patch0 + patch0StrideElements);
1325  SSE::prefetchT0(patch1 + patch1StrideElements);
1326 
1327  for (unsigned int n = 0u; n < blocks48; ++n)
1328  {
1329  const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 0));
1330  const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 16));
1331  const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 32));
1332 
1333  __m128i channel0_0_128i;
1334  __m128i channel0_1_128i;
1335  __m128i channel0_2_128i;
1336  SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
1337 
1338  const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 0));
1339  const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 16));
1340  const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 32));
1341 
1342  __m128i channel1_0_128i;
1343  __m128i channel1_1_128i;
1344  __m128i channel1_2_128i;
1345  SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
1346 
1347  __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)); // (channel0_0 - mean0) - (channel1_0 - mean1)
1348  __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1349 
1350  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1351  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1352 
1353  absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1354  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1355 
1356  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1357  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1358 
1359  absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1360  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1361 
1362  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1363  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1364 
1365  patch0 += 48;
1366  patch1 += 48;
1367  }
1368 
1369  if constexpr (partialBlock48)
1370  {
1371  constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
1372  constexpr int overlappingPixels = overlappingElements / int(tChannels);
1373 
1374  const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(patch0)), overlappingElements);
1375  const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(patch0 - overlappingElements + 16));
1376  const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(patch0 - overlappingElements + 32));
1377 
1378  __m128i channel0_0_128i;
1379  __m128i channel0_1_128i;
1380  __m128i channel0_2_128i;
1381  SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
1382 
1383  const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(patch1)), overlappingElements);
1384  const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(patch1 - overlappingElements + 16));
1385  const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(patch1 - overlappingElements + 32));
1386 
1387  __m128i channel1_0_128i;
1388  __m128i channel1_1_128i;
1389  __m128i channel1_2_128i;
1390  SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
1391 
1392  __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2); // (channel0_0 - mean0) - (channel1_0 - mean1)
1393  __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1394 
1395  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1396  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1397 
1398  absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
1399  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1400 
1401  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1402  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1403 
1404  absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
1405  absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1406 
1407  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1408  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1409 
1410  patch0 += remainingAfterFullBlocks48;
1411  patch1 += remainingAfterFullBlocks48;
1412  }
1413 
1414  for (unsigned int n = 0u; n < blocks24; ++n)
1415  {
1416  const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 0));
1417  const __m128i buffer0B_128i = _mm_loadl_epi64((const __m128i*)(patch0 + 16)); // load for unaligned 64 bit memory
1418 
1419  __m128i channel0_01_128i;
1420  __m128i channel0_2_128i;
1421  SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
1422 
1423  const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 0));
1424  const __m128i buffer1B_128i = _mm_loadl_epi64((const __m128i*)(patch1 + 16)); // load for unaligned 64 bit memory
1425 
1426  __m128i channel1_01_128i;
1427  __m128i channel1_2_128i;
1428  SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
1429 
1430  __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)); // (channel0_01 - mean0) - (channel1_01 - mean1)
1431  __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
1432 
1433  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1434  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1435 
1436  absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1437 
1438  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1439 
1440  patch0 += 24;
1441  patch1 += 24;
1442  }
1443 
1444  for (unsigned int n = 0u; n < blocks21; ++n)
1445  {
1446  const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 0));
1447  const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch0 + 16 - 3)), 3); // load for unaligned 64 bit memory
1448 
1449  __m128i channel0_01_128i;
1450  __m128i channel0_2_128i;
1451  SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
1452 
1453  const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 0));
1454  const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch1 + 16 - 3)), 3); // load for unaligned 64 bit memory
1455 
1456  __m128i channel1_01_128i;
1457  __m128i channel1_2_128i;
1458  SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
1459 
1460  __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2); // (channel0_01 - mean0) - (channel1_01 - mean1)
1461  __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1462 
1463  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1464  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1465 
1466  absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1467 
1468  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1469 
1470  patch0 += 21;
1471  patch1 += 21;
1472  }
1473 
1474  for (unsigned int n = 0u; n < blocks15; ++n)
1475  {
1476  const __m128i buffer0_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((const __m128i*)(patch0)) : _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch0 - 1)), 1);
1477 
1478  __m128i channel0_01_128i;
1479  __m128i channel0_2_128i;
1480  SSE::deInterleave3Channel8Bit15Elements(buffer0_128i, channel0_01_128i, channel0_2_128i);
1481 
1482  const __m128i buffer1_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((const __m128i*)(patch1)) : _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch1 - 1)), 1);
1483 
1484  __m128i channel1_01_128i;
1485  __m128i channel1_2_128i;
1486  SSE::deInterleave3Channel8Bit15Elements(buffer1_128i, channel1_01_128i, channel1_2_128i);
1487 
1488  __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6); // (channel0_01 - mean0) - (channel1_01 - mean1)
1489  __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1490 
1491  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1492  sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1493 
1494  absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1495 
1496  sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1497 
1498  patch0 += 15;
1499  patch1 += 15;
1500  }
1501 
1502  if constexpr (blocks1 != 0u)
1503  {
1504  constexpr unsigned int pixels = blocks1 / 3u;
1505 
1506  for (unsigned int x = 0u; x < pixels; ++x)
1507  {
1508  for (unsigned int n = 0u; n < 3u; ++n)
1509  {
1510  sumIndividual += sqrDistance(patch0[x * 3u + n] - meanValues0[n], patch1[x * 3u + n] - meanValues1[n]);
1511  }
1512  }
1513 
1514  patch0 += blocks1;
1515  patch1 += blocks1;
1516  }
1517 
1518  patch0 += patch0StrideElements - patchWidthElements;
1519  patch1 += patch1StrideElements - patchWidthElements;
1520  }
1521 
1522  return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
1523 }
1524 
1525 template <unsigned int tChannels>
1526 template <unsigned int tPatchSize>
1527 inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1528 {
1529  static_assert(tChannels >= 1u, "Invalid channel number!");
1530  static_assert(tPatchSize >= 1u, "Invalid patch size!");
1531 
1532  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1533  ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1534 
1535  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1536  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1537 
1538  uint32_t ssd = 0u;
1539 
1540  for (unsigned int y = 0u; y < tPatchSize; ++y)
1541  {
1542  for (unsigned int x = 0u; x < tPatchSize; ++x)
1543  {
1544  for (unsigned int n = 0u; n < tChannels; ++n)
1545  {
1546  ssd += sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1547  }
1548  }
1549 
1550  patch0 += patch0StrideElements;
1551  patch1 += patch1StrideElements;
1552  }
1553 
1554  return ssd;
1555 }
1556 
1557 template <unsigned int tChannels, unsigned int tPixels>
1558 inline uint32_t ZeroMeanSumSquareDifferencesSSE::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1)
1559 {
1560  static_assert(tChannels >= 1u, "Invalid channel number!");
1561  static_assert(tPixels >= 8u, "Invalid patch size!");
1562 
1563  ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1564 
1565  uint8_t meanValues0[tChannels];
1566  mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
1567 
1568  uint8_t meanValues1[tChannels];
1569  mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
1570 
1571  return SpecializedForChannels<tChannels>::template buffer8BitPerChannel<tPixels>(buffer0, buffer1, meanValues0, meanValues1);
1572 }
1573 
1574 template <unsigned int tChannels, unsigned int tPatchSize>
1575 inline uint32_t ZeroMeanSumSquareDifferencesSSE::patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
1576 {
1577  static_assert(tChannels >= 1u, "Invalid channel number!");
1578  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1579 
1580  ocean_assert(patch0 != nullptr && patch1 != nullptr);
1581 
1582  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1583  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1584 
1585  uint8_t meanValues0[tChannels];
1586  mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
1587 
1588  uint8_t meanValues1[tChannels];
1589  mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
1590 
1591  return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, patch1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
1592 }
1593 
1594 template <unsigned int tChannels, unsigned int tPatchSize>
1595 inline uint32_t ZeroMeanSumSquareDifferencesSSE::patchBuffer8BitPerChannel(const uint8_t* const patch0, const uint8_t* const buffer1, const unsigned int patch0StrideElements)
1596 {
1597  static_assert(tChannels >= 1u, "Invalid channel number!");
1598  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1599 
1600  ocean_assert(patch0 != nullptr && buffer1 != nullptr);
1601 
1602  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1603 
1604  uint8_t meanValues0[tChannels];
1605  mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
1606 
1607  uint8_t meanValues1[tChannels];
1608  mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
1609 
1610  constexpr unsigned int patch1StrideElements = tChannels * tPatchSize;
1611 
1612  return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, buffer1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
1613 }
1614 
1615 template <unsigned int tChannels, unsigned int tPixels>
1616 OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesSSE::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
1617 {
1618  static_assert(tChannels >= 1u, "Invalid channel number!");
1619  static_assert(tPixels >= 8u, "Invalid patch size!");
1620 
1621  SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPixels>(buffer, meanValues);
1622 }
1623 
1624 template <unsigned int tChannels, unsigned int tPatchSize>
1625 OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesSSE::mean8BitPerChannel(const uint8_t* const patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
1626 {
1627  static_assert(tChannels >= 1u, "Invalid channel number!");
1628  static_assert(tPatchSize >= 5u, "Invalid patch size!");
1629 
1630  SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPatchSize>(patch, patchStrideElements, meanValues);
1631 }
1632 
1633 }
1634 
1635 }
1636 
1637 #endif // OCEAN_HARDWARE_SSE_VERSION >= 41
1638 
1639 #endif // META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3277
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition: SSE.h:1340
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3289
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3304
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition: SSE.h:1322
This class allows to specialize functions for individual channels.
Definition: ZeroMeanSumSquareDifferencesSSE.h:39
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1527
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1089
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesSSE.h:389
This class implements function to calculate zeao-mean sum square differences using SSE instructions.
Definition: ZeroMeanSumSquareDifferencesSSE.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *const patch0, const uint8_t *const buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1595
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1558
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1616
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition: ZeroMeanSumSquareDifferencesSSE.h:1575
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15