Ocean
Loading...
Searching...
No Matches
ZeroMeanSumSquareDifferencesSSE.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
9#define META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
10
11#include "ocean/cv/CV.h"
12
14
15#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
16
17#include "ocean/cv/SSE.h"
18
19namespace Ocean
20{
21
22namespace CV
23{
24
25/**
26 * This class implements function to calculate zeao-mean sum square differences using SSE instructions.
27 * @ingroup cv
28 */
30{
31 protected:
32
33 /**
34 * This class allows to specialize functions for individual channels.
35 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
36 */
37 template <unsigned int tChannels>
39 {
40 public:
41
42 /**
43 * Determines the mean value for a buffer, one value for each channel.
44 * @param buffer The memory buffer to be handled, must be valid
45 * @param meanValues The resulting mean values, one for each channel
46 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
47 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
48 */
49 template <unsigned int tPixels>
50 static inline void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
51
52 /**
53 * Determines the mean value for an image patch, one value for each channel.
54 * @param patch The top left start position of the image patch, must be valid
55 * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
56 * @param meanValues The resulting mean values, one for each channel
57 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
58 */
59 template <unsigned int tPatchSize>
60 static inline void mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
61
62 /**
63 * Returns the zero-mean sum of square differences between two memory buffers.
64 * @param buffer0 The first memory buffer, must be valid
65 * @param buffer1 The second memory buffer, must be valid
66 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
67 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
68 * @return The resulting sum of square differences
69 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
70 */
71 template <unsigned int tPixels>
72 static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
73
74 /**
75 * Returns the zero-mean sum of square differences between two patches within an image.
76 * @param patch0 The top left start position of the first image patch, must be valid
77 * @param patch1 The top left start position of the second image patch, must be valid
78 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
79 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
80 * @param meanValues0 The mean values of the first patch, one for each channel, must be valid
81 * @param meanValues1 The mean values of the second patch, one for each channel, must be valid
82 * @return The resulting sum of square differences
83 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
84 */
85 template <unsigned int tPatchSize>
86 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1);
87 };
88
89 public:
90
91 /**
92 * Returns the zero-mean sum of square differences between two memory buffers.
93 * @param buffer0 The first memory buffer, must be valid
94 * @param buffer1 The second memory buffer, must be valid
95 * @return The resulting sum of square differences
96 * @tparam tChannels Specifies the number of channels for the given buffers, with range [1, infinity)
97 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
98 */
99 template <unsigned int tChannels, unsigned int tPixels>
100 static inline uint32_t buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1);
101
102 /**
103 * Returns the zero-mean sum of square differences between two patches within an image.
104 * @param patch0 The top left start position of the first image patch, must be valid
105 * @param patch1 The top left start position of the second image patch, must be valid
106 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
107 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
108 * @return The resulting sum of square differences
109 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
110 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
111 */
112 template <unsigned int tChannels, unsigned int tPatchSize>
113 static inline uint32_t patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
114
115 /**
116 * Returns the zero-mean sum of square differences between an image patch and a buffer.
117 * @param patch0 The top left start position of the image patch, must be valid
118 * @param buffer1 The memory buffer, must be valid
119 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
120 * @return The resulting sum of square differences
121 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
122 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
123 */
124 template <unsigned int tChannels, unsigned int tPatchSize>
125 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* const patch0, const uint8_t* const buffer1, const unsigned int patch0StrideElements);
126
127 /**
128 * Determines the mean value for a buffer, one value for each channel.
129 * @param buffer The memory buffer to be handled, must be valid
130 * @param meanValues The resulting mean values, one for each channel
131 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
132 * @tparam tPixels The number of pixels in the buffer, in pixels, with range [8, infinity)
133 */
134 template <unsigned int tChannels, unsigned int tPixels>
135 static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues);
136
137 /**
138 * Determines the mean value for an image patch, one value for each channel.
139 * @param patch The top left start position of the image patch, must be valid
140 * @param patchStrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
141 * @param meanValues The resulting mean values, one for each channel
142 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
143 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
144 */
145 template <unsigned int tChannels, unsigned int tPatchSize>
146 static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t* const patch, const unsigned int patchStrideElements, uint8_t* const meanValues);
147};
148
149template <>
150template <unsigned int tPixels>
151inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
152{
153 static_assert(tPixels >= 8u, "Invalid buffer size!");
154
155 constexpr unsigned int tChannels = 1u;
156
157 ocean_assert(buffer != nullptr && meanValues != nullptr);
158
159 constexpr unsigned int bufferElements = tChannels * tPixels;
160
161 constexpr unsigned int blocks16 = bufferElements / 16u;
162 constexpr unsigned int remainingAfterBlocks16 = bufferElements % 16u;
163
164 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
165
166 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
167
168 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
169
170 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
171
172 static_assert(blocks1 <= 2u, "Invalid block size!");
173
174 __m128i sum_128i = _mm_setzero_si128();
175
176 uint32_t sumIndividual = 0u;
177
178 for (unsigned int n = 0u; n < blocks16; ++n)
179 {
180 const __m128i buffer_128i = _mm_lddqu_si128((const __m128i*)buffer);
181
182 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
183
184 buffer += 16;
185 }
186
187 if constexpr (partialBlock16)
188 {
189 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
190
191 static_assert(overlapElements < 8u, "Invalid value!");
192
193 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer - overlapElements)), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the right
194
195 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
196
197 buffer += remainingAfterBlocks16;
198 }
199
200 if constexpr (fullBlock8)
201 {
202 const __m128i buffer_128i = _mm_loadl_epi64((const __m128i*)buffer); // load for unaligned 64 bit memory
203
204 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
205
206 buffer += 8;
207 }
208
209 if constexpr (partialBlock8)
210 {
211 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
212
213 static_assert(overlapElements < 8u, "Invalid value!");
214
215 const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer - overlapElements)), overlapElements); // loading 8 elements, but shifting `overlapElements` zeros to the right
216
217 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
218
219 buffer += remainingAfterBlocks16;
220 }
221
222 if constexpr (blocks1 != 0u)
223 {
224 for (unsigned int n = 0u; n < blocks1; ++n)
225 {
226 sumIndividual += buffer[n];
227 }
228 }
229
230 const uint32_t sum = SSE::sum_u32_first_third(sum_128i) + sumIndividual;
231
232 meanValues[0] = uint8_t((sum + tPixels / 2u) / tPixels);
233}
234
235template <>
236template <unsigned int tPixels>
237inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* buffer, uint8_t* const meanValues)
238{
239 static_assert(tPixels >= 8u, "Invalid buffer size!");
240
241 constexpr unsigned int tChannels = 3u;
242
243 ocean_assert(buffer != nullptr && meanValues != nullptr);
244
245 constexpr unsigned int bufferElements = tChannels * tPixels;
246
247 constexpr unsigned int blocks48 = bufferElements / 48u;
248 constexpr unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
249
250 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
251
252 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
253
254 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
255
256 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
257
258 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
259
260 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
261
262 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
263
264 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
265
266 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
267
268 static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
269
270 __m128i sumChannel0_128i = _mm_setzero_si128();
271 __m128i sumChannel1_128i = _mm_setzero_si128();
272 __m128i sumChannel2_128i = _mm_setzero_si128();
273
274 uint32_t sumIndividual[3] = {0u};
275
276 for (unsigned int n = 0u; n < blocks48; ++n)
277 {
278 const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(buffer + 0));
279 const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(buffer + 16));
280 const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(buffer + 32));
281
282 __m128i channel0;
283 __m128i channel1;
284 __m128i channel2;
285 SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
286
287 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
288 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
289 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
290
291 buffer += 48;
292 }
293
294 if constexpr (partialBlock48)
295 {
296 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
297
298 const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(buffer)), overlappingElements);
299 const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(buffer - overlappingElements + 16));
300 const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(buffer - overlappingElements + 32));
301
302 __m128i channel0;
303 __m128i channel1;
304 __m128i channel2;
305 SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
306
307 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
308 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
309 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
310
311 buffer += remainingAfterFullBlocks48;
312 }
313
314 for (unsigned int n = 0u; n < blocks24; ++n)
315 {
316 const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(buffer + 0));
317 const __m128i bufferB_128i = _mm_loadl_epi64((const __m128i*)(buffer + 16)); // load for unaligned 64 bit memory
318
319 __m128i channel01_128i;
320 __m128i channel2_128i;
321 SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
322
323 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
324
325 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
326 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
327 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
328
329 buffer += 24;
330 }
331
332 for (unsigned int n = 0u; n < blocks21; ++n)
333 {
334 const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(buffer + 0));
335 const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer + 16 - 3)), 3); // load for unaligned 64 bit memory
336
337 __m128i channel01_128i;
338 __m128i channel2_128i;
339 SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
340
341 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
342
343 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
344 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
345 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
346
347 buffer += 21;
348 }
349
350 for (unsigned int n = 0u; n < blocks15; ++n)
351 {
352 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer - 1)), 1);
353
354 __m128i channel01_128i;
355 __m128i channel2_128i;
356 SSE::deInterleave3Channel8Bit15Elements(buffer_128i, channel01_128i, channel2_128i);
357
358 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
359
360 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
361 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
362 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
363
364 buffer += 15;
365 }
366
367 if constexpr (blocks1 != 0u)
368 {
369 constexpr unsigned int pixels = blocks1 / 3u;
370
371 for (unsigned int x = 0u; x < pixels; ++x)
372 {
373 for (unsigned int n = 0u; n < 3u; ++n)
374 {
375 sumIndividual[n] += buffer[x * 3u + n];
376 }
377 }
378
379 buffer += blocks1;
380 }
381
382 meanValues[0] = uint8_t((SSE::sum_u32_first_third(sumChannel0_128i) + sumIndividual[0] + tPixels / 2u) / tPixels);
383 meanValues[1] = uint8_t((SSE::sum_u32_first_third(sumChannel1_128i) + sumIndividual[1] + tPixels / 2u) / tPixels);
384 meanValues[2] = uint8_t((SSE::sum_u32_first_third(sumChannel2_128i) + sumIndividual[2] + tPixels / 2u) / tPixels);
385}
386
387template <unsigned int tChannels>
388template <unsigned int tPixels>
389inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
390{
391 static_assert(tChannels >= 1u, "Invalid channel number!");
392 static_assert(tPixels >= 1u, "Invalid buffer size!");
393
394 ocean_assert(buffer != nullptr && meanValues != nullptr);
395
396 uint32_t sum[tChannels] = {0u};
397
398 for (unsigned int n = 0u; n < tPixels; ++n)
399 {
400 for (unsigned int c = 0u; c < tChannels; ++c)
401 {
402 sum[c] += buffer[n * tChannels + c];
403 }
404 }
405
406 for (unsigned int c = 0u; c < tChannels; ++c)
407 {
408 meanValues[c] = uint8_t((sum[c] + tPixels / 2u) / tPixels);
409 }
410}
411
412template <>
413template <unsigned int tPatchSize>
414inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
415{
416 static_assert(tPatchSize >= 5u, "Invalid patch size!");
417
418 constexpr unsigned int tChannels = 1u;
419
420 ocean_assert(patch != nullptr && meanValues != nullptr);
421
422 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
423
424 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
425
426 constexpr unsigned int blocks16 = patchWidthElements / 16u;
427 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
428
429 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
430
431 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
432
433 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
434
435 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
436
437 static_assert(blocks1 <= 2u, "Invalid block size!");
438
439 __m128i sum_128i = _mm_setzero_si128();
440
441 uint32_t sumIndividual = 0u;
442
443 for (unsigned int y = 0u; y < tPatchSize; ++y)
444 {
445 SSE::prefetchT0(patch + patchStrideElements);
446
447 for (unsigned int n = 0u; n < blocks16; ++n)
448 {
449 const __m128i buffer_128i = _mm_lddqu_si128((const __m128i*)patch);
450
451 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
452
453 patch += 16;
454 }
455
456 if constexpr (fullBlock8)
457 {
458 const __m128i buffer_128i = _mm_loadl_epi64((const __m128i*)patch); // load for unaligned 64 bit memory
459
460 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
461
462 patch += 8;
463 }
464
465 if constexpr (partialBlock16)
466 {
467 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
468
469 static_assert(overlapElements < 8u, "Invalid value!");
470
471 if (y < tPatchSize - 1u)
472 {
473 const __m128i buffer_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)patch), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the left
474
475 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
476 }
477 else
478 {
479 const __m128i buffer_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch - overlapElements)), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the right
480
481 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
482 }
483
484 patch += remainingAfterBlocks16;
485 }
486
487 if constexpr (partialBlock8)
488 {
489 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
490
491 static_assert(overlapElements < 8u, "Invalid value!");
492
493 if (y < tPatchSize - 1u)
494 {
495 const __m128i buffer_128i = _mm_slli_si128(_mm_loadl_epi64((const __m128i*)patch), overlapElements + 8); // loading 8 elements, but shifting `overlapElements` zeros to the left
496
497 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
498 }
499 else
500 {
501 const __m128i buffer_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch - overlapElements)), overlapElements); // loading 8 elements, but shifting `overlapElements` zeros to the right
502
503 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer_128i, _mm_setzero_si128()));
504 }
505
506 patch += remainingAfterBlocks16;
507 }
508
509 if constexpr (blocks1 != 0u)
510 {
511 for (unsigned int n = 0u; n < blocks1; ++n)
512 {
513 sumIndividual += patch[n];
514 }
515
516 patch += blocks1;
517 }
518
519 patch += patchStrideElements - patchWidthElements;
520 }
521
522 const uint32_t sum = SSE::sum_u32_first_third(sum_128i) + sumIndividual;
523
524 meanValues[0] = uint8_t((sum + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
525}
526
527template <>
528template <unsigned int tPatchSize>
529inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
530{
531 static_assert(tPatchSize >= 5u, "Invalid patch size!");
532
533 constexpr unsigned int tChannels = 3u;
534
535 ocean_assert(patch != nullptr && meanValues != nullptr);
536
537 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
538
539 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
540
541 constexpr unsigned int blocks48 = patchWidthElements / 48u;
542 constexpr unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
543
544 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
545
546 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
547
548 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
549
550 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
551
552 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
553
554 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
555
556 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
557
558 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
559
560 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
561
562 static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
563
564 __m128i sumChannel0_128i = _mm_setzero_si128();
565 __m128i sumChannel1_128i = _mm_setzero_si128();
566 __m128i sumChannel2_128i = _mm_setzero_si128();
567
568 uint32_t sumIndividual[3] = {0u};
569
570 for (unsigned int y = 0u; y < tPatchSize; ++y)
571 {
572 SSE::prefetchT0(patch + patchStrideElements);
573
574 for (unsigned int n = 0u; n < blocks48; ++n)
575 {
576 const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(patch + 0));
577 const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(patch + 16));
578 const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(patch + 32));
579
580 __m128i channel0;
581 __m128i channel1;
582 __m128i channel2;
583 SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
584
585 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
586 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
587 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
588
589 patch += 48;
590 }
591
592 if constexpr (partialBlock48)
593 {
594 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
595
596 const __m128i bufferA_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(patch)), overlappingElements);
597 const __m128i bufferB_128i = _mm_lddqu_si128((const __m128i*)(patch - overlappingElements + 16));
598 const __m128i bufferC_128i = _mm_lddqu_si128((const __m128i*)(patch - overlappingElements + 32));
599
600 __m128i channel0;
601 __m128i channel1;
602 __m128i channel2;
603 SSE::deInterleave3Channel8Bit48Elements(bufferA_128i, bufferB_128i, bufferC_128i, channel0, channel1, channel2);
604
605 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_sad_epu8(channel0, _mm_setzero_si128()));
606 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_sad_epu8(channel1, _mm_setzero_si128()));
607 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2, _mm_setzero_si128()));
608
609 patch += remainingAfterFullBlocks48;
610 }
611
612 for (unsigned int n = 0u; n < blocks24; ++n)
613 {
614 const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(patch + 0));
615 const __m128i bufferB_128i = _mm_loadl_epi64((const __m128i*)(patch + 16)); // load for unaligned 64 bit memory
616
617 __m128i channel01_128i;
618 __m128i channel2_128i;
619 SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
620
621 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
622
623 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
624 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
625 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
626
627 patch += 24;
628 }
629
630 for (unsigned int n = 0u; n < blocks21; ++n)
631 {
632 const __m128i bufferA_128i = _mm_lddqu_si128((const __m128i*)(patch + 0));
633 const __m128i bufferB_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch + 16 - 3)), 3); // load for unaligned 64 bit memory
634
635 __m128i channel01_128i;
636 __m128i channel2_128i;
637 SSE::deInterleave3Channel8Bit24Elements(bufferA_128i, bufferB_128i, channel01_128i, channel2_128i);
638
639 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
640
641 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
642 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
643 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
644
645 patch += 21;
646 }
647
648 for (unsigned int n = 0u; n < blocks15; ++n)
649 {
650 const __m128i buffer_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((const __m128i*)(patch)) : _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch - 1)), 1);
651
652 __m128i channel01_128i;
653 __m128i channel2_128i;
654 SSE::deInterleave3Channel8Bit15Elements(buffer_128i, channel01_128i, channel2_128i);
655
656 const __m128i sumChannel01_128i = _mm_sad_epu8(channel01_128i, _mm_setzero_si128());
657
658 sumChannel0_128i = _mm_add_epi32(sumChannel0_128i, _mm_slli_si128(sumChannel01_128i, 8));
659 sumChannel1_128i = _mm_add_epi32(sumChannel1_128i, _mm_srli_si128(sumChannel01_128i, 8));
660 sumChannel2_128i = _mm_add_epi32(sumChannel2_128i, _mm_sad_epu8(channel2_128i, _mm_setzero_si128()));
661
662 patch += 15;
663 }
664
665 if constexpr (blocks1 != 0u)
666 {
667 constexpr unsigned int pixels = blocks1 / 3u;
668
669 for (unsigned int x = 0u; x < pixels; ++x)
670 {
671 for (unsigned int n = 0u; n < 3u; ++n)
672 {
673 sumIndividual[n] += patch[x * 3u + n];
674 }
675 }
676
677 patch += blocks1;
678 }
679
680 patch += patchStrideElements - patchWidthElements;
681 }
682
683 meanValues[0] = uint8_t((SSE::sum_u32_first_third(sumChannel0_128i) + sumIndividual[0] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
684 meanValues[1] = uint8_t((SSE::sum_u32_first_third(sumChannel1_128i) + sumIndividual[1] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
685 meanValues[2] = uint8_t((SSE::sum_u32_first_third(sumChannel2_128i) + sumIndividual[2] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
686}
687
688template <unsigned int tChannels>
689template <unsigned int tPatchSize>
690inline void ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::mean8BitPerChannel(const uint8_t* patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
691{
692 static_assert(tChannels >= 1u, "Invalid channel number!");
693 static_assert(tPatchSize >= 1u, "Invalid patch size!");
694
695 ocean_assert(patch != nullptr && meanValues != nullptr);
696
697 ocean_assert(patchStrideElements >= tChannels * tPatchSize);
698
699 uint32_t sum[tChannels] = {0u};
700
701 for (unsigned int y = 0u; y < tPatchSize; ++y)
702 {
703 for (unsigned int x = 0u; x < tPatchSize; ++x)
704 {
705 for (unsigned int n = 0u; n < tChannels; ++n)
706 {
707 sum[n] += patch[x * tChannels + n];
708 }
709 }
710
711 patch += patchStrideElements;
712 }
713
714 for (unsigned int n = 0u; n < tChannels; ++n)
715 {
716 meanValues[n] = uint8_t((sum[n] + tPatchSize * tPatchSize / 2u) / (tPatchSize * tPatchSize));
717 }
718}
719
720template <>
721template <unsigned int tPixels>
722inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
723{
724 static_assert(tPixels >= 8u, "Invalid pixel number!");
725
726 constexpr unsigned int tChannels = 1u;
727
728 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
729 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
730
731 constexpr unsigned int bufferElements = tChannels * tPixels;
732
733 constexpr unsigned int blocks16 = bufferElements / 16u;
734 constexpr unsigned int remainingAfterBlocks16 = bufferElements % 16u;
735
736 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
737
738 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
739
740 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
741
742 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
743
744 static_assert(blocks1 <= 2u, "Invalid block size!");
745
746 static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
747
748 const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
749
750 const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
751 const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
752
753 __m128i sum0_128i = _mm_setzero_si128();
754 __m128i sum1_128i = _mm_setzero_si128();
755
756 uint32_t sumIndividual = 0u;
757
758 for (unsigned int n = 0u; n < blocks16; ++n)
759 {
760 const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)buffer0);
761 const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)buffer1);
762
763 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
764 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
765
766 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
767 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
768
769 buffer0 += 16;
770 buffer1 += 16;
771 }
772
773 if constexpr (partialBlock16)
774 {
775 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
776
777 static_assert(overlapElements < 8u, "Invalid value!");
778
779 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer0 - overlapElements)), overlapElements); // loading 16 elements, but shifting `overlapElements` zeros to the right
780 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer1 - overlapElements)), overlapElements);
781
782 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
783 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
784
785 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
786 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
787
788 buffer0 += remainingAfterBlocks16;
789 buffer1 += remainingAfterBlocks16;
790 }
791
792 if constexpr (fullBlock8)
793 {
794 const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)buffer0); // load for unaligned 64 bit memory
795 const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)buffer1); // load for unaligned 64 bit memory
796
797 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
798 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
799
800 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
801 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
802
803 buffer0 += 8;
804 buffer1 += 8;
805 }
806
807 if constexpr (partialBlock8)
808 {
809 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
810
811 static_assert(overlapElements < 8u, "Invalid value!");
812
813 const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer0 - overlapElements)), overlapElements); // loading 8 elements, but shifting `overlapElements` zeros to the right
814 const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer1 - overlapElements)), overlapElements);
815
816 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
817 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
818
819 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
820 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
821
822 buffer0 += remainingAfterBlocks16;
823 buffer1 += remainingAfterBlocks16;
824 }
825
826 if constexpr (blocks1 != 0u)
827 {
828 for (unsigned int n = 0u; n < blocks1; ++n)
829 {
830 sumIndividual += sqrDistance(buffer0[n] - meanValues0[0], buffer1[n] - meanValues1[0]);
831 }
832
833 buffer0 += blocks1;
834 buffer1 += blocks1;
835 }
836
837 return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
838}
839
840template <>
841template <unsigned int tPixels>
842inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
843{
844 static_assert(tPixels >= 5u, "Invalid pixel number!");
845
846 constexpr unsigned int tChannels = 3u;
847
848 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
849 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
850
851 constexpr unsigned int bufferElements = tChannels * tPixels;
852
853 constexpr unsigned int blocks48 = bufferElements / 48u;
854 constexpr unsigned int remainingAfterFullBlocks48 = bufferElements % 48u;
855
856 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
857
858 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
859
860 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
861
862 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
863
864 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
865
866 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
867
868 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
869
870 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
871
872 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
873
874 static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
875
876 static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
877
878 const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
879
880 const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
881 const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
882 const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
883
884 const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
885 const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
886 const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
887
888 __m128i sum0_128i = _mm_setzero_si128();
889 __m128i sum1_128i = _mm_setzero_si128();
890
891 uint32_t sumIndividual = 0u;
892
893 for (unsigned int n = 0u; n < blocks48; ++n)
894 {
895 const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 0));
896 const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 16));
897 const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 32));
898
899 __m128i channel0_0_128i;
900 __m128i channel0_1_128i;
901 __m128i channel0_2_128i;
902 SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
903
904 const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 0));
905 const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 16));
906 const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 32));
907
908 __m128i channel1_0_128i;
909 __m128i channel1_1_128i;
910 __m128i channel1_2_128i;
911 SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
912
913 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)); // (channel0_0 - mean0) - (channel1_0 - mean1)
914 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
915
916 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
917 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
918
919 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
920 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
921
922 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
923 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
924
925 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
926 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
927
928 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
929 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
930
931 buffer0 += 48;
932 buffer1 += 48;
933 }
934
935 if constexpr (partialBlock48)
936 {
937 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
938 constexpr int overlappingPixels = overlappingElements / int(tChannels);
939
940 const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(buffer0)), overlappingElements);
941 const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(buffer0 - overlappingElements + 16));
942 const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(buffer0 - overlappingElements + 32));
943
944 __m128i channel0_0_128i;
945 __m128i channel0_1_128i;
946 __m128i channel0_2_128i;
947 SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
948
949 const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(buffer1)), overlappingElements);
950 const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(buffer1 - overlappingElements + 16));
951 const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(buffer1 - overlappingElements + 32));
952
953 __m128i channel1_0_128i;
954 __m128i channel1_1_128i;
955 __m128i channel1_2_128i;
956 SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
957
958 __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2); // (channel0_0 - mean0) - (channel1_0 - mean1)
959 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
960
961 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
962 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
963
964 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
965 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
966
967 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
968 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
969
970 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
971 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
972
973 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
974 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
975
976 buffer0 += remainingAfterFullBlocks48;
977 buffer1 += remainingAfterFullBlocks48;
978 }
979
980 for (unsigned int n = 0u; n < blocks24; ++n)
981 {
982 const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 0));
983 const __m128i buffer0B_128i = _mm_loadl_epi64((const __m128i*)(buffer0 + 16)); // load for unaligned 64 bit memory
984
985 __m128i channel0_01_128i;
986 __m128i channel0_2_128i;
987 SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
988
989 const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 0));
990 const __m128i buffer1B_128i = _mm_loadl_epi64((const __m128i*)(buffer1 + 16)); // load for unaligned 64 bit memory
991
992 __m128i channel1_01_128i;
993 __m128i channel1_2_128i;
994 SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
995
996 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)); // (channel0_01 - mean0) - (channel1_01 - mean1)
997 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
998
999 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1000 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1001
1002 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1003
1004 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1005
1006 buffer0 += 24;
1007 buffer1 += 24;
1008 }
1009
1010 for (unsigned int n = 0u; n < blocks21; ++n)
1011 {
1012 const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(buffer0 + 0));
1013 const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer0 + 16 - 3)), 3); // load for unaligned 64 bit memory
1014
1015 __m128i channel0_01_128i;
1016 __m128i channel0_2_128i;
1017 SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
1018
1019 const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(buffer1 + 0));
1020 const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(buffer1 + 16 - 3)), 3); // load for unaligned 64 bit memory
1021
1022 __m128i channel1_01_128i;
1023 __m128i channel1_2_128i;
1024 SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
1025
1026 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2); // (channel0_01 - mean0) - (channel1_01 - mean1)
1027 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1028
1029 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1030 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1031
1032 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1033
1034 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1035
1036 buffer0 += 21;
1037 buffer1 += 21;
1038 }
1039
1040 for (unsigned int n = 0u; n < blocks15; ++n)
1041 {
1042 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer0 - 1)), 1);
1043
1044 __m128i channel0_01_128i;
1045 __m128i channel0_2_128i;
1046 SSE::deInterleave3Channel8Bit15Elements(buffer0_128i, channel0_01_128i, channel0_2_128i);
1047
1048 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(buffer1 - 1)), 1);
1049
1050 __m128i channel1_01_128i;
1051 __m128i channel1_2_128i;
1052 SSE::deInterleave3Channel8Bit15Elements(buffer1_128i, channel1_01_128i, channel1_2_128i);
1053
1054 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6); // (channel0_01 - mean0) - (channel1_01 - mean1)
1055 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1056
1057 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1058 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1059
1060 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1061
1062 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1063
1064 buffer0 += 15;
1065 buffer1 += 15;
1066 }
1067
1068 if constexpr (blocks1 != 0u)
1069 {
1070 constexpr unsigned int pixels = blocks1 / 3u;
1071
1072 for (unsigned int x = 0u; x < pixels; ++x)
1073 {
1074 for (unsigned int n = 0u; n < 3u; ++n)
1075 {
1076 sumIndividual += sqrDistance(buffer0[x * 3u + n] - meanValues0[n], buffer1[x * 3u + n] - meanValues1[n]);
1077 }
1078 }
1079
1080 buffer0 += blocks1;
1081 buffer1 += blocks1;
1082 }
1083
1084 return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
1085}
1086
1087template <unsigned int tChannels>
1088template <unsigned int tPixels>
1089inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1090{
1091 static_assert(tChannels >= 1u, "Invalid channel number!");
1092 static_assert(tPixels >= 1u, "Invalid patch size!");
1093
1094 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1095 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1096
1097 uint32_t ssd = 0u;
1098
1099 for (unsigned int n = 0u; n < tPixels; ++n)
1100 {
1101 for (unsigned int c = 0u; c < tChannels; ++c)
1102 {
1103 ssd += sqrDistance(buffer0[n * tChannels + c] - meanValues0[c], buffer1[n * tChannels + c] - meanValues1[c]);
1104 }
1105 }
1106
1107 return ssd;
1108}
1109
1110template <>
1111template <unsigned int tPatchSize>
1112inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<1u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1113{
1114 static_assert(tPatchSize >= 1u, "Invalid patch size!");
1115
1116 constexpr unsigned int tChannels = 1u;
1117
1118 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1119 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1120
1121 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1122 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1123
1124 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
1125
1126 constexpr unsigned int blocks16 = patchWidthElements / 16u;
1127 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
1128
1129 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
1130
1131 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
1132
1133 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
1134
1135 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
1136
1137 static_assert(blocks1 <= 2u, "Invalid block size!");
1138
1139 static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
1140
1141 const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
1142
1143 const __m128i mean0_128i = _mm_set1_epi8(meanValues0[0]);
1144 const __m128i mean1_128i = _mm_set1_epi8(meanValues1[0]);
1145
1146 __m128i sum0_128i = _mm_setzero_si128();
1147 __m128i sum1_128i = _mm_setzero_si128();
1148
1149 uint32_t sumIndividual = 0u;
1150
1151 for (unsigned int y = 0u; y < tPatchSize; ++y)
1152 {
1153 SSE::prefetchT0(patch0 + patch0StrideElements);
1154 SSE::prefetchT0(patch1 + patch1StrideElements);
1155
1156 for (unsigned int n = 0u; n < blocks16; ++n)
1157 {
1158 const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)patch0);
1159 const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)patch1);
1160
1161 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
1162 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1163
1164 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1165 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1166
1167 patch0 += 16;
1168 patch1 += 16;
1169 }
1170
1171 if constexpr (fullBlock8)
1172 {
1173 const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)patch0); // load for unaligned 64 bit memory
1174 const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)patch1); // load for unaligned 64 bit memory
1175
1176 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
1177 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1178
1179 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1180 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1181
1182 patch0 += 8;
1183 patch1 += 8;
1184 }
1185
1186 if constexpr (partialBlock16)
1187 {
1188 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
1189
1190 static_assert(overlapElements < 8u, "Invalid value!");
1191
1192 if (y < tPatchSize - 1u)
1193 {
1194 const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)patch0); // loading 16 elements
1195 const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)patch1);
1196
1197 const __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)); // (buffer0 - mean0) - (buffer1 - mean1)
1198 const __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2);
1199
1200 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1201 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1202 }
1203 else
1204 {
1205 const __m128i buffer0_128i = _mm_lddqu_si128((const __m128i*)(patch0 - overlapElements)); // loading 16 elements
1206 const __m128i buffer1_128i = _mm_lddqu_si128((const __m128i*)(patch1 - overlapElements));
1207
1208 const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2); // (buffer0 - mean0) - (buffer1 - mean1)
1209 const __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_128i, buffer1_128i), constant_signs_m128i));
1210
1211 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1212 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1213 }
1214
1215 patch0 += remainingAfterBlocks16;
1216 patch1 += remainingAfterBlocks16;
1217 }
1218
1219 if constexpr (partialBlock8)
1220 {
1221 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
1222
1223 static_assert(overlapElements < 8u, "Invalid value!");
1224
1225 if (y < tPatchSize - 1u)
1226 {
1227 const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)patch0);// loading 8 elements
1228 const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)patch1);
1229
1230 const __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2); // (buffer0 - mean0) - (buffer1 - mean1)
1231
1232 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1233 }
1234 else
1235 {
1236 const __m128i buffer0_128i = _mm_loadl_epi64((const __m128i*)(patch0 - overlapElements)); // loading 8 elements
1237 const __m128i buffer1_128i = _mm_loadl_epi64((const __m128i*)(patch1 - overlapElements));
1238
1239 const __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_128i, buffer0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_128i, buffer1_128i), constant_signs_m128i)), overlapElements * 2); // (buffer0 - mean0) - (buffer1 - mean1)
1240
1241 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1242 }
1243
1244 patch0 += remainingAfterBlocks16;
1245 patch1 += remainingAfterBlocks16;
1246 }
1247
1248 if constexpr (blocks1 != 0u)
1249 {
1250 for (unsigned int n = 0u; n < blocks1; ++n)
1251 {
1252 sumIndividual += sqrDistance(patch0[n] - meanValues0[0], patch1[n] - meanValues1[0]);
1253 }
1254
1255 patch0 += blocks1;
1256 patch1 += blocks1;
1257 }
1258
1259 patch0 += patch0StrideElements - patchWidthElements;
1260 patch1 += patch1StrideElements - patchWidthElements;
1261 }
1262
1263 return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
1264}
1265
1266template <>
1267template <unsigned int tPatchSize>
1268inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<3u>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1269{
1270 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1271
1272 constexpr unsigned int tChannels = 3u;
1273
1274 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1275 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1276
1277 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1278 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1279
1280 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
1281
1282 constexpr unsigned int blocks48 = patchWidthElements / 48u;
1283 constexpr unsigned int remainingAfterFullBlocks48 = patchWidthElements % 48u;
1284
1285 constexpr bool partialBlock48 = remainingAfterFullBlocks48 > 2u * 16u;
1286
1287 constexpr unsigned int remainingAfterPartialBlock48 = partialBlock48 ? 0u : remainingAfterFullBlocks48;
1288
1289 constexpr unsigned int blocks24 = remainingAfterPartialBlock48 / 24u;
1290
1291 constexpr unsigned int remainingAfterPartialBlock24 = remainingAfterPartialBlock48 % 24u;
1292
1293 constexpr unsigned int blocks21 = remainingAfterPartialBlock24 / 21u;
1294
1295 constexpr unsigned int remainingAfterPartialBlock21 = remainingAfterPartialBlock24 % 21u;
1296
1297 constexpr unsigned int blocks15 = remainingAfterPartialBlock21 / 15u;
1298
1299 constexpr unsigned int remainingAfterPartialBlock15 = remainingAfterPartialBlock21 % 15u;
1300
1301 constexpr unsigned int blocks1 = remainingAfterPartialBlock15;
1302
1303 static_assert(blocks1 % 3u == 0u, "Invalid number of single blocks");
1304
1305 static_assert(std::is_same<short, int16_t>::value, "Invalid data type!");
1306
1307 const __m128i constant_signs_m128i = _mm_set1_epi16(short(0x1FF)); // -1, 1, -1, 1, -1, 1, -1, 1
1308
1309 const __m128i mean0_0_128i = _mm_set1_epi8(meanValues0[0]);
1310 const __m128i mean0_1_128i = _mm_set1_epi8(meanValues0[1]);
1311 const __m128i mean0_2_128i = _mm_set1_epi8(meanValues0[2]);
1312
1313 const __m128i mean1_0_128i = _mm_set1_epi8(meanValues1[0]);
1314 const __m128i mean1_1_128i = _mm_set1_epi8(meanValues1[1]);
1315 const __m128i mean1_2_128i = _mm_set1_epi8(meanValues1[2]);
1316
1317 __m128i sum0_128i = _mm_setzero_si128();
1318 __m128i sum1_128i = _mm_setzero_si128();
1319
1320 uint32_t sumIndividual = 0u;
1321
1322 for (unsigned int y = 0u; y < tPatchSize; ++y)
1323 {
1324 SSE::prefetchT0(patch0 + patch0StrideElements);
1325 SSE::prefetchT0(patch1 + patch1StrideElements);
1326
1327 for (unsigned int n = 0u; n < blocks48; ++n)
1328 {
1329 const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 0));
1330 const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 16));
1331 const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 32));
1332
1333 __m128i channel0_0_128i;
1334 __m128i channel0_1_128i;
1335 __m128i channel0_2_128i;
1336 SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
1337
1338 const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 0));
1339 const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 16));
1340 const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 32));
1341
1342 __m128i channel1_0_128i;
1343 __m128i channel1_1_128i;
1344 __m128i channel1_2_128i;
1345 SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
1346
1347 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)); // (channel0_0 - mean0) - (channel1_0 - mean1)
1348 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1349
1350 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1351 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1352
1353 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1354 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1355
1356 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1357 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1358
1359 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1360 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1361
1362 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1363 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1364
1365 patch0 += 48;
1366 patch1 += 48;
1367 }
1368
1369 if constexpr (partialBlock48)
1370 {
1371 constexpr int overlappingElements = int(48u - remainingAfterFullBlocks48);
1372 constexpr int overlappingPixels = overlappingElements / int(tChannels);
1373
1374 const __m128i buffer0A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(patch0)), overlappingElements);
1375 const __m128i buffer0B_128i = _mm_lddqu_si128((const __m128i*)(patch0 - overlappingElements + 16));
1376 const __m128i buffer0C_128i = _mm_lddqu_si128((const __m128i*)(patch0 - overlappingElements + 32));
1377
1378 __m128i channel0_0_128i;
1379 __m128i channel0_1_128i;
1380 __m128i channel0_2_128i;
1381 SSE::deInterleave3Channel8Bit48Elements(buffer0A_128i, buffer0B_128i, buffer0C_128i, channel0_0_128i, channel0_1_128i, channel0_2_128i);
1382
1383 const __m128i buffer1A_128i = _mm_slli_si128(_mm_lddqu_si128((const __m128i*)(patch1)), overlappingElements);
1384 const __m128i buffer1B_128i = _mm_lddqu_si128((const __m128i*)(patch1 - overlappingElements + 16));
1385 const __m128i buffer1C_128i = _mm_lddqu_si128((const __m128i*)(patch1 - overlappingElements + 32));
1386
1387 __m128i channel1_0_128i;
1388 __m128i channel1_1_128i;
1389 __m128i channel1_2_128i;
1390 SSE::deInterleave3Channel8Bit48Elements(buffer1A_128i, buffer1B_128i, buffer1C_128i, channel1_0_128i, channel1_1_128i, channel1_2_128i);
1391
1392 __m128i absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i)), overlappingPixels * 2); // (channel0_0 - mean0) - (channel1_0 - mean1)
1393 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_0_128i, channel0_0_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_0_128i, channel1_0_128i), constant_signs_m128i));
1394
1395 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1396 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1397
1398 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i)), overlappingPixels * 2);
1399 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_1_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_1_128i), constant_signs_m128i));
1400
1401 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1402 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1403
1404 absDifferencesLow_128i = _mm_srli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), overlappingPixels * 2);
1405 absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1406
1407 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
1408 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1409
1410 patch0 += remainingAfterFullBlocks48;
1411 patch1 += remainingAfterFullBlocks48;
1412 }
1413
1414 for (unsigned int n = 0u; n < blocks24; ++n)
1415 {
1416 const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 0));
1417 const __m128i buffer0B_128i = _mm_loadl_epi64((const __m128i*)(patch0 + 16)); // load for unaligned 64 bit memory
1418
1419 __m128i channel0_01_128i;
1420 __m128i channel0_2_128i;
1421 SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
1422
1423 const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 0));
1424 const __m128i buffer1B_128i = _mm_loadl_epi64((const __m128i*)(patch1 + 16)); // load for unaligned 64 bit memory
1425
1426 __m128i channel1_01_128i;
1427 __m128i channel1_2_128i;
1428 SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
1429
1430 __m128i absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)); // (channel0_01 - mean0) - (channel1_01 - mean1)
1431 __m128i absDifferencesHigh_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i));
1432
1433 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1434 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1435
1436 absDifferencesLow_128i = _mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i));
1437
1438 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1439
1440 patch0 += 24;
1441 patch1 += 24;
1442 }
1443
1444 for (unsigned int n = 0u; n < blocks21; ++n)
1445 {
1446 const __m128i buffer0A_128i = _mm_lddqu_si128((const __m128i*)(patch0 + 0));
1447 const __m128i buffer0B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch0 + 16 - 3)), 3); // load for unaligned 64 bit memory
1448
1449 __m128i channel0_01_128i;
1450 __m128i channel0_2_128i;
1451 SSE::deInterleave3Channel8Bit24Elements(buffer0A_128i, buffer0B_128i, channel0_01_128i, channel0_2_128i);
1452
1453 const __m128i buffer1A_128i = _mm_lddqu_si128((const __m128i*)(patch1 + 0));
1454 const __m128i buffer1B_128i = _mm_srli_si128(_mm_loadl_epi64((const __m128i*)(patch1 + 16 - 3)), 3); // load for unaligned 64 bit memory
1455
1456 __m128i channel1_01_128i;
1457 __m128i channel1_2_128i;
1458 SSE::deInterleave3Channel8Bit24Elements(buffer1A_128i, buffer1B_128i, channel1_01_128i, channel1_2_128i);
1459
1460 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 2); // (channel0_01 - mean0) - (channel1_01 - mean1)
1461 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 2);
1462
1463 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1464 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1465
1466 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 2);
1467
1468 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1469
1470 patch0 += 21;
1471 patch1 += 21;
1472 }
1473
1474 for (unsigned int n = 0u; n < blocks15; ++n)
1475 {
1476 const __m128i buffer0_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((const __m128i*)(patch0)) : _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch0 - 1)), 1);
1477
1478 __m128i channel0_01_128i;
1479 __m128i channel0_2_128i;
1480 SSE::deInterleave3Channel8Bit15Elements(buffer0_128i, channel0_01_128i, channel0_2_128i);
1481
1482 const __m128i buffer1_128i = y < tPatchSize - 1u ? _mm_lddqu_si128((const __m128i*)(patch1)) : _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(patch1 - 1)), 1);
1483
1484 __m128i channel1_01_128i;
1485 __m128i channel1_2_128i;
1486 SSE::deInterleave3Channel8Bit15Elements(buffer1_128i, channel1_01_128i, channel1_2_128i);
1487
1488 __m128i absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_0_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_0_128i, channel1_01_128i), constant_signs_m128i)), 6); // (channel0_01 - mean0) - (channel1_01 - mean1)
1489 __m128i absDifferencesHigh_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(mean0_1_128i, channel0_01_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpackhi_epi8(mean1_1_128i, channel1_01_128i), constant_signs_m128i)), 6);
1490
1491 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1492 sum1_128i = _mm_add_epi32(sum1_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
1493
1494 absDifferencesLow_128i = _mm_slli_si128(_mm_sub_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(mean0_2_128i, channel0_2_128i), constant_signs_m128i), _mm_maddubs_epi16(_mm_unpacklo_epi8(mean1_2_128i, channel1_2_128i), constant_signs_m128i)), 6);
1495
1496 sum0_128i = _mm_add_epi32(sum0_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i)); // sum0_128i += absDifferencesLow_128i * absDifferencesLow_128i
1497
1498 patch0 += 15;
1499 patch1 += 15;
1500 }
1501
1502 if constexpr (blocks1 != 0u)
1503 {
1504 constexpr unsigned int pixels = blocks1 / 3u;
1505
1506 for (unsigned int x = 0u; x < pixels; ++x)
1507 {
1508 for (unsigned int n = 0u; n < 3u; ++n)
1509 {
1510 sumIndividual += sqrDistance(patch0[x * 3u + n] - meanValues0[n], patch1[x * 3u + n] - meanValues1[n]);
1511 }
1512 }
1513
1514 patch0 += blocks1;
1515 patch1 += blocks1;
1516 }
1517
1518 patch0 += patch0StrideElements - patchWidthElements;
1519 patch1 += patch1StrideElements - patchWidthElements;
1520 }
1521
1522 return SSE::sum_u32_4(sum0_128i) + SSE::sum_u32_4(sum1_128i) + sumIndividual;
1523}
1524
1525template <unsigned int tChannels>
1526template <unsigned int tPatchSize>
1527inline uint32_t ZeroMeanSumSquareDifferencesSSE::SpecializedForChannels<tChannels>::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t* const meanValues0, const uint8_t* const meanValues1)
1528{
1529 static_assert(tChannels >= 1u, "Invalid channel number!");
1530 static_assert(tPatchSize >= 1u, "Invalid patch size!");
1531
1532 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1533 ocean_assert(meanValues0 != nullptr && meanValues1 != nullptr);
1534
1535 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1536 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1537
1538 uint32_t ssd = 0u;
1539
1540 for (unsigned int y = 0u; y < tPatchSize; ++y)
1541 {
1542 for (unsigned int x = 0u; x < tPatchSize; ++x)
1543 {
1544 for (unsigned int n = 0u; n < tChannels; ++n)
1545 {
1546 ssd += sqrDistance(patch0[x * tChannels + n] - meanValues0[n], patch1[x * tChannels + n] - meanValues1[n]);
1547 }
1548 }
1549
1550 patch0 += patch0StrideElements;
1551 patch1 += patch1StrideElements;
1552 }
1553
1554 return ssd;
1555}
1556
1557template <unsigned int tChannels, unsigned int tPixels>
1558inline uint32_t ZeroMeanSumSquareDifferencesSSE::buffer8BitPerChannel(const uint8_t* const buffer0, const uint8_t* const buffer1)
1559{
1560 static_assert(tChannels >= 1u, "Invalid channel number!");
1561 static_assert(tPixels >= 8u, "Invalid patch size!");
1562
1563 ocean_assert(buffer0 != nullptr && buffer1 != nullptr);
1564
1565 uint8_t meanValues0[tChannels];
1566 mean8BitPerChannel<tChannels, tPixels>(buffer0, meanValues0);
1567
1568 uint8_t meanValues1[tChannels];
1569 mean8BitPerChannel<tChannels, tPixels>(buffer1, meanValues1);
1570
1571 return SpecializedForChannels<tChannels>::template buffer8BitPerChannel<tPixels>(buffer0, buffer1, meanValues0, meanValues1);
1572}
1573
1574template <unsigned int tChannels, unsigned int tPatchSize>
1575inline uint32_t ZeroMeanSumSquareDifferencesSSE::patch8BitPerChannel(const uint8_t* const patch0, const uint8_t* const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
1576{
1577 static_assert(tChannels >= 1u, "Invalid channel number!");
1578 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1579
1580 ocean_assert(patch0 != nullptr && patch1 != nullptr);
1581
1582 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1583 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
1584
1585 uint8_t meanValues0[tChannels];
1586 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
1587
1588 uint8_t meanValues1[tChannels];
1589 mean8BitPerChannel<tChannels, tPatchSize>(patch1, patch1StrideElements, meanValues1);
1590
1591 return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, patch1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
1592}
1593
1594template <unsigned int tChannels, unsigned int tPatchSize>
1595inline uint32_t ZeroMeanSumSquareDifferencesSSE::patchBuffer8BitPerChannel(const uint8_t* const patch0, const uint8_t* const buffer1, const unsigned int patch0StrideElements)
1596{
1597 static_assert(tChannels >= 1u, "Invalid channel number!");
1598 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1599
1600 ocean_assert(patch0 != nullptr && buffer1 != nullptr);
1601
1602 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
1603
1604 uint8_t meanValues0[tChannels];
1605 mean8BitPerChannel<tChannels, tPatchSize>(patch0, patch0StrideElements, meanValues0);
1606
1607 uint8_t meanValues1[tChannels];
1608 mean8BitPerChannel<tChannels, tPatchSize * tPatchSize>(buffer1, meanValues1);
1609
1610 constexpr unsigned int patch1StrideElements = tChannels * tPatchSize;
1611
1612 return SpecializedForChannels<tChannels>::template patch8BitPerChannel<tPatchSize>(patch0, buffer1, patch0StrideElements, patch1StrideElements, meanValues0, meanValues1);
1613}
1614
1615template <unsigned int tChannels, unsigned int tPixels>
1616OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesSSE::mean8BitPerChannel(const uint8_t* const buffer, uint8_t* const meanValues)
1617{
1618 static_assert(tChannels >= 1u, "Invalid channel number!");
1619 static_assert(tPixels >= 8u, "Invalid patch size!");
1620
1621 SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPixels>(buffer, meanValues);
1622}
1623
1624template <unsigned int tChannels, unsigned int tPatchSize>
1625OCEAN_FORCE_INLINE void ZeroMeanSumSquareDifferencesSSE::mean8BitPerChannel(const uint8_t* const patch, const unsigned int patchStrideElements, uint8_t* const meanValues)
1626{
1627 static_assert(tChannels >= 1u, "Invalid channel number!");
1628 static_assert(tPatchSize >= 5u, "Invalid patch size!");
1629
1630 SpecializedForChannels<tChannels>::template mean8BitPerChannel<tPatchSize>(patch, patchStrideElements, meanValues);
1631}
1632
1633}
1634
1635}
1636
1637#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
1638
1639#endif // META_OCEAN_CV_ZERO_MEAN_SUM_SQUARE_DIFFERENCES_SSE_H
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
Definition SSE.h:3277
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1340
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
Definition SSE.h:3289
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1255
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
Definition SSE.h:3304
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition SSE.h:1322
This class allows to specialize functions for individual channels.
Definition ZeroMeanSumSquareDifferencesSSE.h:39
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two patches within an image.
Definition ZeroMeanSumSquareDifferencesSSE.h:1527
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1, const uint8_t *const meanValues0, const uint8_t *const meanValues1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition ZeroMeanSumSquareDifferencesSSE.h:1089
static void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition ZeroMeanSumSquareDifferencesSSE.h:389
This class implements function to calculate zeao-mean sum square differences using SSE instructions.
Definition ZeroMeanSumSquareDifferencesSSE.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *const patch0, const uint8_t *const buffer1, const unsigned int patch0StrideElements)
Returns the zero-mean sum of square differences between an image patch and a buffer.
Definition ZeroMeanSumSquareDifferencesSSE.h:1595
static uint32_t buffer8BitPerChannel(const uint8_t *const buffer0, const uint8_t *const buffer1)
Returns the zero-mean sum of square differences between two memory buffers.
Definition ZeroMeanSumSquareDifferencesSSE.h:1558
static OCEAN_FORCE_INLINE void mean8BitPerChannel(const uint8_t *const buffer, uint8_t *const meanValues)
Determines the mean value for a buffer, one value for each channel.
Definition ZeroMeanSumSquareDifferencesSSE.h:1616
static uint32_t patch8BitPerChannel(const uint8_t *const patch0, const uint8_t *const patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the zero-mean sum of square differences between two patches within an image.
Definition ZeroMeanSumSquareDifferencesSSE.h:1575
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition Accessor.h:15