8 #ifndef META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
9 #define META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
15 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
52 template <
unsigned int tChannels,
unsigned int tPatchSize>
53 static inline uint32_t patch8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int width1,
const Scalar centerX0,
const Scalar centerY0,
const Scalar centerX1,
const Scalar centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements);
71 template <
unsigned int tChannels,
unsigned int tPatchSize>
72 static inline uint32_t patch8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int width1,
const unsigned int centerX0,
const unsigned int centerY0,
const Scalar centerX1,
const Scalar centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements);
90 template <
unsigned int tChannels,
unsigned int tPatchSize>
91 static inline uint32_t
patch8BitPerChannel(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx0,
const unsigned int fy0,
const unsigned int fx1,
const unsigned int fy1);
105 template <
unsigned int tChannels,
unsigned int tPatchSize>
106 static inline uint32_t
patch8BitPerChannel(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx1,
const unsigned int fy1);
109 template <
unsigned int tChannels,
unsigned int tPatchSize>
112 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
114 ocean_assert(width0 > tPatchSize);
115 ocean_assert(width1 > tPatchSize);
117 const unsigned int tPatchSize_2 = tPatchSize / 2u;
119 ocean_assert(centerX0 >=
Scalar(tPatchSize_2) && centerX0 <
Scalar(width0 - tPatchSize_2 - 1u));
120 ocean_assert(centerY0 >=
Scalar(tPatchSize_2));
122 ocean_assert(centerX1 >=
Scalar(tPatchSize_2) && centerX1 <
Scalar(width1 - tPatchSize_2 - 1u));
123 ocean_assert(centerY1 >=
Scalar(tPatchSize_2));
125 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
126 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
128 const unsigned int left0 = (
unsigned int)(centerX0);
129 const unsigned int top0 = (
unsigned int)(centerY0);
131 const unsigned int left1 = (
unsigned int)(centerX1);
132 const unsigned int top1 = (
unsigned int)(centerY1);
137 ocean_assert(scalarFx0 >= 0 && scalarFx0 <= 1u);
138 ocean_assert(scalarFy0 >= 0 && scalarFy0 <= 1u);
140 const unsigned int fx0 = (
unsigned int)(
Scalar(128) * scalarFx0 +
Scalar(0.5));
141 const unsigned int fy0 = (
unsigned int)(
Scalar(128) * scalarFy0 +
Scalar(0.5));
146 ocean_assert(scalarFx1 >= 0 && scalarFx1 <= 1);
147 ocean_assert(scalarFy1 >= 0 && scalarFy1 <= 1);
149 const unsigned int fx1 = (
unsigned int)(
Scalar(128) * scalarFx1 +
Scalar(0.5));
150 const unsigned int fy1 = (
unsigned int)(
Scalar(128) * scalarFy1 +
Scalar(0.5));
152 const uint8_t* imageTopLeft0 = image0 + (top0 - tPatchSize_2) * image0StrideElements + (left0 - tPatchSize_2) * tChannels;
153 const uint8_t* imageTopLeft1 = image1 + (top1 - tPatchSize_2) * image1StrideElements + (left1 - tPatchSize_2) * tChannels;
155 return patch8BitPerChannel<tChannels, tPatchSize>(imageTopLeft0, imageTopLeft1, image0StrideElements, image1StrideElements, fx0, fy0, fx1, fy1);
158 template <
unsigned int tChannels,
unsigned int tPatchSize>
159 inline uint32_t
AdvancedSumSquareDifferencesSSE::patch8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int width1,
const unsigned int centerX0,
const unsigned int centerY0,
const Scalar centerX1,
const Scalar centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements)
161 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
163 ocean_assert(width0 > tPatchSize);
164 ocean_assert(width1 > tPatchSize);
166 const unsigned int tPatchSize_2 = tPatchSize / 2u;
168 ocean_assert(centerX0 >= tPatchSize_2 && centerX0 < width0 - tPatchSize_2);
169 ocean_assert(centerY0 >=
Scalar(tPatchSize_2));
171 ocean_assert(centerX1 >= tPatchSize_2 && centerX1 < width1 - tPatchSize_2 - 1u);
172 ocean_assert(centerY1 >=
Scalar(tPatchSize_2));
174 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
175 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
177 const unsigned int left1 = (
unsigned int)(centerX1);
178 const unsigned int top1 = (
unsigned int)(centerY1);
183 ocean_assert(scalarFx1 >= 0 && scalarFx1 <= 1);
184 ocean_assert(scalarFy1 >= 0 && scalarFy1 <= 1);
186 const unsigned int fx1 = (
unsigned int)(
Scalar(128) * scalarFx1 +
Scalar(0.5));
187 const unsigned int fy1 = (
unsigned int)(
Scalar(128) * scalarFy1 +
Scalar(0.5));
189 const uint8_t* imageTopLeft0 = image0 + (centerY0 - tPatchSize_2) * image0StrideElements + (centerX0 - tPatchSize_2) * tChannels;
190 const uint8_t* imageTopLeft1 = image1 + (top1 - tPatchSize_2) * image1StrideElements + (left1 - tPatchSize_2) * tChannels;
192 return patch8BitPerChannel<tChannels, tPatchSize>(imageTopLeft0, imageTopLeft1, image0StrideElements, image1StrideElements, fx1, fy1);
196 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<1u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx0,
const unsigned int fy0,
const unsigned int fx1,
const unsigned int fy1)
198 ocean_assert(fx0 <= 128u && fy0 <= 128u);
199 ocean_assert(fx1 <= 128u && fy1 <= 128u);
207 const unsigned int fx0_ = 128u - fx0;
208 const unsigned int fy0_ = 128u - fy0;
210 const unsigned int fx1_ = 128u - fx1;
211 const unsigned int fy1_ = 128u - fy1;
213 const unsigned int f0x_y_ = fx0_ * fy0_;
214 const unsigned int f0xy_ = fx0 * fy0_;
215 const unsigned int f0x_y = fx0_ * fy0;
216 const unsigned int f0xy = fx0 * fy0;
218 const unsigned int f1x_y_ = fx1_ * fy1_;
219 const unsigned int f1xy_ = fx1 * fy1_;
220 const unsigned int f1x_y = fx1_ * fy1;
221 const unsigned int f1xy = fx1 * fy1;
223 const __m128i __f0x_y_ = _mm_set1_epi16(
short(f0x_y_));
224 const __m128i __f0xy_ = _mm_set1_epi16(
short(f0xy_));
225 const __m128i __f0x_y = _mm_set1_epi16(
short(f0x_y));
226 const __m128i __f0xy = _mm_set1_epi16(
short(f0xy));
228 const __m128i __f1x_y_ = _mm_set1_epi16(
short(f1x_y_));
229 const __m128i __f1xy_ = _mm_set1_epi16(
short(f1xy_));
230 const __m128i __f1x_y = _mm_set1_epi16(
short(f1x_y));
231 const __m128i __f1xy = _mm_set1_epi16(
short(f1xy));
237 const __m128i image0_row0 = _mm_loadu_si64(imageTopLeft0);
238 const __m128i image0_row1 = _mm_loadu_si64(imageTopLeft0 + image0StrideElements);
241 const __m128i image1_row0 = _mm_loadu_si64(imageTopLeft1);
242 const __m128i image1_row1 = _mm_loadu_si64(imageTopLeft1 + image1StrideElements);
250 const __m128i image0_row2 = _mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements);
251 __m128i mask =
SSE::set128i(0x0000000000FFFFFFull, 0xFFFFFFFFFFFFFFFFull);
254 const __m128i image1_row2 = _mm_loadu_si64(imageTopLeft1 + 2u * image1StrideElements);
260 #ifdef OCEAN_COMPILER_CLANG
265 const __m128i image0_row3 = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
266 mask =
SSE::set128i(0x0000000000000000ull, 0x0000FFFFFFFFFFFFull);
269 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
275 const __m128i image0_row3 = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
276 mask =
SSE::set128i(0x0000000000000000ull, 0x000000FFFFFFFFFFull);
279 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
292 __m128i image0_row4 = _mm_loadu_si64(imageTopLeft0 + 4u * image0StrideElements);
295 __m128i image1_row4 = _mm_loadu_si64(imageTopLeft1 + 4u * image1StrideElements);
299 __m128i image0_row5 = _mm_loadu_si64(imageTopLeft0 + 5u * image0StrideElements);
300 mask =
SSE::set128i(0x000000000000FFFFull, 0xFFFFFF0000000000ull);
303 __m128i image1_row5 = _mm_loadu_si64(imageTopLeft1 + 5u * image1StrideElements);
306 #ifdef OCEAN_COMPILER_CLANG
310 interpolation0 = _mm_slli_si128(interpolation0, 6);
311 interpolation1 = _mm_slli_si128(interpolation1, 6);
322 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<2u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx0,
const unsigned int fy0,
const unsigned int fx1,
const unsigned int fy1)
324 ocean_assert(fx0 <= 128u && fy0 <= 128u);
325 ocean_assert(fx1 <= 128u && fy1 <= 128u);
333 const unsigned int fx0_ = 128u - fx0;
334 const unsigned int fy0_ = 128u - fy0;
336 const unsigned int fx1_ = 128u - fx1;
337 const unsigned int fy1_ = 128u - fy1;
339 const unsigned int f0x_y_ = fx0_ * fy0_;
340 const unsigned int f0xy_ = fx0 * fy0_;
341 const unsigned int f0x_y = fx0_ * fy0;
342 const unsigned int f0xy = fx0 * fy0;
344 const unsigned int f1x_y_ = fx1_ * fy1_;
345 const unsigned int f1xy_ = fx1 * fy1_;
346 const unsigned int f1x_y = fx1_ * fy1;
347 const unsigned int f1xy = fx1 * fy1;
349 const __m128i __f0x_y_ = _mm_set1_epi16(
short(f0x_y_));
350 const __m128i __f0xy_ = _mm_set1_epi16(
short(f0xy_));
351 const __m128i __f0x_y = _mm_set1_epi16(
short(f0x_y));
352 const __m128i __f0xy = _mm_set1_epi16(
short(f0xy));
354 const __m128i __f1x_y_ = _mm_set1_epi16(
short(f1x_y_));
355 const __m128i __f1xy_ = _mm_set1_epi16(
short(f1xy_));
356 const __m128i __f1x_y = _mm_set1_epi16(
short(f1x_y));
357 const __m128i __f1xy = _mm_set1_epi16(
short(f1xy));
364 __m128i image0_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft0);
365 __m128i image0_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
369 __m128i image1_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft1);
370 __m128i image1_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
373 unsigned int localResult =
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 8, imageTopLeft1 + 8, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
374 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 9, imageTopLeft1 + 9, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
382 __m128i image0_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
386 __m128i image1_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
392 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 8u, imageTopLeft1 + 1u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
393 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 9u, imageTopLeft1 + 1u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
401 __m128i image0_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
405 __m128i image1_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
408 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 8u, imageTopLeft1 + 2u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
409 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 9u, imageTopLeft1 + 2u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
417 __m128i image0_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
421 __m128i image1_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
427 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 8u, imageTopLeft1 + 3u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
428 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 9u, imageTopLeft1 + 3u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
433 __m128i image0_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements - 2u)), 2);
437 __m128i image1_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements - 2u)), 2);
443 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 8u, imageTopLeft1 + 4u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
444 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 9u, imageTopLeft1 + 4u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
450 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<3u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx0,
const unsigned int fy0,
const unsigned int fx1,
const unsigned int fy1)
452 ocean_assert(fx0 <= 128u && fy0 <= 128u);
453 ocean_assert(fx1 <= 128u && fy1 <= 128u);
461 const unsigned int fx0_ = 128u - fx0;
462 const unsigned int fy0_ = 128u - fy0;
464 const unsigned int fx1_ = 128u - fx1;
465 const unsigned int fy1_ = 128u - fy1;
467 const __m128i f0x_y_ = _mm_set1_epi16(
short(fx0_ * fy0_));
468 const __m128i f0xy_ = _mm_set1_epi16(
short(fx0 * fy0_));
469 const __m128i f0x_y = _mm_set1_epi16(
short(fx0_ * fy0));
470 const __m128i f0xy = _mm_set1_epi16(
short(fx0 * fy0));
472 const __m128i f1x_y_ = _mm_set1_epi16(
short(fx1_ * fy1_));
473 const __m128i f1xy_ = _mm_set1_epi16(
short(fx1 * fy1_));
474 const __m128i f1x_y = _mm_set1_epi16(
short(fx1_ * fy1));
475 const __m128i f1xy = _mm_set1_epi16(
short(fx1 * fy1));
481 __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
482 __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
485 __m128i image0_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 8u));
486 __m128i image0_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements + 8u));
489 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
490 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
493 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
494 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
505 __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
508 __m128i image0_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements + 8u));
511 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
514 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
525 __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
528 __m128i image0_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements + 8u));
531 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
534 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
544 __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
547 __m128i image0_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements + 8u));
550 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
553 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
561 __m128i image0_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements));
564 __m128i image0_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements + 8u - 6u)), 6);
567 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
570 __m128i image1_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u - 6u)), 6);
579 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<4u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx0,
const unsigned int fy0,
const unsigned int fx1,
const unsigned int fy1)
587 const unsigned int fx0_ = 128u - fx0;
588 const unsigned int fy0_ = 128u - fy0;
590 const unsigned int fx1_ = 128u - fx1;
591 const unsigned int fy1_ = 128u - fy1;
593 const __m128i f0x_y_ = _mm_set1_epi16(
short(fx0_ * fy0_));
594 const __m128i f0xy_ = _mm_set1_epi16(
short(fx0 * fy0_));
595 const __m128i f0x_y = _mm_set1_epi16(
short(fx0_ * fy0));
596 const __m128i f0xy = _mm_set1_epi16(
short(fx0 * fy0));
598 const __m128i f1x_y_ = _mm_set1_epi16(
short(fx1_ * fy1_));
599 const __m128i f1xy_ = _mm_set1_epi16(
short(fx1 * fy1_));
600 const __m128i f1x_y = _mm_set1_epi16(
short(fx1_ * fy1));
601 const __m128i f1xy = _mm_set1_epi16(
short(fx1 * fy1));
608 __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
609 __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
613 __m128i image0_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 8u));
614 __m128i image0_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements + 8u));
618 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
619 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
623 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
624 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
637 __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
641 __m128i image0_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements + 8u));
645 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
649 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
657 image0_row0Back = _mm_blend_epi16(_mm_srli_si128(image0_row0Back, 8), image0_row1Back, 0xF0);
658 image0_row1Back = _mm_blend_epi16(_mm_srli_si128(image0_row1Back, 8), image0_row2Back, 0xF0);
662 image1_row0Back = _mm_blend_epi16(_mm_srli_si128(image1_row0Back, 8), image1_row1Back, 0xF0);
663 image1_row1Back = _mm_blend_epi16(_mm_srli_si128(image1_row1Back, 8), image1_row2Back, 0xF0);
677 __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
681 __m128i image0_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements + 8u));
685 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
689 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
702 __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
706 __m128i image0_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements + 8u));
710 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
714 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
721 image0_row2Back = _mm_blend_epi16(_mm_srli_si128(image0_row2Back, 8), image0_row3Back, 0xF0);
722 image0_row3Back = _mm_blend_epi16(_mm_srli_si128(image0_row3Back, 8), image0_row4Back, 0xF0);
726 image1_row2Back = _mm_blend_epi16(_mm_srli_si128(image1_row2Back, 8), image1_row3Back, 0xF0);
727 image1_row3Back = _mm_blend_epi16(_mm_srli_si128(image1_row3Back, 8), image1_row4Back, 0xF0);
737 __m128i image0_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements));
741 __m128i image0_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements + 8u));
745 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
749 __m128i image1_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u));
756 image0_row4Back = _mm_and_si128(image0_row4Back,
SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
757 image0_row5Back = _mm_and_si128(image0_row5Back,
SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
761 image1_row4Back = _mm_and_si128(image1_row4Back,
SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
762 image1_row5Back = _mm_and_si128(image1_row5Back,
SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
772 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<1u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx1,
const unsigned int fy1)
774 ocean_assert(fx1 <= 128u && fy1 <= 128u);
781 const unsigned int fx1_ = 128u - fx1;
782 const unsigned int fy1_ = 128u - fy1;
784 const unsigned int f1x_y_ = fx1_ * fy1_;
785 const unsigned int f1xy_ = fx1 * fy1_;
786 const unsigned int f1x_y = fx1_ * fy1;
787 const unsigned int f1xy = fx1 * fy1;
789 const __m128i __f1x_y_ = _mm_set1_epi16(
short(f1x_y_));
790 const __m128i __f1xy_ = _mm_set1_epi16(
short(f1xy_));
791 const __m128i __f1x_y = _mm_set1_epi16(
short(f1x_y));
792 const __m128i __f1xy = _mm_set1_epi16(
short(f1xy));
798 __m128i image0_row = _mm_slli_si128(_mm_loadu_si64(imageTopLeft0), 11);
800 const __m128i image1_row0 = _mm_loadu_si64(imageTopLeft1);
801 const __m128i image1_row1 = _mm_loadu_si64(imageTopLeft1 + image1StrideElements);
809 __m128i mask =
SSE::set128i(0x0000000000FFFFFFull, 0xFFFFFFFFFFFFFFFFull);
810 image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 1u * image0StrideElements), 6), mask);
812 const __m128i image1_row2 = _mm_loadu_si64(imageTopLeft1 + 2u * image1StrideElements);
818 #ifdef OCEAN_COMPILER_CLANG
823 mask =
SSE::set128i(0x0000000000000000ull, 0x0000FFFFFFFFFFFFull);
824 image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements), 1u), mask);
826 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
832 mask =
SSE::set128i(0x0000000000000000ull, 0x000000FFFFFFFFFFull);
833 image0_row = _mm_blendv_epi8(image0_row, _mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements), mask);
835 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
848 image0_row = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
850 const __m128i image1_row4 = _mm_loadu_si64(imageTopLeft1 + 4u * image1StrideElements);
854 mask =
SSE::set128i(0x000000000000FFFFull, 0xFFFFFF0000000000ull);
855 image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 4u * image0StrideElements - 3), 2), mask);
857 const __m128i image1_row5 = _mm_loadu_si64(imageTopLeft1 + 5u * image1StrideElements - 2);
860 #ifdef OCEAN_COMPILER_CLANG
864 image0_row = _mm_slli_si128(image0_row, 6);
865 image1_row = _mm_slli_si128(image1_row, 6);
876 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<2u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx1,
const unsigned int fy1)
878 ocean_assert(fx1 <= 128u && fy1 <= 128u);
886 const unsigned int fx1_ = 128u - fx1;
887 const unsigned int fy1_ = 128u - fy1;
889 const unsigned int f1x_y_ = fx1_ * fy1_;
890 const unsigned int f1xy_ = fx1 * fy1_;
891 const unsigned int f1x_y = fx1_ * fy1;
892 const unsigned int f1xy = fx1 * fy1;
894 const __m128i __f1x_y_ = _mm_set1_epi16(
short(f1x_y_));
895 const __m128i __f1xy_ = _mm_set1_epi16(
short(f1xy_));
896 const __m128i __f1x_y = _mm_set1_epi16(
short(f1x_y));
897 const __m128i __f1xy = _mm_set1_epi16(
short(f1xy));
904 __m128i image0_row0 = _mm_loadl_epi64((__m128i*)imageTopLeft0);
907 __m128i image1_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft1);
908 __m128i image1_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
911 unsigned int localResult =
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 8, imageTopLeft1 + 8, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
912 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 9, imageTopLeft1 + 9, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
920 __m128i image0_row1 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 1u * image0StrideElements));
921 image0_row0 = ::_mm_or_si128(image0_row0, _mm_slli_si128(image0_row1, 8));
924 __m128i image1_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
930 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 8u, imageTopLeft1 + 1u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
931 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 9u, imageTopLeft1 + 1u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
939 __m128i image0_row2 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
942 __m128i image1_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
945 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 8u, imageTopLeft1 + 2u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
946 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 9u, imageTopLeft1 + 2u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
954 __m128i image0_row3 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
955 image0_row2 = _mm_or_si128(image0_row2, _mm_slli_si128(image0_row3, 8));
958 __m128i image1_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
964 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 8u, imageTopLeft1 + 3u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
965 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 9u, imageTopLeft1 + 3u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
969 __m128i image0_row4 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
972 __m128i image1_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements - 2u)), 2);
978 localResult +=
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 8u, imageTopLeft1 + 4u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
979 +
SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 9u, imageTopLeft1 + 4u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
985 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<3u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx1,
const unsigned int fy1)
993 const unsigned int fx1_ = 128u - fx1;
994 const unsigned int fy1_ = 128u - fy1;
996 const __m128i f1x_y_ = _mm_set1_epi16(
short(fx1_ * fy1_));
997 const __m128i f1xy_ = _mm_set1_epi16(
short(fx1 * fy1_));
998 const __m128i f1x_y = _mm_set1_epi16(
short(fx1_ * fy1));
999 const __m128i f1xy = _mm_set1_epi16(
short(fx1 * fy1));
1005 __m128i image0_row0 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)imageTopLeft0),
SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1007 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
1008 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
1011 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
1012 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
1023 __m128i image0_row1 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 1u * image0StrideElements)),
SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1025 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
1028 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
1039 __m128i image0_row2 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements)),
SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1041 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
1044 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
1054 __m128i image0_row3 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements)),
SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1056 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
1059 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
1067 __m128i image0_row4 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements)),
SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1069 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
1072 __m128i image1_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u - 6u)), 6);
1081 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<4u, 5u>(
const uint8_t*
const imageTopLeft0,
const uint8_t*
const imageTopLeft1,
const unsigned int image0StrideElements,
const unsigned int image1StrideElements,
const unsigned int fx1,
const unsigned int fy1)
1089 const unsigned int fx1_ = 128u - fx1;
1090 const unsigned int fy1_ = 128u - fy1;
1092 const __m128i f1x_y_ = _mm_set1_epi16(
short(fx1_ * fy1_));
1093 const __m128i f1xy_ = _mm_set1_epi16(
short(fx1 * fy1_));
1094 const __m128i f1x_y = _mm_set1_epi16(
short(fx1_ * fy1));
1095 const __m128i f1xy = _mm_set1_epi16(
short(fx1 * fy1));
1102 __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
1105 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
1106 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
1110 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
1111 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
1124 __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 1u * image0StrideElements));
1127 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
1131 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
1138 __m128i image0row01 = _mm_set_epi32(0, 0, *((
unsigned int*)(imageTopLeft0 + image0StrideElements + 16u)), *((
unsigned int*)(imageTopLeft0 + 16)));
1141 image1_row0Back = _mm_blend_epi16(_mm_srli_si128(image1_row0Back, 8), image1_row1Back, 0xF0);
1142 image1_row1Back = _mm_blend_epi16(_mm_srli_si128(image1_row1Back, 8), image1_row2Back, 0xF0);
1156 __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
1159 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
1163 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
1176 __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
1179 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
1183 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
1190 __m128i image0row23 = _mm_set_epi32(0, 0, *((
unsigned int*)(imageTopLeft0 + 3u * image0StrideElements + 16u)), *((
unsigned int*)(imageTopLeft0 + 2u * image0StrideElements + 16)));
1193 image1_row2Back = _mm_blend_epi16(_mm_srli_si128(image1_row2Back, 8), image1_row3Back, 0xF0);
1194 image1_row3Back = _mm_blend_epi16(_mm_srli_si128(image1_row3Back, 8), image1_row4Back, 0xF0);
1204 __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
1207 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
1211 __m128i image1_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u));
1218 __m128i image0row4 = _mm_set_epi32(0, 0, *((
unsigned int*)(imageTopLeft0 + 4u * image0StrideElements + 16u)), 0);
1221 image1_row4Back = _mm_and_si128(image1_row4Back,
SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
1222 image1_row5Back = _mm_and_si128(image1_row5Back,
SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
This class implements sum of square difference calculation functions allowing to determine the SSE wi...
Definition: AdvancedSumSquareDifferencesSSE.h:33
static uint32_t patch8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences for an image patch determined between two individual images.
Definition: AdvancedSumSquareDifferencesSSE.h:110
static uint32_t patch8BitPerChannel(const uint8_t *const imageTopLeft0, const uint8_t *const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
Returns the sum of square differences for an image patch determined between two individual images.
static uint32_t patch8BitPerChannel(const uint8_t *const imageTopLeft0, const uint8_t *const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
Returns the sum of square differences for an image patch determined between two individual images.
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:1583
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition: SSE.h:2264
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: SSE.h:2117
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: SSE.h:1733
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition: SSE.h:1533
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition: SSE.h:1322
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition: SSE.h:3927
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition: SSE.h:3770
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: SSE.h:1879
float Scalar
Definition of a scalar type.
Definition: Math.h:128
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15