141 static_assert(tSize >= 1u,
"Invalid buffer size!");
143 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
145 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
147 __m128i sumLow_128i = _mm_setzero_si128();
148 __m128i sumHigh_128i = _mm_setzero_si128();
152 constexpr unsigned int blocks16 = tSize / 16u;
154 for (
unsigned int n = 0u; n < blocks16; ++n)
156 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)buffer0);
157 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)buffer1);
159 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
160 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
162 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
163 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
169 if constexpr (blocks16 >= 1u && (tSize % 16u) >= 10u)
171 constexpr unsigned int remainingElements = tSize % 16u;
172 constexpr unsigned int overlappingElements = 16u - remainingElements;
174 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements)), overlappingElements);
175 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements)), overlappingElements);
177 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
178 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
180 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
181 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
183 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);
191 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
192 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
194 if constexpr (blocks8 == 1u)
196 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)buffer0);
197 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)buffer1);
199 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
201 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
207 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);
209 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
210 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
216 for (
unsigned int n = 0u; n < remainingElements; ++n)
228 static_assert(tChannels >= 1u,
"Invalid channel number!");
229 static_assert(tPatchSize >= 1u,
"Invalid buffer size!");
231 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
233 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
234 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
236 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
238 constexpr unsigned int blocks16 = patchWidthElements / 16u;
239 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
241 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
243 constexpr bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
245 constexpr bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
247 constexpr unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
249 static_assert(blocks1 <= 2u,
"Invalid block size!");
251 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
253 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
255 __m128i sumLow_128i = _mm_setzero_si128();
256 __m128i sumHigh_128i = _mm_setzero_si128();
258 uint32_t sumIndividual = 0u;
260 for (
unsigned int y = 0u; y < tPatchSize; ++y)
265 for (
unsigned int n = 0u; n < blocks16; ++n)
267 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
268 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
270 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
271 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
273 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
274 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
280 if constexpr (fullBlock8)
282 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
283 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
285 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
287 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
293 if constexpr (partialBlock16)
295 constexpr unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
297 static_assert(overlapElements < 8u,
"Invalid value!");
299 if (y < tPatchSize - 1u)
301 const __m128i buffer0_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch0), overlapElements);
302 const __m128i buffer1_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch1), overlapElements);
304 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
305 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
307 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
308 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
312 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch0 - overlapElements)), overlapElements);
313 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch1 - overlapElements)), overlapElements);
315 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
316 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
318 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
319 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
322 patch0 += remainingAfterBlocks16;
323 patch1 += remainingAfterBlocks16;
326 if constexpr (partialBlock8)
328 constexpr unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
330 static_assert(overlapElements < 8u,
"Invalid value!");
332 if (y < tPatchSize - 1u)
334 const __m128i buffer0_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch0), overlapElements + 8);
335 const __m128i buffer1_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch1), overlapElements + 8);
337 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
339 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
343 const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch0 - overlapElements)), overlapElements);
344 const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch1 - overlapElements)), overlapElements);
346 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
348 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
351 patch0 += remainingAfterBlocks16;
352 patch1 += remainingAfterBlocks16;
355 if constexpr (blocks1 != 0u)
357 for (
unsigned int n = 0u; n < blocks1; ++n)
359 sumIndividual +=
sqrDistance(patch0[n], patch1[n]);
366 patch0 += patch0StrideElements - patchWidthElements;
367 patch1 += patch1StrideElements - patchWidthElements;
370 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);
382uint32_t
SumSquareDifferencesSSE::patchMirroredBorder8BitPerChannel(
const uint8_t* image0,
const uint8_t* image1,
const unsigned int width0,
const unsigned int height0,
const unsigned int width1,
const unsigned int height1,
const unsigned int centerX0,
const unsigned int centerY0,
const unsigned int centerX1,
const unsigned int centerY1,
const unsigned int image0PaddingElements,
const unsigned int image1PaddingElements)
384 static_assert(tChannels >= 1u,
"Invalid channel number!");
385 static_assert(tPatchSize % 2u == 1u,
"Invalid patch size!");
387 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
389 ocean_assert(centerX0 < width0 && centerY0 < height0);
390 ocean_assert(centerX1 < width1 && centerY1 < height1);
392 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
394 const unsigned int width0Elements = width0 * tChannels;
395 const unsigned int width1Elements = width1 * tChannels;
397 const unsigned int image0StrideElements = width0Elements + image0PaddingElements;
398 const unsigned int image1StrideElements = width1Elements + image1PaddingElements;
400 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
402 constexpr unsigned int blocks16 = patchWidthElements / 16u;
403 constexpr unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
405 constexpr bool partialBlock16 = remainingAfterBlocks16 > 8u;
406 constexpr unsigned int remainingAfterPartialBlock16 = partialBlock16 ? 0u : remainingAfterBlocks16;
408 constexpr unsigned int blocks8 = remainingAfterPartialBlock16 / 8u;
409 constexpr unsigned int remainingAfterBlocks8 = remainingAfterPartialBlock16 % 8u;
411 constexpr bool partialBlock8 = remainingAfterBlocks8 >= 3u;
412 constexpr unsigned int remainingAfterPartialBlock8 = partialBlock8 ? 0u : remainingAfterBlocks8;
414 constexpr unsigned int blocks1 = remainingAfterPartialBlock8;
416 static_assert(blocks1 <= 7u,
"Invalid block size!");
418 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
420 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
422 __m128i sumLow_128i = _mm_setzero_si128();
423 __m128i sumHigh_128i = _mm_setzero_si128();
425 uint32_t sumIndividual = 0u;
427 uint8_t intermediate[16];
429 int y1 = int(centerY1) - int(tPatchSize_2);
430 for (
int y0 =
int(centerY0) -
int(tPatchSize_2); y0 <= int(centerY0) + int(tPatchSize_2); ++y0)
432 const uint8_t*
const mirroredRow0 = image0 + (
unsigned int)(y0 +
CVUtilities::mirrorOffset(y0, height0)) * image0StrideElements;
433 const uint8_t*
const mirroredRow1 = image1 + (
unsigned int)(y1 +
CVUtilities::mirrorOffset(y1, height1)) * image1StrideElements;
435 int x0 = (int(centerX0) - int(tPatchSize_2)) *
int(tChannels);
436 int x1 = (int(centerX1) - int(tPatchSize_2)) *
int(tChannels);
438 for (
unsigned int n = 0u; n < blocks16; ++n)
440 const __m128i buffer0_128i = loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow0, x0, width0Elements, intermediate);
441 const __m128i buffer1_128i = loadMirrored_u_8x16<tChannels, true, 16u>(mirroredRow1, x1, width1Elements, intermediate);
443 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
444 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
446 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
447 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
453 if constexpr (partialBlock16)
455 if (y0 <
int(centerY0) + int(tPatchSize_2))
457 const __m128i buffer0_128i = loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate);
458 const __m128i buffer1_128i = loadMirrored_u_8x16<tChannels, true, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate);
460 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
461 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
463 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
464 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
468 const __m128i buffer0_128i = loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow0, x0, width0Elements, intermediate);
469 const __m128i buffer1_128i = loadMirrored_u_8x16<tChannels, false, remainingAfterBlocks16>(mirroredRow1, x1, width1Elements, intermediate);
471 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
472 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
474 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
475 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
478 x0 += remainingAfterBlocks16;
479 x1 += remainingAfterBlocks16;
482 for (
unsigned int n = 0u; n < blocks8; ++n)
484 const __m128i buffer0_128i = loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow0, x0, width0Elements, intermediate);
485 const __m128i buffer1_128i = loadMirrored_u_8x8<tChannels, true, 8u>(mirroredRow1, x1, width1Elements, intermediate);
487 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
489 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
495 if constexpr (partialBlock8)
499 if (y0 <
int(centerY0) + int(tPatchSize_2))
502 const __m128i loaded0_128i = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
503 const __m128i loaded1_128i = loadMirrored_u_8x8<tChannels, true, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
505 constexpr unsigned int shift = 8u + (8u - remainingAfterBlocks8);
506 const __m128i remaining0_128i = _mm_slli_si128(loaded0_128i, shift);
507 const __m128i remaining1_128i = _mm_slli_si128(loaded1_128i, shift);
509 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(remaining0_128i, remaining1_128i), constant_signs_m128i);
511 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
516 const __m128i remaining0_128i = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow0, x0, width0Elements, intermediate);
517 const __m128i remaining1_128i = loadMirrored_u_8x8<tChannels, false, remainingAfterBlocks8>(mirroredRow1, x1, width1Elements, intermediate);
519 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(remaining0_128i, remaining1_128i), constant_signs_m128i);
521 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
524 x0 += remainingAfterBlocks8;
525 x1 += remainingAfterBlocks8;
528 if constexpr (blocks1 != 0u)
530 for (
unsigned int n = 0u; n < blocks1; ++n)
532 sumIndividual +=
sqrDistance(mirroredRow0[mirrorIndex<tChannels>(x0 +
int(n), width0Elements)], mirroredRow1[mirrorIndex<tChannels>(x1 +
int(n), width1Elements)]);
539 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);