8 #ifndef META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_SSE_H
9 #define META_OCEAN_CV_SUM_SQUARE_DIFFERENCES_SSE_H
15 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
40 template <
unsigned int tSize>
53 template <
unsigned int tChannels,
unsigned int tPatchSize>
54 static inline uint32_t
patch8BitPerChannel(
const uint8_t* patch0,
const uint8_t* patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements);
65 template <
unsigned int tChannels,
unsigned int tPatchSize>
66 static inline uint32_t
patchBuffer8BitPerChannel(
const uint8_t* patch0,
const uint8_t* buffer1,
const unsigned int patch0StrideElements);
69 template <
unsigned int tSize>
72 static_assert(tSize >= 1u,
"Invalid buffer size!");
74 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
76 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
78 __m128i sumLow_128i = _mm_setzero_si128();
79 __m128i sumHigh_128i = _mm_setzero_si128();
83 constexpr
unsigned int blocks16 = tSize / 16u;
85 for (
unsigned int n = 0u; n < blocks16; ++n)
87 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)buffer0);
88 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)buffer1);
90 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
91 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
93 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
94 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
100 if constexpr (blocks16 >= 1u && (tSize % 16u) >= 10u)
102 constexpr
unsigned int remainingElements = tSize % 16u;
103 constexpr
unsigned int overlappingElements = 16u - remainingElements;
105 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements)), overlappingElements);
106 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements)), overlappingElements);
108 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
109 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
111 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
112 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
114 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);
122 constexpr
unsigned int blocks8 = (tSize % 16u) / 8u;
123 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
125 if constexpr (blocks8 == 1u)
127 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)buffer0);
128 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)buffer1);
130 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
132 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
138 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);
140 constexpr
unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
141 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
147 for (
unsigned int n = 0u; n < remainingElements; ++n)
156 template <
unsigned int tChannels,
unsigned int tPatchSize>
159 static_assert(tChannels >= 1u,
"Invalid channel number!");
160 static_assert(tPatchSize >= 1u,
"Invalid buffer size!");
162 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
164 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
165 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
167 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
169 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
170 constexpr
unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
172 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
174 constexpr
bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
176 constexpr
bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
178 constexpr
unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
180 static_assert(blocks1 <= 2u,
"Invalid block size!");
182 static_assert(std::is_same<short, int16_t>::value,
"Invalid data type!");
184 const __m128i constant_signs_m128i = _mm_set1_epi16(
short(0x1FF));
186 __m128i sumLow_128i = _mm_setzero_si128();
187 __m128i sumHigh_128i = _mm_setzero_si128();
189 uint32_t sumIndividual = 0u;
191 for (
unsigned int y = 0u; y < tPatchSize; ++y)
196 for (
unsigned int n = 0u; n < blocks16; ++n)
198 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
199 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
201 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
202 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
204 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
205 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
211 if constexpr (fullBlock8)
213 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
214 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
216 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
218 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
224 if constexpr (partialBlock16)
226 constexpr
unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
228 static_assert(overlapElements < 8u,
"Invalid value!");
230 if (y < tPatchSize - 1u)
232 const __m128i buffer0_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch0), overlapElements);
233 const __m128i buffer1_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch1), overlapElements);
235 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
236 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
238 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
239 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
243 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch0 - overlapElements)), overlapElements);
244 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch1 - overlapElements)), overlapElements);
246 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
247 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
249 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
250 sumHigh_128i = _mm_add_epi32(sumHigh_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
253 patch0 += remainingAfterBlocks16;
254 patch1 += remainingAfterBlocks16;
257 if constexpr (partialBlock8)
259 constexpr
unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
261 static_assert(overlapElements < 8u,
"Invalid value!");
263 if (y < tPatchSize - 1u)
265 const __m128i buffer0_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch0), overlapElements + 8);
266 const __m128i buffer1_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch1), overlapElements + 8);
268 const __m128i absDifferencesHigh_128i = _mm_maddubs_epi16(_mm_unpackhi_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
270 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesHigh_128i, absDifferencesHigh_128i));
274 const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch0 - overlapElements)), overlapElements);
275 const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch1 - overlapElements)), overlapElements);
277 const __m128i absDifferencesLow_128i = _mm_maddubs_epi16(_mm_unpacklo_epi8(buffer0_128i, buffer1_128i), constant_signs_m128i);
279 sumLow_128i = _mm_add_epi32(sumLow_128i, _mm_madd_epi16(absDifferencesLow_128i, absDifferencesLow_128i));
282 patch0 += remainingAfterBlocks16;
283 patch1 += remainingAfterBlocks16;
286 if constexpr (blocks1 != 0u)
288 for (
unsigned int n = 0u; n < blocks1; ++n)
290 sumIndividual +=
sqrDistance(patch0[n], patch1[n]);
297 patch0 += patch0StrideElements - patchWidthElements;
298 patch1 += patch1StrideElements - patchWidthElements;
301 const __m128i sum_128i = _mm_add_epi32(sumLow_128i, sumHigh_128i);
306 template <
unsigned int tChannels,
unsigned int tPatchSize>
309 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition: SSE.h:1322
This class implements function to calculate sum square differences using SSE instructions.
Definition: SumSquareDifferencesSSE.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of square differences between an image patch and a buffer.
Definition: SumSquareDifferencesSSE.h:307
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of square differences between two memory buffers.
Definition: SumSquareDifferencesSSE.h:70
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of square differences between two patches within an image.
Definition: SumSquareDifferencesSSE.h:157
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15