8 #ifndef META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_SSE_H
9 #define META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_SSE_H
13 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
38 template <
unsigned int tSize>
51 template <
unsigned int tChannels,
unsigned int tPatchSize>
52 static inline uint32_t
patch8BitPerChannel(
const uint8_t* patch0,
const uint8_t* patch1,
const unsigned int patch0StrideElements,
const unsigned int patch1StrideElements);
63 template <
unsigned int tChannels,
unsigned int tPatchSize>
64 static inline uint32_t
patchBuffer8BitPerChannel(
const uint8_t* patch0,
const uint8_t* buffer1,
const unsigned int patch0StrideElements);
67 template <
unsigned int tSize>
70 static_assert(tSize >= 1u,
"Invalid buffer size!");
72 __m128i sum_128i = _mm_setzero_si128();
76 constexpr
unsigned int blocks16 = tSize / 16u;
78 for (
unsigned int n = 0u; n < blocks16; ++n)
80 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)buffer0);
81 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)buffer1);
83 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
89 if constexpr (blocks16 >= 1u && (tSize % 16u) >= 10u)
91 constexpr
unsigned int remainingElements = tSize % 16u;
92 constexpr
unsigned int overlappingElements = 16u - remainingElements;
94 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer0 - overlappingElements)), overlappingElements);
95 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(buffer1 - overlappingElements)), overlappingElements);
97 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
105 constexpr
unsigned int blocks8 = (tSize % 16u) / 8u;
106 static_assert(blocks8 <= 1u,
"Invalid number of blocks!");
108 if constexpr (blocks8 == 1u)
110 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)buffer0);
111 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)buffer1);
113 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
119 constexpr
unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
120 static_assert(remainingElements < 8u,
"Invalid number of remaining elements!");
126 for (
unsigned int n = 0u; n < remainingElements; ++n)
128 result += uint32_t(abs(buffer0[n] - buffer1[n]));
135 template <
unsigned int tChannels,
unsigned int tPatchSize>
138 static_assert(tChannels >= 1u,
"Invalid channel number!");
139 static_assert(tPatchSize >= 5u,
"Invalid patch size!");
141 ocean_assert(patch0 !=
nullptr && patch1 !=
nullptr);
143 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
144 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
146 constexpr
unsigned int patchWidthElements = tChannels * tPatchSize;
148 constexpr
unsigned int blocks16 = patchWidthElements / 16u;
149 constexpr
unsigned int remainingAfterBlocks16 = patchWidthElements % 16u;
151 constexpr
bool partialBlock16 = remainingAfterBlocks16 > 8u;
153 constexpr
bool fullBlock8 = !partialBlock16 && remainingAfterBlocks16 == 8u;
155 constexpr
bool partialBlock8 = !partialBlock16 && !fullBlock8 && remainingAfterBlocks16 >= 3u;
157 constexpr
unsigned int blocks1 = (!partialBlock16 && !fullBlock8 && !partialBlock8) ? remainingAfterBlocks16 : 0u;
159 static_assert(blocks1 <= 2u,
"Invalid block size!");
161 __m128i sum_128i = _mm_setzero_si128();
163 uint32_t sumIndividual = 0u;
165 for (
unsigned int y = 0u; y < tPatchSize; ++y)
170 for (
unsigned int n = 0u; n < blocks16; ++n)
172 const __m128i buffer0_128i = _mm_lddqu_si128((
const __m128i*)patch0);
173 const __m128i buffer1_128i = _mm_lddqu_si128((
const __m128i*)patch1);
175 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
181 if constexpr (fullBlock8)
183 const __m128i buffer0_128i = _mm_loadl_epi64((
const __m128i*)patch0);
184 const __m128i buffer1_128i = _mm_loadl_epi64((
const __m128i*)patch1);
186 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
192 if constexpr (partialBlock16)
194 constexpr
unsigned int overlapElements = partialBlock16 ? 16u - remainingAfterBlocks16 : 0u;
196 static_assert(overlapElements < 8u,
"Invalid value!");
198 if (y < tPatchSize - 1u)
200 const __m128i buffer0_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch0), overlapElements);
201 const __m128i buffer1_128i = _mm_slli_si128(_mm_lddqu_si128((
const __m128i*)patch1), overlapElements);
203 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
207 const __m128i buffer0_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch0 - overlapElements)), overlapElements);
208 const __m128i buffer1_128i = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(patch1 - overlapElements)), overlapElements);
210 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
213 patch0 += remainingAfterBlocks16;
214 patch1 += remainingAfterBlocks16;
217 if constexpr (partialBlock8)
219 constexpr
unsigned int overlapElements = partialBlock8 ? 8u - remainingAfterBlocks16 : 0u;
221 static_assert(overlapElements < 8u,
"Invalid value!");
223 if (y < tPatchSize - 1u)
225 const __m128i buffer0_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch0), overlapElements + 8);
226 const __m128i buffer1_128i = _mm_slli_si128(_mm_loadl_epi64((
const __m128i*)patch1), overlapElements + 8);
228 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
232 const __m128i buffer0_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch0 - overlapElements)), overlapElements);
233 const __m128i buffer1_128i = _mm_srli_si128(_mm_loadl_epi64((
const __m128i*)(patch1 - overlapElements)), overlapElements);
235 sum_128i = _mm_add_epi32(sum_128i, _mm_sad_epu8(buffer0_128i, buffer1_128i));
238 patch0 += remainingAfterBlocks16;
239 patch1 += remainingAfterBlocks16;
242 if constexpr (blocks1 != 0u)
244 for (
unsigned int n = 0u; n < blocks1; ++n)
246 sumIndividual += uint32_t(abs(patch0[n] - patch1[n]));
253 patch0 += patch0StrideElements - patchWidthElements;
254 patch1 += patch1StrideElements - patchWidthElements;
260 template <
unsigned int tChannels,
unsigned int tPatchSize>
263 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition: SSE.h:1340
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
This class implements functions calculation the sum of absolute differences.
Definition: SumAbsoluteDifferencesSSE.h:28
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of absolute differences between two patches within an image.
Definition: SumAbsoluteDifferencesSSE.h:136
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of absolute differences between an image patch and a buffer.
Definition: SumAbsoluteDifferencesSSE.h:261
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of absolute differences between two memory buffers.
Definition: SumAbsoluteDifferencesSSE.h:68
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15