Ocean
Loading...
Searching...
No Matches
AdvancedSumSquareDifferencesSSE.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
9#define META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
10
12
13#include "ocean/math/Math.h"
14
15#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
16
17#include "ocean/cv/SSE.h"
18
19namespace Ocean
20{
21
22namespace CV
23{
24
25namespace Advanced
26{
27
28/**
29 * This class implements sum of square difference calculation functions allowing to determine the SSE with sub-pixel accuracy using SSD SIMD instructions.
30 * @ingroup cvadvanced
31 */
32class OCEAN_CV_ADVANCED_EXPORT AdvancedSumSquareDifferencesSSE
33{
34 public:
35
36 /**
37 * Returns the sum of square differences for an image patch determined between two individual images.
38 * @param image0 The image in which the first patch is located, must be valid
39 * @param image1 The image in which the second patch is located, must be valid
40 * @param width0 The width of the first image, in pixels, with range [tPatchSize + 1, infinity)
41 * @param width1 The width of the second image, in pixels, with range [tPatchSize + 1, infinity)
42 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1)
43 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1)
44 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1)
45 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1)
46 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
47 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
48 * @return The resulting sum of square differences, with range [0, infinity)
49 * @tparam tChannels The number of frame channels, with range [1, infinity)
50 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
51 */
52 template <unsigned int tChannels, unsigned int tPatchSize>
53 static inline uint32_t patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
54
55 /**
56 * Returns the sum of square differences for an image patch determined between two individual images.
57 * @param image0 The image in which the first patch is located, must be valid
58 * @param image1 The image in which the second patch is located, must be valid
59 * @param width0 The width of the first image, in pixels, with range [tPatchSize + 1, infinity)
60 * @param width1 The width of the second image, in pixels, with range [tPatchSize + 1, infinity)
61 * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2)
62 * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2)
63 * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1)
64 * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1)
65 * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
66 * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
67 * @return The resulting sum of square differences, with range [0, infinity)
68 * @tparam tChannels The number of frame channels, with range [1, infinity)
69 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
70 */
71 template <unsigned int tChannels, unsigned int tPatchSize>
72 static inline uint32_t patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const unsigned int centerX0, const unsigned int centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
73
74 private:
75
76 /**
77 * Returns the sum of square differences for an image patch determined between two individual images.
78 * @param imageTopLeft0 The top left corner of the image patch in the first image, must be valid
79 * @param imageTopLeft1 The top left corner of the image patch in the second image, must be valid
80 * @param image0StrideElements The number of elements between two rows in the first image, in elements, with range [width0 * tChannels, infinity)
81 * @param image1StrideElements The number of elements between two rows in the second image, in elements, with range [width0 * tChannels, infinity)
82 * @param fx0 Horizontal interpolation factor for the first image, with range [0, 128]
83 * @param fy0 Vertical interpolation factor for the first image, with range [0, 128]
84 * @param fx1 Horizontal interpolation factor for the second image, with range [0, 128]
85 * @param fy1 Vertical interpolation factor for the second image, with range [0, 128]
86 * @return The resulting sum of square differences, with range [0, infinity)
87 * @tparam tChannels The number of frame channels, with range [1, infinity)
88 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
89 */
90 template <unsigned int tChannels, unsigned int tPatchSize>
91 static inline uint32_t patch8BitPerChannel(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1);
92
93 /**
94 * Returns the sum of square differences for an image patch determined between two individual images.
95 * @param imageTopLeft0 The top left corner of the image patch in the first image, must be valid
96 * @param imageTopLeft1 The top left corner of the image patch in the second image, must be valid
97 * @param image0StrideElements The number of elements between two rows in the first image, in elements, with range [width0 * tChannels, infinity)
98 * @param image1StrideElements The number of elements between two rows in the second image, in elements, with range [width0 * tChannels, infinity)
99 * @param fx1 Horizontal interpolation factor for the second image, with range [0, 128]
100 * @param fy1 Vertical interpolation factor for the second image, with range [0, 128]
101 * @return The resulting sum of square differences, with range [0, infinity)
102 * @tparam tChannels The number of frame channels, with range [1, infinity)
103 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
104 */
105 template <unsigned int tChannels, unsigned int tPatchSize>
106 static inline uint32_t patch8BitPerChannel(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1);
107};
108
109template <unsigned int tChannels, unsigned int tPatchSize>
110inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
111{
112 ocean_assert(image0 != nullptr && image1 != nullptr);
113
114 ocean_assert(width0 > tPatchSize);
115 ocean_assert(width1 > tPatchSize);
116
117 const unsigned int tPatchSize_2 = tPatchSize / 2u;
118
119 ocean_assert(centerX0 >= Scalar(tPatchSize_2) && centerX0 < Scalar(width0 - tPatchSize_2 - 1u));
120 ocean_assert(centerY0 >= Scalar(tPatchSize_2));
121
122 ocean_assert(centerX1 >= Scalar(tPatchSize_2) && centerX1 < Scalar(width1 - tPatchSize_2 - 1u));
123 ocean_assert(centerY1 >= Scalar(tPatchSize_2));
124
125 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
126 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
127
128 const unsigned int left0 = (unsigned int)(centerX0);
129 const unsigned int top0 = (unsigned int)(centerY0);
130
131 const unsigned int left1 = (unsigned int)(centerX1);
132 const unsigned int top1 = (unsigned int)(centerY1);
133
134 const Scalar scalarFx0 = centerX0 - Scalar(left0);
135 const Scalar scalarFy0 = centerY0 - Scalar(top0);
136
137 ocean_assert(scalarFx0 >= 0 && scalarFx0 <= 1u);
138 ocean_assert(scalarFy0 >= 0 && scalarFy0 <= 1u);
139
140 const unsigned int fx0 = (unsigned int)(Scalar(128) * scalarFx0 + Scalar(0.5));
141 const unsigned int fy0 = (unsigned int)(Scalar(128) * scalarFy0 + Scalar(0.5));
142
143 const Scalar scalarFx1 = centerX1 - Scalar(left1);
144 const Scalar scalarFy1 = centerY1 - Scalar(top1);
145
146 ocean_assert(scalarFx1 >= 0 && scalarFx1 <= 1);
147 ocean_assert(scalarFy1 >= 0 && scalarFy1 <= 1);
148
149 const unsigned int fx1 = (unsigned int)(Scalar(128) * scalarFx1 + Scalar(0.5));
150 const unsigned int fy1 = (unsigned int)(Scalar(128) * scalarFy1 + Scalar(0.5));
151
152 const uint8_t* imageTopLeft0 = image0 + (top0 - tPatchSize_2) * image0StrideElements + (left0 - tPatchSize_2) * tChannels;
153 const uint8_t* imageTopLeft1 = image1 + (top1 - tPatchSize_2) * image1StrideElements + (left1 - tPatchSize_2) * tChannels;
154
155 return patch8BitPerChannel<tChannels, tPatchSize>(imageTopLeft0, imageTopLeft1, image0StrideElements, image1StrideElements, fx0, fy0, fx1, fy1);
156}
157
158template <unsigned int tChannels, unsigned int tPatchSize>
159inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const unsigned int centerX0, const unsigned int centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
160{
161 ocean_assert(image0 != nullptr && image1 != nullptr);
162
163 ocean_assert(width0 > tPatchSize);
164 ocean_assert(width1 > tPatchSize);
165
166 const unsigned int tPatchSize_2 = tPatchSize / 2u;
167
168 ocean_assert(centerX0 >= tPatchSize_2 && centerX0 < width0 - tPatchSize_2);
169 ocean_assert(centerY0 >= Scalar(tPatchSize_2));
170
171 ocean_assert(centerX1 >= tPatchSize_2 && centerX1 < width1 - tPatchSize_2 - 1u);
172 ocean_assert(centerY1 >= Scalar(tPatchSize_2));
173
174 const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
175 const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
176
177 const unsigned int left1 = (unsigned int)(centerX1);
178 const unsigned int top1 = (unsigned int)(centerY1);
179
180 const Scalar scalarFx1 = centerX1 - Scalar(left1);
181 const Scalar scalarFy1 = centerY1 - Scalar(top1);
182
183 ocean_assert(scalarFx1 >= 0 && scalarFx1 <= 1);
184 ocean_assert(scalarFy1 >= 0 && scalarFy1 <= 1);
185
186 const unsigned int fx1 = (unsigned int)(Scalar(128) * scalarFx1 + Scalar(0.5));
187 const unsigned int fy1 = (unsigned int)(Scalar(128) * scalarFy1 + Scalar(0.5));
188
189 const uint8_t* imageTopLeft0 = image0 + (centerY0 - tPatchSize_2) * image0StrideElements + (centerX0 - tPatchSize_2) * tChannels;
190 const uint8_t* imageTopLeft1 = image1 + (top1 - tPatchSize_2) * image1StrideElements + (left1 - tPatchSize_2) * tChannels;
191
192 return patch8BitPerChannel<tChannels, tPatchSize>(imageTopLeft0, imageTopLeft1, image0StrideElements, image1StrideElements, fx1, fy1);
193}
194
195template <>
196inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<1u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
197{
198 ocean_assert(fx0 <= 128u && fy0 <= 128u);
199 ocean_assert(fx1 <= 128u && fy1 <= 128u);
200
201 SSE::prefetchT0(imageTopLeft0);
202 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
203
204 SSE::prefetchT0(imageTopLeft1);
205 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
206
207 const unsigned int fx0_ = 128u - fx0;
208 const unsigned int fy0_ = 128u - fy0;
209
210 const unsigned int fx1_ = 128u - fx1;
211 const unsigned int fy1_ = 128u - fy1;
212
213 const unsigned int f0x_y_ = fx0_ * fy0_;
214 const unsigned int f0xy_ = fx0 * fy0_;
215 const unsigned int f0x_y = fx0_ * fy0;
216 const unsigned int f0xy = fx0 * fy0;
217
218 const unsigned int f1x_y_ = fx1_ * fy1_;
219 const unsigned int f1xy_ = fx1 * fy1_;
220 const unsigned int f1x_y = fx1_ * fy1;
221 const unsigned int f1xy = fx1 * fy1;
222
223 const __m128i __f0x_y_ = _mm_set1_epi16(short(f0x_y_));
224 const __m128i __f0xy_ = _mm_set1_epi16(short(f0xy_));
225 const __m128i __f0x_y = _mm_set1_epi16(short(f0x_y));
226 const __m128i __f0xy = _mm_set1_epi16(short(f0xy));
227
228 const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
229 const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
230 const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
231 const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
232
233 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
234 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
235
236 // row0 -> [-----------00000]
237 const __m128i image0_row0 = _mm_loadu_si64(imageTopLeft0);
238 const __m128i image0_row1 = _mm_loadu_si64(imageTopLeft0 + image0StrideElements);
239 __m128i interpolation0 = _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row0, image0_row1, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 11);
240
241 const __m128i image1_row0 = _mm_loadu_si64(imageTopLeft1);
242 const __m128i image1_row1 = _mm_loadu_si64(imageTopLeft1 + image1StrideElements);
243 __m128i interpolation1 = _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 11);
244
245 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
246 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
247
248
249 // row1 -> [------1111100000]
250 const __m128i image0_row2 = _mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements);
251 __m128i mask = SSE::set128i(0x0000000000FFFFFFull, 0xFFFFFFFFFFFFFFFFull);
252 interpolation0 = _mm_blendv_epi8(interpolation0, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row1, image0_row2, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 6), mask);
253
254 const __m128i image1_row2 = _mm_loadu_si64(imageTopLeft1 + 2u * image1StrideElements);
255 interpolation1 = _mm_blendv_epi8(interpolation1, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 6), mask);
256
257 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
258 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
259
260#ifdef OCEAN_COMPILER_CLANG
261
262 // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
263
264 // row2 -> [-222221111100000]
265 const __m128i image0_row3 = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
266 mask = SSE::set128i(0x0000000000000000ull, 0x0000FFFFFFFFFFFFull);
267 interpolation0 = _mm_blendv_epi8(interpolation0, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row2, image0_row3, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 1), mask);
268
269 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
270 interpolation1 = _mm_blendv_epi8(interpolation1, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 1), mask);
271
272#else
273
274 // row2 -> [22222-1111100000]
275 const __m128i image0_row3 = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
276 mask = SSE::set128i(0x0000000000000000ull, 0x000000FFFFFFFFFFull);
277 interpolation0 = _mm_blendv_epi8(interpolation0, SSE::interpolation1Channel8Bit8Elements(image0_row2, image0_row3, __f0x_y_, __f0xy_, __f0x_y, __f0xy), mask);
278
279 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
280 interpolation1 = _mm_blendv_epi8(interpolation1, SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), mask);
281
282#endif
283
284 // intermediate ssd
285 __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
286
287 SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
288 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
289
290
291 // row3 -> [33333-----------]
292 __m128i image0_row4 = _mm_loadu_si64(imageTopLeft0 + 4u * image0StrideElements);
293 interpolation0 = SSE::interpolation1Channel8Bit8Elements(image0_row3, image0_row4, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
294
295 __m128i image1_row4 = _mm_loadu_si64(imageTopLeft1 + 4u * image1StrideElements);
296 interpolation1 = SSE::interpolation1Channel8Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
297
298 // row3 -> [3333344444------]
299 __m128i image0_row5 = _mm_loadu_si64(imageTopLeft0 + 5u * image0StrideElements);
300 mask = SSE::set128i(0x000000000000FFFFull, 0xFFFFFF0000000000ull);
301 interpolation0 = _mm_blendv_epi8(interpolation0, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row4, image0_row5, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 5), mask);
302
303 __m128i image1_row5 = _mm_loadu_si64(imageTopLeft1 + 5u * image1StrideElements);
304 interpolation1 = _mm_blendv_epi8(interpolation1, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row4, image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 5), mask);
305
306#ifdef OCEAN_COMPILER_CLANG
307
308 // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
309
310 interpolation0 = _mm_slli_si128(interpolation0, 6);
311 interpolation1 = _mm_slli_si128(interpolation1, 6);
312
313#endif // OCEAN_COMPILER_CLANG
314
315 // ssd row04[0:7]
316 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
317
318 return SSE::sum_u32_4(result);
319}
320
321template <>
322inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<2u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
323{
324 ocean_assert(fx0 <= 128u && fy0 <= 128u);
325 ocean_assert(fx1 <= 128u && fy1 <= 128u);
326
327 SSE::prefetchT0(imageTopLeft0);
328 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
329
330 SSE::prefetchT0(imageTopLeft1);
331 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
332
333 const unsigned int fx0_ = 128u - fx0;
334 const unsigned int fy0_ = 128u - fy0;
335
336 const unsigned int fx1_ = 128u - fx1;
337 const unsigned int fy1_ = 128u - fy1;
338
339 const unsigned int f0x_y_ = fx0_ * fy0_;
340 const unsigned int f0xy_ = fx0 * fy0_;
341 const unsigned int f0x_y = fx0_ * fy0;
342 const unsigned int f0xy = fx0 * fy0;
343
344 const unsigned int f1x_y_ = fx1_ * fy1_;
345 const unsigned int f1xy_ = fx1 * fy1_;
346 const unsigned int f1x_y = fx1_ * fy1;
347 const unsigned int f1xy = fx1 * fy1;
348
349 const __m128i __f0x_y_ = _mm_set1_epi16(short(f0x_y_));
350 const __m128i __f0xy_ = _mm_set1_epi16(short(f0xy_));
351 const __m128i __f0x_y = _mm_set1_epi16(short(f0x_y));
352 const __m128i __f0xy = _mm_set1_epi16(short(f0xy));
353
354 const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
355 const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
356 const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
357 const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
358
359 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
360 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
361
362 // row0
363 // image0 row0[0:7]
364 __m128i image0_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft0);
365 __m128i image0_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
366 __m128i interpolation0 = SSE::interpolation2Channel16Bit8Elements(image0_row0, image0_row1, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
367
368 // image1 row0[0:7]
369 __m128i image1_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft1);
370 __m128i image1_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
371 __m128i interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
372
373 unsigned int localResult = SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 8, imageTopLeft1 + 8, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
374 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 9, imageTopLeft1 + 9, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
375
376 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
377 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
378
379
380 // row1
381 // image0 row1[0:7]
382 __m128i image0_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
383 interpolation0 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image0_row1, image0_row2, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 8), interpolation0);
384
385 // image1 row1[0:7]
386 __m128i image1_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
387 interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
388
389 // ssd row01[0:7]
390 __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
391
392 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 8u, imageTopLeft1 + 1u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
393 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 9u, imageTopLeft1 + 1u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
394
395 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
396 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
397
398
399 // row 2
400 // image0 row2[0:7]
401 __m128i image0_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
402 interpolation0 = SSE::interpolation2Channel16Bit8Elements(image0_row2, image0_row3, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
403
404 // image1 row2[0:7]
405 __m128i image1_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
406 interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
407
408 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 8u, imageTopLeft1 + 2u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
409 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 9u, imageTopLeft1 + 2u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
410
411 SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
412 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
413
414
415 // row 3
416 // image0 row3[0:7]
417 __m128i image0_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
418 interpolation0 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image0_row3, image0_row4, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 8), interpolation0);
419
420 // image1 row3[0:7]
421 __m128i image1_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
422 interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
423
424 // ssd row03[0:7]
425 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
426
427 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 8u, imageTopLeft1 + 3u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
428 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 9u, imageTopLeft1 + 3u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
429
430
431 // row 4
432 // image0 row4[0:7]
433 __m128i image0_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements - 2u)), 2); // here we start 2 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
434 interpolation0 = SSE::interpolation2Channel16Bit8Elements(image0_row4, image0_row5, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
435
436 // image0 row4[0:7]
437 __m128i image1_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements - 2u)), 2); // here we start 2 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
438 interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row4, image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
439
440 // ssd row04[0:7]
441 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
442
443 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 8u, imageTopLeft1 + 4u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
444 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 9u, imageTopLeft1 + 4u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
445
446 return SSE::sum_u32_4(result) + localResult;
447}
448
449template <>
450inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<3u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
451{
452 ocean_assert(fx0 <= 128u && fy0 <= 128u);
453 ocean_assert(fx1 <= 128u && fy1 <= 128u);
454
455 SSE::prefetchT0(imageTopLeft0);
456 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
457
458 SSE::prefetchT0(imageTopLeft1);
459 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
460
461 const unsigned int fx0_ = 128u - fx0;
462 const unsigned int fy0_ = 128u - fy0;
463
464 const unsigned int fx1_ = 128u - fx1;
465 const unsigned int fy1_ = 128u - fy1;
466
467 const __m128i f0x_y_ = _mm_set1_epi16(short(fx0_ * fy0_));
468 const __m128i f0xy_ = _mm_set1_epi16(short(fx0 * fy0_));
469 const __m128i f0x_y = _mm_set1_epi16(short(fx0_ * fy0));
470 const __m128i f0xy = _mm_set1_epi16(short(fx0 * fy0));
471
472 const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
473 const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
474 const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
475 const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
476
477 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
478 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
479
480 // row 0
481 __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
482 __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
483 __m128i interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row0Front, image0_row1Front, f0x_y_, f0xy_, f0x_y, f0xy);
484
485 __m128i image0_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 8u));
486 __m128i image0_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements + 8u));
487 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row0Back, image0_row1Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
488
489 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
490 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
491 __m128i interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
492
493 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
494 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
495 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
496
497 __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
498
499
500 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
501 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
502
503
504 // row 1
505 __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
506 interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row1Front, image0_row2Front, f0x_y_, f0xy_, f0x_y, f0xy);
507
508 __m128i image0_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements + 8u));
509 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row1Back, image0_row2Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
510
511 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
512 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
513
514 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
515 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
516
517 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
518
519
520 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
521 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
522
523
524 // row 2
525 __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
526 interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row2Front, image0_row3Front, f0x_y_, f0xy_, f0x_y, f0xy);
527
528 __m128i image0_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements + 8u));
529 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row2Back, image0_row3Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
530
531 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
532 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
533
534 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
535 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
536
537 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
538
539
540 SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
541 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
542
543 // row 3
544 __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
545 interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row3Front, image0_row4Front, f0x_y_, f0xy_, f0x_y, f0xy);
546
547 __m128i image0_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements + 8u));
548 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row3Back, image0_row4Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
549
550 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
551 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
552
553 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
554 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
555
556 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
557
558
559
560 // row 4
561 __m128i image0_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements));
562 interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row4Front, image0_row5Front, f0x_y_, f0xy_, f0x_y, f0xy);
563
564 __m128i image0_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements + 8u - 6u)), 6); // here we start 6 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
565 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row4Back, image0_row5Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
566
567 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
568 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
569
570 __m128i image1_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u - 6u)), 6); // here we start 6 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
571 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
572
573 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
574
575 return SSE::sum_u32_4(result);
576}
577
578template <>
579inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<4u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
580{
581 SSE::prefetchT0(imageTopLeft0);
582 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
583
584 SSE::prefetchT0(imageTopLeft1);
585 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
586
587 const unsigned int fx0_ = 128u - fx0;
588 const unsigned int fy0_ = 128u - fy0;
589
590 const unsigned int fx1_ = 128u - fx1;
591 const unsigned int fy1_ = 128u - fy1;
592
593 const __m128i f0x_y_ = _mm_set1_epi16(short(fx0_ * fy0_));
594 const __m128i f0xy_ = _mm_set1_epi16(short(fx0 * fy0_));
595 const __m128i f0x_y = _mm_set1_epi16(short(fx0_ * fy0));
596 const __m128i f0xy = _mm_set1_epi16(short(fx0 * fy0));
597
598 const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
599 const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
600 const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
601 const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
602
603 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
604 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
605
606 // row0
607 // image0 row0 [0:7]
608 __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
609 __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
610 __m128i interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row0Front, image0_row1Front, f0x_y_, f0xy_, f0x_y, f0xy);
611
612 // image0 row0 [8:15]
613 __m128i image0_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 8u));
614 __m128i image0_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements + 8u));
615 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row0Back, image0_row1Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
616
617 // image1 row0 [0:7]
618 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
619 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
620 __m128i interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
621
622 // image1 row0 [8:15]
623 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
624 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
625 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
626
627 // ssd row0 [0:15]
628 __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
629
630 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
631 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
632
633
634
635 // row1
636 // image0 row1 [0:7]
637 __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
638 interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row1Front, image0_row2Front, f0x_y_, f0xy_, f0x_y, f0xy);
639
640 // image0 row1 [8:15]
641 __m128i image0_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements + 8u));
642 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row1Back, image0_row2Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
643
644 // image1 row1 [0:7]
645 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
646 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
647
648 // image1 row1 [8:15]
649 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
650 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
651
652 // ssd row01 [0:15]
653 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
654
655
656 // image0 row0 [16:19], row1 [16:19]
657 image0_row0Back = _mm_blend_epi16(_mm_srli_si128(image0_row0Back, 8), image0_row1Back, 0xF0); // 0xF0 = 1111 0000
658 image0_row1Back = _mm_blend_epi16(_mm_srli_si128(image0_row1Back, 8), image0_row2Back, 0xF0); // 0xF0 = 1111 0000
659 interpolation0 = SSE::interpolation4Channel32Bit2x4Elements(image0_row0Back, image0_row1Back, f0x_y_, f0xy_, f0x_y, f0xy);
660
661 // image1 row1 [16:19], row1 [16:19]
662 image1_row0Back = _mm_blend_epi16(_mm_srli_si128(image1_row0Back, 8), image1_row1Back, 0xF0); // 0xF0 = 1111 0000
663 image1_row1Back = _mm_blend_epi16(_mm_srli_si128(image1_row1Back, 8), image1_row2Back, 0xF0); // 0xF0 = 1111 0000
664 interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy);
665
666 // ssd row01 [0:19]
667 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
668
669
670 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
671 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
672
673
674
675 // row2
676 // image0 row2 [0:7]
677 __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
678 interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row2Front, image0_row3Front, f0x_y_, f0xy_, f0x_y, f0xy);
679
680 // image0 row2 [8:15]
681 __m128i image0_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements + 8u));
682 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row2Back, image0_row3Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
683
684 // image1 row2 [0:7]
685 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
686 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
687
688 // image1 row2 [8:15]
689 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
690 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
691
692 // ssd row01 [0:19], row2 [0:15]
693 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
694
695 SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
696 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
697
698
699
700 // row3
701 // image0 row3 [0:7]
702 __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
703 interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row3Front, image0_row4Front, f0x_y_, f0xy_, f0x_y, f0xy);
704
705 // image0 row3 [8:15]
706 __m128i image0_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements + 8u));
707 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row3Back, image0_row4Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
708
709 // image1 row3 [0:7]
710 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
711 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
712
713 // image row3 [8:15]
714 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
715 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
716
717 // ssd row01 [0:19], row23 [0:15]
718 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
719
720 // image0 row2 [16:19], row3 [16:19]
721 image0_row2Back = _mm_blend_epi16(_mm_srli_si128(image0_row2Back, 8), image0_row3Back, 0xF0); // 0xF0 = 1111 0000
722 image0_row3Back = _mm_blend_epi16(_mm_srli_si128(image0_row3Back, 8), image0_row4Back, 0xF0); // 0xF0 = 1111 0000
723 interpolation0 = SSE::interpolation4Channel32Bit2x4Elements(image0_row2Back, image0_row3Back, f0x_y_, f0xy_, f0x_y, f0xy);
724
725 // image1 row2 [16:19], row3 [16:19]
726 image1_row2Back = _mm_blend_epi16(_mm_srli_si128(image1_row2Back, 8), image1_row3Back, 0xF0); // 0xF0 = 1111 0000
727 image1_row3Back = _mm_blend_epi16(_mm_srli_si128(image1_row3Back, 8), image1_row4Back, 0xF0); // 0xF0 = 1111 0000
728 interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy);
729
730 // ssd row03 [0:19]
731 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
732
733
734
735 // row4
736 // image0 row4 [0:7]
737 __m128i image0_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements));
738 interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row4Front, image0_row5Front, f0x_y_, f0xy_, f0x_y, f0xy);
739
740 // image0 row4 [8:15]
741 __m128i image0_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements + 8u));
742 interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row4Back, image0_row5Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
743
744 // image1 row4 [0:7]
745 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
746 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
747
748 // image1 row4 [8:15]
749 __m128i image1_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u));
750 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
751
752 // ssd row03 [0:19] row4[0:15]
753 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
754
755 // image0 row4 [16:19]
756 image0_row4Back = _mm_and_si128(image0_row4Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
757 image0_row5Back = _mm_and_si128(image0_row5Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
758 interpolation0 = SSE::interpolation4Channel32Bit2x4Elements(image0_row4Back, image0_row5Back, f0x_y_, f0xy_, f0x_y, f0xy);
759
760 // image1 row4 [16:19]
761 image1_row4Back = _mm_and_si128(image1_row4Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
762 image1_row5Back = _mm_and_si128(image1_row5Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
763 interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy);
764
765 // ssd row04 [0:19]
766 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
767
768 return SSE::sum_u32_4(result);
769}
770
771template <>
772inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<1u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
773{
774 ocean_assert(fx1 <= 128u && fy1 <= 128u);
775
776 SSE::prefetchT0(imageTopLeft0);
777
778 SSE::prefetchT0(imageTopLeft1);
779 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
780
781 const unsigned int fx1_ = 128u - fx1;
782 const unsigned int fy1_ = 128u - fy1;
783
784 const unsigned int f1x_y_ = fx1_ * fy1_;
785 const unsigned int f1xy_ = fx1 * fy1_;
786 const unsigned int f1x_y = fx1_ * fy1;
787 const unsigned int f1xy = fx1 * fy1;
788
789 const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
790 const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
791 const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
792 const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
793
794 SSE::prefetchT0(imageTopLeft0 + 1u * image0StrideElements);
795 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
796
797 // row0 -> [-----------00000]
798 __m128i image0_row = _mm_slli_si128(_mm_loadu_si64(imageTopLeft0), 11);
799
800 const __m128i image1_row0 = _mm_loadu_si64(imageTopLeft1);
801 const __m128i image1_row1 = _mm_loadu_si64(imageTopLeft1 + image1StrideElements);
802 __m128i image1_row = _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 11);
803
804 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
805 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
806
807
808 // row1 -> [------1111100000]
809 __m128i mask = SSE::set128i(0x0000000000FFFFFFull, 0xFFFFFFFFFFFFFFFFull);
810 image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 1u * image0StrideElements), 6), mask);
811
812 const __m128i image1_row2 = _mm_loadu_si64(imageTopLeft1 + 2u * image1StrideElements);
813 image1_row = _mm_blendv_epi8(image1_row, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 6), mask);
814
815 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
816 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
817
818#ifdef OCEAN_COMPILER_CLANG
819
820 // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
821
822 // row2 -> [-222221111100000]
823 mask = SSE::set128i(0x0000000000000000ull, 0x0000FFFFFFFFFFFFull);
824 image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements), 1u), mask);
825
826 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
827 image1_row = _mm_blendv_epi8(image1_row, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 1), mask);
828
829#else // OCEAN_COMPILER_CLANG
830
831 // row2 -> [22222-1111100000]
832 mask = SSE::set128i(0x0000000000000000ull, 0x000000FFFFFFFFFFull);
833 image0_row = _mm_blendv_epi8(image0_row, _mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements), mask);
834
835 const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
836 image1_row = _mm_blendv_epi8(image1_row, SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), mask);
837
838#endif // OCEAN_COMPILER_CLANG
839
840 // intermediate ssd
841 __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row, image1_row);
842
843 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
844 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
845
846
847 // row3 -> [33333-----------]
848 image0_row = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
849
850 const __m128i image1_row4 = _mm_loadu_si64(imageTopLeft1 + 4u * image1StrideElements);
851 image1_row = SSE::interpolation1Channel8Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
852
853 // row4 -> [3333344444------]
854 mask = SSE::set128i(0x000000000000FFFFull, 0xFFFFFF0000000000ull);
855 image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 4u * image0StrideElements - 3), 2), mask);
856
857 const __m128i image1_row5 = _mm_loadu_si64(imageTopLeft1 + 5u * image1StrideElements - 2);
858 image1_row = _mm_blendv_epi8(image1_row, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(_mm_slli_si128(image1_row4, 2), image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 3), mask);
859
860#ifdef OCEAN_COMPILER_CLANG
861
862 // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
863
864 image0_row = _mm_slli_si128(image0_row, 6);
865 image1_row = _mm_slli_si128(image1_row, 6);
866
867#endif // OCEAN_COMPILER_CLANG
868
869 // remaining ssd
870 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row, image1_row));
871
872 return SSE::sum_u32_4(result);
873}
874
875template <>
876inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<2u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
877{
878 ocean_assert(fx1 <= 128u && fy1 <= 128u);
879
880 SSE::prefetchT0(imageTopLeft0);
881 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
882
883 SSE::prefetchT0(imageTopLeft1);
884 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
885
886 const unsigned int fx1_ = 128u - fx1;
887 const unsigned int fy1_ = 128u - fy1;
888
889 const unsigned int f1x_y_ = fx1_ * fy1_;
890 const unsigned int f1xy_ = fx1 * fy1_;
891 const unsigned int f1x_y = fx1_ * fy1;
892 const unsigned int f1xy = fx1 * fy1;
893
894 const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
895 const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
896 const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
897 const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
898
899 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
900 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
901
902 // row0
903 // image0 row0[0:7]
904 __m128i image0_row0 = _mm_loadl_epi64((__m128i*)imageTopLeft0);
905
906 // image1 row0[0:7]
907 __m128i image1_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft1);
908 __m128i image1_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
909 __m128i interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
910
911 unsigned int localResult = SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 8, imageTopLeft1 + 8, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
912 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 9, imageTopLeft1 + 9, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
913
914 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
915 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
916
917
918 // row1
919 // image0 row1[0:7]
920 __m128i image0_row1 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 1u * image0StrideElements));
921 image0_row0 = ::_mm_or_si128(image0_row0, _mm_slli_si128(image0_row1, 8));
922
923 // image1 row1[0:7]
924 __m128i image1_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
925 interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
926
927 // ssd row01[0:7]
928 __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row0, interpolation1);
929
930 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 8u, imageTopLeft1 + 1u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
931 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 9u, imageTopLeft1 + 1u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
932
933 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
934 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
935
936
937 // row 2
938 // image0 row2[0:7]
939 __m128i image0_row2 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
940
941 // image1 row2[0:7]
942 __m128i image1_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
943 interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
944
945 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 8u, imageTopLeft1 + 2u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
946 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 9u, imageTopLeft1 + 2u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
947
948 SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
949 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
950
951
952 // row 3
953 // image0 row3[0:7]
954 __m128i image0_row3 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
955 image0_row2 = _mm_or_si128(image0_row2, _mm_slli_si128(image0_row3, 8));
956
957 // image1 row3[0:7]
958 __m128i image1_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
959 interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
960
961 // ssd row03[0:7]
962 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row2, interpolation1));
963
964 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 8u, imageTopLeft1 + 3u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
965 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 9u, imageTopLeft1 + 3u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
966
967 // row 4
968 // image0 row4[0:7]
969 __m128i image0_row4 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
970
971 // image0 row4[0:7]
972 __m128i image1_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements - 2u)), 2); // here we start 2 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
973 interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row4, image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
974
975 // ssd row04[0:7]
976 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row4, interpolation1));
977
978 localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 8u, imageTopLeft1 + 4u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
979 + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 9u, imageTopLeft1 + 4u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
980
981 return SSE::sum_u32_4(result) + localResult;
982}
983
984template <>
985inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<3u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
986{
987 SSE::prefetchT0(imageTopLeft0);
988 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
989
990 SSE::prefetchT0(imageTopLeft1);
991 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
992
993 const unsigned int fx1_ = 128u - fx1;
994 const unsigned int fy1_ = 128u - fy1;
995
996 const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
997 const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
998 const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
999 const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
1000
1001 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
1002 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
1003
1004 // row 0
1005 __m128i image0_row0 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)imageTopLeft0), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1006
1007 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
1008 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
1009 __m128i interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
1010
1011 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
1012 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
1013 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1014
1015 __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row0, interpolation1);
1016
1017
1018 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
1019 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
1020
1021
1022 // row 1
1023 __m128i image0_row1 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 1u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1024
1025 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
1026 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
1027
1028 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
1029 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1030
1031 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row1, interpolation1));
1032
1033
1034 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
1035 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
1036
1037
1038 // row 2
1039 __m128i image0_row2 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1040
1041 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
1042 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
1043
1044 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
1045 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1046
1047 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row2, interpolation1));
1048
1049
1050 SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
1051 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
1052
1053 // row 3
1054 __m128i image0_row3 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1055
1056 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
1057 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
1058
1059 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
1060 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1061
1062 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row3, interpolation1));
1063
1064
1065
1066 // row 4
1067 __m128i image0_row4 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1068
1069 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
1070 interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
1071
1072 __m128i image1_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u - 6u)), 6); // here we start 6 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
1073 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1074
1075 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row4, interpolation1));
1076
1077 return SSE::sum_u32_4(result);
1078}
1079
1080template <>
1081inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<4u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
1082{
1083 SSE::prefetchT0(imageTopLeft0);
1084 SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
1085
1086 SSE::prefetchT0(imageTopLeft1);
1087 SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
1088
1089 const unsigned int fx1_ = 128u - fx1;
1090 const unsigned int fy1_ = 128u - fy1;
1091
1092 const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
1093 const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
1094 const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
1095 const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
1096
1097 SSE::prefetchT0(imageTopLeft0 + 1u * image0StrideElements);
1098 SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
1099
1100 // row0
1101 // image0 row0 [0:15]
1102 __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
1103
1104 // image1 row0 [0:7]
1105 __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
1106 __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
1107 __m128i interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
1108
1109 // image1 row0 [8:15]
1110 __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
1111 __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
1112 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1113
1114 // ssd row0 [0:15]
1115 __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row0Front, interpolation1);
1116
1117 SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
1118 SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
1119
1120
1121
1122 // row1
1123 // image0 row1 [0:15]
1124 __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 1u * image0StrideElements));
1125
1126 // image1 row1 [0:7]
1127 __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
1128 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
1129
1130 // image1 row1 [8:15]
1131 __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
1132 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1133
1134 // ssd row01 [0:15]
1135 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row1Front, interpolation1));
1136
1137 // image0 row0 [16:19], row1 [16:19]
1138 __m128i image0row01 = _mm_set_epi32(0, 0, *((unsigned int*)(imageTopLeft0 + image0StrideElements + 16u)), *((unsigned int*)(imageTopLeft0 + 16)));
1139
1140 // image1 row1 [16:19], row1 [16:19]
1141 image1_row0Back = _mm_blend_epi16(_mm_srli_si128(image1_row0Back, 8), image1_row1Back, 0xF0); // 0xF0 = 1111 0000
1142 image1_row1Back = _mm_blend_epi16(_mm_srli_si128(image1_row1Back, 8), image1_row2Back, 0xF0); // 0xF0 = 1111 0000
1143 interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy);
1144
1145 // ssd row01 [0:19]
1146 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0row01, interpolation1));
1147
1148
1149 SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
1150 SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
1151
1152
1153
1154 // row2
1155 // image0 row2 [0:7]
1156 __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
1157
1158 // image1 row2 [0:7]
1159 __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
1160 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
1161
1162 // image1 row2 [8:15]
1163 __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
1164 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1165
1166 // ssd row01 [0:19], row2 [0:15]
1167 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row2Front, interpolation1));
1168
1169 SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
1170 SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
1171
1172
1173
1174 // row3
1175 // image0 row3 [0:7]
1176 __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
1177
1178 // image1 row3 [0:7]
1179 __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
1180 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
1181
1182 // image row3 [8:15]
1183 __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
1184 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1185
1186 // ssd row01 [0:19], row23 [0:15]
1187 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row3Front, interpolation1));
1188
1189 // image0 row2 [16:19], row3 [16:19]
1190 __m128i image0row23 = _mm_set_epi32(0, 0, *((unsigned int*)(imageTopLeft0 + 3u * image0StrideElements + 16u)), *((unsigned int*)(imageTopLeft0 + 2u * image0StrideElements + 16)));
1191
1192 // image1 row2 [16:19], row3 [16:19]
1193 image1_row2Back = _mm_blend_epi16(_mm_srli_si128(image1_row2Back, 8), image1_row3Back, 0xF0); // 0xF0 = 1111 0000
1194 image1_row3Back = _mm_blend_epi16(_mm_srli_si128(image1_row3Back, 8), image1_row4Back, 0xF0); // 0xF0 = 1111 0000
1195 interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy);
1196
1197 // ssd row03 [0:19]
1198 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0row23, interpolation1));
1199
1200
1201
1202 // row4
1203 // image0 row4 [0:7]
1204 __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
1205
1206 // image1 row4 [0:7]
1207 __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
1208 interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
1209
1210 // image1 row4 [8:15]
1211 __m128i image1_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u));
1212 interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1213
1214 // ssd row03 [0:19] row4[0:15]
1215 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row4Front, interpolation1));
1216
1217 // image0 row4 [16:19]
1218 __m128i image0row4 = _mm_set_epi32(0, 0, *((unsigned int*)(imageTopLeft0 + 4u * image0StrideElements + 16u)), 0);
1219
1220 // image1 row4 [16:19]
1221 image1_row4Back = _mm_and_si128(image1_row4Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
1222 image1_row5Back = _mm_and_si128(image1_row5Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
1223 interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy);
1224
1225 // ssd row04 [0:19]
1226 result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0row4, interpolation1));
1227
1228 return SSE::sum_u32_4(result);
1229}
1230
1231}
1232
1233}
1234
1235}
1236
1237#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
1238
1239#endif // META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
This class implements sum of square difference calculation functions allowing to determine the SSE wi...
Definition AdvancedSumSquareDifferencesSSE.h:33
static uint32_t patch8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences for an image patch determined between two individual images.
Definition AdvancedSumSquareDifferencesSSE.h:110
static uint32_t patch8BitPerChannel(const uint8_t *const imageTopLeft0, const uint8_t *const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
Returns the sum of square differences for an image patch determined between two individual images.
static uint32_t patch8BitPerChannel(const uint8_t *const imageTopLeft0, const uint8_t *const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
Returns the sum of square differences for an image patch determined between two individual images.
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:1583
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition SSE.h:2264
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2117
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1255
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:1733
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1533
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition SSE.h:1322
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition SSE.h:3927
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3770
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:1879
float Scalar
Definition of a scalar type.
Definition Math.h:129
The namespace covering the entire Ocean framework.
Definition Accessor.h:15