Ocean
AdvancedSumSquareDifferencesSSE.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
9 #define META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
10 
12 
13 #include "ocean/math/Math.h"
14 
15 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
16 
17 #include "ocean/cv/SSE.h"
18 
19 namespace Ocean
20 {
21 
22 namespace CV
23 {
24 
25 namespace Advanced
26 {
27 
28 /**
29  * This class implements sum of square difference calculation functions allowing to determine the SSE with sub-pixel accuracy using SSD SIMD instructions.
30  * @ingroup cvadvanced
31  */
32 class OCEAN_CV_ADVANCED_EXPORT AdvancedSumSquareDifferencesSSE
33 {
34  public:
35 
36  /**
37  * Returns the sum of square differences for an image patch determined between two individual images.
38  * @param image0 The image in which the first patch is located, must be valid
39  * @param image1 The image in which the second patch is located, must be valid
40  * @param width0 The width of the first image, in pixels, with range [tPatchSize + 1, infinity)
41  * @param width1 The width of the second image, in pixels, with range [tPatchSize + 1, infinity)
42  * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2 - 1)
43  * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2 - 1)
44  * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1)
45  * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1)
46  * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
47  * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
48  * @return The resulting sum of square differences, with range [0, infinity)
49  * @tparam tChannels The number of frame channels, with range [1, infinity)
50  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
51  */
52  template <unsigned int tChannels, unsigned int tPatchSize>
53  static inline uint32_t patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
54 
55  /**
56  * Returns the sum of square differences for an image patch determined between two individual images.
57  * @param image0 The image in which the first patch is located, must be valid
58  * @param image1 The image in which the second patch is located, must be valid
59  * @param width0 The width of the first image, in pixels, with range [tPatchSize + 1, infinity)
60  * @param width1 The width of the second image, in pixels, with range [tPatchSize + 1, infinity)
61  * @param centerX0 Horizontal center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, width - tPatchSize/2)
62  * @param centerY0 Vertical center position of the (tPatchSize x tPatchSize) block in the first frame, with range [tPatchSize/2, height - tPatchSize/2)
63  * @param centerX1 Horizontal center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, width - tPatchSize/2 - 1)
64  * @param centerY1 Vertical center position of the (tPatchSize x tPatchSize) block in the second frame, with range [tPatchSize/2, height - tPatchSize/2 - 1)
65  * @param image0PaddingElements The number of padding elements at the end of each row of the first image, in elements, with range [0, infinity)
66  * @param image1PaddingElements The number of padding elements at the end of each row of the second image, in elements, with range [0, infinity)
67  * @return The resulting sum of square differences, with range [0, infinity)
68  * @tparam tChannels The number of frame channels, with range [1, infinity)
69  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
70  */
71  template <unsigned int tChannels, unsigned int tPatchSize>
72  static inline uint32_t patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const unsigned int centerX0, const unsigned int centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements);
73 
74  private:
75 
76  /**
77  * Returns the sum of square differences for an image patch determined between two individual images.
78  * @param imageTopLeft0 The top left corner of the image patch in the first image, must be valid
79  * @param imageTopLeft1 The top left corner of the image patch in the second image, must be valid
80  * @param image0StrideElements The number of elements between two rows in the first image, in elements, with range [width0 * tChannels, infinity)
81  * @param image1StrideElements The number of elements between two rows in the second image, in elements, with range [width0 * tChannels, infinity)
82  * @param fx0 Horizontal interpolation factor for the first image, with range [0, 128]
83  * @param fy0 Vertical interpolation factor for the first image, with range [0, 128]
84  * @param fx1 Horizontal interpolation factor for the second image, with range [0, 128]
85  * @param fy1 Vertical interpolation factor for the second image, with range [0, 128]
86  * @return The resulting sum of square differences, with range [0, infinity)
87  * @tparam tChannels The number of frame channels, with range [1, infinity)
88  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
89  */
90  template <unsigned int tChannels, unsigned int tPatchSize>
91  static inline uint32_t patch8BitPerChannel(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1);
92 
93  /**
94  * Returns the sum of square differences for an image patch determined between two individual images.
95  * @param imageTopLeft0 The top left corner of the image patch in the first image, must be valid
96  * @param imageTopLeft1 The top left corner of the image patch in the second image, must be valid
97  * @param image0StrideElements The number of elements between two rows in the first image, in elements, with range [width0 * tChannels, infinity)
98  * @param image1StrideElements The number of elements between two rows in the second image, in elements, with range [width0 * tChannels, infinity)
99  * @param fx1 Horizontal interpolation factor for the second image, with range [0, 128]
100  * @param fy1 Vertical interpolation factor for the second image, with range [0, 128]
101  * @return The resulting sum of square differences, with range [0, infinity)
102  * @tparam tChannels The number of frame channels, with range [1, infinity)
103  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
104  */
105  template <unsigned int tChannels, unsigned int tPatchSize>
106  static inline uint32_t patch8BitPerChannel(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1);
107 };
108 
109 template <unsigned int tChannels, unsigned int tPatchSize>
110 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
111 {
112  ocean_assert(image0 != nullptr && image1 != nullptr);
113 
114  ocean_assert(width0 > tPatchSize);
115  ocean_assert(width1 > tPatchSize);
116 
117  const unsigned int tPatchSize_2 = tPatchSize / 2u;
118 
119  ocean_assert(centerX0 >= Scalar(tPatchSize_2) && centerX0 < Scalar(width0 - tPatchSize_2 - 1u));
120  ocean_assert(centerY0 >= Scalar(tPatchSize_2));
121 
122  ocean_assert(centerX1 >= Scalar(tPatchSize_2) && centerX1 < Scalar(width1 - tPatchSize_2 - 1u));
123  ocean_assert(centerY1 >= Scalar(tPatchSize_2));
124 
125  const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
126  const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
127 
128  const unsigned int left0 = (unsigned int)(centerX0);
129  const unsigned int top0 = (unsigned int)(centerY0);
130 
131  const unsigned int left1 = (unsigned int)(centerX1);
132  const unsigned int top1 = (unsigned int)(centerY1);
133 
134  const Scalar scalarFx0 = centerX0 - Scalar(left0);
135  const Scalar scalarFy0 = centerY0 - Scalar(top0);
136 
137  ocean_assert(scalarFx0 >= 0 && scalarFx0 <= 1u);
138  ocean_assert(scalarFy0 >= 0 && scalarFy0 <= 1u);
139 
140  const unsigned int fx0 = (unsigned int)(Scalar(128) * scalarFx0 + Scalar(0.5));
141  const unsigned int fy0 = (unsigned int)(Scalar(128) * scalarFy0 + Scalar(0.5));
142 
143  const Scalar scalarFx1 = centerX1 - Scalar(left1);
144  const Scalar scalarFy1 = centerY1 - Scalar(top1);
145 
146  ocean_assert(scalarFx1 >= 0 && scalarFx1 <= 1);
147  ocean_assert(scalarFy1 >= 0 && scalarFy1 <= 1);
148 
149  const unsigned int fx1 = (unsigned int)(Scalar(128) * scalarFx1 + Scalar(0.5));
150  const unsigned int fy1 = (unsigned int)(Scalar(128) * scalarFy1 + Scalar(0.5));
151 
152  const uint8_t* imageTopLeft0 = image0 + (top0 - tPatchSize_2) * image0StrideElements + (left0 - tPatchSize_2) * tChannels;
153  const uint8_t* imageTopLeft1 = image1 + (top1 - tPatchSize_2) * image1StrideElements + (left1 - tPatchSize_2) * tChannels;
154 
155  return patch8BitPerChannel<tChannels, tPatchSize>(imageTopLeft0, imageTopLeft1, image0StrideElements, image1StrideElements, fx0, fy0, fx1, fy1);
156 }
157 
158 template <unsigned int tChannels, unsigned int tPatchSize>
159 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel(const uint8_t* image0, const uint8_t* image1, const unsigned int width0, const unsigned int width1, const unsigned int centerX0, const unsigned int centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
160 {
161  ocean_assert(image0 != nullptr && image1 != nullptr);
162 
163  ocean_assert(width0 > tPatchSize);
164  ocean_assert(width1 > tPatchSize);
165 
166  const unsigned int tPatchSize_2 = tPatchSize / 2u;
167 
168  ocean_assert(centerX0 >= tPatchSize_2 && centerX0 < width0 - tPatchSize_2);
169  ocean_assert(centerY0 >= Scalar(tPatchSize_2));
170 
171  ocean_assert(centerX1 >= tPatchSize_2 && centerX1 < width1 - tPatchSize_2 - 1u);
172  ocean_assert(centerY1 >= Scalar(tPatchSize_2));
173 
174  const unsigned int image0StrideElements = width0 * tChannels + image0PaddingElements;
175  const unsigned int image1StrideElements = width1 * tChannels + image1PaddingElements;
176 
177  const unsigned int left1 = (unsigned int)(centerX1);
178  const unsigned int top1 = (unsigned int)(centerY1);
179 
180  const Scalar scalarFx1 = centerX1 - Scalar(left1);
181  const Scalar scalarFy1 = centerY1 - Scalar(top1);
182 
183  ocean_assert(scalarFx1 >= 0 && scalarFx1 <= 1);
184  ocean_assert(scalarFy1 >= 0 && scalarFy1 <= 1);
185 
186  const unsigned int fx1 = (unsigned int)(Scalar(128) * scalarFx1 + Scalar(0.5));
187  const unsigned int fy1 = (unsigned int)(Scalar(128) * scalarFy1 + Scalar(0.5));
188 
189  const uint8_t* imageTopLeft0 = image0 + (centerY0 - tPatchSize_2) * image0StrideElements + (centerX0 - tPatchSize_2) * tChannels;
190  const uint8_t* imageTopLeft1 = image1 + (top1 - tPatchSize_2) * image1StrideElements + (left1 - tPatchSize_2) * tChannels;
191 
192  return patch8BitPerChannel<tChannels, tPatchSize>(imageTopLeft0, imageTopLeft1, image0StrideElements, image1StrideElements, fx1, fy1);
193 }
194 
195 template <>
196 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<1u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
197 {
198  ocean_assert(fx0 <= 128u && fy0 <= 128u);
199  ocean_assert(fx1 <= 128u && fy1 <= 128u);
200 
201  SSE::prefetchT0(imageTopLeft0);
202  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
203 
204  SSE::prefetchT0(imageTopLeft1);
205  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
206 
207  const unsigned int fx0_ = 128u - fx0;
208  const unsigned int fy0_ = 128u - fy0;
209 
210  const unsigned int fx1_ = 128u - fx1;
211  const unsigned int fy1_ = 128u - fy1;
212 
213  const unsigned int f0x_y_ = fx0_ * fy0_;
214  const unsigned int f0xy_ = fx0 * fy0_;
215  const unsigned int f0x_y = fx0_ * fy0;
216  const unsigned int f0xy = fx0 * fy0;
217 
218  const unsigned int f1x_y_ = fx1_ * fy1_;
219  const unsigned int f1xy_ = fx1 * fy1_;
220  const unsigned int f1x_y = fx1_ * fy1;
221  const unsigned int f1xy = fx1 * fy1;
222 
223  const __m128i __f0x_y_ = _mm_set1_epi16(short(f0x_y_));
224  const __m128i __f0xy_ = _mm_set1_epi16(short(f0xy_));
225  const __m128i __f0x_y = _mm_set1_epi16(short(f0x_y));
226  const __m128i __f0xy = _mm_set1_epi16(short(f0xy));
227 
228  const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
229  const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
230  const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
231  const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
232 
233  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
234  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
235 
236  // row0 -> [-----------00000]
237  const __m128i image0_row0 = _mm_loadu_si64(imageTopLeft0);
238  const __m128i image0_row1 = _mm_loadu_si64(imageTopLeft0 + image0StrideElements);
239  __m128i interpolation0 = _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row0, image0_row1, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 11);
240 
241  const __m128i image1_row0 = _mm_loadu_si64(imageTopLeft1);
242  const __m128i image1_row1 = _mm_loadu_si64(imageTopLeft1 + image1StrideElements);
243  __m128i interpolation1 = _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 11);
244 
245  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
246  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
247 
248 
249  // row1 -> [------1111100000]
250  const __m128i image0_row2 = _mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements);
251  __m128i mask = SSE::set128i(0x0000000000FFFFFFull, 0xFFFFFFFFFFFFFFFFull);
252  interpolation0 = _mm_blendv_epi8(interpolation0, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row1, image0_row2, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 6), mask);
253 
254  const __m128i image1_row2 = _mm_loadu_si64(imageTopLeft1 + 2u * image1StrideElements);
255  interpolation1 = _mm_blendv_epi8(interpolation1, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 6), mask);
256 
257  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
258  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
259 
260 #ifdef OCEAN_COMPILER_CLANG
261 
262  // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
263 
264  // row2 -> [-222221111100000]
265  const __m128i image0_row3 = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
266  mask = SSE::set128i(0x0000000000000000ull, 0x0000FFFFFFFFFFFFull);
267  interpolation0 = _mm_blendv_epi8(interpolation0, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row2, image0_row3, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 1), mask);
268 
269  const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
270  interpolation1 = _mm_blendv_epi8(interpolation1, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 1), mask);
271 
272 #else
273 
274  // row2 -> [22222-1111100000]
275  const __m128i image0_row3 = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
276  mask = SSE::set128i(0x0000000000000000ull, 0x000000FFFFFFFFFFull);
277  interpolation0 = _mm_blendv_epi8(interpolation0, SSE::interpolation1Channel8Bit8Elements(image0_row2, image0_row3, __f0x_y_, __f0xy_, __f0x_y, __f0xy), mask);
278 
279  const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
280  interpolation1 = _mm_blendv_epi8(interpolation1, SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), mask);
281 
282 #endif
283 
284  // intermediate ssd
285  __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
286 
287  SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
288  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
289 
290 
291  // row3 -> [33333-----------]
292  __m128i image0_row4 = _mm_loadu_si64(imageTopLeft0 + 4u * image0StrideElements);
293  interpolation0 = SSE::interpolation1Channel8Bit8Elements(image0_row3, image0_row4, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
294 
295  __m128i image1_row4 = _mm_loadu_si64(imageTopLeft1 + 4u * image1StrideElements);
296  interpolation1 = SSE::interpolation1Channel8Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
297 
298  // row3 -> [3333344444------]
299  __m128i image0_row5 = _mm_loadu_si64(imageTopLeft0 + 5u * image0StrideElements);
300  mask = SSE::set128i(0x000000000000FFFFull, 0xFFFFFF0000000000ull);
301  interpolation0 = _mm_blendv_epi8(interpolation0, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image0_row4, image0_row5, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 5), mask);
302 
303  __m128i image1_row5 = _mm_loadu_si64(imageTopLeft1 + 5u * image1StrideElements);
304  interpolation1 = _mm_blendv_epi8(interpolation1, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row4, image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 5), mask);
305 
306 #ifdef OCEAN_COMPILER_CLANG
307 
308  // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
309 
310  interpolation0 = _mm_slli_si128(interpolation0, 6);
311  interpolation1 = _mm_slli_si128(interpolation1, 6);
312 
313 #endif // OCEAN_COMPILER_CLANG
314 
315  // ssd row04[0:7]
316  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
317 
318  return SSE::sum_u32_4(result);
319 }
320 
321 template <>
322 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<2u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
323 {
324  ocean_assert(fx0 <= 128u && fy0 <= 128u);
325  ocean_assert(fx1 <= 128u && fy1 <= 128u);
326 
327  SSE::prefetchT0(imageTopLeft0);
328  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
329 
330  SSE::prefetchT0(imageTopLeft1);
331  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
332 
333  const unsigned int fx0_ = 128u - fx0;
334  const unsigned int fy0_ = 128u - fy0;
335 
336  const unsigned int fx1_ = 128u - fx1;
337  const unsigned int fy1_ = 128u - fy1;
338 
339  const unsigned int f0x_y_ = fx0_ * fy0_;
340  const unsigned int f0xy_ = fx0 * fy0_;
341  const unsigned int f0x_y = fx0_ * fy0;
342  const unsigned int f0xy = fx0 * fy0;
343 
344  const unsigned int f1x_y_ = fx1_ * fy1_;
345  const unsigned int f1xy_ = fx1 * fy1_;
346  const unsigned int f1x_y = fx1_ * fy1;
347  const unsigned int f1xy = fx1 * fy1;
348 
349  const __m128i __f0x_y_ = _mm_set1_epi16(short(f0x_y_));
350  const __m128i __f0xy_ = _mm_set1_epi16(short(f0xy_));
351  const __m128i __f0x_y = _mm_set1_epi16(short(f0x_y));
352  const __m128i __f0xy = _mm_set1_epi16(short(f0xy));
353 
354  const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
355  const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
356  const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
357  const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
358 
359  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
360  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
361 
362  // row0
363  // image0 row0[0:7]
364  __m128i image0_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft0);
365  __m128i image0_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
366  __m128i interpolation0 = SSE::interpolation2Channel16Bit8Elements(image0_row0, image0_row1, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
367 
368  // image1 row0[0:7]
369  __m128i image1_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft1);
370  __m128i image1_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
371  __m128i interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
372 
373  unsigned int localResult = SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 8, imageTopLeft1 + 8, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
374  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 9, imageTopLeft1 + 9, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
375 
376  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
377  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
378 
379 
380  // row1
381  // image0 row1[0:7]
382  __m128i image0_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
383  interpolation0 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image0_row1, image0_row2, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 8), interpolation0);
384 
385  // image1 row1[0:7]
386  __m128i image1_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
387  interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
388 
389  // ssd row01[0:7]
390  __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
391 
392  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 8u, imageTopLeft1 + 1u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
393  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 9u, imageTopLeft1 + 1u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
394 
395  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
396  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
397 
398 
399  // row 2
400  // image0 row2[0:7]
401  __m128i image0_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
402  interpolation0 = SSE::interpolation2Channel16Bit8Elements(image0_row2, image0_row3, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
403 
404  // image1 row2[0:7]
405  __m128i image1_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
406  interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
407 
408  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 8u, imageTopLeft1 + 2u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
409  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 9u, imageTopLeft1 + 2u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
410 
411  SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
412  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
413 
414 
415  // row 3
416  // image0 row3[0:7]
417  __m128i image0_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
418  interpolation0 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image0_row3, image0_row4, __f0x_y_, __f0xy_, __f0x_y, __f0xy), 8), interpolation0);
419 
420  // image1 row3[0:7]
421  __m128i image1_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
422  interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
423 
424  // ssd row03[0:7]
425  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
426 
427  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 8u, imageTopLeft1 + 3u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
428  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 9u, imageTopLeft1 + 3u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
429 
430 
431  // row 4
432  // image0 row4[0:7]
433  __m128i image0_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements - 2u)), 2); // here we start 2 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
434  interpolation0 = SSE::interpolation2Channel16Bit8Elements(image0_row4, image0_row5, __f0x_y_, __f0xy_, __f0x_y, __f0xy);
435 
436  // image0 row4[0:7]
437  __m128i image1_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements - 2u)), 2); // here we start 2 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
438  interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row4, image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
439 
440  // ssd row04[0:7]
441  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
442 
443  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 8u, imageTopLeft1 + 4u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy)
444  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 9u, imageTopLeft1 + 4u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f0x_y_, f0xy_, f0x_y, f0xy, f1x_y_, f1xy_, f1x_y, f1xy);
445 
446  return SSE::sum_u32_4(result) + localResult;
447 }
448 
449 template <>
450 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<3u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
451 {
452  ocean_assert(fx0 <= 128u && fy0 <= 128u);
453  ocean_assert(fx1 <= 128u && fy1 <= 128u);
454 
455  SSE::prefetchT0(imageTopLeft0);
456  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
457 
458  SSE::prefetchT0(imageTopLeft1);
459  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
460 
461  const unsigned int fx0_ = 128u - fx0;
462  const unsigned int fy0_ = 128u - fy0;
463 
464  const unsigned int fx1_ = 128u - fx1;
465  const unsigned int fy1_ = 128u - fy1;
466 
467  const __m128i f0x_y_ = _mm_set1_epi16(short(fx0_ * fy0_));
468  const __m128i f0xy_ = _mm_set1_epi16(short(fx0 * fy0_));
469  const __m128i f0x_y = _mm_set1_epi16(short(fx0_ * fy0));
470  const __m128i f0xy = _mm_set1_epi16(short(fx0 * fy0));
471 
472  const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
473  const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
474  const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
475  const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
476 
477  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
478  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
479 
480  // row 0
481  __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
482  __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
483  __m128i interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row0Front, image0_row1Front, f0x_y_, f0xy_, f0x_y, f0xy);
484 
485  __m128i image0_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 8u));
486  __m128i image0_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements + 8u));
487  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row0Back, image0_row1Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
488 
489  __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
490  __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
491  __m128i interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
492 
493  __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
494  __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
495  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
496 
497  __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
498 
499 
500  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
501  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
502 
503 
504  // row 1
505  __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
506  interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row1Front, image0_row2Front, f0x_y_, f0xy_, f0x_y, f0xy);
507 
508  __m128i image0_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements + 8u));
509  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row1Back, image0_row2Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
510 
511  __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
512  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
513 
514  __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
515  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
516 
517  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
518 
519 
520  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
521  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
522 
523 
524  // row 2
525  __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
526  interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row2Front, image0_row3Front, f0x_y_, f0xy_, f0x_y, f0xy);
527 
528  __m128i image0_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements + 8u));
529  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row2Back, image0_row3Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
530 
531  __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
532  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
533 
534  __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
535  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
536 
537  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
538 
539 
540  SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
541  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
542 
543  // row 3
544  __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
545  interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row3Front, image0_row4Front, f0x_y_, f0xy_, f0x_y, f0xy);
546 
547  __m128i image0_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements + 8u));
548  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row3Back, image0_row4Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
549 
550  __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
551  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
552 
553  __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
554  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
555 
556  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
557 
558 
559 
560  // row 4
561  __m128i image0_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements));
562  interpolation0 = SSE::interpolation3Channel24Bit8Elements(image0_row4Front, image0_row5Front, f0x_y_, f0xy_, f0x_y, f0xy);
563 
564  __m128i image0_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements + 8u - 6u)), 6); // here we start 6 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
565  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image0_row4Back, image0_row5Back, f0x_y_, f0xy_, f0x_y, f0xy), 9));
566 
567  __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
568  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
569 
570  __m128i image1_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u - 6u)), 6); // here we start 6 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
571  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
572 
573  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
574 
575  return SSE::sum_u32_4(result);
576 }
577 
578 template <>
579 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<4u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
580 {
581  SSE::prefetchT0(imageTopLeft0);
582  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
583 
584  SSE::prefetchT0(imageTopLeft1);
585  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
586 
587  const unsigned int fx0_ = 128u - fx0;
588  const unsigned int fy0_ = 128u - fy0;
589 
590  const unsigned int fx1_ = 128u - fx1;
591  const unsigned int fy1_ = 128u - fy1;
592 
593  const __m128i f0x_y_ = _mm_set1_epi16(short(fx0_ * fy0_));
594  const __m128i f0xy_ = _mm_set1_epi16(short(fx0 * fy0_));
595  const __m128i f0x_y = _mm_set1_epi16(short(fx0_ * fy0));
596  const __m128i f0xy = _mm_set1_epi16(short(fx0 * fy0));
597 
598  const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
599  const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
600  const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
601  const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
602 
603  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
604  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
605 
606  // row0
607  // image0 row0 [0:7]
608  __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
609  __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements));
610  __m128i interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row0Front, image0_row1Front, f0x_y_, f0xy_, f0x_y, f0xy);
611 
612  // image0 row0 [8:15]
613  __m128i image0_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 8u));
614  __m128i image0_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + image0StrideElements + 8u));
615  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row0Back, image0_row1Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
616 
617  // image1 row0 [0:7]
618  __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
619  __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
620  __m128i interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
621 
622  // image1 row0 [8:15]
623  __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
624  __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
625  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
626 
627  // ssd row0 [0:15]
628  __m128i result = SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1);
629 
630  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
631  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
632 
633 
634 
635  // row1
636  // image0 row1 [0:7]
637  __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
638  interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row1Front, image0_row2Front, f0x_y_, f0xy_, f0x_y, f0xy);
639 
640  // image0 row1 [8:15]
641  __m128i image0_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements + 8u));
642  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row1Back, image0_row2Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
643 
644  // image1 row1 [0:7]
645  __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
646  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
647 
648  // image1 row1 [8:15]
649  __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
650  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
651 
652  // ssd row01 [0:15]
653  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
654 
655 
656  // image0 row0 [16:19], row1 [16:19]
657  image0_row0Back = _mm_blend_epi16(_mm_srli_si128(image0_row0Back, 8), image0_row1Back, 0xF0); // 0xF0 = 1111 0000
658  image0_row1Back = _mm_blend_epi16(_mm_srli_si128(image0_row1Back, 8), image0_row2Back, 0xF0); // 0xF0 = 1111 0000
659  interpolation0 = SSE::interpolation4Channel32Bit2x4Elements(image0_row0Back, image0_row1Back, f0x_y_, f0xy_, f0x_y, f0xy);
660 
661  // image1 row1 [16:19], row1 [16:19]
662  image1_row0Back = _mm_blend_epi16(_mm_srli_si128(image1_row0Back, 8), image1_row1Back, 0xF0); // 0xF0 = 1111 0000
663  image1_row1Back = _mm_blend_epi16(_mm_srli_si128(image1_row1Back, 8), image1_row2Back, 0xF0); // 0xF0 = 1111 0000
664  interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy);
665 
666  // ssd row01 [0:19]
667  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
668 
669 
670  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
671  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
672 
673 
674 
675  // row2
676  // image0 row2 [0:7]
677  __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
678  interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row2Front, image0_row3Front, f0x_y_, f0xy_, f0x_y, f0xy);
679 
680  // image0 row2 [8:15]
681  __m128i image0_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements + 8u));
682  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row2Back, image0_row3Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
683 
684  // image1 row2 [0:7]
685  __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
686  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
687 
688  // image1 row2 [8:15]
689  __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
690  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
691 
692  // ssd row01 [0:19], row2 [0:15]
693  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
694 
695  SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
696  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
697 
698 
699 
700  // row3
701  // image0 row3 [0:7]
702  __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
703  interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row3Front, image0_row4Front, f0x_y_, f0xy_, f0x_y, f0xy);
704 
705  // image0 row3 [8:15]
706  __m128i image0_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements + 8u));
707  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row3Back, image0_row4Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
708 
709  // image1 row3 [0:7]
710  __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
711  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
712 
713  // image row3 [8:15]
714  __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
715  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
716 
717  // ssd row01 [0:19], row23 [0:15]
718  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
719 
720  // image0 row2 [16:19], row3 [16:19]
721  image0_row2Back = _mm_blend_epi16(_mm_srli_si128(image0_row2Back, 8), image0_row3Back, 0xF0); // 0xF0 = 1111 0000
722  image0_row3Back = _mm_blend_epi16(_mm_srli_si128(image0_row3Back, 8), image0_row4Back, 0xF0); // 0xF0 = 1111 0000
723  interpolation0 = SSE::interpolation4Channel32Bit2x4Elements(image0_row2Back, image0_row3Back, f0x_y_, f0xy_, f0x_y, f0xy);
724 
725  // image1 row2 [16:19], row3 [16:19]
726  image1_row2Back = _mm_blend_epi16(_mm_srli_si128(image1_row2Back, 8), image1_row3Back, 0xF0); // 0xF0 = 1111 0000
727  image1_row3Back = _mm_blend_epi16(_mm_srli_si128(image1_row3Back, 8), image1_row4Back, 0xF0); // 0xF0 = 1111 0000
728  interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy);
729 
730  // ssd row03 [0:19]
731  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
732 
733 
734 
735  // row4
736  // image0 row4 [0:7]
737  __m128i image0_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements));
738  interpolation0 = SSE::interpolation4Channel32Bit8Elements(image0_row4Front, image0_row5Front, f0x_y_, f0xy_, f0x_y, f0xy);
739 
740  // image0 row4 [8:15]
741  __m128i image0_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 5u * image0StrideElements + 8u));
742  interpolation0 = _mm_or_si128(interpolation0, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image0_row4Back, image0_row5Back, f0x_y_, f0xy_, f0x_y, f0xy), 8));
743 
744  // image1 row4 [0:7]
745  __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
746  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
747 
748  // image1 row4 [8:15]
749  __m128i image1_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u));
750  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
751 
752  // ssd row03 [0:19] row4[0:15]
753  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
754 
755  // image0 row4 [16:19]
756  image0_row4Back = _mm_and_si128(image0_row4Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
757  image0_row5Back = _mm_and_si128(image0_row5Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
758  interpolation0 = SSE::interpolation4Channel32Bit2x4Elements(image0_row4Back, image0_row5Back, f0x_y_, f0xy_, f0x_y, f0xy);
759 
760  // image1 row4 [16:19]
761  image1_row4Back = _mm_and_si128(image1_row4Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
762  image1_row5Back = _mm_and_si128(image1_row5Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
763  interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy);
764 
765  // ssd row04 [0:19]
766  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(interpolation0, interpolation1));
767 
768  return SSE::sum_u32_4(result);
769 }
770 
771 template <>
772 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<1u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
773 {
774  ocean_assert(fx1 <= 128u && fy1 <= 128u);
775 
776  SSE::prefetchT0(imageTopLeft0);
777 
778  SSE::prefetchT0(imageTopLeft1);
779  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
780 
781  const unsigned int fx1_ = 128u - fx1;
782  const unsigned int fy1_ = 128u - fy1;
783 
784  const unsigned int f1x_y_ = fx1_ * fy1_;
785  const unsigned int f1xy_ = fx1 * fy1_;
786  const unsigned int f1x_y = fx1_ * fy1;
787  const unsigned int f1xy = fx1 * fy1;
788 
789  const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
790  const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
791  const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
792  const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
793 
794  SSE::prefetchT0(imageTopLeft0 + 1u * image0StrideElements);
795  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
796 
797  // row0 -> [-----------00000]
798  __m128i image0_row = _mm_slli_si128(_mm_loadu_si64(imageTopLeft0), 11);
799 
800  const __m128i image1_row0 = _mm_loadu_si64(imageTopLeft1);
801  const __m128i image1_row1 = _mm_loadu_si64(imageTopLeft1 + image1StrideElements);
802  __m128i image1_row = _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 11);
803 
804  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
805  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
806 
807 
808  // row1 -> [------1111100000]
809  __m128i mask = SSE::set128i(0x0000000000FFFFFFull, 0xFFFFFFFFFFFFFFFFull);
810  image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 1u * image0StrideElements), 6), mask);
811 
812  const __m128i image1_row2 = _mm_loadu_si64(imageTopLeft1 + 2u * image1StrideElements);
813  image1_row = _mm_blendv_epi8(image1_row, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 6), mask);
814 
815  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
816  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
817 
818 #ifdef OCEAN_COMPILER_CLANG
819 
820  // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
821 
822  // row2 -> [-222221111100000]
823  mask = SSE::set128i(0x0000000000000000ull, 0x0000FFFFFFFFFFFFull);
824  image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements), 1u), mask);
825 
826  const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
827  image1_row = _mm_blendv_epi8(image1_row, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 1), mask);
828 
829 #else // OCEAN_COMPILER_CLANG
830 
831  // row2 -> [22222-1111100000]
832  mask = SSE::set128i(0x0000000000000000ull, 0x000000FFFFFFFFFFull);
833  image0_row = _mm_blendv_epi8(image0_row, _mm_loadu_si64(imageTopLeft0 + 2u * image0StrideElements), mask);
834 
835  const __m128i image1_row3 = _mm_loadu_si64(imageTopLeft1 + 3u * image1StrideElements);
836  image1_row = _mm_blendv_epi8(image1_row, SSE::interpolation1Channel8Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy), mask);
837 
838 #endif // OCEAN_COMPILER_CLANG
839 
840  // intermediate ssd
841  __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row, image1_row);
842 
843  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
844  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
845 
846 
847  // row3 -> [33333-----------]
848  image0_row = _mm_loadu_si64(imageTopLeft0 + 3u * image0StrideElements);
849 
850  const __m128i image1_row4 = _mm_loadu_si64(imageTopLeft1 + 4u * image1StrideElements);
851  image1_row = SSE::interpolation1Channel8Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
852 
853  // row4 -> [3333344444------]
854  mask = SSE::set128i(0x000000000000FFFFull, 0xFFFFFF0000000000ull);
855  image0_row = _mm_blendv_epi8(image0_row, _mm_slli_si128(_mm_loadu_si64(imageTopLeft0 + 4u * image0StrideElements - 3), 2), mask);
856 
857  const __m128i image1_row5 = _mm_loadu_si64(imageTopLeft1 + 5u * image1StrideElements - 2);
858  image1_row = _mm_blendv_epi8(image1_row, _mm_slli_si128(SSE::interpolation1Channel8Bit8Elements(_mm_slli_si128(image1_row4, 2), image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 3), mask);
859 
860 #ifdef OCEAN_COMPILER_CLANG
861 
862  // workaround for Clang compiler bug - with optimizations the unused SSE bytes are not set to zero
863 
864  image0_row = _mm_slli_si128(image0_row, 6);
865  image1_row = _mm_slli_si128(image1_row, 6);
866 
867 #endif // OCEAN_COMPILER_CLANG
868 
869  // remaining ssd
870  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row, image1_row));
871 
872  return SSE::sum_u32_4(result);
873 }
874 
875 template <>
876 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<2u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
877 {
878  ocean_assert(fx1 <= 128u && fy1 <= 128u);
879 
880  SSE::prefetchT0(imageTopLeft0);
881  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
882 
883  SSE::prefetchT0(imageTopLeft1);
884  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
885 
886  const unsigned int fx1_ = 128u - fx1;
887  const unsigned int fy1_ = 128u - fy1;
888 
889  const unsigned int f1x_y_ = fx1_ * fy1_;
890  const unsigned int f1xy_ = fx1 * fy1_;
891  const unsigned int f1x_y = fx1_ * fy1;
892  const unsigned int f1xy = fx1 * fy1;
893 
894  const __m128i __f1x_y_ = _mm_set1_epi16(short(f1x_y_));
895  const __m128i __f1xy_ = _mm_set1_epi16(short(f1xy_));
896  const __m128i __f1x_y = _mm_set1_epi16(short(f1x_y));
897  const __m128i __f1xy = _mm_set1_epi16(short(f1xy));
898 
899  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
900  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
901 
902  // row0
903  // image0 row0[0:7]
904  __m128i image0_row0 = _mm_loadl_epi64((__m128i*)imageTopLeft0);
905 
906  // image1 row0[0:7]
907  __m128i image1_row0 = _mm_lddqu_si128((__m128i*)imageTopLeft1);
908  __m128i image1_row1 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
909  __m128i interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row0, image1_row1, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
910 
911  unsigned int localResult = SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 8, imageTopLeft1 + 8, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
912  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 9, imageTopLeft1 + 9, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
913 
914  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
915  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
916 
917 
918  // row1
919  // image0 row1[0:7]
920  __m128i image0_row1 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 1u * image0StrideElements));
921  image0_row0 = ::_mm_or_si128(image0_row0, _mm_slli_si128(image0_row1, 8));
922 
923  // image1 row1[0:7]
924  __m128i image1_row2 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
925  interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row1, image1_row2, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
926 
927  // ssd row01[0:7]
928  __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row0, interpolation1);
929 
930  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 8u, imageTopLeft1 + 1u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
931  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 1u * image0StrideElements + 9u, imageTopLeft1 + 1u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
932 
933  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
934  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
935 
936 
937  // row 2
938  // image0 row2[0:7]
939  __m128i image0_row2 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
940 
941  // image1 row2[0:7]
942  __m128i image1_row3 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
943  interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row2, image1_row3, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
944 
945  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 8u, imageTopLeft1 + 2u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
946  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 2u * image0StrideElements + 9u, imageTopLeft1 + 2u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
947 
948  SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
949  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
950 
951 
952  // row 3
953  // image0 row3[0:7]
954  __m128i image0_row3 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
955  image0_row2 = _mm_or_si128(image0_row2, _mm_slli_si128(image0_row3, 8));
956 
957  // image1 row3[0:7]
958  __m128i image1_row4 = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
959  interpolation1 = _mm_or_si128(_mm_slli_si128(SSE::interpolation2Channel16Bit8Elements(image1_row3, image1_row4, __f1x_y_, __f1xy_, __f1x_y, __f1xy), 8), interpolation1);
960 
961  // ssd row03[0:7]
962  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row2, interpolation1));
963 
964  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 8u, imageTopLeft1 + 3u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
965  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 3u * image0StrideElements + 9u, imageTopLeft1 + 3u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
966 
967  // row 4
968  // image0 row4[0:7]
969  __m128i image0_row4 = _mm_loadl_epi64((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
970 
971  // image0 row4[0:7]
972  __m128i image1_row5 = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements - 2u)), 2); // here we start 2 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
973  interpolation1 = SSE::interpolation2Channel16Bit8Elements(image1_row4, image1_row5, __f1x_y_, __f1xy_, __f1x_y, __f1xy);
974 
975  // ssd row04[0:7]
976  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row4, interpolation1));
977 
978  localResult += SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 8u, imageTopLeft1 + 4u * image1StrideElements + 8u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy)
979  + SSE::ssd2Channel16Bit1x1(imageTopLeft0 + 4u * image0StrideElements + 9u, imageTopLeft1 + 4u * image1StrideElements + 9u, image0StrideElements, image1StrideElements, f1x_y_, f1xy_, f1x_y, f1xy);
980 
981  return SSE::sum_u32_4(result) + localResult;
982 }
983 
984 template <>
985 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<3u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
986 {
987  SSE::prefetchT0(imageTopLeft0);
988  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
989 
990  SSE::prefetchT0(imageTopLeft1);
991  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
992 
993  const unsigned int fx1_ = 128u - fx1;
994  const unsigned int fy1_ = 128u - fy1;
995 
996  const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
997  const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
998  const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
999  const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
1000 
1001  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
1002  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
1003 
1004  // row 0
1005  __m128i image0_row0 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)imageTopLeft0), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1006 
1007  __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
1008  __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
1009  __m128i interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
1010 
1011  __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
1012  __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
1013  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1014 
1015  __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row0, interpolation1);
1016 
1017 
1018  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
1019  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
1020 
1021 
1022  // row 1
1023  __m128i image0_row1 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 1u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1024 
1025  __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
1026  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
1027 
1028  __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
1029  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1030 
1031  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row1, interpolation1));
1032 
1033 
1034  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
1035  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
1036 
1037 
1038  // row 2
1039  __m128i image0_row2 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1040 
1041  __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
1042  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
1043 
1044  __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
1045  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1046 
1047  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row2, interpolation1));
1048 
1049 
1050  SSE::prefetchT0(imageTopLeft0 + 5u * image0StrideElements);
1051  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
1052 
1053  // row 3
1054  __m128i image0_row3 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1055 
1056  __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
1057  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
1058 
1059  __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
1060  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1061 
1062  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row3, interpolation1));
1063 
1064 
1065 
1066  // row 4
1067  __m128i image0_row4 = _mm_shuffle_epi8(_mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements)), SSE::set128i(0x0E0D0C0B0A0908A0ull, 0x0706050403020100ull));
1068 
1069  __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
1070  interpolation1 = SSE::interpolation3Channel24Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
1071 
1072  __m128i image1_row5Back = _mm_srli_si128(_mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u - 6u)), 6); // here we start 6 bytes earlyer (and shift the bytes later) to avoid a segmentation fault
1073  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation3Channel24Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 9));
1074 
1075  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row4, interpolation1));
1076 
1077  return SSE::sum_u32_4(result);
1078 }
1079 
1080 template <>
1081 inline uint32_t AdvancedSumSquareDifferencesSSE::patch8BitPerChannel<4u, 5u>(const uint8_t* const imageTopLeft0, const uint8_t* const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
1082 {
1083  SSE::prefetchT0(imageTopLeft0);
1084  SSE::prefetchT0(imageTopLeft0 + image0StrideElements);
1085 
1086  SSE::prefetchT0(imageTopLeft1);
1087  SSE::prefetchT0(imageTopLeft1 + image1StrideElements);
1088 
1089  const unsigned int fx1_ = 128u - fx1;
1090  const unsigned int fy1_ = 128u - fy1;
1091 
1092  const __m128i f1x_y_ = _mm_set1_epi16(short(fx1_ * fy1_));
1093  const __m128i f1xy_ = _mm_set1_epi16(short(fx1 * fy1_));
1094  const __m128i f1x_y = _mm_set1_epi16(short(fx1_ * fy1));
1095  const __m128i f1xy = _mm_set1_epi16(short(fx1 * fy1));
1096 
1097  SSE::prefetchT0(imageTopLeft0 + 1u * image0StrideElements);
1098  SSE::prefetchT0(imageTopLeft1 + 2u * image1StrideElements);
1099 
1100  // row0
1101  // image0 row0 [0:15]
1102  __m128i image0_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft0);
1103 
1104  // image1 row0 [0:7]
1105  __m128i image1_row0Front = _mm_lddqu_si128((__m128i*)imageTopLeft1);
1106  __m128i image1_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements));
1107  __m128i interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row0Front, image1_row1Front, f1x_y_, f1xy_, f1x_y, f1xy);
1108 
1109  // image1 row0 [8:15]
1110  __m128i image1_row0Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 8u));
1111  __m128i image1_row1Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + image1StrideElements + 8u));
1112  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1113 
1114  // ssd row0 [0:15]
1115  __m128i result = SSE::sumSquareDifference8Bit16Elements(image0_row0Front, interpolation1);
1116 
1117  SSE::prefetchT0(imageTopLeft0 + 2u * image0StrideElements);
1118  SSE::prefetchT0(imageTopLeft1 + 3u * image1StrideElements);
1119 
1120 
1121 
1122  // row1
1123  // image0 row1 [0:15]
1124  __m128i image0_row1Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 1u * image0StrideElements));
1125 
1126  // image1 row1 [0:7]
1127  __m128i image1_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements));
1128  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row1Front, image1_row2Front, f1x_y_, f1xy_, f1x_y, f1xy);
1129 
1130  // image1 row1 [8:15]
1131  __m128i image1_row2Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 2u * image1StrideElements + 8u));
1132  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row1Back, image1_row2Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1133 
1134  // ssd row01 [0:15]
1135  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row1Front, interpolation1));
1136 
1137  // image0 row0 [16:19], row1 [16:19]
1138  __m128i image0row01 = _mm_set_epi32(0, 0, *((unsigned int*)(imageTopLeft0 + image0StrideElements + 16u)), *((unsigned int*)(imageTopLeft0 + 16)));
1139 
1140  // image1 row1 [16:19], row1 [16:19]
1141  image1_row0Back = _mm_blend_epi16(_mm_srli_si128(image1_row0Back, 8), image1_row1Back, 0xF0); // 0xF0 = 1111 0000
1142  image1_row1Back = _mm_blend_epi16(_mm_srli_si128(image1_row1Back, 8), image1_row2Back, 0xF0); // 0xF0 = 1111 0000
1143  interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row0Back, image1_row1Back, f1x_y_, f1xy_, f1x_y, f1xy);
1144 
1145  // ssd row01 [0:19]
1146  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0row01, interpolation1));
1147 
1148 
1149  SSE::prefetchT0(imageTopLeft0 + 3u * image0StrideElements);
1150  SSE::prefetchT0(imageTopLeft1 + 4u * image1StrideElements);
1151 
1152 
1153 
1154  // row2
1155  // image0 row2 [0:7]
1156  __m128i image0_row2Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 2u * image0StrideElements));
1157 
1158  // image1 row2 [0:7]
1159  __m128i image1_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements));
1160  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row2Front, image1_row3Front, f1x_y_, f1xy_, f1x_y, f1xy);
1161 
1162  // image1 row2 [8:15]
1163  __m128i image1_row3Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 3u * image1StrideElements + 8u));
1164  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1165 
1166  // ssd row01 [0:19], row2 [0:15]
1167  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row2Front, interpolation1));
1168 
1169  SSE::prefetchT0(imageTopLeft0 + 4u * image0StrideElements);
1170  SSE::prefetchT0(imageTopLeft1 + 5u * image1StrideElements);
1171 
1172 
1173 
1174  // row3
1175  // image0 row3 [0:7]
1176  __m128i image0_row3Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 3u * image0StrideElements));
1177 
1178  // image1 row3 [0:7]
1179  __m128i image1_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements));
1180  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row3Front, image1_row4Front, f1x_y_, f1xy_, f1x_y, f1xy);
1181 
1182  // image row3 [8:15]
1183  __m128i image1_row4Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 4u * image1StrideElements + 8u));
1184  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row3Back, image1_row4Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1185 
1186  // ssd row01 [0:19], row23 [0:15]
1187  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row3Front, interpolation1));
1188 
1189  // image0 row2 [16:19], row3 [16:19]
1190  __m128i image0row23 = _mm_set_epi32(0, 0, *((unsigned int*)(imageTopLeft0 + 3u * image0StrideElements + 16u)), *((unsigned int*)(imageTopLeft0 + 2u * image0StrideElements + 16)));
1191 
1192  // image1 row2 [16:19], row3 [16:19]
1193  image1_row2Back = _mm_blend_epi16(_mm_srli_si128(image1_row2Back, 8), image1_row3Back, 0xF0); // 0xF0 = 1111 0000
1194  image1_row3Back = _mm_blend_epi16(_mm_srli_si128(image1_row3Back, 8), image1_row4Back, 0xF0); // 0xF0 = 1111 0000
1195  interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row2Back, image1_row3Back, f1x_y_, f1xy_, f1x_y, f1xy);
1196 
1197  // ssd row03 [0:19]
1198  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0row23, interpolation1));
1199 
1200 
1201 
1202  // row4
1203  // image0 row4 [0:7]
1204  __m128i image0_row4Front = _mm_lddqu_si128((__m128i*)(imageTopLeft0 + 4u * image0StrideElements));
1205 
1206  // image1 row4 [0:7]
1207  __m128i image1_row5Front = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements));
1208  interpolation1 = SSE::interpolation4Channel32Bit8Elements(image1_row4Front, image1_row5Front, f1x_y_, f1xy_, f1x_y, f1xy);
1209 
1210  // image1 row4 [8:15]
1211  __m128i image1_row5Back = _mm_lddqu_si128((__m128i*)(imageTopLeft1 + 5u * image1StrideElements + 8u));
1212  interpolation1 = _mm_or_si128(interpolation1, _mm_slli_si128(SSE::interpolation4Channel32Bit8Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy), 8));
1213 
1214  // ssd row03 [0:19] row4[0:15]
1215  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0_row4Front, interpolation1));
1216 
1217  // image0 row4 [16:19]
1218  __m128i image0row4 = _mm_set_epi32(0, 0, *((unsigned int*)(imageTopLeft0 + 4u * image0StrideElements + 16u)), 0);
1219 
1220  // image1 row4 [16:19]
1221  image1_row4Back = _mm_and_si128(image1_row4Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
1222  image1_row5Back = _mm_and_si128(image1_row5Back, SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0x0000000000000000ull));
1223  interpolation1 = SSE::interpolation4Channel32Bit2x4Elements(image1_row4Back, image1_row5Back, f1x_y_, f1xy_, f1x_y, f1xy);
1224 
1225  // ssd row04 [0:19]
1226  result = _mm_add_epi32(result, SSE::sumSquareDifference8Bit16Elements(image0row4, interpolation1));
1227 
1228  return SSE::sum_u32_4(result);
1229 }
1230 
1231 }
1232 
1233 }
1234 
1235 }
1236 
1237 #endif // OCEAN_HARDWARE_SSE_VERSION >= 41
1238 
1239 #endif // META_OCEAN_CV_ADVANCED_ADVANCED_SUM_SQUARE_DIFFERENCES_SSE_H
This class implements sum of square difference calculation functions allowing to determine the SSE wi...
Definition: AdvancedSumSquareDifferencesSSE.h:33
static uint32_t patch8BitPerChannel(const uint8_t *image0, const uint8_t *image1, const unsigned int width0, const unsigned int width1, const Scalar centerX0, const Scalar centerY0, const Scalar centerX1, const Scalar centerY1, const unsigned int image0PaddingElements, const unsigned int image1PaddingElements)
Returns the sum of square differences for an image patch determined between two individual images.
Definition: AdvancedSumSquareDifferencesSSE.h:110
static uint32_t patch8BitPerChannel(const uint8_t *const imageTopLeft0, const uint8_t *const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx0, const unsigned int fy0, const unsigned int fx1, const unsigned int fy1)
Returns the sum of square differences for an image patch determined between two individual images.
static uint32_t patch8BitPerChannel(const uint8_t *const imageTopLeft0, const uint8_t *const imageTopLeft1, const unsigned int image0StrideElements, const unsigned int image1StrideElements, const unsigned int fx1, const unsigned int fy1)
Returns the sum of square differences for an image patch determined between two individual images.
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:1583
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition: SSE.h:2264
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: SSE.h:2117
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: SSE.h:1733
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition: SSE.h:1533
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition: SSE.h:1322
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition: SSE.h:3927
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition: SSE.h:3770
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: SSE.h:1879
float Scalar
Definition of a scalar type.
Definition: Math.h:128
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15