Ocean
SSE.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_SSE_H
9 #define META_OCEAN_CV_SSE_H
10 
11 #include "ocean/cv/CV.h"
12 
13 #include "ocean/base/Utilities.h"
14 
15 #include "ocean/math/Math.h"
16 
17 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
18 
19 // SSE2 include files
20 #include <emmintrin.h>
21 #include <immintrin.h>
22 #include <mmintrin.h>
23 
24 // SSE3 include files
25 #include <pmmintrin.h>
26 #include <mmintrin.h>
27 
28 // SSE4 include files
29 #include <smmintrin.h>
30 
31 namespace Ocean
32 {
33 
34 namespace CV
35 {
36 
37 /**
38  * This class implements computer vision functions using SSE extensions.
39  * @ingroup cv
40  */
41 class SSE
42 {
43  public:
44 
45 #if !defined(OCEAN_COMPILER_MSC)
46 
47  /**
48  * This union defines a wrapper for the __m128i SSE intrinsic data type.
49  */
50  union M128i
51  {
52  /// The two 64 bit elements.
53  uint64_t m128i_u64[2];
54 
55  /// The four 32 bit elements.
56  uint32_t m128i_u32[4];
57 
58  /// The eight 16 bit elements.
59  uint16_t m128i_u16[8];
60 
61  /// The sixteen 8 bit elements.
62  uint8_t m128i_u8[16];
63  };
64 
65  static_assert(sizeof(M128i) == 16, "Invalid data type!");
66 
67  /**
68  * This union defines a wrapper for the __m128 SSE intrinsic data type.
69  */
70  union M128
71  {
72  /// The four 32 bit elements.
73  float m128_f32[4];
74  };
75 
76  static_assert(sizeof(M128) == 16, "Invalid data type!");
77 
78  /**
79  * This union defines a wrapper for the __m128 SSE intrinsic data type.
80  */
81  union M128d
82  {
83  /// The two 64 bit elements.
84  double m128d_f64[2];
85  };
86 
87  static_assert(sizeof(M128d) == 16, "Invalid data type!");
88 
89 #endif
90 
91  public:
92 
93  /**
94  * Prefetches a block of temporal memory into all cache levels.
95  * @param data Data to be prefetched
96  */
97  static inline void prefetchT0(const void* const data);
98 
99  /**
100  * Prefetches a block of temporal memory in all cache levels except 0th cache level.
101  * @param data Data to be prefetched
102  */
103  static inline void prefetchT1(const void* const data);
104 
105  /**
106  * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
107  * @param data Data to be prefetched
108  */
109  static inline void prefetchT2(const void* const data);
110 
111  /**
112  * Prefetches a block of non-temporal memory into non-temporal cache structure.
113  * @param data Data to be prefetched
114  */
115  static inline void prefetchNTA(const void* const data);
116 
117  /**
118  * Returns one specific 8 bit unsigned integer value of a m128i value object.
119  * @param value The value from which the 8 bit value will be returned
120  * @return The requested 8 bit value
121  * @tparam tIndex The index of the requested 8 bit integer value, with range [0, 15]
122  */
123  template <unsigned int tIndex>
124  static inline uint8_t value_u8(const __m128i& value);
125 
126  /**
127  * Returns one specific 8 bit unsigned integer value of a m128i value object.
128  * @param value The value from which the 8 bit value will be returned
129  * @param index The index of the requested 8 bit integer value, with range [0, 15]
130  * @return The requested 8 bit value
131  */
132  static inline uint8_t value_u8(const __m128i& value, const unsigned int index);
133 
134  /**
135  * Returns one specific 16 bit unsigned integer value of a m128i value object.
136  * @param value The value from which the 16 bit value will be returned
137  * @return The requested 16 bit value
138  * @tparam tIndex The index of the requested 16 bit integer value, with range [0, 7]
139  */
140  template <unsigned int tIndex>
141  static inline uint16_t value_u16(const __m128i& value);
142 
143  /**
144  * Returns one specific 32 bit unsigned integer value of a m128i value object.
145  * @param value The value from which the 32 bit value will be returned
146  * @return The requested 32 bit value
147  * @tparam tIndex The index of the requested 32 bit integer value, with range [0, 3]
148  */
149  template <unsigned int tIndex>
150  static inline unsigned int value_u32(const __m128i& value);
151 
152  /**
153  * Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the result.
154  * @param value The value which elements will be added
155  * @return The resulting sum value
156  */
157  static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i& value);
158 
159  /**
160  * Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
161  * @param value The value which elements will be added
162  * @return The resulting sum value
163  */
164  static inline unsigned int sum_u32_first_2(const __m128i& value);
165 
166  /**
167  * Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
168  * @param value The value which elements will be added
169  * @return The resulting sum value
170  */
171  static inline unsigned int sum_u32_first_third(const __m128i& value);
172 
173  /**
174  * Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
175  * @param value The value which elements will be added
176  * @return The resulting sum value
177  */
178  static OCEAN_FORCE_INLINE float sum_f32_4(const __m128& value);
179 
180  /**
181  * Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
182  * @param value The value which elements will be added
183  * @return The resulting sum value
184  */
185  static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d& value);
186 
187  /**
188  * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
189  * @param image0 First 11 elements to determine the ssd for, may be non aligned
190  * @param image1 Second 11 elements to determine the ssd for, may be non aligned
191  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
192  */
193  static inline __m128i sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
194 
195  /**
196  * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision, the remaining 4 elements are set to zero.
197  * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
198  * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [00 01 02 03 04 05 06 07 08 09 10 11 NA NA NA NA].
199  * @param image0 First 12 (+4) elements to determine the ssd for, with any alignment
200  * @param image1 Second 12 (+4) elements to determine the ssd for, with any alignment
201  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
202  */
203  static inline __m128i sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
204 
205  /**
206  * Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit precision, the beginning 4 elements are interpreted as zero.
207  * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
208  * Thus, this function handles two buffers with this pattern (while the memory starts left and ends right): [NA NA NA NA 04 05 06 07 08 09 10 11 12 13 14 15].
209  * @param image0 First (4+) 12 elements to determine the ssd for, with any alignment
210  * @param image1 Second (4+) 12 elements to determine the ssd for, with any alignment
211  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
212  */
213  static inline __m128i sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
214 
215  /**
216  * Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
217  * This function supports to load the 13 elements from a buffer with only 13 bytes or with a buffer with at least 16 bytes.
218  * @param image0 First 13 elements to determine the ssd for, may be non aligned
219  * @param image1 Second 13 elements to determine the ssd for, may be non aligned
220  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
221  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 13 bytes only
222  */
223  template <bool tBufferHas16Bytes>
224  static inline __m128i sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
225 
226  /**
227  * Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit precision, the beginning 3 elements are interpreted as zero.
228  * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
229  * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [NA NA NA 03 04 05 06 07 08 09 10 11 12 13 14 15].
230  * @param image0 First (3+) 13 elements to determine the ssd for, may be non aligned
231  * @param image1 Second (3+) 13 elements to determine the ssd for, may be non aligned
232  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
233  */
234  static inline __m128i sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
235 
236  /**
237  * Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
238  * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
239  * @param image0 First 15 elements to determine the ssd for, may be non aligned
240  * @param image1 Second 15 elements to determine the ssd for, may be non aligned
241  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
242  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
243  */
244  template <bool tBufferHas16Bytes>
245  static inline __m128i sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
246 
247  /**
248  * Sum square difference determination for 16 elements with 8 bit precision.
249  * @param image0 First 16 elements to determine the ssd for, may be non aligned
250  * @param image1 Second 16 elements to determine the ssd for, may be non aligned
251  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
252  */
253  static inline __m128i sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
254 
255  /**
256  * Sum square difference determination for 16 elements with 8 bit precision.
257  * @param image0 First 16 elements to determine the ssd for, may be non aligned
258  * @param image1 Second 16 elements to determine the ssd for, may be non aligned
259  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
260  */
261  static inline __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1);
262 
263  /**
264  * Sum square difference determination for 16 elements with 8 bit precision.
265  * @param row0 First 16 elements to determine the ssd for
266  * @param row1 Second 16 elements to determine the ssd for
267  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
268  */
269  static inline __m128i sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1);
270 
271  /**
272  * Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
273  * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
274  * @param image0 First row of 8 elements
275  * @param image1 Second row of 8 elements
276  * @param result Resulting 4 average elements
277  */
278  static inline void average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result);
279 
280  /**
281  * Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
282  * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
283  * @param image0 First row of 8 elements
284  * @param image1 Second row of 8 elements
285  * @param result Resulting 4 average elements
286  */
287  static inline void average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
288 
289  /**
290  * Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
291  * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
292  * @param image0 First row of 8 elements, must be valid
293  * @param image1 Second row of 8 elements, must be valid
294  * @param result Resulting 4 average elementss, must be valid
295  * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
296  */
297  static inline void average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
298 
299  /**
300  * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
301  * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
302  * @param image0 First row of 16 elements, must be valid
303  * @param image1 Second row of 16 elements, must be valid
304  * @param result Resulting 8 average elements, must be valid
305  */
306  static inline void average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
307 
308  /**
309  * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
310  * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
311  * @param image0 First row of 16 elements, must be valid
312  * @param image1 Second row of 16 elements, must be valid
313  * @param result Resulting 8 average elements, must be valid
314  * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
315  */
316  static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
317 
318  /**
319  * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
320  * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
321  * @param image0 First row of 32 elements
322  * @param image1 Second row of 32 elements
323  * @param result Resulting 16 average elements
324  */
325  static inline void average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
326 
327  /**
328  * Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
329  * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
330  * @param image0 First row of 32 elements, must be valid
331  * @param image1 Second row of 32 elements, must be valid
332  * @param result Resulting 16 average elements, must be valid
333  * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
334  */
335  static inline void average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
336 
337  /**
338  * Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
339  * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels, each with 2 channels).<br>
340  * @param image0 First row of 8 elements
341  * @param image1 Second row of 8 elements
342  * @param result Resulting 4 average elements
343  */
344  static inline void average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
345 
346  /**
347  * Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
348  * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels).<br>
349  * @param image0 First row of 8 elements
350  * @param image1 Second row of 8 elements
351  * @param result Resulting 4 average elements
352  */
353  static inline void average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result);
354 
355  /**
356  * Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
357  * The function takes two rows of 32 elements and returns 8 average elements (4 averaged pixels, each with 2 channels).<br>
358  * @param image0 First row of 16 elements
359  * @param image1 Second row of 16 elements
360  * @param result Resulting 8 average elements
361  */
362  static inline void average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
363 
364  /**
365  * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
366  * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).<br>
367  * @param image0 First row of 32 elements
368  * @param image1 Second row of 32 elements
369  * @param result Resulting 16 average elements
370  */
371  static inline void average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
372 
373  /**
374  * Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
375  * The function takes two rows of 6 elements and returns 3 average elements (1 averaged pixels, each with 3 channels).<br>
376  * @param image0 First row of 6 elements
377  * @param image1 Second row of 6 elements
378  * @param result Resulting 3 average elements
379  */
380  static inline void average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result);
381 
382  /**
383  * Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
384  * The function takes two rows of 24 elements and returns 12 average elements (4 averaged pixels, each with 3 channels).<br>
385  * @param image0 First row of 24 elements
386  * @param image1 Second row of 24 elements
387  * @param result Resulting 12 average elements
388  */
389  static inline void average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
390 
391  /**
392  * Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
393  * The function takes two rows of 8 elements and returns 4 average elements (1 averaged pixel).<br>
394  * @param image0 First row of 8 elements
395  * @param image1 Second row of 8 elements
396  * @param result Resulting 4 average elements
397  */
398  static inline void average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result);
399 
400  /**
401  * Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
402  * The function takes two rows of 16 elements and returns 8 average elements (2 averaged pixels, each with 4 channels).<br>
403  * @param image0 First row of 16 elements
404  * @param image1 Second row of 16 elements
405  * @param result Resulting 8 average elements
406  */
407  static inline void average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
408 
409  /**
410  * Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
411  * The function takes two rows of 32 elements and returns 16 average elements (4 averaged pixels, each with 4 channels).<br>
412  * @param image0 First row of 32 elements
413  * @param image1 Second row of 32 elements
414  * @param result Resulting 16 average elements
415  */
416  static inline void average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
417 
418  /**
419  * Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
420  * The function takes two rows of 30 elements and returns 10 average elements (10 averaged pixels).<br>
421  * @param image0 First row of 30 elements
422  * @param image1 Second row of 30 elements
423  * @param image2 Third row of 30 elements
424  * @param result Resulting 10 average elements
425  */
426  static inline void average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
427 
428  /**
429  * Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
430  * This function must be invoked before the right shift is applied.
431  * @param value The eight signed 16 bit values to be handled
432  * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
433  */
434  static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i& value);
435 
436  /**
437  * Adds 2^shifts - 1 to each negative signed 16 bit value, so they each value can be right shifted to allow a correct division by 2^shifts.
438  * This function must be invoked before the right shift is applied.
439  * @param value The eight signed 16 bit values to be handled
440  * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
441  * @return The modified value for which division a shift yield equal (and correct!) results
442  */
443  static inline __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts);
444 
445  /**
446  * Divides eight signed 16 bit values by applying a right shift.
447  * This is able to determine the correct division result for positive and negative 16 bit values.
448  * @param value The eight signed 16 bit values to be handled
449  * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
450  * @return The divided values
451  */
452  static inline __m128i divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts);
453 
454  /**
455  * Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
456  * This function must be invoked before the right shift is applied.
457  * @param value The eight signed 32 bit values to be handled
458  * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
459  */
460  static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i& value);
461 
462  /**
463  * Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to allow a correct division by 2^shifts.
464  * This function must be invoked before the right shift is applied.
465  * @param value The eight signed 32 bit values to be handled
466  * @param rightShifts The number of right shifts which needs to be applied, with range [0, 31]
467  * @return The modified value for which division a shift yield equal (and correct!) results
468  */
469  static inline __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts);
470 
471  /**
472  * Divides eight signed 32 bit values by applying a right shift.
473  * This is able to determine the correct division result for positive and negative 32 bit values.
474  * @param value The eight signed 32 bit values to be handled
475  * @param rightShifts The number of right shifts which needs to be applied, with range [0, 32]
476  * @return The divided values
477  */
478  static inline __m128i divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts);
479 
480  /**
481  * Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 bit frame.
482  * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
483  * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
484  * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
485  * @param width The width of the original frame in pixel, with range [10, infinity)
486  */
487  static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
488 
489  /**
490  * Determines the squared horizontal and vertical gradients and the product of both gradients for 16 following pixels for a given 1 channel 8 bit frame.
491  * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
492  * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
493  * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
494  * @param width The width of the original frame in pixel, with range [10, infinity)
495  */
496  static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
497 
498  /**
499  * Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit precision.
500  * @param image0 First 11 elements to determine the sad for, may be non aligned
501  * @param image1 Second 11 elements to determine the sad for, may be non aligned
502  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
503  */
504  static inline __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
505 
506  /**
507  * Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
508  * This function supports to load the 10 elements from a buffer with only 10 bytes or with a buffer with at least 16 bytes.
509  * @param image0 First 10 elements to determine the sad for, may be non aligned
510  * @param image1 Second 10 elements to determine the sad for, may be non aligned
511  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
512  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 10 bytes only
513  */
514  template <bool tBufferHas16Bytes>
515  static inline __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
516 
517  /**
518  * Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
519  * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
520  * @param image0 First 15 elements to determine the sad for, may be non aligned
521  * @param image1 Second 15 elements to determine the sad for, may be non aligned
522  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
523  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
524  */
525  template <bool tBufferHas16Bytes>
526  static inline __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
527 
528  /**
529  * Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
530  * The first interpolation element results from the first and second element of both rows.<br>
531  * The second interpolation element results from the second and third element of both rows.<br>
532  * ...<br>
533  * The eighth interpolation element results from the eighth and ninth.<br>
534  * The interpolation is specified by tx and ty with range [0, 128u].<br>
535  * @param values0 First row of 9 elements to be interpolated
536  * @param values1 Second row of 9 elements to be interpolated
537  * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
538  * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
539  * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
540  * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
541  * @return Interpolation result for 8 elements, which are 8 pixels
542  */
543  static inline __m128i interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
544 
545  /**
546  * Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
547  * The first interpolation element results from the first and second element of both rows.<br>
548  * The second interpolation element results from the second and third element of both rows.<br>
549  * ...<br>
550  * The eighth interpolation element results from the eighth and ninth.<br>
551  * The interpolation is specified by tx and ty with range [0, 128u].<br>
552  * @param values0 First row of 10 elements to be interpolated
553  * @param values1 Second row of 10 elements to be interpolated
554  * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
555  * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
556  * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
557  * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
558  * @return Interpolation result for 8 elements, which are 4 pixels
559  */
560  static inline __m128i interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
561 
562  /**
563  * Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
564  * The first interpolation element results from the first and second element of both rows.<br>
565  * The second interpolation element results from the second and third element of both rows.<br>
566  * ...<br>
567  * The eighth interpolation element results from the eighth and ninth.<br>
568  * The interpolation is specified by tx and ty with range [0, 128u].<br>
569  * @param values0 First row of 11 elements to be interpolated
570  * @param values1 Second row of 11 elements to be interpolated
571  * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
572  * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
573  * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
574  * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
575  * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
576  */
577  static inline __m128i interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
578 
579  /**
580  * Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
581  * The interpolation is specified by tx and ty with range [0, 128u].<br>
582  * @param values0 First row of 16 elements to be interpolated
583  * @param values1 Second row of 16 elements to be interpolated
584  * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
585  * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
586  * @return Interpolation result for 15 elements, which are (15 pixels)
587  */
588  static inline __m128i interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
589 
590  /**
591  * Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
592  * The interpolation is specified by tx and ty with range [0, 128u].<br>
593  * @param values0 First row of 15 elements to be interpolated
594  * @param values1 Second row of 15 elements to be interpolated
595  * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
596  * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
597  * @return Interpolation result for 12 elements, which are (4 pixels)
598  */
599  static inline __m128i interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
600 
601  /**
602  * Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
603  * The first interpolation element results from the first and second element of both rows.<br>
604  * The second interpolation element results from the second and third element of both rows.<br>
605  * ...<br>
606  * The eighth interpolation element results from the eighth and ninth.<br>
607  * The interpolation is specified by tx and ty with range [0, 128u].<br>
608  * @param values0 First row of 12 elements to be interpolated
609  * @param values1 Second row of 12 elements to be interpolated
610  * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
611  * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
612  * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
613  * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
614  * @return Interpolation result for 8 elements, which are (2 pixels)
615  */
616  static inline __m128i interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
617 
618  /**
619  * Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit frames.
620  * The first interpolation element results from the first and second element of both rows.<br>
621  * The second interpolation element results from the second and third element of both rows.<br>
622  * ...<br>
623  * The eighth interpolation element results from the eighth and ninth.<br>
624  * The interpolation is specified by tx and ty with range [0, 128u].<br>
625  * @param values0 First row of 16 elements to be interpolated
626  * @param values1 Second row of 16 elements to be interpolated
627  * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
628  * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
629  * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
630  * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
631  * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
632  */
633  static inline __m128i interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
634 
635  /**
636  * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
637  * @param pixel0 Uppler left pixel in the first frame
638  * @param pixel1 Uppler left pixel in the second frame
639  * @param size0 Size of one frame row in bytes
640  * @param size1 Size of one frame row in bytes
641  * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
642  * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
643  * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
644  * @param f1xy Product of the fx and the fy interpolation factor for the second image
645  * @return Interpolated sum of square difference
646  */
647  static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
648 
649  /**
650  * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
651  * @param pixel0 Uppler left pixel in the first frame
652  * @param pixel1 Uppler left pixel in the second frame
653  * @param size0 Size of one frame row in bytes
654  * @param size1 Size of one frame row in bytes
655  * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
656  * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
657  * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
658  * @param f0xy Product of the fx and the fy interpolation factor for the first image
659  * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
660  * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
661  * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
662  * @param f1xy Product of the fx and the fy interpolation factor for the second image
663  * @return Interpolated sum of square difference
664  */
665  static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
666 
667  /**
668  * Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
669  * @param image0 First 16 elements to determine the ssd for, may be non aligned
670  * @param image1 Second 16 elements to determine the ssd for, may be non aligned
671  * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
672  */
673  static inline __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
674 
675  /**
676  * Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
677  * This functions converts X CBA CBA CBA CBA CBA to 00000000000CCCCC 000BBBBB000AAAAA.
678  * @param interleaved The 15 elements holding the interleaved image data
679  * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
680  * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
681  */
682  static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2);
683 
684  /**
685  * Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
686  * This functions converts XX XXX XXX CBA CBA CB A CBA CBA CBA CBA CBA to 00000000CCCCCCCC BBBBBBBBAAAAAAAA.
687  * @param interleavedA First 16 elements holding the interleaved image data
688  * @param interleavedB Second 16 elements holding the interleaved image data, the first 8 elements will be used only
689  * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
690  * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
691  */
692  static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2);
693 
694  /**
695  * Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
696  * This functions converts CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA to CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA.
697  * @param interleavedA First 16 elements holding the interleaved image data
698  * @param interleavedB Second 16 elements holding the interleaved image data
699  * @param interleavedC Third 16 elements holding the interleaved image data
700  * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
701  * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
702  * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
703  */
704  static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2);
705 
706  /**
707  * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
708  * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
709  * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
710  * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
711  * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
712  */
713  static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
714 
715  /**
716  * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
717  * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes), must be valid
718  * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively, must be valid
719  * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively, must be valid
720  * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively, must be valid
721  */
722  static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2);
723 
724  /**
725  * Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
726  * @param interleaved 45 elements of an image with 3 channels and 8 bit per element (45 bytes), must be valid
727  * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
728  * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
729  * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
730  */
731  static inline void deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
732 
733  /**
734  * Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
735  * This functions converts CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA to CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA.
736  * @param channel0 The 16 elements of the first channel to be interleaved
737  * @param channel1 The 16 elements of the second channel to be interleaved
738  * @param channel2 The 16 elements of the third channel to be interleaved
739  * @param interleavedA Resulting first 16 of the interleaved data
740  * @param interleavedB Resulting second 16 of the interleaved data
741  * @param interleavedC Resulting third 16 of the interleaved data
742  */
743  OCEAN_FORCE_INLINE static void interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC);
744 
745  /**
746  * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
747  * @param channel0 The 16 elements of the first channel to be interleaved, must be valid
748  * @param channel1 The 16 elements of the second channel to be interleaved, must be valid
749  * @param channel2 The 16 elements of the third channel to be interleaved, must be valid
750  * @param interleaved The resulting 48 interleaved elements, must be valid
751  */
752  static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved);
753 
754  /**
755  * Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels and 8 bit per element (e.g., YA16 to AY16).
756  * @param interleaved 16 elements of an image with 2 channels and 8 bit per element (32 bytes)
757  * @param reversedInterleaved Resulting 32 elements with reversed channel order
758  */
759  static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
760 
761  /**
762  * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element.
763  * @param interleaved0 First 16 elements holding the interleaved image data
764  * @param interleaved1 Second 16 elements holding the interleaved image data
765  * @param interleaved2 Third 16 elements holding the interleaved image data
766  * @param reversedInterleaved0 Resulting first 16 elements holding the interleaved image data with reversed channel order
767  * @param reversedInterleaved1 Resulting second 16 elements holding the interleaved image data with reversed channel order
768  * @param reversedInterleaved2 Resulting third 16 elements holding the interleaved image data with reversed channel order
769  */
770  static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2);
771 
772  /**
773  * Reverses the order of the first and last channel of 48 elements (16 pixels) of an image with 3 interleaved channels and 8 bit per element (e.g., RGB24 to BGR24).
774  * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
775  * @param reversedInterleaved Resulting 48 elements with reversed channel order
776  */
777  static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
778 
779  /**
780  * Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels and 8 bit per element (e.g., RGBA32 to ABGR24).
781  * @param interleaved 64 elements of an image with 4 channels and 8 bit per element (64 bytes)
782  * @param reversedInterleaved Resulting 64 elements with reversed channel order
783  */
784  static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
785 
786  /**
787  * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element (in place).
788  * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
789  */
790  static void reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved);
791 
792  /**
793  * Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interleaved channels and 8 bit per element and further swaps both sets.
794  * @param first First 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
795  * @param second Second 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
796  */
797  static inline void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second);
798 
799  /**
800  * Reverses the order of 48 elements with 8 bit per element.
801  * @param elements0 First 16 elements
802  * @param elements1 Second 16 elements
803  * @param elements2 Third 16 elements
804  * @param reversedElements0 Resulting reversed first 16 elements
805  * @param reversedElements1 Resulting reversed second 16 elements
806  * @param reversedElements2 Resulting reversed third 16 elements
807  */
808  static inline void reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2);
809 
810  /**
811  * Reverses the order of 48 elements with 8 bit per element.
812  * @param elements 48 elements that will be reversed
813  * @param reversedElements Resulting reversed 48 elements
814  */
815  static inline void reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements);
816 
817  /**
818  * Reverses the order of 48 elements with 8 bit per element (in place).
819  * @param elements 48 elements that will be reversed
820  */
821  static inline void reverseElements8Bit48Elements(uint8_t* elements);
822 
823  /**
824  * Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
825  * @param first First 48 elements that will be reversed and swapped with the second 48 elements
826  * @param second Second 48 elements that will be reversed and swapped with the first 48 elements
827  */
828  static inline void swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second);
829 
830  /**
831  * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel.
832  * The function takes four pixels DCBA DCBA DCBA DCBA and provides ADCB ADCB ADCB ADCB.<br>
833  * @param elements 16 elements of 4 pixels to be shifted
834  * @param shiftedElements Resulting shifted elements
835  */
836  static inline void shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
837 
838  /**
839  * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel and mirrors the four individual pixels.
840  * @param elements 16 elements of 4 pixels to be shifted and mirrored
841  * @param shiftedElements Resulting shifted and mirrored elements
842  */
843  static inline void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
844 
845  /**
846  * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel.
847  * The function takes four pixels DCBA DCBA DCBA DCBA and provides CBAD CBAD CBAD CBAD.<br>
848  * @param elements 16 elements of 4 pixels to be shifted
849  * @param shiftedElements Resulting shifted elements
850  */
851  static inline void shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
852 
853  /**
854  * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel and mirrors the four individual pixels.
855  * @param elements 16 elements of 4 pixels to be shifted and mirrored
856  * @param shiftedElements Resulting shifted and mirrored elements
857  */
858  static inline void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
859 
860  /**
861  * Sums 16 elements with 8 bit per element.
862  * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
863  * @param elements 16 elements holding the image data
864  * @return Resulting sums
865  */
866  static inline __m128i sum1Channel8Bit16Elements(const __m128i& elements);
867 
868  /**
869  * Sums 16 elements with 8 bit per element.
870  * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
871  * @param elements 16 elements holding the image data
872  * @return Resulting sums
873  */
874  static inline __m128i sum1Channel8Bit16Elements(const uint8_t* elements);
875 
876  /**
877  * Sums the first 15 elements of a buffer with 8 bit per element.
878  * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.<br>
879  * If the provided buffer holds at least 16 bytes the load function is much faster compared to the case if the buffer is not larger than 15 bytes.<br>
880  * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
881  * @param elements 16 elements holding the image data
882  * @return Resulting sums
883  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
884  */
885  template <bool tBufferHas16Bytes>
886  static inline __m128i sum1Channel8BitFront15Elements(const uint8_t* elements);
887 
888  /**
889  * Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is interpreted as zero.
890  * However, the provided buffer must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE register.<br>
891  * Thus, this functions handles one buffer with this pattern (while the memory starts left and ends right): [NA 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15].
892  * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
893  * @param elements (1+) 15 elements holding the image data
894  * @return Resulting sum
895  */
896  static inline __m128i sum1Channel8BitBack15Elements(const uint8_t* elements);
897 
898  /**
899  * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
900  * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
901  * @param interleaved0 First 16 elements holding the interleaved image data
902  * @param interleaved1 Second 16 elements holding the interleaved image data
903  * @param interleaved2 Third 16 elements holding the interleaved image data
904  * @return Resulting sums
905  */
906  static inline __m128i sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2);
907 
908  /**
909  * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
910  * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
911  * @param interleaved 48 elements holding the interleaved image data
912  * @return Resulting sums
913  */
914  static inline __m128i sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved);
915 
916  /**
917  * Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
918  * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
919  * @param interleaved 45 elements holding the interleaved image data
920  * @return Resulting sums
921  */
922  static inline __m128i sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved);
923 
924  /**
925  * Loads the lower 64 bit of a 128i value from the memory.
926  * The upper 64 bit are zeroed.
927  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 8 bytes
928  * @return Resulting value
929  */
930  static inline __m128i load128iLower64(const void* const buffer);
931 
932  /**
933  * Loads a 128i value from the memory.
934  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 16 bytes
935  * @return Resulting value
936  */
937  static inline __m128i load128i(const void* const buffer);
938 
939  /**
940  * Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes, to a 128i value and sets the remaining bytes of the resulting 128i value to zero.
941  * The loaded memory will be stored in the upper 10 bytes of the 128i value while the lowest remaining 6 bytes will be set to zero.
942  * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [09 08 07 06 05 04 03 02 01 00 ZZ ZZ ZZ ZZ ZZ ZZ], with ZZ meaning zero.<br>
943  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
944  * @return Resulting 128 bit value
945  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 10 bytes
946  */
947  template <bool tBufferHas16Bytes>
948  static inline __m128i load_u8_10_upper_zero(const uint8_t* const buffer);
949 
950  /**
951  * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
952  * The loaded memory will be stored in the upper 15 bytes of the 128i value while the lowest remaining 1 byte will be set to zero.
953  * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 ZZ], with ZZ meaning zero.<br>
954  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
955  * @return Resulting 128 bit value
956  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
957  */
958  template <bool tBufferHas16Bytes>
959  static inline __m128i load_u8_15_upper_zero(const uint8_t* const buffer);
960 
961  /**
962  * Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
963  * The loaded memory will be stored in the lower 13 bytes of the 128i value while the highest remaining 3 byte will be random.<br>
964  * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? ?? ?? 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
965  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
966  * @return Resulting 128 bit value
967  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 13 bytes
968  */
969  template <bool tBufferHas16Bytes>
970  static inline __m128i load_u8_13_lower_random(const uint8_t* const buffer);
971 
972  /**
973  * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
974  * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be set to zero.<br>
975  * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [-- 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ZZ meaning zero.<br>
976  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
977  * @return Resulting 128 bit value
978  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
979  */
980  template <bool tBufferHas16Bytes>
981  static inline __m128i load_u8_15_lower_zero(const uint8_t* const buffer);
982 
983  /**
984  * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
985  * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be random.<br>
986  * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
987  * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
988  * @return Resulting 128 bit value
989  * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
990  */
991  template <bool tBufferHas16Bytes>
992  static inline __m128i load_u8_15_lower_random(const uint8_t* const buffer);
993 
994  /**
995  * Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified number of bytes to the right (by inserting zeros).
996  * This function can be used if the remaining buffer is smaller than 16 bytes while the buffer exceeds/continues in the lower address space (from the original point of interest).<br>
997  * Thus, this function an handle a buffer with the following pattern (with lower address left and high address right):<br>
998  * | ?? ?? ?? ?? ?? ?? ?? ?? ?? V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 |, where ?? represent random values in our buffer (in the lower address space), and VX represent the values of interest and V0 the location to which 'buffer' is pointing to.<br>
999  * by load_u8_16_and_shift_right<6>(buffer - 6);<br>
1000  * The resulting 128i register will then be composed of (high bits left, low bits right): [00 00 00 00 00 00 V9 V8 V7 V6 V5 V4 V3 V2 V1 V0].
1001  * @param buffer The actual address from which the 16 bytes will be loaded, must be valid and must be at least 16 bytes large
1002  * @return The resulting 128 bit value
1003  * @tparam tShiftBytes The number of bytes which will be shifted (to the right) after the memory has loaded, with range [0, 16]
1004  */
1005  template <unsigned int tShiftBytes>
1006  static inline __m128i load_u8_16_and_shift_right(const uint8_t* const buffer);
1007 
1008  /**
1009  * Stores a 128i value to the memory.
1010  * @param value Value to be stored
1011  * @param buffer Buffer receiving the value (does not need to be aligned on any particular boundary)
1012  */
1013  static inline void store128i(const __m128i& value, uint8_t* const buffer);
1014 
1015  /**
1016  * Sets a 128i value by two 64 bit values.
1017  * @param high64 High 64 bits to be set
1018  * @param low64 Low 64 bits to be set
1019  * @return Resulting 128i value
1020  */
1021  static inline __m128i set128i(const unsigned long long high64, const unsigned long long low64);
1022 
1023  /**
1024  * Removes the higher 16 bits of four 32 bit elements.
1025  * Given: PONM-LKJI-HGFE-DCBA<br>
1026  * Result: 00NM-00JI-00FE-00BA<br>
1027  * @param value Value to remove the high bits for
1028  * @return Result
1029  */
1030  static inline __m128i removeHighBits32_16(const __m128i& value);
1031 
1032  /**
1033  * Removes the lower 16 bits of four 32 bit elements.
1034  * Given: PONM-LKJI-HGFE-DCBA<br>
1035  * Result: PO00-LK00-HG00-DC00<br>
1036  * @param value Value to remove the lower bits for
1037  * @return Result
1038  */
1039  static inline __m128i removeLowBits32_16(const __m128i& value);
1040 
1041  /**
1042  * Removes the higher 8 bits of eight 16 bit elements.
1043  * Given: PONM-LKJI-HGFE-DCBA<br>
1044  * Result: 0O0M-0K0I-0G0E-0C0A<br>
1045  * @param value Value to remove the high bits for
1046  * @return Result
1047  */
1048  static inline __m128i removeHighBits16_8(const __m128i& value);
1049 
1050  /**
1051  * Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
1052  * Given: PONM-LKJI-HGFE-DCBA<br>
1053  * Result: 000M-0K0I-0G0E-0C0A<br>
1054  * @param value Value to remove the high bits for
1055  * @return Result
1056  */
1057  static inline __m128i removeHighBits16_8_7_lower(const __m128i& value);
1058 
1059  /**
1060  * Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
1061  * Given: PONM-LKJI-HGFE-DCBA<br>
1062  * Result: 0O0M-0K0I-0G0E-0C00<br>
1063  * @param value Value to remove the high bits for
1064  * @return Result
1065  */
1066  static inline __m128i removeHighBits16_8_7_upper(const __m128i& value);
1067 
1068  /**
1069  * Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1070  * Given: PONM-LKJI-HGFE-DCBA<br>
1071  * Result: 0000-0000-OMKI-GECA<br>
1072  * @param value Value to remove the high bits for
1073  * @return Result
1074  */
1075  static inline __m128i moveLowBits16_8ToLow64(const __m128i& value);
1076 
1077  /**
1078  * Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0.
1079  * Given: PONM-LKJI-HGFE-DCBA<br>
1080  * Result: 0000-0000-0000-MIEA<br>
1081  * @param value Value to remove the high bits for
1082  * @return Result
1083  */
1084  static inline __m128i moveLowBits32_8ToLow32(const __m128i& value);
1085 
1086  /**
1087  * Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1088  * Given: PONM-LKJI-HGFE-DCBA<br>
1089  * Result: 0000-0000-NMJI-FEBA<br>
1090  * @param value Value to remove the high bits for
1091  * @return Result
1092  */
1093  static inline __m128i moveLowBits32_16ToLow64(const __m128i& value);
1094 
1095  /**
1096  * Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with 0.
1097  * Given: PONM-LKJI-HGFE-DCBA<br>
1098  * Result: OMKI-GECA-0000-0000<br>
1099  * @param value Value to remove the high bits for
1100  * @return Result
1101  */
1102  static inline __m128i moveLowBits16_8ToHigh64(const __m128i& value);
1103 
1104  /**
1105  * Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
1106  * Given: PONM-LKJI-HGFE-DCBA<br>
1107  * Result: 00PO-00LK-00HG-00DC<br>
1108  * @param value Value to remove the high bits for
1109  * @return Result
1110  */
1111  static inline __m128i moveHighBits32_16(const __m128i& value);
1112 
1113  /**
1114  * Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
1115  * Given: PONM-LKJI-HGFE-DCBA<br>
1116  * Result: 0P0N-0L0J-0H0F-0D0B<br>
1117  * @param value Value to remove the high bits for
1118  * @return Result
1119  */
1120  static inline __m128i moveHighBits16_8(const __m128i& value);
1121 
1122  /**
1123  * Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
1124  * Given: PONM-LKJI-HGFE-DCBA<br>
1125  * Result: 0000-000J-0H0F-0D0B<br>
1126  * @param value Value to remove the high bits for
1127  * @return Result
1128  */
1129  static inline __m128i moveHighBits16_8_5(const __m128i& value);
1130 
1131  /**
1132  * Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
1133  * Given: PONM-LKJI-HGFE-DCBA<br>
1134  * Result: 0000-0L0J-0H0F-0D0B<br>
1135  * @param value Value to remove the high bits for
1136  * @return Result
1137  */
1138  static inline __m128i moveHighBits16_8_6(const __m128i& value);
1139 
1140  /**
1141  * Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
1142  * Given: PONM-LKJI-HGFE-DCBA<br>
1143  * Result: 000N-0L0J-0H0F-0D0B<br>
1144  * @param value Value to remove the high bits for
1145  * @return Result
1146  */
1147  static inline __m128i moveHighBits16_8_7(const __m128i& value);
1148 
1149  /**
1150  * Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
1151  * Given: PONM-LKJI-HGFE-DCBA<br>
1152  * Result: 000D-000C-000B-000A<br>
1153  * @param value Value to be shuffled
1154  * @return Result
1155  */
1156  static inline __m128i shuffleLow32ToLow32_8(const __m128i& value);
1157 
1158  /**
1159  * Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1160  * Given: PONM-LKJI-HGFE-DCBA<br>
1161  * Result: 0H0D-0G0C-0F0B-0E0A<br>
1162  * @param value Value to be shuffled
1163  * @return Result
1164  */
1165  static inline __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value);
1166 
1167  /**
1168  * Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1169  * Given: PONM-LKJI-HGFE-DCBA<br>
1170  * Result: 0P0L-0O0K-0N0J-0M0I<br>
1171  * @param value Value to be shuffled
1172  * @return Result
1173  */
1174  static inline __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i& value);
1175 
1176  /**
1177  * Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1178  * @param value Value to be shuffled
1179  * @return Result
1180  */
1181  static inline __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value);
1182 
1183  /**
1184  * Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1185  * @param value Value to be shuffled
1186  * @return Result
1187  */
1188  static inline __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i& value);
1189 
1190  /**
1191  * Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
1192  * @return Bitmask
1193  */
1194  static inline __m128i bitMaskRemoveHigh16_8();
1195 
1196  /**
1197  * Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
1198  * @return Bitmask
1199  */
1200  static inline __m128i bitMaskRemoveHigh32_16();
1201 
1202  /**
1203  * Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
1204  * The pseudo code of the function is as follows:
1205  * <pre>
1206  * products0[0] = values0[0] * values1[0]
1207  * ...
1208  * products0[3] = values0[3] * values1[3]
1209  *
1210  * products1[0] = values0[4] * values1[4]
1211  * ...
1212  * products1[3] = values0[7] * values1[7]
1213  * </pre>
1214  * @param values0 The first 8 int16_t values to be multiplied
1215  * @param values1 The second 8 int16_t values to be multiplied
1216  * @param products0 The resulting first 4 int32_t products
1217  * @param products1 The resulting second 4 int32_t products
1218  */
1219  static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1);
1220 
1221  /**
1222  * Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
1223  * The pseudo code of the function is as follows:
1224  * <pre>
1225  * results0[0] += values0[0] * values1[0]
1226  * ...
1227  * results0[3] += values0[3] * values1[3]
1228  *
1229  * results1[0] += values0[4] * values1[4]
1230  * ...
1231  * results1[3] += values0[7] * values1[7]
1232  * </pre>
1233  * @param values0 The first 8 int16_t values to be multiplied
1234  * @param values1 The second 8 int16_t values to be multiplied
1235  * @param results0 The results to which the first 4 int32_t products will be added
1236  * @param results1 The results to which the second 4 int32_t products will be added
1237  */
1238  static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1);
1239 
1240  private:
1241 
1242  /**
1243  * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
1244  * @param pixel Upper left pixel in the frame
1245  * @param size Size of one frame row in bytes
1246  * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
1247  * @param fxy_ Product of the fx and the inverse fy interpolation factor
1248  * @param fx_y Product of the inverse fx and the fy interpolation factor
1249  * @param fxy Product of the fx and the fy interpolation factor
1250  * @return Interpolated pixel values
1251  */
1252  static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
1253 };
1254 
1255 inline void SSE::prefetchT0(const void* const data)
1256 {
1257  _mm_prefetch((char*)data, _MM_HINT_T0);
1258 }
1259 
1260 inline void SSE::prefetchT1(const void* const data)
1261 {
1262  _mm_prefetch((char*)data, _MM_HINT_T1);
1263 }
1264 
1265 inline void SSE::prefetchT2(const void* const data)
1266 {
1267  _mm_prefetch((char*)data, _MM_HINT_T2);
1268 }
1269 
1270 inline void SSE::prefetchNTA(const void* const data)
1271 {
1272  _mm_prefetch((char*)data, _MM_HINT_NTA);
1273 }
1274 
1275 template <unsigned int tIndex>
1276 inline uint8_t SSE::value_u8(const __m128i& value)
1277 {
1278  static_assert(tIndex <= 15u, "Invalid index!");
1279 
1280 #ifdef OCEAN_COMPILER_MSC
1281  return value.m128i_u8[tIndex];
1282 #else
1283  return ((const M128i*)(&value))->m128i_u8[tIndex];
1284 #endif
1285 }
1286 
1287 inline uint8_t SSE::value_u8(const __m128i& value, const unsigned int index)
1288 {
1289  ocean_assert(index <= 15u);
1290 
1291 #ifdef OCEAN_COMPILER_MSC
1292  return value.m128i_u8[index];
1293 #else
1294  return ((const M128i*)(&value))->m128i_u8[index];
1295 #endif
1296 }
1297 
1298 template <unsigned int tIndex>
1299 inline uint16_t SSE::value_u16(const __m128i& value)
1300 {
1301  static_assert(tIndex <= 7u, "Invalid index!");
1302 
1303 #ifdef OCEAN_COMPILER_MSC
1304  return value.m128i_u16[tIndex];
1305 #else
1306  return ((const M128i*)(&value))->m128i_u16[tIndex];
1307 #endif
1308 }
1309 
1310 template <unsigned int tIndex>
1311 inline unsigned int SSE::value_u32(const __m128i& value)
1312 {
1313  static_assert(tIndex <= 3u, "Invalid index!");
1314 
1315 #ifdef OCEAN_COMPILER_MSC
1316  return value.m128i_u32[tIndex];
1317 #else
1318  return ((const M128i*)(&value))->m128i_u32[tIndex];
1319 #endif
1320 }
1321 
1322 OCEAN_FORCE_INLINE unsigned int SSE::sum_u32_4(const __m128i& value)
1323 {
1324 #ifdef OCEAN_COMPILER_MSC
1325  return value.m128i_u32[0] + value.m128i_u32[1] + value.m128i_u32[2] + value.m128i_u32[3];
1326 #else
1327  return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1] + ((const M128i*)(&value))->m128i_u32[2] + ((const M128i*)(&value))->m128i_u32[3];
1328 #endif
1329 }
1330 
1331 inline unsigned int SSE::sum_u32_first_2(const __m128i& value)
1332 {
1333 #ifdef OCEAN_COMPILER_MSC
1334  return value.m128i_u32[0] + value.m128i_u32[1];
1335 #else
1336  return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1];
1337 #endif
1338 }
1339 
1340 inline unsigned int SSE::sum_u32_first_third(const __m128i& value)
1341 {
1342 #ifdef OCEAN_COMPILER_MSC
1343  return value.m128i_u32[0] + value.m128i_u32[2];
1344 #else
1345  return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[2];
1346 #endif
1347 }
1348 
1349 OCEAN_FORCE_INLINE float SSE::sum_f32_4(const __m128& value)
1350 {
1351 #ifdef OCEAN_COMPILER_MSC
1352  return value.m128_f32[0] + value.m128_f32[1] + value.m128_f32[2] + value.m128_f32[3];
1353 #else
1354  return ((const M128*)(&value))->m128_f32[0] + ((const M128*)(&value))->m128_f32[1] + ((const M128*)(&value))->m128_f32[2] + ((const M128*)(&value))->m128_f32[3];
1355 #endif
1356 }
1357 
1358 OCEAN_FORCE_INLINE double SSE::sum_f64_2(const __m128d& value)
1359 {
1360 #ifdef OCEAN_COMPILER_MSC
1361  return value.m128d_f64[0] + value.m128d_f64[1];
1362 #else
1363  return ((const M128d*)(&value))->m128d_f64[0] + ((const M128d*)(&value))->m128d_f64[1];
1364 #endif
1365 }
1366 
1367 inline __m128i SSE::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1368 {
1369  ocean_assert(image0 && image1);
1370 
1371  return SSE::sumSquareDifference8Bit16Elements(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1372 }
1373 
1374 inline __m128i SSE::sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1375 {
1376  ocean_assert(image0 && image1);
1377 
1378  return _mm_sad_epu8(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1379 }
1380 
1381 inline __m128i SSE::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
1382 {
1383  ocean_assert(image0 && image1);
1384 
1385  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1386  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1387 
1388  // subtract the 16 elements (usage of saturation and bitwise or operator)
1389  const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1390 
1391  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1392 
1393  const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00AA008ull, 0xA006A004A002A000ull));
1394  const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1395 
1396  // square the 16 elements
1397  const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1398  const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1399 
1400  // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1401  const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1402  const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1403 
1404  // 4 32 bit square difference values
1405  return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1406 }
1407 
1408 inline __m128i SSE::sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
1409 {
1410  ocean_assert(image0 && image1);
1411 
1412  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1413  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1414 
1415  // subtract the 16 elements (usage of saturation and bitwise or operator)
1416  const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1417 
1418  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1419 
1420  const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1421  const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00FA00Dull, 0xA00BA009A007A005ull));
1422 
1423  // square the 16 elements
1424  const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1425  const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1426 
1427  // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1428  const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1429  const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1430 
1431  // 4 32 bit square difference values
1432  return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1433 }
1434 
1435 template <bool tBufferHas16Bytes>
1436 inline __m128i SSE::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
1437 {
1438  ocean_assert(image0 && image1);
1439 
1440  const __m128i row0 = load_u8_13_lower_random<tBufferHas16Bytes>(image0);
1441  const __m128i row1 = load_u8_13_lower_random<tBufferHas16Bytes>(image1);
1442 
1443  // subtract the 16 elements (usage of saturation and bitwise or operator)
1444  const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1445 
1446  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1447 
1448  const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00CA00AA008ull, 0xA006A004A002A000ull));
1449  const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1450 
1451  // square the 16 elements
1452  const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1453  const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1454 
1455  // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1456  const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1457  const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1458 
1459  // 4 32 bit square difference values
1460  return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1461 }
1462 
1463 inline __m128i SSE::sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
1464 {
1465  ocean_assert(image0 && image1);
1466 
1467  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1468  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1469 
1470  // subtract the 16 elements (usage of saturation and bitwise or operator)
1471  const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1472 
1473  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1474 
1475  const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00FA00DA00Bull, 0xA009A007A005A003ull));
1476  const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1477 
1478  // square the 16 elements
1479  const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1480  const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1481 
1482  // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1483  const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1484  const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1485 
1486  // 4 32 bit square difference values
1487  return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1488 }
1489 
1490 template <bool tBufferHas16Bytes>
1491 inline __m128i SSE::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1492 {
1493  ocean_assert(image0 && image1);
1494 
1495  const __m128i row0 = load_u8_15_lower_random<tBufferHas16Bytes>(image0);
1496  const __m128i row1 = load_u8_15_lower_random<tBufferHas16Bytes>(image1);
1497 
1498  // subtract the 16 elements (usage of saturation and bitwise or operator)
1499  const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1500 
1501  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1502  const __m128i subtractLow = removeHighBits16_8(subtract);
1503  const __m128i subtractHigh = moveHighBits16_8_7(subtract); // the highest high 8 bit are not used due to the only 15 elements
1504 
1505  // square the 16 elements
1506  const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1507  const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1508 
1509  // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1510  const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1511  const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1512 
1513  // 4 32 bit square difference values
1514  return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1515 }
1516 
1517 template <bool tBufferHas16Bytes>
1518 inline __m128i SSE::sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
1519 {
1520  ocean_assert(image0 && image1);
1521 
1522  return _mm_sad_epu8(load_u8_10_upper_zero<tBufferHas16Bytes>(image0), load_u8_10_upper_zero<tBufferHas16Bytes>(image1));
1523 }
1524 
1525 template <bool tBufferHas16Bytes>
1526 inline __m128i SSE::sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1527 {
1528  ocean_assert(image0 && image1);
1529 
1530  return _mm_sad_epu8(load_u8_15_upper_zero<tBufferHas16Bytes>(image0), load_u8_15_upper_zero<tBufferHas16Bytes>(image1));
1531 }
1532 
1533 inline __m128i SSE::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1534 {
1535  ocean_assert(image0 && image1);
1536 
1537  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1538  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1539 
1540  return sumSquareDifference8Bit16Elements(row0, row1);
1541 }
1542 
1543 inline __m128i SSE::sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1544 {
1545  ocean_assert(image0 && image1);
1546 
1547  return _mm_sad_epu8(SSE::load128i(image0), SSE::load128i(image1));
1548 }
1549 
1550 inline __m128i SSE::sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1)
1551 {
1552  ocean_assert(image0 && image1);
1553  ocean_assert((unsigned long long)image0 % 16ll == 0ll);
1554  ocean_assert((unsigned long long)image1 % 16ll == 0ll);
1555 
1556  const __m128i row0 = _mm_load_si128((__m128i*)image0);
1557  const __m128i row1 = _mm_load_si128((__m128i*)image1);
1558 
1559  return sumSquareDifference8Bit16Elements(row0, row1);
1560 }
1561 
1562 inline __m128i SSE::sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1)
1563 {
1564  // subtract the 16 elements (usage of saturation and bitwise or operator)
1565  const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1566 
1567  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1568  const __m128i subtractLow = removeHighBits16_8(subtract);
1569  const __m128i subtractHigh = moveHighBits16_8(subtract);
1570 
1571  // square the 16 elements
1572  const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1573  const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1574 
1575  // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1576  const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1577  const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1578 
1579  // 4 32 bit square difference values
1580  return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1581 }
1582 
1583 inline __m128i SSE::interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1584 {
1585  // F E D C B A 9 8 7 6 5 4 3 2 1 0
1586  // values0: aF yE | yD yC | yB yA | y9 y8 | y7 y6 | y5 y4 | y3 y2 | y1 y0
1587  // values1: aF' yE' | yD' yC' | yB' yA' | y9' y8' | y7' y6' | y5' y4' | y3' y2' | y1' y0'
1588 
1589  // shuffled elements
1590  // row0: y7 y6 y5 y4 y3 y2 y1 y0 | * fx_ * fy_
1591  // row1: y8 y7 y6 y5 y4 y3 y2 y1 | * fx * fy_
1592  // row2: y7' y6' y5' y4' y3' y2' y1' y0' | * fx_ * fy
1593  // row3: y8' y7' y6' y5' y4' y3' y2' y1' | * fx * fy
1594 
1595 #ifdef OCEAN_COMPILER_MSC
1596 
1597  ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1598  ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1599  ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1600  ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1601  ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1602  ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1603  ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1604 
1605  ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1606  ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1607  ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1608  ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1609  ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1610  ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1611  ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1612 
1613  ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1614  ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1615  ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1616  ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1617  ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1618  ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1619  ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1620 
1621  ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1622  ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1623  ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1624  ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1625  ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1626  ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1627  ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1628 
1629  ocean_assert(fx_fy_.m128i_u16[0] + fxfy_.m128i_u16[0] + fx_fy.m128i_u16[0] + fxfy.m128i_u16[0] == 128u * 128u);
1630 
1631 #else
1632 
1633 #ifdef OCEAN_DEBUG
1634 
1635  const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1636  const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1637  const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1638  const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1639 
1640 #endif // OCEAN_DEBUG
1641 
1642  ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1643  ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1644  ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1645  ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1646  ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1647  ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1648  ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1649 
1650  ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1651  ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1652  ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1653  ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1654  ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1655  ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1656  ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1657 
1658  ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1659  ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1660  ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1661  ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1662  ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1663  ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1664  ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1665 
1666  ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1667  ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1668  ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1669  ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1670  ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1671  ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1672  ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1673 
1674  ocean_assert(debug_fx_fy_.m128i_u16[0] + debug_fxfy_.m128i_u16[0] + debug_fx_fy.m128i_u16[0] + debug_fxfy.m128i_u16[0] == 128u * 128u);
1675 
1676 #endif
1677 
1678  __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1679 
1680  // row0
1681  __m128i row = _mm_shuffle_epi8(values0, shuffle);
1682 
1683  __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1684  __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1685 
1686  __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1687  __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1688 
1689  // row2
1690  row = _mm_shuffle_epi8(values1, shuffle);
1691 
1692  multiLow = _mm_mullo_epi16(row, fx_fy);
1693  multiHigh = _mm_mulhi_epu16(row, fx_fy);
1694 
1695  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1696  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1697 
1698 
1699 
1700  shuffle = set128i(0xA008A007A006A005ull, 0xA004A003A002A001ull);
1701 
1702  // row1
1703  row = _mm_shuffle_epi8(values0, shuffle);
1704 
1705  multiLow = _mm_mullo_epi16(row, fxfy_);
1706  multiHigh = _mm_mulhi_epu16(row, fxfy_);
1707 
1708  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1709  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1710 
1711 
1712  // row4
1713  row = _mm_shuffle_epi8(values1, shuffle);
1714 
1715  multiLow = _mm_mullo_epi16(row, fxfy);
1716  multiHigh = _mm_mulhi_epu16(row, fxfy);
1717 
1718  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1719  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1720 
1721 
1722  // normalization ( + 128 * 128 / 2) / (128 * 128)
1723  resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1724  resultEven = _mm_srli_epi32(resultEven, 14);
1725 
1726  resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1727  resultOdd = _mm_srli_epi32(resultOdd, 14);
1728 
1729  // stack the 2 four 32 bit values together to eight 8 bit values
1730  return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1731 }
1732 
1733 inline __m128i SSE::interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1734 {
1735  // F E D C B A 9 8 7 6 5 4 3 2 1 0
1736  // values0: a7 y7 | a6 y6 | a5 y5 | a4 y4 | a3 y3 | a2 y2 | a1 y1 | a0 y0
1737  // values1: a7' y7' | a6' y6' | a5' y5' | a4' y4' | a3' y3' | a2' y2' | a1' y1' | a0' y0'
1738 
1739  // shuffled elements
1740  // row0: a3 y3 a2 y2 a1 y1 a0 y0 | * fx_ * fy_
1741  // row1: a4 y4 a3 y3 a2 y2 a1 y1 | * fx * fy_
1742  // row2: a3' y3' a2' y2' a1' y1' a0' y0' | * fx_ * fy
1743  // row3: a4' y4' a3' y3' a2' y2' a1' y1' | * fx * fy
1744 
1745 #ifdef OCEAN_COMPILER_MSC
1746 
1747  ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1748  ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1749  ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1750  ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1751  ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1752  ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1753  ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1754 
1755  ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1756  ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1757  ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1758  ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1759  ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1760  ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1761  ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1762 
1763  ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1764  ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1765  ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1766  ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1767  ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1768  ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1769  ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1770 
1771  ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1772  ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1773  ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1774  ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1775  ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1776  ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1777  ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1778 
1779 #else
1780 
1781 #ifdef OCEAN_DEBUG
1782 
1783  const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1784  const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1785  const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1786  const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1787 
1788 #endif // OCEAN_DEBUG
1789 
1790  ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1791  ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1792  ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1793  ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1794  ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1795  ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1796  ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1797 
1798  ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1799  ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1800  ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1801  ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1802  ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1803  ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1804  ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1805 
1806  ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1807  ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1808  ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1809  ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1810  ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1811  ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1812  ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1813 
1814  ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1815  ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1816  ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1817  ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1818  ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1819  ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1820  ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1821 
1822 #endif
1823 
1824  __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1825 
1826  // row0
1827  __m128i row = _mm_shuffle_epi8(values0, shuffle);
1828 
1829  __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1830  __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1831 
1832  __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1833  __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1834 
1835  // row2
1836  row = _mm_shuffle_epi8(values1, shuffle);
1837 
1838  multiLow = _mm_mullo_epi16(row, fx_fy);
1839  multiHigh = _mm_mulhi_epu16(row, fx_fy);
1840 
1841  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1842  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1843 
1844 
1845 
1846  shuffle = set128i(0xA009A008A007A006ull, 0xA005A004A003A002ull);
1847 
1848  // row1
1849  row = _mm_shuffle_epi8(values0, shuffle);
1850 
1851  multiLow = _mm_mullo_epi16(row, fxfy_);
1852  multiHigh = _mm_mulhi_epu16(row, fxfy_);
1853 
1854  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1855  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1856 
1857 
1858  // row4
1859  row = _mm_shuffle_epi8(values1, shuffle);
1860 
1861  multiLow = _mm_mullo_epi16(row, fxfy);
1862  multiHigh = _mm_mulhi_epu16(row, fxfy);
1863 
1864  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1865  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1866 
1867 
1868  // normalization ( + 128 * 128 / 2) / (128 * 128)
1869  resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1870  resultEven = _mm_srli_epi32(resultEven, 14);
1871 
1872  resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1873  resultOdd = _mm_srli_epi32(resultOdd, 14);
1874 
1875  // stack the 2 four 32 bit values together to eight 8 bit values
1876  return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1877 }
1878 
1879 inline __m128i SSE::interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1880 {
1881  // F E D C B A 9 8 7 6 5 4 3 2 1 0
1882  // values0: r5 | b4 g4 r4 | b3 g3 r3 | b2 g2 r2 | b1 g1 r1 | b0 g0 r0
1883  // values1: r5'| b4' g4' r4'| b3' g3' r3'| b2' g2' r2'| b1' g1' r1'| b0' g0' r0'
1884 
1885  // shuffled elements
1886  // row0: g2 r2 b1 g1 r1 b0 g0 r0 | * fx_ * fy_
1887  // row1: g3 r3 b2 g2 r2 b1 g1 r1 | * fx * fy_
1888  // row2: g2' r2' b1' g1' r1' b0' g0' r0' | * fx_ * fy
1889  // row3: g3' r3' b2' g2' r2' b1' g1' r1' | * fx * fy
1890 
1891 #ifdef OCEAN_COMPILER_MSC
1892 
1893  ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1894  ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1895  ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1896  ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1897  ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1898  ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1899  ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1900 
1901  ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1902  ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1903  ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1904  ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1905  ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1906  ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1907  ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1908 
1909  ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1910  ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1911  ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1912  ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1913  ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1914  ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1915  ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1916 
1917  ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1918  ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1919  ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1920  ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1921  ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1922  ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1923  ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1924 
1925 #else
1926 
1927 #ifdef OCEAN_DEBUG
1928 
1929  const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1930  const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1931  const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1932  const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1933 
1934 #endif // OCEAN_DEBUG
1935 
1936  ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1937  ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1938  ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1939  ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1940  ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1941  ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1942  ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1943 
1944  ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1945  ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1946  ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1947  ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1948  ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1949  ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1950  ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1951 
1952  ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1953  ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1954  ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1955  ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1956  ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1957  ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1958  ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1959 
1960  ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1961  ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1962  ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1963  ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1964  ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1965  ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1966  ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1967 
1968 #endif
1969 
1970  __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1971 
1972  // row0
1973  __m128i row = _mm_shuffle_epi8(values0, shuffle);
1974 
1975  __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1976  __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1977 
1978  __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1979  __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1980 
1981  // row2
1982  row = _mm_shuffle_epi8(values1, shuffle);
1983 
1984  multiLow = _mm_mullo_epi16(row, fx_fy);
1985  multiHigh = _mm_mulhi_epu16(row, fx_fy);
1986 
1987  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1988  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1989 
1990 
1991 
1992  shuffle = set128i(0xA00AA009A008A007ull, 0xA006A005A004A003ull);
1993 
1994  // row1
1995  row = _mm_shuffle_epi8(values0, shuffle);
1996 
1997  multiLow = _mm_mullo_epi16(row, fxfy_);
1998  multiHigh = _mm_mulhi_epu16(row, fxfy_);
1999 
2000  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2001  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2002 
2003 
2004  // row4
2005  row = _mm_shuffle_epi8(values1, shuffle);
2006 
2007  multiLow = _mm_mullo_epi16(row, fxfy);
2008  multiHigh = _mm_mulhi_epu16(row, fxfy);
2009 
2010  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2011  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2012 
2013 
2014  // normalization ( + 128 * 128 / 2) / (128 * 128)
2015  resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2016  resultEven = _mm_srli_epi32(resultEven, 14);
2017 
2018  resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2019  resultOdd = _mm_srli_epi32(resultOdd, 14);
2020 
2021  // stack the 2 four 32 bit values together to eight 8 bit values
2022  return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2023 }
2024 
2025 inline __m128i SSE::interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2026 {
2027  __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2028  __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2029 
2030  __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2031  __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2032 
2033  __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2034  __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2035 
2036  __m128i row0_d = _mm_shuffle_epi8(values0, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2037  __m128i row1_d = _mm_shuffle_epi8(values1, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2038 
2039  row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2040  row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2041  row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2042  row0_d = _mm_madd_epi16(row0_d, fx_fy_fxfy_);
2043 
2044  row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2045  row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2046  row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2047  row1_d = _mm_madd_epi16(row1_d, fx_fyfxfy);
2048 
2049  const __m128i rounding = _mm_set1_epi32(8192);
2050 
2051  __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2052  __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2053  __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2054  __m128i row_d = _mm_add_epi32(row0_d, row1_d);
2055 
2056  row_a = _mm_add_epi32(row_a, rounding);
2057  row_b = _mm_add_epi32(row_b, rounding);
2058  row_c = _mm_add_epi32(row_c, rounding);
2059  row_d = _mm_add_epi32(row_d, rounding);
2060 
2061  row_a = _mm_srli_epi32(row_a, 14);
2062  row_b = _mm_srli_epi32(row_b, 14);
2063  row_c = _mm_srli_epi32(row_c, 14);
2064  row_d = _mm_srli_epi32(row_d, 14);
2065 
2066  row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF0c080400ull));
2067  row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFFFFFFFFull, 0x0c080400FFFFFFFFull));
2068  row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0c080400ull, 0xFFFFFFFFFFFFFFFFull));
2069  row_d = _mm_shuffle_epi8(row_d, set128i(0xFF080400FFFFFFFFull, 0xFFFFFFFFFFFFFFFFull));
2070 
2071  row_a = _mm_or_si128(row_a, row_b);
2072  row_c = _mm_or_si128(row_c, row_d);
2073 
2074  return _mm_or_si128(row_a, row_c);
2075 }
2076 
2077 inline __m128i SSE::interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2078 {
2079  __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2080  __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2081 
2082  __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2083  __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2084 
2085  __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2086  __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2087 
2088  row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2089  row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2090  row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2091 
2092  row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2093  row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2094  row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2095 
2096  const __m128i rounding = _mm_set1_epi32(8192);
2097 
2098  __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2099  __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2100  __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2101 
2102  row_a = _mm_add_epi32(row_a, rounding);
2103  row_b = _mm_add_epi32(row_b, rounding);
2104  row_c = _mm_add_epi32(row_c, rounding);
2105 
2106  row_a = _mm_srli_epi32(row_a, 14);
2107  row_b = _mm_srli_epi32(row_b, 14);
2108  row_c = _mm_srli_epi32(row_c, 14);
2109 
2110  row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
2111  row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
2112  row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
2113 
2114  return _mm_or_si128(row_a, _mm_or_si128(row_b, row_c));
2115 }
2116 
2117 inline __m128i SSE::interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2118 {
2119  // F E D C B A 9 8 7 6 5 4 3 2 1 0
2120  // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2121  // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2122 
2123  // shuffled elements
2124  // row0: a1 b1 g1 r1 a0 b0 g0 r0 | * fx_ * fy_
2125  // row1: a2 b2 g2 r2 a1 b1 g1 r1 | * fx * fy_
2126  // row2: a1' b1' g1' r1' a0' b0' g0' r0' | * fx_ * fy
2127  // row3: a2' b2' g2' r2' a1' b1' g1' r1' | * fx * fy
2128 
2129 #ifdef OCEAN_COMPILER_MSC
2130 
2131  ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2132  ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2133  ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2134  ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2135  ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2136  ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2137  ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2138 
2139  ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2140  ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2141  ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2142  ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2143  ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2144  ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2145  ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2146 
2147  ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2148  ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2149  ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2150  ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2151  ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2152  ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2153  ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2154 
2155  ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2156  ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2157  ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2158  ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2159  ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2160  ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2161  ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2162 
2163 #else
2164 
2165 #ifdef OCEAN_DEBUG
2166 
2167  const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2168  const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2169  const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2170  const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2171 
2172 #endif // OCEAN_DEBUG
2173 
2174  ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2175  ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2176  ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2177  ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2178  ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2179  ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2180  ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2181 
2182  ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2183  ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2184  ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2185  ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2186  ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2187  ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2188  ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2189 
2190  ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2191  ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2192  ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2193  ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2194  ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2195  ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2196  ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2197 
2198  ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2199  ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2200  ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2201  ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2202  ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2203  ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2204  ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2205 
2206 #endif
2207 
2208  __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2209 
2210  // row0
2211  __m128i row = _mm_shuffle_epi8(values0, shuffle);
2212 
2213  __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2214  __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2215 
2216  __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2217  __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2218 
2219  // row2
2220  row = _mm_shuffle_epi8(values1, shuffle);
2221 
2222  multiLow = _mm_mullo_epi16(row, fx_fy);
2223  multiHigh = _mm_mulhi_epu16(row, fx_fy);
2224 
2225  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2226  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2227 
2228 
2229 
2230  shuffle = set128i(0xA00BA00AA009A008ull, 0xA007A006A005A004ull);
2231 
2232  // row1
2233  row = _mm_shuffle_epi8(values0, shuffle);
2234 
2235  multiLow = _mm_mullo_epi16(row, fxfy_);
2236  multiHigh = _mm_mulhi_epu16(row, fxfy_);
2237 
2238  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2239  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2240 
2241 
2242  // row4
2243  row = _mm_shuffle_epi8(values1, shuffle);
2244 
2245  multiLow = _mm_mullo_epi16(row, fxfy);
2246  multiHigh = _mm_mulhi_epu16(row, fxfy);
2247 
2248  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2249  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2250 
2251 
2252  // normalization ( + 128 * 128 / 2) / (128 * 128)
2253  resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2254  resultEven = _mm_srli_epi32(resultEven, 14);
2255 
2256  resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2257  resultOdd = _mm_srli_epi32(resultOdd, 14);
2258 
2259  // stack the 2 four 32 bit values together to eight 8 bit values
2260  return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2261 }
2262 
2263 
2264 inline __m128i SSE::interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2265 {
2266  // F E D C B A 9 8 7 6 5 4 3 2 1 0
2267  // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2268  // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2269 
2270  // shuffled elements
2271  // row0: a2 b2 g2 r2 a0 b0 g0 r0 | * fx_ * fy_
2272  // row1: a3 b3 g3 r3 a1 b1 g1 r1 | * fx * fy_
2273  // row2: a2' b2' g2' r2' a0' b0' g0' r0' | * fx_ * fy
2274  // row3: a3' b3' g3' r3' a1' b1' g1' r1' | * fx * fy
2275 
2276 #ifdef OCEAN_COMPILER_MSC
2277 
2278  ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2279  ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2280  ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2281  ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2282  ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2283  ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2284  ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2285 
2286  ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2287  ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2288  ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2289  ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2290  ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2291  ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2292  ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2293 
2294  ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2295  ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2296  ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2297  ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2298  ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2299  ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2300  ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2301 
2302  ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2303  ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2304  ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2305  ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2306  ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2307  ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2308  ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2309 
2310 #else
2311 
2312 #ifdef OCEAN_DEBUG
2313 
2314  const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2315  const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2316  const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2317  const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2318 
2319 #endif // OCEAN_DEBUG
2320 
2321  ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2322  ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2323  ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2324  ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2325  ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2326  ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2327  ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2328 
2329  ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2330  ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2331  ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2332  ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2333  ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2334  ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2335  ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2336 
2337  ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2338  ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2339  ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2340  ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2341  ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2342  ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2343  ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2344 
2345  ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2346  ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2347  ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2348  ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2349  ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2350  ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2351  ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2352 
2353 #endif
2354 
2355  __m128i shuffle = set128i(0xA00BA00AA009A008ull, 0xA003A002A001A000ull);
2356 
2357  // row0
2358  __m128i row = _mm_shuffle_epi8(values0, shuffle);
2359 
2360  __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2361  __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2362 
2363  __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2364  __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2365 
2366  // row2
2367  row = _mm_shuffle_epi8(values1, shuffle);
2368 
2369  multiLow = _mm_mullo_epi16(row, fx_fy);
2370  multiHigh = _mm_mulhi_epu16(row, fx_fy);
2371 
2372  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2373  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2374 
2375 
2376 
2377  shuffle = set128i(0xA00FA00EA00DA00Cull, 0xA007A006A005A004ull);
2378 
2379  // row1
2380  row = _mm_shuffle_epi8(values0, shuffle);
2381 
2382  multiLow = _mm_mullo_epi16(row, fxfy_);
2383  multiHigh = _mm_mulhi_epu16(row, fxfy_);
2384 
2385  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2386  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2387 
2388 
2389  // row4
2390  row = _mm_shuffle_epi8(values1, shuffle);
2391 
2392  multiLow = _mm_mullo_epi16(row, fxfy);
2393  multiHigh = _mm_mulhi_epu16(row, fxfy);
2394 
2395  resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2396  resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2397 
2398 
2399  // normalization ( + 128 * 128 / 2) / (128 * 128)
2400  resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2401  resultEven = _mm_srli_epi32(resultEven, 14);
2402 
2403  resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2404  resultOdd = _mm_srli_epi32(resultOdd, 14);
2405 
2406  // stack the 2 four 32 bit values together to eight 8 bit values
2407  return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2408 }
2409 
2410 inline void SSE::average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result)
2411 {
2412  ocean_assert(image0 && image1);
2413 
2414  // 4 * float = m128, input does not need to be aligned on any particular boundary.
2415  const __m128 row0 = _mm_loadu_ps(image0);
2416  const __m128 row1 = _mm_loadu_ps(image1);
2417 
2418  // get sum of first 4 elements
2419  const __m128 sumFirst = _mm_add_ps(row0, row1);
2420 
2421  // load next 4 elements
2422  const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2423  const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2424 
2425  // get sum of second 4 elements
2426  const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2427 
2428  // get sum of adjacent summed pixels
2429  const __m128 sumAdjacent = _mm_hadd_ps(sumFirst, sumSecond);
2430 
2431  /* following variant is exactly as fast as _mm_hadd_ps(,) ~ 0.30ms / 100,000 iteration
2432  const unsigned int mask10001000 = 136u;
2433  const unsigned int mask11011101 = 221u;
2434  const __m128 sumAdjacent = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, mask10001000), _mm_shuffle_ps(sumFirst, sumSecond, mask11011101));
2435  */
2436 
2437  // divide by 4 --> multiply by 0.25
2438  const __m128 division = _mm_mul_ps(sumAdjacent, _mm_set_ps1(0.25f));
2439 
2440  // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2441  _mm_storeu_ps(result, division);
2442 }
2443 
2444 inline void SSE::average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2445 {
2446  ocean_assert(image0 && image1);
2447 
2448  // 16 * uchar = m128i, but only the first 8 elements are set
2449  const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2450  const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2451 
2452  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2453  const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2454  const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2455 
2456  // build overall sum and add 2 for rounding
2457  const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2458 
2459  // divide by 4 by right shifting of two bits
2460  const __m128i division16 = _mm_srli_epi16(sum, 2);
2461 
2462  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2463  const __m128i division8 = moveLowBits16_8ToLow64(division16);
2464 
2465  memcpy(result, &division8, sizeof(uint8_t) * 4);
2466 }
2467 
2468 inline void SSE::average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2469 {
2470  ocean_assert(image0 != nullptr && image1 != nullptr);
2471  ocean_assert(threshold >= 1u);
2472 
2473  // we load the first 8 elements, the uppper 8 bytes will be set to zero
2474  const __m128i row0_u_8x8 = _mm_loadl_epi64((__m128i*)image0);
2475  const __m128i row1_u_8x8 = _mm_loadl_epi64((__m128i*)image1);
2476 
2477  const __m128i row0_u_16x8 = _mm_cvtepu8_epi16(row0_u_8x8); // converting the lower 8 bytes to 16 byte values
2478  const __m128i row1_u_16x8 = _mm_cvtepu8_epi16(row1_u_8x8);
2479 
2480  const __m128i verticalSum_u_16x8 = _mm_adds_epu16(row0_u_16x8, row1_u_16x8);
2481  const __m128i sum_u_16x8 = _mm_hadd_epi16(verticalSum_u_16x8, verticalSum_u_16x8);
2482 
2483  const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2484 
2485  const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2486 
2487  memcpy(result, &mask_u_8x8, sizeof(uint8_t) * 4);
2488 }
2489 
2490 inline void SSE::average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2491 {
2492  ocean_assert(image0 && image1);
2493 
2494  // 16 * uchar = m128i
2495  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2496  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2497 
2498  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2499  const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2500  const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2501 
2502  // build overall sum and add 2 for rounding
2503  const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2504 
2505  // divide by 4 by right shifting of two bits
2506  const __m128i division16 = _mm_srli_epi16(sum, 2);
2507 
2508  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2509  const __m128i division8 = moveLowBits16_8ToLow64(division16);
2510 
2511  // copy the lower 64 bit to the memory
2512  _mm_storel_epi64((__m128i*)result, division8);
2513 
2514  /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2515  const __m128i avgRows = _mm_avg_epu8(row0, row1);
2516  const __m128i avgRowsSwap = _mm_or_si128(_mm_slli_epi16(avgRows, 8), _mm_srli_epi16(avgRows, 8));
2517 
2518  const __m128i avg = _mm_avg_epu8(avgRows, avgRowsSwap); // 1 result in 2 uchar
2519  const __m128i avgOrdered = _mm_shuffle_epi8(avg, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2520 
2521  _mm_storel_epi64((__m128i*)result, avgOrdered);
2522  */
2523 }
2524 
2525 inline void SSE::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2526 {
2527  ocean_assert(image0 != nullptr && image1 != nullptr);
2528  ocean_assert(threshold >= 1u);
2529 
2530  // 16 * uchar = m128i
2531  const __m128i row0_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2532  const __m128i row1_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2533 
2534  const __m128i horizontalSum0_u_16x8 = _mm_maddubs_epi16(row0_u_8x16, _mm_set1_epi8(1));
2535  const __m128i horizontalSum1_u_16x8 = _mm_maddubs_epi16(row1_u_8x16, _mm_set1_epi8(1));
2536 
2537  const __m128i sum_u_16x8 = _mm_add_epi16(horizontalSum0_u_16x8, horizontalSum1_u_16x8);
2538 
2539  const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2540 
2541  const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2542 
2543  // copy the lower 64 bit to the memory
2544  _mm_storel_epi64((__m128i*)result, mask_u_8x8);
2545 }
2546 
2547 inline void SSE::average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2548 {
2549  ocean_assert(image0 && image1);
2550 
2551  // first 16 elements
2552  const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2553  const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2554 
2555  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2556  const __m128i firstSumLow = _mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1));
2557  const __m128i firstSumHigh = _mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1));
2558 
2559  // build overall sum and add 2 for rounding
2560  const __m128i firstSum = _mm_add_epi16(firstSumLow, _mm_add_epi16(firstSumHigh, _mm_set1_epi32(int(0x00020002))));
2561 
2562  // divide by 4 by right shifting of two bits
2563  const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2564 
2565  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2566  const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2567 
2568  // second 16 elements
2569  const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2570  const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2571 
2572  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2573  const __m128i secondSumLow = _mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1));
2574  const __m128i secondSumHigh = _mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1));
2575 
2576  // build overall sum and add 2 for rounding
2577  const __m128i secondSum = _mm_add_epi16(secondSumLow, _mm_add_epi16(secondSumHigh, _mm_set1_epi32(int(0x00020002))));
2578 
2579  // divide by 4 by right shifting of two bits
2580  const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2581 
2582  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2583  const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2584 
2585 
2586  // combine both divion results
2587  const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2588 
2589  // copy the 128 bit to the memory
2590  _mm_storeu_si128((__m128i*)result, division8);
2591 
2592  /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2593  const __m128i avgFirstRows = _mm_avg_epu8(firstRow0, firstRow1);
2594  const __m128i avgFirstRowsSwap = _mm_or_si128(_mm_slli_epi16(avgFirstRows, 8), _mm_srli_epi16(avgFirstRows, 8));
2595 
2596  const __m128i avgFirst = _mm_avg_epu8(avgFirstRows, avgFirstRowsSwap); // 1 result in 2 uchar
2597  const __m128i avgFristOrdered = _mm_shuffle_epi8(avgFirst, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2598 
2599  const __m128i avgSecondRows = _mm_avg_epu8(secondRow0, secondRow1);
2600  const __m128i avgSecondRowsSwap = _mm_or_si128(_mm_slli_epi16(avgSecondRows, 8), _mm_srli_epi16(avgSecondRows, 8));
2601 
2602  const __m128i avgSecond = _mm_avg_epu8(avgSecondRows, avgSecondRowsSwap); // 1 result in 2 uchar
2603  const __m128i avgSecondOrdered = _mm_shuffle_epi8(avgSecond, _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0));
2604 
2605  // combine both divion results
2606  const __m128i combinedAvg = _mm_or_si128(avgFristOrdered, avgSecondOrdered);
2607 
2608  // copy the 128 bit to the memory
2609  _mm_storeu_si128((__m128i*)result, combinedAvg);
2610  */
2611 }
2612 
2613 inline void SSE::average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2614 {
2615  ocean_assert(image0 != nullptr && image1 != nullptr);
2616  ocean_assert(threshold >= 1u);
2617 
2618  // load first 16 uchars
2619  const __m128i row0A_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2620  const __m128i row1A_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2621 
2622  const __m128i horizontalSum0A_u_16x8 = _mm_maddubs_epi16(row0A_u_8x16, _mm_set1_epi8(1));
2623  const __m128i horizontalSum1A_u_16x8 = _mm_maddubs_epi16(row1A_u_8x16, _mm_set1_epi8(1));
2624 
2625  const __m128i sumA_u_16x8 = _mm_add_epi16(horizontalSum0A_u_16x8, horizontalSum1A_u_16x8);
2626 
2627  const __m128i maskA_u_16x8 = _mm_cmpgt_epi16(sumA_u_16x8, _mm_set1_epi16(short(threshold - 1)));
2628 
2629  const __m128i row0B_u_8x16 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2630  const __m128i row1B_u_8x16 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2631 
2632  const __m128i horizontalSum0B_u_16x8 = _mm_maddubs_epi16(row0B_u_8x16, _mm_set1_epi8(1));
2633  const __m128i horizontalSum1B_u_16x8 = _mm_maddubs_epi16(row1B_u_8x16, _mm_set1_epi8(1));
2634 
2635  const __m128i sumB_u_16x8 = _mm_add_epi16(horizontalSum0B_u_16x8, horizontalSum1B_u_16x8);
2636 
2637  const __m128i maskB_u_16x8 = _mm_cmpgt_epi16(sumB_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2638 
2639  const __m128i mask_u_8x16 = _mm_or_si128(moveLowBits16_8ToLow64(maskA_u_16x8), moveLowBits16_8ToHigh64(maskB_u_16x8));
2640 
2641  // copy the 128 bit to the memory
2642  _mm_storeu_si128((__m128i*)result, mask_u_8x16);
2643 }
2644 
2645 inline void SSE::average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2646 {
2647  ocean_assert(image0 && image1);
2648 
2649  // 16 * uchar = m128i, but only the first 8 elements are set
2650  const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2651  const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2652 
2653  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2654  const __m128i shuffledRow0 = shuffleNeighbor2Low64BitsToLow16_8(row0);
2655  const __m128i shuffledRow1 = shuffleNeighbor2Low64BitsToLow16_8(row1);
2656 
2657  // build sum and add 2 for rounding
2658  const __m128i sumLow = _mm_add_epi16(shuffledRow0, shuffledRow1);
2659  const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumLow), _mm_set1_epi32(int(0x00020002)));
2660 
2661  // divide by 4 by right shifting of two bits
2662  const __m128i division16 = _mm_srli_epi16(sum, 2);
2663 
2664  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2665  const __m128i division8 = moveLowBits16_8ToLow64(division16);
2666 
2667  memcpy(result, &division8, sizeof(uint8_t) * 4);
2668 }
2669 
2670 inline void SSE::average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result)
2671 {
2672  ocean_assert(image0 && image1);
2673 
2674  // 4 * float = m128, input does not need to be aligned on any particular boundary.
2675  const __m128 row0 = _mm_loadu_ps(image0);
2676  const __m128 row1 = _mm_loadu_ps(image1);
2677 
2678  // get sum of first 4 elements
2679  const __m128 sumFirst = _mm_add_ps(row0, row1);
2680 
2681  // load next 4 elements
2682  const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2683  const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2684 
2685  // get sum of second 4 elements
2686  const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2687 
2688  // get sum of summed pixels
2689  // mask01000100 = 68u
2690  // mask11101110 = 238u
2691  const __m128 sumComponents = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, 68u), _mm_shuffle_ps(sumFirst, sumSecond, 238u));
2692 
2693  // divide by 4 --> multiply by 0.25
2694  const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2695 
2696  // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2697  _mm_storeu_ps(result, division);
2698 }
2699 
2700 inline void SSE::average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2701 {
2702  ocean_assert(image0 && image1);
2703 
2704  // 16 * uchar = m128i
2705  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2706  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2707 
2708  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2709  const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2710  const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2711 
2712  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2713  const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2714 
2715  // divide by 4 by right shifting of two bits
2716  const __m128i division16 = _mm_srli_epi16(sum, 2);
2717 
2718  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2719  const __m128i division8 = moveLowBits16_8ToLow64(division16);
2720 
2721  // copy the lower 64 bit to the memory
2722  _mm_storel_epi64((__m128i*)result, division8);
2723 }
2724 
2725 inline void SSE::average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2726 {
2727  ocean_assert(image0 && image1);
2728 
2729  // first 16 elements: 16 * uchar = m128i
2730  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2731  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2732 
2733  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2734  const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2735  const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2736 
2737  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2738  const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2739 
2740  // divide by 4 by right shifting of two bits
2741  const __m128i division16 = _mm_srli_epi16(sum, 2);
2742 
2743  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2744  const __m128i firstDivision8 = moveLowBits16_8ToLow64(division16);
2745 
2746  // second 16 elements
2747  const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2748  const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2749 
2750  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2751  const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(secondRow0), shuffleNeighbor2Low64BitsToLow16_8(secondRow1));
2752  const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(secondRow0), shuffleNeighbor2High64BitsToLow16_8(secondRow1));
2753 
2754  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2755  const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2756 
2757  // divide by 4 by right shifting of two bits
2758  const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2759 
2760  // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2761  const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2762 
2763 
2764  // combine both divion results
2765  const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2766 
2767  // copy the 128 bit to the memory
2768  _mm_storeu_si128((__m128i*)result, division8);
2769 }
2770 
2771 inline void SSE::average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result)
2772 {
2773  ocean_assert(image0 && image1 && result);
2774 
2775  // 6 * float = 2 pixel: 00 01 02 03 04 05
2776 
2777  // load element 0 up to 3, input does not need to be aligned on any particular boundary.
2778  const __m128 row0 = _mm_loadu_ps(image0);
2779  const __m128 row1 = _mm_loadu_ps(image1);
2780 
2781  // get sum of first 4 elements
2782  const __m128 sumFirst = _mm_add_ps(row0, row1);
2783 
2784  // load element 2 up to 5 to prevent that we access memory out of our range
2785  const __m128 rowSecond0 = _mm_loadu_ps(image0 + 2);
2786  const __m128 rowSecond1 = _mm_loadu_ps(image1 + 2);
2787 
2788  // get sum of second 4 elements
2789  const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2790 
2791  // get sum of summed pixels
2792  // NOTE: _mm_shuffle_ps resulting first 64bit are always from first __m128, second 64bit from second __m128
2793  // mask111001 = 57u; // 'i+1'th float became 'i'
2794  const __m128 sumComponents = _mm_add_ps(sumFirst, _mm_shuffle_ps(sumSecond, sumSecond, 57u));
2795 
2796  // divide by 4 --> multiply by 0.25
2797  const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2798 
2799  // store 3 elements (96 bit) to the memory
2800 
2801 #ifdef OCEAN_COMPILER_MSC
2802  memcpy(result, &division.m128_f32[0], sizeof(float) * 3);
2803 #else
2804  memcpy(result, &division, sizeof(float) * 3);
2805 #endif
2806 }
2807 
2808 inline void SSE::average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2809 {
2810  ocean_assert(image0 && image1 && result);
2811 
2812  __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2813  __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2814 
2815  // distribute the first 12 elements (element 00 up to 11):
2816  // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2817  //
2818  // -- -- -- -- -- 08 -- 07 -- 06 -- 02 -- 01 -- 00
2819  // -- -- -- -- -- 11 -- 10 -- 09 -- 05 -- 04 -- 03
2820 
2821  __m128i shuffleMaskLow = set128i(0xA0A0A0A0A008A007ull, 0xA006A002A001A000ull);
2822  __m128i shuffleMaskHigh = set128i(0xA0A0A0A0A00BA00Aull, 0xA009A005A004A003ull);
2823 
2824  __m128i sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2825  __m128i sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2826 
2827  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2828  __m128i sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2829 
2830  // divide by 4 by right shifting of two bits
2831  __m128i division16 = _mm_srli_epi16(sum, 2);
2832 
2833  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2834  __m128i division8 = _mm_shuffle_epi8(division16, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A00A0806040200ull));
2835 
2836 
2837  // now we load the remaining 12 elements (however, this time we take element 04 up to 15 to prevent that we access memory out of our range)
2838  // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2839  //
2840  // -- -- -- -- -- 12 -- 11 -- 10 -- 06 -- 05 -- 04
2841  // -- -- -- -- -- 15 -- 14 -- 13 -- 09 -- 08 -- 07
2842 
2843  row0 = _mm_lddqu_si128((__m128i*)(image0 + 8));
2844  row1 = _mm_lddqu_si128((__m128i*)(image1 + 8));
2845 
2846  shuffleMaskLow = set128i(0xA0A0A0A0A00CA00Bull, 0xA00AA006A005A004ull);
2847  shuffleMaskHigh = set128i(0xA0A0A0A0A00FA00Eull, 0xA00DA009A008A007ull);
2848 
2849  sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2850  sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2851 
2852  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2853  sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2854 
2855  // divide by 4 by right shifting of two bits
2856  division16 = _mm_srli_epi16(sum, 2);
2857 
2858  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2859  division8 = _mm_or_si128(division8, _mm_shuffle_epi8(division16, set128i(0xA0A0A0A00A080604ull, 0x0200A0A0A0A0A0A0ull)));
2860 
2861 #ifdef OCEAN_COMPILER_MSC
2862  memcpy(result, &division8.m128i_u8[0], 12);
2863 #else
2864  memcpy(result, &division8, 12);
2865 #endif
2866 }
2867 
2868 inline void SSE::average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result)
2869 {
2870  ocean_assert(image0 && image1);
2871 
2872  // 4 * float = m128, input does not need to be aligned on any particular boundary.
2873  const __m128 row0 = _mm_loadu_ps(image0);
2874  const __m128 row1 = _mm_loadu_ps(image1);
2875 
2876  // get sum of first 4 elements
2877  const __m128 sumFirstPixel = _mm_add_ps(row0, row1);
2878 
2879  // load next 4 elements
2880  const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2881  const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2882 
2883  // get sum of second 4 elements
2884  const __m128 sumSecondPixel = _mm_add_ps(rowSecond0, rowSecond1);
2885 
2886  // get sum of summed pixels
2887  const __m128 sumComponents = _mm_add_ps(sumFirstPixel, sumSecondPixel);
2888 
2889  // divide by 4 --> multiply by 0.25
2890  const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2891 
2892  // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2893  _mm_storeu_ps(result, division);
2894 }
2895 
2896 inline void SSE::average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2897 {
2898  ocean_assert(image0 && image1);
2899 
2900  const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2901  const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2902 
2903  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2904  const __m128i sumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(row0), shuffleNeighbor4Low64BitsToLow16_8(row1));
2905  const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(row0), shuffleNeighbor4High64BitsToLow16_8(row1));
2906 
2907  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2908  const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2909 
2910  // divide by 4 by right shifting of two bits
2911  const __m128i division16 = _mm_srli_epi16(sum, 2);
2912 
2913  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2914  const __m128i division8 = moveLowBits16_8ToLow64(division16);
2915 
2916  // copy the lower 64 bit to the memory
2917  _mm_storel_epi64((__m128i*)result, division8);
2918 }
2919 
2920 inline void SSE::average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2921 {
2922  ocean_assert(image0 && image1);
2923 
2924  // first 16 elements
2925  const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2926  const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2927 
2928  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2929  const __m128i firstSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(firstRow0), shuffleNeighbor4Low64BitsToLow16_8(firstRow1));
2930  const __m128i firstSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(firstRow0), shuffleNeighbor4High64BitsToLow16_8(firstRow1));
2931 
2932  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2933  const __m128i firstSum = _mm_add_epi16(_mm_hadd_epi16(firstSumLow, firstSumHigh), _mm_set1_epi32(int(0x00020002)));
2934 
2935  // divide by 4 by right shifting of two bits
2936  const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2937 
2938  // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2939  const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2940 
2941 
2942  // second 16 elements
2943  const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2944  const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2945 
2946  // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2947  const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(secondRow0), shuffleNeighbor4Low64BitsToLow16_8(secondRow1));
2948  const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(secondRow0), shuffleNeighbor4High64BitsToLow16_8(secondRow1));
2949 
2950  // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2951  const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2952 
2953  // divide by 4 by right shifting of two bits
2954  const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2955 
2956  // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2957  const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2958 
2959 
2960  // combine both divion results
2961  const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2962 
2963  // copy the 128 bit to the memory
2964  _mm_storeu_si128((__m128i*)result, division8);
2965 }
2966 
2967 inline void SSE::average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
2968 {
2969  ocean_assert(image0 && image1 && image2);
2970 
2971  /**
2972  * | 1 2 1 |
2973  * 1/16 | 2 4 2 |
2974  * | 1 2 1 |
2975  */
2976 
2977  // first 16 elements (actual 14 are used)
2978  const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2979  const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2980  const __m128i firstRow2 = _mm_lddqu_si128((__m128i*)image2);
2981 
2982  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
2983  const __m128i firstSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1)), _mm_add_epi16(removeHighBits16_8(firstRow1), removeHighBits16_8(firstRow2)));
2984  const __m128i firstSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1)), _mm_add_epi16(moveHighBits16_8(firstRow1), moveHighBits16_8(firstRow2)));
2985 
2986  // second 16 elements, starting from 15th element
2987  const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 14));
2988  const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 14));
2989  const __m128i secondRow2 = _mm_lddqu_si128((__m128i*)(image2 + 14));
2990 
2991  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
2992  const __m128i secondSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1)), _mm_add_epi16(removeHighBits16_8(secondRow1), removeHighBits16_8(secondRow2)));
2993  const __m128i secondSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1)), _mm_add_epi16(moveHighBits16_8(secondRow1), moveHighBits16_8(secondRow2)));
2994 
2995  // build overall sum and add 8 for rounding
2996  // positions 0, 2, 3, 5, 6 are valid, e.g. pos. 0 contains element00 + element01
2997  const __m128i firstSum = _mm_add_epi16(firstSumEven, _mm_add_epi16(firstSumOdd, _mm_set1_epi32(int(0x00080008))));
2998  // e.g. pos. 0 contains now element00 + element01 + element02
2999  const __m128i firstSumWithEven = _mm_add_epi16(firstSum, _mm_shuffle_epi8(firstSumEven, set128i(0xFFFF0F0E0B0AFFFFull, 0x09080504FFFF0302ull)));
3000  // e.g. pos. 0 contains now element00 + element01 + element02 + element01
3001  const __m128i firstSumWithBoth = _mm_add_epi16(firstSumWithEven, _mm_shuffle_epi8(firstSumOdd, set128i(0xFFFF0D0C0908FFFFull, 0x07060302FFFF0100ull)));
3002 
3003  // build overall sum and add 8 for rounding
3004  // positions 1, 2, 4, 5, 7 are valid
3005  const __m128i secondSum = _mm_add_epi16(secondSumEven, _mm_add_epi16(secondSumOdd, _mm_set1_epi32(int(0x00080008))));
3006  const __m128i secondSumWithEven = _mm_add_epi16(secondSum, _mm_shuffle_epi8(secondSumEven, set128i(0x0F0EFFFF0D0C0908ull, 0xFFFF07060302FFFFull)));
3007  const __m128i secondSumWithBoth = _mm_add_epi16(secondSumWithEven, _mm_shuffle_epi8(secondSumOdd, set128i(0x0D0CFFFF0B0A0706ull, 0xFFFF05040100FFFFull)));
3008 
3009  // divide by 16 by right shifting of four bits
3010  const __m128i firstDivision16 = _mm_srli_epi16(firstSumWithBoth, 4);
3011  const __m128i secondDivision16 = _mm_srli_epi16(secondSumWithBoth, 4);
3012 
3013  // reorder valid elements to lowest bits
3014  const __m128i firstDivision8 = _mm_shuffle_epi8(firstDivision16, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0C0A060400ull));
3015  const __m128i secondDivision8 = _mm_shuffle_epi8(secondDivision16, set128i(0xFFFFFFFFFFFF0E0Aull, 0x080402FFFFFFFFFFull));
3016 
3017  // combine both divion results
3018  const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3019 
3020  // copy the lowest 10*8 bit to the memory
3021 #ifdef OCEAN_COMPILER_MSC
3022  memcpy(result, &division8.m128i_u8[0], 10);
3023 #else
3024  memcpy(result, &division8, 10);
3025 #endif
3026 }
3027 
3029 {
3030  /**
3031  * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3032  * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3033  * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3034  */
3035 
3036  // We create a bit mask for all 16 bit odd values, an odd value will create an active lower bit in each 16 bit value
3037  const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0001000100010001ull, 0x0001000100010001ull));
3038 
3039  // We create a bit mask for all 16 bit negative values, a negative value will create an active lower bit in each 16 bit value
3040  const __m128i maskNegatives = _mm_srli_epi16(_mm_and_si128(value, CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull)), 15);
3041 
3042  // We add 1 to each 16 bit value having an active 'odd-bit' and active
3043  // 'negative-bit'
3044  return _mm_add_epi16(value, _mm_and_si128(maskNegatives, maskOdds));
3045 }
3046 
3047 inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3048 {
3049  ocean_assert(rightShifts < 16u);
3050 
3051  // the offset for negative values: 2^shifts - 1
3052  const __m128i offsetForNegatives_s_16x8 = _mm_set1_epi16(short((1u << rightShifts) - 1u));
3053 
3054  // bit mask for all 16 bit negative values
3055  const __m128i maskHigh_s_16x8 = CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull);
3056 
3057  // 0x0000 for positive values, 0xFFFF for negative values
3058  const __m128i maskNegativeValues_s_16x8 = _mm_cmpeq_epi16(_mm_and_si128(value, maskHigh_s_16x8), maskHigh_s_16x8);
3059 
3060  // 0 for positive values, 2^shifts - 1 for negative values
3061  const __m128i offset_s_16x8 = _mm_and_si128(offsetForNegatives_s_16x8, maskNegativeValues_s_16x8);
3062 
3063  return _mm_add_epi16(value, offset_s_16x8);
3064 }
3065 
3066 inline __m128i SSE::divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3067 {
3068  return _mm_srai_epi16(addOffsetBeforeRightShiftDivisionSigned16Bit(value, rightShifts), int(rightShifts));
3069 }
3070 
3072 {
3073  /**
3074  * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3075  * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3076  * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3077  */
3078 
3079  // We create a bit mask for all 32 bit odd values, an odd value will create an active lower bit in each 32 bit value
3080  const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0000000100000001ull, 0x0000000100000001ull));
3081 
3082  // We create a bit mask for all 32 bit negative values, a negative value will create an active lower bit in each 32 bit value
3083  const __m128i maskNegatives = _mm_srli_epi32(_mm_and_si128(value, CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull)), 31);
3084 
3085  // We add 1 to each 32 bit value having an active 'odd-bit' and active 'negative-bit'
3086  return _mm_add_epi32(value, _mm_and_si128(maskNegatives, maskOdds));
3087 }
3088 
3089 inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3090 {
3091  ocean_assert(rightShifts < 32u);
3092 
3093  // the offset for negative values: 2^shifts - 1
3094  const __m128i offsetForNegatives_s_32x4 = _mm_set1_epi32(int((1u << rightShifts) - 1u));
3095 
3096  // bit mask for all 32 bit negative values
3097  const __m128i maskHigh_s_32x4 = CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull);
3098 
3099  // 0x00000000 for positive values, 0xFFFFFFFF for negative values
3100  const __m128i maskNegativeValues_s_32x4 = _mm_cmpeq_epi32(_mm_and_si128(value, maskHigh_s_32x4), maskHigh_s_32x4);
3101 
3102  // 0 for positive values, 2^shifts - 1 for negative values
3103  const __m128i offset_s_32x4 = _mm_and_si128(offsetForNegatives_s_32x4, maskNegativeValues_s_32x4);
3104 
3105  return _mm_add_epi32(value, offset_s_32x4);
3106 }
3107 
3108 inline __m128i SSE::divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3109 {
3110  return _mm_srai_epi32(addOffsetBeforeRightShiftDivisionSigned32Bit(value, rightShifts), int(rightShifts));
3111 }
3112 
3113 inline void SSE::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
3114 {
3115  ocean_assert(source && response && width >= 10u);
3116 
3117  // Load 16 unsigned 8-bit values; left/right/top/bottom pixels
3118  const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3119  const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3120 
3121  const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3122  const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3123 
3124  // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3125  const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3126  //const __m128i horizontalMinusLo = _mm_shuffle_epi8(horizontalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3127  const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3128 
3129  const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3130  //const __m128i horizontalPlusLo = _mm_shuffle_epi8(horizontalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3131  const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3132 
3133  // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3134  const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3135  const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3136 
3137  // Convert the low and high signed 16-bit differences to signed 8-bit and merge them into a single
3138  const __m128i horizontalGradient = _mm_or_si128(
3139  _mm_shuffle_epi8(horizontalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3140  _mm_shuffle_epi8(horizontalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3141 
3142  // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3143  const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3144  //const __m128i verticalMinusLo = _mm_shuffle_epi8(verticalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == a[7:0]
3145  const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3146 
3147  const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3148  //const __m128i verticalPlusLo = _mm_shuffle_epi8(verticalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == b[7:0]
3149  const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3150 
3151  // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3152  const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3153  const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3154 
3155  // Convert the differences to signed char and merge the high and low halves
3156  const __m128i verticalGradient = _mm_or_si128(
3157  _mm_shuffle_epi8(verticalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3158  _mm_shuffle_epi8(verticalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3159 
3160  // Take the horizontal gradients, [dx0, dx1, dx2, ...], and the vertical gradient, [dy0, dy1, dy2, ...] and interleave them, [dx0, dy0, dx1, dy1, dx2, dy2, ...]
3161  const __m128i interleavedResponseLo = _mm_unpacklo_epi8(horizontalGradient, verticalGradient);
3162  const __m128i interleavedResponseHi = _mm_unpackhi_epi8(horizontalGradient, verticalGradient);
3163 
3164  ocean_assert(sizeof(char) == 1ull);
3165  _mm_storeu_si128((__m128i*)response, interleavedResponseLo);
3166  _mm_storeu_si128((__m128i*)(response + 16ull), interleavedResponseHi);
3167 }
3168 
3169 inline void SSE::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
3170 {
3171  ocean_assert(source && response && width >= 10u);
3172 
3173  // Load 4x(16x8u) values: left/right/top/bottom pixels
3174  const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3175  const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3176 
3177  const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3178  const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3179 
3180  // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3181  const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3182  const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3183 
3184  const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3185  const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3186 
3187  // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3188  const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3189  const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3190 
3191  // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3192  const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3193  const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3194 
3195  const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3196  const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3197 
3198  // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3199  const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3200  const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3201 
3202  // Squared gradients: h*h, v*v, h*v
3203  const __m128i horizontalHorizontalLo = _mm_mullo_epi16(horizontalGradientLo, horizontalGradientLo);
3204  const __m128i horizontalHorizontalHi = _mm_mullo_epi16(horizontalGradientHi, horizontalGradientHi);
3205 
3206  const __m128i verticalVerticalLo = _mm_mullo_epi16(verticalGradientLo, verticalGradientLo);
3207  const __m128i verticalVerticalHi = _mm_mullo_epi16(verticalGradientHi, verticalGradientHi);
3208 
3209  const __m128i horzontalVerticalLo = _mm_mullo_epi16(horizontalGradientLo, verticalGradientLo);
3210  const __m128i horzontalVerticalHi = _mm_mullo_epi16(horizontalGradientHi, verticalGradientHi);
3211 
3212  // Interleave/pack the above squared gradient, 16S values
3213  //
3214  // a, b, c - Above variables ending in *Lo
3215  // d, e, f - Above variables ending in *Hi
3216  //
3217  // a = [a7, a6, a5, a4, a3, a2, a1, a0]
3218  // b = [b7, b6, b5, b4, b3, b2, b1, b0]
3219  // c = [c7, c6, c5, c4, c3, c2, c1, c0]
3220  //
3221  // d = [d7, d6, d5, d4, d3, d2, d1, d0]
3222  // e = [e7, e6, e5, e4, e3, e2, e1, e0]
3223  // f = [f7, f6, f5, f4, f3, f2, f1, f0]
3224  //
3225  // A = [b2, a2, c1, b1, a1, c0, b0, a0]
3226  // B = [a5, c4, b4, a4, c3, b3, a3, c2]
3227  // C = [c7, b7, a7, c6, b6, a6, c5, b5]
3228  //
3229  // D = [e2, d2, f1, e1, d1, f0, e0, d0]
3230  // E = [d5, f4, e4, d4, f3, e3, d3, f2]
3231  // F = [f7, e7, d7, f6, e6, d6, f5, e5]
3232 
3233  const __m128i block0Lo = _mm_or_si128( // == [b2, a2, c1, b1, a1, c0, b0, a0]
3234  _mm_or_si128( // == [b2, a2, 00, b1, a1, 00, b0, a0]
3235  _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, a2, 00, 00, a1, 00, 00, a0]
3236  _mm_shuffle_epi8(verticalVerticalLo, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [b2, 00, 00, b1, 00, 00, b0, 00]
3237  _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, c1, 00, 00, c0, 00, 00]
3238 
3239  const __m128i block1Lo = _mm_or_si128( // == [a5, c4, b4, a4, c3, b3, a3, c2]
3240  _mm_or_si128( // == [a5, 00, b4, a4, 00, b3, a3, 00]
3241  _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [a5, 00, 00, a4, 00, 00, a4, 00]
3242  _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, b4, 00, 00, b3, 00, 00]
3243  _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, c4, 00, 00, c3, 00, 00, c2]
3244 
3245  const __m128i block2Lo = _mm_or_si128( // == [c7, b7, a7, c6, b6, a6, c5, b5]
3246  _mm_or_si128( // == [00, b7, a7, 00, b6, a6, 00, b5]
3247  _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, a7, 00, 00, a6, 00, 00]
3248  _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, b7, 00, 00, b6, 00, 00, b5]
3249  _mm_shuffle_epi8(horzontalVerticalLo, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [c7, 00, 00, c6, 00, 00, c5, 00]
3250 
3251  const __m128i block0Hi = _mm_or_si128( // == [e2, d2, f1, e1, d1, f0, e0, d0]
3252  _mm_or_si128( // == [e2, d2, 00, e1, d1, 00, e0, d0]
3253  _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, d2, 00, 00, d1, 00, 00, d0]
3254  _mm_shuffle_epi8(verticalVerticalHi, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [e2, 00, 00, e1, 00, 00, e0, 00]
3255  _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, f1, 00, 00, f0, 00, 00]
3256 
3257  const __m128i block1Hi = _mm_or_si128( // == [d5, f4, e4, d4, f3, e3, d3, f2]
3258  _mm_or_si128( // == [d5, 00, e4, d4, 00, e3, d3, 00]
3259  _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [d5, 00, 00, d4, 00, 00, d4, 00]
3260  _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, e4, 00, 00, e3, 00, 00]
3261  _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, f4, 00, 00, f3, 00, 00, f2]
3262 
3263  const __m128i block2Hi = _mm_or_si128( // == [f7, e7, d7, f6, e6, d6, f5, e5]
3264  _mm_or_si128( // == [00, e7, d7, 00, e6, d6, 00, e5]
3265  _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, d7, 00, 00, d6, 00, 00]
3266  _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, e7, 00, 00, e6, 00, 00, e5]
3267  _mm_shuffle_epi8(horzontalVerticalHi, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [f7, 00, 00, f6, 00, 00, f5, 00]
3268 
3269  _mm_storeu_si128((__m128i*)response, block0Lo);
3270  _mm_storeu_si128((__m128i*)(response + 8ull), block1Lo);
3271  _mm_storeu_si128((__m128i*)(response + 16ull), block2Lo);
3272  _mm_storeu_si128((__m128i*)(response + 24ull), block0Hi);
3273  _mm_storeu_si128((__m128i*)(response + 32ull), block1Hi);
3274  _mm_storeu_si128((__m128i*)(response + 40ull), block2Hi);
3275 }
3276 
3277 OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2)
3278 {
3279  // interleaved R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 X
3280 
3281  // channel01 R0 R1 R2 R3 R4 X X X G0 G1 G2 G3 G4 X X X
3282  // channel2 B0 B1 B2 B3 B4 X X X 0 0 0 0 0 0 0 0
3283 
3284  channel01 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFF0d0a070401ull, 0xFFFFFF0c09060300ull));
3285 
3286  channel2 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull));
3287 }
3288 
3289 OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2)
3290 {
3291  // interleavedA R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
3292  // interleavedB G5 B5 R6 G6 B6 R7 G7 B7 X X X X X X X X
3293 
3294  // channel01 R0 R1 R2 R3 R4 R5 R6 R7 G0 G1 G2 G3 G4 G5 G6 G7
3295  // channel2 B0 B1 B2 B3 B4 B5 B6 B7 0 0 0 0 0 0 0 0
3296 
3297  channel01 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFF0d0a070401ull, 0xFFFF0f0c09060300ull)),
3298  _mm_shuffle_epi8(interleavedB, set128i(0x060300FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3299 
3300  channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3301  _mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFFFFFFull, 0x070401FFFFFFFFFFull)));
3302 }
3303 
3304 OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3305 {
3306  channel0 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFF0f0c09060300ull)),
3307  _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0e0b08ull, 0x0502FFFFFFFFFFFFull)),
3308  _mm_shuffle_epi8(interleavedC, set128i(0x0d0a070401FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3309 
3310  channel1 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3311  _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3312  _mm_shuffle_epi8(interleavedC, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3313 
3314  channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3315  _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFF0d0aull, 0x070401FFFFFFFFFFull)),
3316  _mm_shuffle_epi8(interleavedC, set128i(0x0f0c09060300FFFFull, 0xFFFFFFFFFFFFFFFFull))));
3317 }
3318 
3319 inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3320 {
3321  ocean_assert(interleaved != nullptr);
3322 
3323  deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0, channel1, channel2);
3324 }
3325 
3326 inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2)
3327 {
3328  ocean_assert(interleaved && channel0 && channel1 && channel2);
3329 
3330  __m128i channel0_128, channel1_128, channel2_128;
3331  deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0_128, channel1_128, channel2_128);
3332 
3333  store128i(channel0_128, channel0);
3334  store128i(channel1_128, channel1);
3335  store128i(channel2_128, channel2);
3336 }
3337 
3338 inline void SSE::deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3339 {
3340  ocean_assert(interleaved != nullptr);
3341 
3342  deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3), channel0, channel1, channel2);
3343 }
3344 
3345 OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC)
3346 {
3347  interleavedA = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0x05FFFF04FFFF03FFull, 0xFF02FFFF01FFFF00ull)),
3348  _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFFFF04FFFF03FFFFull, 0x02FFFF01FFFF00FFull)),
3349  _mm_shuffle_epi8(channel2, set128i(0xFF04FFFF03FFFF02ull, 0xFFFF01FFFF00FFFFull))));
3350 
3351  interleavedB = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFF0AFFFF09FFFF08ull, 0xFFFF07FFFF06FFFFull)),
3352  _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0x0AFFFF09FFFF08FFull, 0xFF07FFFF06FFFF05ull)),
3353  _mm_shuffle_epi8(channel2, set128i(0xFFFF09FFFF08FFFFull, 0x07FFFF06FFFF05FFull))));
3354 
3355  interleavedC = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFFFF0FFFFF0EFFFFull, 0x0DFFFF0CFFFF0BFFull)),
3356  _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFF0FFFFF0EFFFF0Dull, 0xFFFF0CFFFF0BFFFFull)),
3357  _mm_shuffle_epi8(channel2, set128i(0x0FFFFF0EFFFF0DFFull, 0xFF0CFFFF0BFFFF0Aull))));
3358 }
3359 
3360 OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved)
3361 {
3362  ocean_assert(channel0 && channel1 && channel2 && interleaved);
3363 
3364  __m128i interleavedA_128, interleavedB_128, interleavedC_128;
3365  interleave3Channel8Bit48Elements(load128i(channel0), load128i(channel1), load128i(channel2), interleavedA_128, interleavedB_128, interleavedC_128);
3366 
3367  store128i(interleavedA_128, interleaved + 0);
3368  store128i(interleavedB_128, interleaved + 16);
3369  store128i(interleavedC_128, interleaved + 32);
3370 }
3371 
3372 OCEAN_FORCE_INLINE void SSE::reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3373 {
3374  ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3375 
3376  // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3377  // Y A Y A Y A Y A Y A Y A Y A Y A
3378  // output: A Y A Y A Y A Y A Y A Y A Y A Y
3379  // 1 0 3 2 5 4 7 6 9 8 B A D C F E
3380 
3381  const __m128i shuffleMask_u_16x8 = set128i(0x0E0F0C0D0A0B0809ull, 0x0607040502030001ull);
3382 
3383  store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3384  store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3385 }
3386 
3387 OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2)
3388 {
3389  reversedInterleaved0 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFF0c0d0e090a0b06ull, 0x0708030405000102ull)),
3390  _mm_shuffle_epi8(interleaved1, set128i(0x01FFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull)));
3391 
3392  reversedInterleaved1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFF0fFFull)),
3393  _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0x0fFF0b0c0d08090aull, 0x050607020304FF00ull)),
3394  _mm_shuffle_epi8(interleaved2, set128i(0xFF00FFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3395 
3396  reversedInterleaved2 = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFF0eull)),
3397  _mm_shuffle_epi8(interleaved2, set128i(0x0d0e0f0a0b0c0708ull, 0x09040506010203FFull)));
3398 }
3399 
3400 OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* const reversedInterleaved)
3401 {
3402  ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3403 
3404  __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3405  reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3406 
3407  store128i(reversedInterleaved0, reversedInterleaved);
3408  store128i(reversedInterleaved1, reversedInterleaved + 16);
3409  store128i(reversedInterleaved2, reversedInterleaved + 32);
3410 }
3411 
3412 OCEAN_FORCE_INLINE void SSE::reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3413 {
3414  ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3415 
3416  // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3417  // R G B A R G B A R G B A R G B A
3418  // output: A B G R A B G R A B G R A B G R
3419  // 3 2 1 0 7 6 5 4 B A 9 8 F E D C
3420 
3421  const __m128i shuffleMask_u_16x8 = set128i(0x0C0D0E0F08090A0Bull, 0x0405060700010203ull);
3422 
3423  store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3424  store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3425  store128i(_mm_shuffle_epi8(load128i(interleaved + 32), shuffleMask_u_16x8), reversedInterleaved + 32);
3426  store128i(_mm_shuffle_epi8(load128i(interleaved + 48), shuffleMask_u_16x8), reversedInterleaved + 48);
3427 }
3428 
3429 inline void SSE::reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved)
3430 {
3431  ocean_assert(interleaved);
3432 
3433  __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3434  reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3435 
3436  store128i(reversedInterleaved0, interleaved);
3437  store128i(reversedInterleaved1, interleaved + 16);
3438  store128i(reversedInterleaved2, interleaved + 32);
3439 }
3440 
3441 inline void SSE::swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second)
3442 {
3443  ocean_assert(first && second && first != second);
3444 
3445  __m128i first0, first1, first2;
3446  reverseChannelOrder3Channel8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3447 
3448  __m128i second0, second1, second2;
3449  reverseChannelOrder3Channel8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3450 
3451  store128i(first0, second);
3452  store128i(first1, second + 16);
3453  store128i(first2, second + 32);
3454 
3455  store128i(second0, first);
3456  store128i(second1, first + 16);
3457  store128i(second2, first + 32);
3458 }
3459 
3460 inline void SSE::reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2)
3461 {
3462  const __m128i mask = set128i(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
3463 
3464  reversedElements0 = _mm_shuffle_epi8(elements2, mask);
3465  reversedElements1 = _mm_shuffle_epi8(elements1, mask);
3466  reversedElements2 = _mm_shuffle_epi8(elements0, mask);
3467 }
3468 
3469 inline void SSE::reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements)
3470 {
3471  ocean_assert(elements && reversedElements);
3472 
3473  __m128i reversedElements0, reversedElements1, reversedElements2;
3474  reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3475 
3476  store128i(reversedElements0, reversedElements);
3477  store128i(reversedElements1, reversedElements + 16);
3478  store128i(reversedElements2, reversedElements + 32);
3479 }
3480 
3481 inline void SSE::reverseElements8Bit48Elements(uint8_t* elements)
3482 {
3483  ocean_assert(elements);
3484 
3485  __m128i reversedElements0, reversedElements1, reversedElements2;
3486  reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3487 
3488  store128i(reversedElements0, elements);
3489  store128i(reversedElements1, elements + 16);
3490  store128i(reversedElements2, elements + 32);
3491 }
3492 
3493 inline void SSE::swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second)
3494 {
3495  ocean_assert(first && second && first != second);
3496 
3497  __m128i first0, first1, first2;
3498  reverseElements8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3499 
3500  __m128i second0, second1, second2;
3501  reverseElements8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3502 
3503  store128i(first0, second);
3504  store128i(first1, second + 16);
3505  store128i(first2, second + 32);
3506 
3507  store128i(second0, first);
3508  store128i(second1, first + 16);
3509  store128i(second2, first + 32);
3510 }
3511 
3512 inline void SSE::shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3513 {
3514  ocean_assert(elements && shiftedElements);
3515 
3516  store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0c0f0e0d080b0a09ull, 0x0407060500030201ull)), shiftedElements);
3517 }
3518 
3519 inline void SSE::shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3520 {
3521  ocean_assert(elements && shiftedElements);
3522 
3523  store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0003020104070605ull, 0x080b0a090c0f0e0dull)), shiftedElements);
3524 }
3525 
3526 inline void SSE::shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3527 {
3528  ocean_assert(elements && shiftedElements);
3529 
3530  store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0e0d0c0f0a09080bull, 0x0605040702010003ull)), shiftedElements);
3531 }
3532 
3533 inline void SSE::shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3534 {
3535  ocean_assert(elements && shiftedElements);
3536 
3537  store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0201000306050407ull, 0x0a09080b0e0d0c0full)), shiftedElements);
3538 }
3539 
3540 inline __m128i SSE::sum1Channel8Bit16Elements(const __m128i& elements)
3541 {
3542  const __m128i zero = _mm_setzero_si128();
3543  const __m128i sum = _mm_sad_epu8(elements, zero);
3544 
3545  return _mm_add_epi32(_mm_srli_si128(sum, 8), sum);
3546 }
3547 
3548 inline __m128i SSE::sum1Channel8Bit16Elements(const uint8_t* elements)
3549 {
3550  ocean_assert(elements != nullptr);
3551 
3552  return sum1Channel8Bit16Elements(load128i(elements));
3553 }
3554 
3555 template <bool tBufferHas16Bytes>
3556 inline __m128i SSE::sum1Channel8BitFront15Elements(const uint8_t* elements)
3557 {
3558  ocean_assert(elements != nullptr);
3559  return sum1Channel8Bit16Elements(load_u8_15_upper_zero<tBufferHas16Bytes>(elements));
3560 }
3561 
3562 inline __m128i SSE::sum1Channel8BitBack15Elements(const uint8_t* elements)
3563 {
3564  ocean_assert(elements != nullptr);
3565  return sum1Channel8Bit16Elements(load_u8_16_and_shift_right<1u>(elements));
3566 }
3567 
3568 inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2)
3569 {
3570  // Interleaved0: R BGR BGR BGR BGR BGR
3571  // Interleaved1: GR BGR BGR BGR BGR BG
3572  // Interleaved2: BGR BGR BGR BGR BGR B
3573 
3574  // BBBBBBBB RRRRRRRR
3575  const __m128i channel0_2First = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFF0e0b080502ull, 0xFFFF0f0c09060300ull)),
3576  _mm_shuffle_epi8(interleaved1, set128i(0x070401FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3577 
3578  // BBBBBBBB RRRRRRRR
3579  const __m128i channel0_2Second = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFF0d0aull, 0xFFFFFFFFFF0e0b08ull)),
3580  _mm_shuffle_epi8(interleaved2, set128i(0x0f0c09060300FFFFull, 0x0d0a070401FFFFFFull)));
3581 
3582  // GGGGGGGG GGGGGGGG
3583  const __m128i channel1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3584  _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3585  _mm_shuffle_epi8(interleaved2, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3586 
3587  const __m128i zero = _mm_setzero_si128();
3588 
3589  // 0000 BBBB 0000 RRRR
3590  const __m128i sum0_2 = _mm_add_epi32(_mm_sad_epu8(channel0_2First, zero), _mm_sad_epu8(channel0_2Second, zero));
3591 
3592  // 0000 GGGG 0000 GGGG
3593  const __m128i sum1 = _mm_sad_epu8(channel1, zero);
3594 
3595  // 0000 BBBB GGGG RRRR
3596  return _mm_blend_epi16(sum0_2, _mm_add_epi32(_mm_slli_si128(sum1, 4), _mm_srli_si128(sum1, 4)), int(0xC));
3597 }
3598 
3599 inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved)
3600 {
3601  ocean_assert(interleaved != nullptr);
3602 
3603  return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32));
3604 }
3605 
3606 inline __m128i SSE::sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved)
3607 {
3608  ocean_assert(interleaved != nullptr);
3609 
3610  return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3));
3611 }
3612 
3613 inline __m128i SSE::load128iLower64(const void* const buffer)
3614 {
3615  ocean_assert(buffer != nullptr);
3616  return _mm_loadl_epi64((const __m128i*)(buffer));
3617 }
3618 
3619 inline __m128i SSE::load128i(const void* const buffer)
3620 {
3621  ocean_assert(buffer != nullptr);
3622  return _mm_lddqu_si128((const __m128i*)(buffer));
3623 }
3624 
3625 template <bool tBufferHas16Bytes>
3626 inline __m128i SSE::load_u8_10_upper_zero(const uint8_t* const buffer)
3627 {
3628  ocean_assert(buffer != nullptr);
3629 
3630  __m128i result;
3631 
3632 #ifdef OCEAN_COMPILER_MSC
3633 
3634  result.m128i_u64[0] = uint64_t(0);
3635  memcpy(result.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3636  memcpy(result.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3637 
3638 #else
3639 
3640  M128i& ourResult = *((M128i*)(&result));
3641 
3642  ourResult.m128i_u64[0] = uint64_t(0);
3643  memcpy(ourResult.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3644  memcpy(ourResult.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3645 
3646 #endif
3647 
3648  return result;
3649 }
3650 
3651 template <>
3652 inline __m128i SSE::load_u8_10_upper_zero<true>(const uint8_t* const buffer)
3653 {
3654  ocean_assert(buffer != nullptr);
3655 
3656  // we load 16 bytes and shift the SSE register by 6 byte afterwards
3657  return _mm_slli_si128(SSE::load128i(buffer), 6);
3658 }
3659 
3660 template <bool tBufferHas16Bytes>
3661 inline __m128i SSE::load_u8_15_upper_zero(const uint8_t* const buffer)
3662 {
3663  ocean_assert(buffer != nullptr);
3664 
3665  __m128i intermediate;
3666  memcpy(&intermediate, buffer, 15);
3667 
3668  // we shift the SSE register by 1 byte afterwards
3669  return _mm_slli_si128(intermediate, 1);
3670 }
3671 
3672 template <>
3673 inline __m128i SSE::load_u8_15_upper_zero<true>(const uint8_t* const buffer)
3674 {
3675  ocean_assert(buffer != nullptr);
3676 
3677  // we load 16 bytes and shift the SSE register by 1 byte afterwards
3678  return _mm_slli_si128(_mm_lddqu_si128((__m128i*)(buffer)), 1);
3679 }
3680 
3681 template <bool tBufferHas16Bytes>
3682 inline __m128i SSE::load_u8_13_lower_random(const uint8_t* const buffer)
3683 {
3684  ocean_assert(buffer != nullptr);
3685 
3686  __m128i result;
3687  memcpy(&result, buffer, 13);
3688 
3689  return result;
3690 }
3691 
3692 template <>
3693 inline __m128i SSE::load_u8_13_lower_random<true>(const uint8_t* const buffer)
3694 {
3695  ocean_assert(buffer != nullptr);
3696 
3697  // we load the entire 16 bytes to the 128i value as this is the fastest way
3698  return _mm_lddqu_si128((__m128i*)(buffer));
3699 }
3700 
3701 template <bool tBufferHas16Bytes>
3702 inline __m128i SSE::load_u8_15_lower_zero(const uint8_t* const buffer)
3703 {
3704  ocean_assert(buffer != nullptr);
3705 
3706  __m128i result;
3707  memcpy(&result, buffer, 15);
3708 
3709 #ifdef OCEAN_COMPILER_MSC
3710  result.m128i_u8[15] = 0u;
3711 #else
3712  ((M128i&)result).m128i_u8[15] = 0u;
3713 #endif
3714 
3715  return result;
3716 }
3717 
3718 template <>
3719 inline __m128i SSE::load_u8_15_lower_zero<true>(const uint8_t* const buffer)
3720 {
3721  ocean_assert(buffer != nullptr);
3722 
3723  // we load the entire 16 bytes to the 128i value as this is the fastest way
3724  __m128i result = _mm_lddqu_si128((__m128i*)(buffer));
3725 
3726 #ifdef OCEAN_COMPILER_MSC
3727  result.m128i_u8[15] = 0u;
3728 #else
3729  ((M128i&)result).m128i_u8[15] = 0u;
3730 #endif
3731 
3732  return result;
3733 }
3734 
3735 template <bool tBufferHas16Bytes>
3736 inline __m128i SSE::load_u8_15_lower_random(const uint8_t* const buffer)
3737 {
3738  ocean_assert(buffer != nullptr);
3739 
3740  __m128i result;
3741  memcpy(&result, buffer, 15);
3742 
3743  return result;
3744 }
3745 
3746 template <>
3747 inline __m128i SSE::load_u8_15_lower_random<true>(const uint8_t* const buffer)
3748 {
3749  ocean_assert(buffer != nullptr);
3750 
3751  // we load the entire 16 bytes to the 128i value as this is the fastest way
3752  return _mm_lddqu_si128((__m128i*)(buffer));
3753 }
3754 
3755 template <unsigned int tShiftBytes>
3756 inline __m128i SSE::load_u8_16_and_shift_right(const uint8_t* const buffer)
3757 {
3758  static_assert(tShiftBytes <= 16u, "Invalid shift!");
3759 
3760  ocean_assert(buffer != nullptr);
3761  return _mm_srli_si128(_mm_lddqu_si128((__m128i*)(buffer)), tShiftBytes);
3762 }
3763 
3764 inline void SSE::store128i(const __m128i& value, uint8_t* const buffer)
3765 {
3766  ocean_assert(buffer != nullptr);
3767  _mm_storeu_si128((__m128i*)(buffer), value);
3768 }
3769 
3770 inline __m128i SSE::set128i(const unsigned long long high64, const unsigned long long low64)
3771 {
3772 
3773 #ifdef _WINDOWS
3774 
3775  #ifdef _WIN64
3776  return _mm_set_epi64x(high64, low64);
3777  #else
3778  return _mm_set_epi32(*(((int*)&high64) + 1), *((int*)&high64), *(((int*)&low64) + 1), *((int*)&low64));
3779  #endif
3780 
3781 #else
3782 
3783  return _mm_set_epi64x(high64, low64);
3784 
3785 #endif
3786 
3787 }
3788 
3789 inline __m128i SSE::removeHighBits32_16(const __m128i& value)
3790 {
3791  return _mm_and_si128(value, _mm_set1_epi32(int(0x0000FFFFu)));
3792 }
3793 
3794 inline __m128i SSE::removeLowBits32_16(const __m128i& value)
3795 {
3796  return _mm_and_si128(value, _mm_set1_epi32(int(0xFFFF0000u)));
3797 }
3798 
3799 inline __m128i SSE::removeHighBits16_8(const __m128i& value)
3800 {
3801  return _mm_and_si128(value, _mm_set1_epi32(int(0x00FF00FFu)));
3802 }
3803 
3804 inline __m128i SSE::removeHighBits16_8_7_lower(const __m128i& value)
3805 {
3806  return _mm_and_si128(value, set128i(0x000000FF00FF00FFull, 0x00FF00FF00FF00FFull));
3807 }
3808 
3809 inline __m128i SSE::removeHighBits16_8_7_upper(const __m128i& value)
3810 {
3811  return _mm_and_si128(value, set128i(0x00FF00FF00FF00FFull, 0x00FF00FF00FF0000ull));
3812 }
3813 
3814 inline __m128i SSE::moveLowBits16_8ToLow64(const __m128i& value)
3815 {
3816  return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0E0C0A0806040200ull));
3817 }
3818 
3819 inline __m128i SSE::moveLowBits32_8ToLow32(const __m128i& value)
3820 {
3821  return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A0A0A00C080400ull));
3822 }
3823 
3824 inline __m128i SSE::moveLowBits32_16ToLow64(const __m128i& value)
3825 {
3826  return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0D0C090805040100ull));
3827 }
3828 
3829 inline __m128i SSE::moveLowBits16_8ToHigh64(const __m128i& value)
3830 {
3831  return _mm_shuffle_epi8(value, set128i(0x0E0C0A0806040200ull, 0xA0A0A0A0A0A0A0A0ull));
3832 }
3833 
3834 inline __m128i SSE::moveHighBits32_16(const __m128i& value)
3835 {
3836  // shift the four 32 bit integers by 16 to the right and fill by zeros
3837  return _mm_srli_epi32(value, 16);
3838 }
3839 
3840 inline __m128i SSE::moveHighBits16_8(const __m128i& value)
3841 {
3842  return _mm_shuffle_epi8(value, set128i(0xA00FA00DA00BA009ull, 0xA007A005A003A001ull));
3843 }
3844 
3845 inline __m128i SSE::moveHighBits16_8_5(const __m128i& value)
3846 {
3847  return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A009ull, 0xA007A005A003A001ull));
3848 }
3849 
3850 inline __m128i SSE::moveHighBits16_8_6(const __m128i& value)
3851 {
3852  return _mm_shuffle_epi8(value, set128i(0xFFFFFFFFFF0bFF09ull, 0xFF07FF05FF03FF01ull));
3853 }
3854 
3855 inline __m128i SSE::moveHighBits16_8_7(const __m128i& value)
3856 {
3857  return _mm_shuffle_epi8(value, set128i(0xA0A0A00DA00BA009ull, 0xA007A005A003A001ull));
3858 }
3859 
3860 inline __m128i SSE::shuffleLow32ToLow32_8(const __m128i& value)
3861 {
3862  return _mm_shuffle_epi8(value, set128i(0xA0A0A003A0A0A002ull, 0xA0A0A001A0A0A000ull));
3863 }
3864 
3865 inline __m128i SSE::shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value)
3866 {
3867  // we could also use one of the following mask-defining possibility, all provide the same result
3868  // _mm_set_epi8(0x80, 7, 0x80, 3, 0x80, 6, 0x80, 2, 0x80, 5, 0x80, 1, 0x80, 4, 0x80, 0))
3869  // _mm_set_epi8(0xA0, 7, 0xA0, 3, 0xA0, 6, 0xA0, 2, 0xA0, 5, 0xA0, 1, 0xA0, 4, 0xA0, 0))
3870  // _mm_set_epi8(0xFF, 7, 0xFF, 3, 0xFF, 6, 0xFF, 2, 0xFF, 5, 0xFF, 1, 0xFF, 4, 0xFF, 0))
3871 
3872  return _mm_shuffle_epi8(value, set128i(0xA007A003A006A002ull, 0xA005A001A004A000ull));
3873 }
3874 
3875 inline __m128i SSE::shuffleNeighbor4High64BitsToLow16_8(const __m128i& value)
3876 {
3877  return _mm_shuffle_epi8(value, set128i(0xA00FA00BA00EA00Aull, 0xA00DA009A00CA008ull));
3878 }
3879 
3880 inline __m128i SSE::shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value)
3881 {
3882  return _mm_shuffle_epi8(value, set128i(0xFF07FF05FF06FF04ull, 0xFF03FF01FF02FF00ull));
3883 }
3884 
3885 inline __m128i SSE::shuffleNeighbor2High64BitsToLow16_8(const __m128i& value)
3886 {
3887  return _mm_shuffle_epi8(value, set128i(0xFF0FFF0DFF0EFF0Cull, 0xFF0BFF09FF0AFF08ull));
3888 }
3889 
3891 {
3892  return _mm_set1_epi32(int(0x00FF00FFu));
3893 }
3894 
3896 {
3897  return _mm_set1_epi32(int(0x0000FFFFu));
3898 }
3899 
3900 OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1)
3901 {
3902  const __m128i lowProducts = _mm_mullo_epi16(values0, values1);
3903  const __m128i highProducts = _mm_mulhi_epi16(values0, values1);
3904 
3905  products0 = _mm_unpacklo_epi16(lowProducts, highProducts);
3906  products1 = _mm_unpackhi_epi16(lowProducts, highProducts);
3907 }
3908 
3909 OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1)
3910 {
3911  __m128i products0;
3912  __m128i products1;
3913  multiplyInt8x16ToInt32x8(values0, values1, products0, products1);
3914 
3915  results0 = _mm_add_epi32(results0, products0);
3916  results1 = _mm_add_epi32(results1, products1);
3917 }
3918 
3919 inline unsigned int SSE::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
3920 {
3921  ocean_assert(pixel);
3922  ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
3923 
3924  return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
3925 }
3926 
3927 inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
3928 {
3929  ocean_assert(pixel0 && pixel1);
3930 
3931  ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
3932 
3933  return sqrDistance(*pixel0, (uint8_t)interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
3934 }
3935 
3936 inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
3937 {
3938  ocean_assert(pixel0 && pixel1);
3939 
3940  ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
3941  ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
3942 
3943  return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
3944 }
3945 
3946 }
3947 
3948 }
3949 
3950 #endif // OCEAN_HARDWARE_SSE_VERSION >= 41
3951 
3952 #endif // META_OCEAN_CV_SSE_H
This class implements computer vision functions using SSE extensions.
Definition: SSE.h:42
static __m128i divideByRightShiftSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 32 bit values by applying a right shift.
Definition: SSE.h:3108
static void average32Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: SSE.h:2725
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 ...
Definition: SSE.h:3113
static unsigned int sum_u32_first_2(const __m128i &value)
Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
Definition: SSE.h:1331
static void average24Elements3Channel24Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: SSE.h:2808
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition: SSE.h:1265
static void reverseElements8Bit48Elements(const __m128i &elements0, const __m128i &elements1, const __m128i &elements2, __m128i &reversedElements0, __m128i &reversedElements1, __m128i &reversedElements2)
Reverses the order of 48 elements with 8 bit per element.
Definition: SSE.h:3460
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition: SSE.h:3619
static void average16Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: SSE.h:2700
static __m128i load_u8_16_and_shift_right(const uint8_t *const buffer)
Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified ...
Definition: SSE.h:3756
static __m128i moveLowBits32_16ToLow64(const __m128i &value)
Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition: SSE.h:3824
static __m128i moveLowBits32_8ToLow32(const __m128i &value)
Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0...
Definition: SSE.h:3819
static __m128i moveHighBits16_8_6(const __m128i &value)
Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition: SSE.h:3850
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i &value)
Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right s...
Definition: SSE.h:3071
static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d &value)
Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
Definition: SSE.h:1358
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3277
static void store128i(const __m128i &value, uint8_t *const buffer)
Stores a 128i value to the memory.
Definition: SSE.h:3764
static __m128i sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
Definition: SSE.h:1436
static __m128i sumInterleave3Channel8Bit45Elements(const uint8_t *interleaved)
Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition: SSE.h:3606
static __m128i moveLowBits16_8ToHigh64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with ...
Definition: SSE.h:3829
static __m128i divideByRightShiftSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 16 bit values by applying a right shift.
Definition: SSE.h:3066
static __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition: SSE.h:3875
static void swapReversedElements8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
Definition: SSE.h:3493
static __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit pr...
Definition: SSE.h:1374
static void average8ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition: SSE.h:2468
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:1583
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i &values0, const __m128i &values1, __m128i &results0, __m128i &results1)
Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
Definition: SSE.h:3909
static __m128i sumSquareDifference8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit prec...
Definition: SSE.h:1463
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition: SSE.h:1340
static __m128i sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition: SSE.h:1381
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition: SSE.h:2525
static __m128i moveHighBits16_8_5(const __m128i &value)
Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition: SSE.h:3845
static __m128i shuffleLow32ToLow32_8(const __m128i &value)
Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
Definition: SSE.h:3860
static void shiftChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition: SSE.h:3512
static __m128i moveHighBits16_8(const __m128i &value)
Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition: SSE.h:3840
static __m128i removeHighBits16_8_7_upper(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
Definition: SSE.h:3809
static void deInterleave3Channel8Bit45Elements(const uint8_t *interleaved, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
Definition: SSE.h:3338
static unsigned int value_u32(const __m128i &value)
Returns one specific 32 bit unsigned integer value of a m128i value object.
Definition: SSE.h:1311
static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const __m128i &channel0, const __m128i &channel1, const __m128i &channel2, __m128i &interleavedA, __m128i &interleavedB, __m128i &interleavedC)
Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition: SSE.h:3345
static __m128i load_u8_15_upper_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition: SSE.h:3661
static __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition: SSE.h:3880
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition: SSE.h:1260
static __m128i sum1Channel8Bit16Elements(const __m128i &elements)
Sums 16 elements with 8 bit per element.
Definition: SSE.h:3540
static __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition: SSE.h:3865
static void average8Elements2Channel64Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
Definition: SSE.h:2670
static __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 16 bit value, so they each value can be right shifted to al...
Definition: SSE.h:3047
static __m128i load_u8_15_lower_random(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition: SSE.h:3736
static __m128i removeHighBits16_8_7_lower(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
Definition: SSE.h:3804
static void average8Elements4Channel128Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
Definition: SSE.h:2868
static __m128i load_u8_10_upper_zero(const uint8_t *const buffer)
Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes,...
Definition: SSE.h:3626
static __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
Definition: SSE.h:1543
static __m128i moveHighBits32_16(const __m128i &value)
Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
Definition: SSE.h:3834
static void average16Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: SSE.h:2896
static __m128i moveHighBits16_8_7(const __m128i &value)
Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition: SSE.h:3855
static __m128i bitMaskRemoveHigh32_16()
Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
Definition: SSE.h:3895
static __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition: SSE.h:1550
static __m128i removeHighBits32_16(const __m128i &value)
Removes the higher 16 bits of four 32 bit elements.
Definition: SSE.h:3789
static __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition: SSE.h:3885
static void average6Elements3Channel96Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
Definition: SSE.h:2771
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition: SSE.h:2264
static __m128i interpolation3Channel24Bit12Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: SSE.h:2077
static __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to al...
Definition: SSE.h:3089
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: SSE.h:2117
static void average8Elements1Channel32Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
Definition: SSE.h:2410
static void shiftChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition: SSE.h:3526
static void average8Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:2444
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3289
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static __m128i interpolation1Channel8Bit15Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:2025
static uint16_t value_u16(const __m128i &value)
Returns one specific 16 bit unsigned integer value of a m128i value object.
Definition: SSE.h:1299
static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2, __m128i &reversedInterleaved0, __m128i &reversedInterleaved1, __m128i &reversedInterleaved2)
Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channe...
Definition: SSE.h:3387
static __m128i sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition: SSE.h:1367
static __m128i removeLowBits32_16(const __m128i &value)
Removes the lower 16 bits of four 32 bit elements.
Definition: SSE.h:3794
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: SSE.h:1733
static uint8_t value_u8(const __m128i &value)
Returns one specific 8 bit unsigned integer value of a m128i value object.
Definition: SSE.h:1276
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 16 fol...
Definition: SSE.h:3169
static __m128i bitMaskRemoveHigh16_8()
Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
Definition: SSE.h:3890
static __m128i removeHighBits16_8(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements.
Definition: SSE.h:3799
static __m128i sum1Channel8BitBack15Elements(const uint8_t *elements)
Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is ...
Definition: SSE.h:3562
static __m128i load_u8_15_lower_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition: SSE.h:3702
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
Definition: SSE.h:3304
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition: SSE.h:1533
static __m128i sumInterleave3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2)
Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition: SSE.h:3568
static void average32Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: SSE.h:2920
static void average30Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition: SSE.h:2967
static __m128i sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
Definition: SSE.h:1491
static __m128i sum1Channel8BitFront15Elements(const uint8_t *elements)
Sums the first 15 elements of a buffer with 8 bit per element.
Definition: SSE.h:3556
static void average32ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
Definition: SSE.h:2613
static void average32Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:2547
static __m128i sumSquareDifference8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit prec...
Definition: SSE.h:1408
static OCEAN_FORCE_INLINE float sum_f32_4(const __m128 &value)
Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
Definition: SSE.h:1349
static __m128i load_u8_13_lower_random(const uint8_t *const buffer)
Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes,...
Definition: SSE.h:3682
static void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interl...
Definition: SSE.h:3441
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition: SSE.h:1322
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition: SSE.h:1270
static __m128i moveLowBits16_8ToLow64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition: SSE.h:3814
static __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
Definition: SSE.h:1518
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition: SSE.h:3919
static void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition: SSE.h:3533
static __m128i load128iLower64(const void *const buffer)
Loads the lower 64 bit of a 128i value from the memory.
Definition: SSE.h:3613
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition: SSE.h:3927
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition: SSE.h:3770
static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels...
Definition: SSE.h:3412
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i &value)
Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right s...
Definition: SSE.h:3028
static void average8Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: SSE.h:2645
static void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition: SSE.h:3519
static __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
Definition: SSE.h:1526
static void average16Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: SSE.h:2490
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i &values0, const __m128i &values1, __m128i &products0, __m128i &products1)
Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
Definition: SSE.h:3900
static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels...
Definition: SSE.h:3372
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: SSE.h:1879
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition: SSE.h:71
float m128_f32[4]
The four 32 bit elements.
Definition: SSE.h:73
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition: SSE.h:82
double m128d_f64[2]
The two 64 bit elements.
Definition: SSE.h:84
This union defines a wrapper for the __m128i SSE intrinsic data type.
Definition: SSE.h:51
uint64_t m128i_u64[2]
The two 64 bit elements.
Definition: SSE.h:53
uint16_t m128i_u16[8]
The eight 16 bit elements.
Definition: SSE.h:59
uint32_t m128i_u32[4]
The four 32 bit elements.
Definition: SSE.h:56
uint8_t m128i_u8[16]
The sixteen 8 bit elements.
Definition: SSE.h:62