Ocean
Loading...
Searching...
No Matches
SSE.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SSE_H
9#define META_OCEAN_CV_SSE_H
10
11#include "ocean/cv/CV.h"
12
14
15#include "ocean/math/Numeric.h"
16
17#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
18
19// SSE2 include files
20#include <emmintrin.h>
21#include <immintrin.h>
22#include <mmintrin.h>
23
24// SSE3 include files
25#include <pmmintrin.h>
26#include <mmintrin.h>
27
28// SSE4 include files
29#include <smmintrin.h>
30
31namespace Ocean
32{
33
34namespace CV
35{
36
37/**
38 * This class implements computer vision functions using SSE extensions.
39 * @ingroup cv
40 */
41class SSE
42{
43 public:
44
45#if !defined(OCEAN_COMPILER_MSC)
46
47 /**
48 * This union defines a wrapper for the __m128i SSE intrinsic data type.
49 */
50 union M128i
51 {
52 /// The two 64 bit elements.
53 uint64_t m128i_u64[2];
54
55 /// The four 32 bit elements.
56 uint32_t m128i_u32[4];
57
58 /// The eight 16 bit elements.
59 uint16_t m128i_u16[8];
60
61 /// The sixteen 8 bit elements.
62 uint8_t m128i_u8[16];
63 };
64
65 static_assert(sizeof(M128i) == 16, "Invalid data type!");
66
67 /**
68 * This union defines a wrapper for the __m128 SSE intrinsic data type.
69 */
70 union M128
71 {
72 /// The four 32 bit elements.
73 float m128_f32[4];
74 };
75
76 static_assert(sizeof(M128) == 16, "Invalid data type!");
77
78 /**
79 * This union defines a wrapper for the __m128 SSE intrinsic data type.
80 */
81 union M128d
82 {
83 /// The two 64 bit elements.
84 double m128d_f64[2];
85 };
86
87 static_assert(sizeof(M128d) == 16, "Invalid data type!");
88
89#endif
90
91 public:
92
93 /**
94 * Prefetches a block of temporal memory into all cache levels.
95 * @param data Data to be prefetched
96 */
97 static inline void prefetchT0(const void* const data);
98
99 /**
100 * Prefetches a block of temporal memory in all cache levels except 0th cache level.
101 * @param data Data to be prefetched
102 */
103 static inline void prefetchT1(const void* const data);
104
105 /**
106 * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
107 * @param data Data to be prefetched
108 */
109 static inline void prefetchT2(const void* const data);
110
111 /**
112 * Prefetches a block of non-temporal memory into non-temporal cache structure.
113 * @param data Data to be prefetched
114 */
115 static inline void prefetchNTA(const void* const data);
116
117 /**
118 * Returns one specific 8 bit unsigned integer value of a m128i value object.
119 * @param value The value from which the 8 bit value will be returned
120 * @return The requested 8 bit value
121 * @tparam tIndex The index of the requested 8 bit integer value, with range [0, 15]
122 */
123 template <unsigned int tIndex>
124 static inline uint8_t value_u8(const __m128i& value);
125
126 /**
127 * Returns one specific 8 bit unsigned integer value of a m128i value object.
128 * @param value The value from which the 8 bit value will be returned
129 * @param index The index of the requested 8 bit integer value, with range [0, 15]
130 * @return The requested 8 bit value
131 */
132 static inline uint8_t value_u8(const __m128i& value, const unsigned int index);
133
134 /**
135 * Returns one specific 16 bit unsigned integer value of a m128i value object.
136 * @param value The value from which the 16 bit value will be returned
137 * @return The requested 16 bit value
138 * @tparam tIndex The index of the requested 16 bit integer value, with range [0, 7]
139 */
140 template <unsigned int tIndex>
141 static inline uint16_t value_u16(const __m128i& value);
142
143 /**
144 * Returns one specific 32 bit unsigned integer value of a m128i value object.
145 * @param value The value from which the 32 bit value will be returned
146 * @return The requested 32 bit value
147 * @tparam tIndex The index of the requested 32 bit integer value, with range [0, 3]
148 */
149 template <unsigned int tIndex>
150 static inline unsigned int value_u32(const __m128i& value);
151
152 /**
153 * Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the result.
154 * @param value The value which elements will be added
155 * @return The resulting sum value
156 */
157 static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i& value);
158
159 /**
160 * Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
161 * @param value The value which elements will be added
162 * @return The resulting sum value
163 */
164 static inline unsigned int sum_u32_first_2(const __m128i& value);
165
166 /**
167 * Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
168 * @param value The value which elements will be added
169 * @return The resulting sum value
170 */
171 static inline unsigned int sum_u32_first_third(const __m128i& value);
172
173 /**
174 * Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
175 * @param value The value which elements will be added
176 * @return The resulting sum value
177 */
178 static OCEAN_FORCE_INLINE float sum_f32_4(const __m128& value);
179
180 /**
181 * Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
182 * @param value The value which elements will be added
183 * @return The resulting sum value
184 */
185 static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d& value);
186
187 /**
188 * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
189 * @param image0 First 11 elements to determine the ssd for, may be non aligned
190 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
191 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
192 */
193 static inline __m128i sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
194
195 /**
196 * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision, the remaining 4 elements are set to zero.
197 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
198 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [00 01 02 03 04 05 06 07 08 09 10 11 NA NA NA NA].
199 * @param image0 First 12 (+4) elements to determine the ssd for, with any alignment
200 * @param image1 Second 12 (+4) elements to determine the ssd for, with any alignment
201 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
202 */
203 static inline __m128i sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
204
205 /**
206 * Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit precision, the beginning 4 elements are interpreted as zero.
207 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
208 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends right): [NA NA NA NA 04 05 06 07 08 09 10 11 12 13 14 15].
209 * @param image0 First (4+) 12 elements to determine the ssd for, with any alignment
210 * @param image1 Second (4+) 12 elements to determine the ssd for, with any alignment
211 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
212 */
213 static inline __m128i sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
214
215 /**
216 * Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
217 * This function supports to load the 13 elements from a buffer with only 13 bytes or with a buffer with at least 16 bytes.
218 * @param image0 First 13 elements to determine the ssd for, may be non aligned
219 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
220 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
221 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 13 bytes only
222 */
223 template <bool tBufferHas16Bytes>
224 static inline __m128i sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
225
226 /**
227 * Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit precision, the beginning 3 elements are interpreted as zero.
228 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
229 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [NA NA NA 03 04 05 06 07 08 09 10 11 12 13 14 15].
230 * @param image0 First (3+) 13 elements to determine the ssd for, may be non aligned
231 * @param image1 Second (3+) 13 elements to determine the ssd for, may be non aligned
232 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
233 */
234 static inline __m128i sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
235
236 /**
237 * Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
238 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
239 * @param image0 First 15 elements to determine the ssd for, may be non aligned
240 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
241 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
242 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
243 */
244 template <bool tBufferHas16Bytes>
245 static inline __m128i sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
246
247 /**
248 * Sum square difference determination for 16 elements with 8 bit precision.
249 * @param image0 First 16 elements to determine the ssd for, may be non aligned
250 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
251 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
252 */
253 static inline __m128i sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
254
255 /**
256 * Sum square difference determination for 16 elements with 8 bit precision.
257 * @param image0 First 16 elements to determine the ssd for, may be non aligned
258 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
259 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
260 */
261 static inline __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1);
262
263 /**
264 * Sum square difference determination for 16 elements with 8 bit precision.
265 * @param row0 First 16 elements to determine the ssd for
266 * @param row1 Second 16 elements to determine the ssd for
267 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
268 */
269 static inline __m128i sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1);
270
271 /**
272 * Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
273 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
274 * @param image0 First row of 8 elements
275 * @param image1 Second row of 8 elements
276 * @param result Resulting 4 average elements
277 */
278 static inline void average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result);
279
280 /**
281 * Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
282 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
283 * @param image0 First row of 8 elements
284 * @param image1 Second row of 8 elements
285 * @param result Resulting 4 average elements
286 */
287 static inline void average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
288
289 /**
290 * Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
291 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
292 * @param image0 First row of 8 elements, must be valid
293 * @param image1 Second row of 8 elements, must be valid
294 * @param result Resulting 4 average elementss, must be valid
295 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
296 */
297 static inline void average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
298
299 /**
300 * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
301 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
302 * @param image0 First row of 16 elements, must be valid
303 * @param image1 Second row of 16 elements, must be valid
304 * @param result Resulting 8 average elements, must be valid
305 */
306 static inline void average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
307
308 /**
309 * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
310 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
311 * @param image0 First row of 16 elements, must be valid
312 * @param image1 Second row of 16 elements, must be valid
313 * @param result Resulting 8 average elements, must be valid
314 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
315 */
316 static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
317
318 /**
319 * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
320 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
321 * @param image0 First row of 32 elements
322 * @param image1 Second row of 32 elements
323 * @param result Resulting 16 average elements
324 */
325 static inline void average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
326
327 /**
328 * Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
329 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
330 * @param image0 First row of 32 elements, must be valid
331 * @param image1 Second row of 32 elements, must be valid
332 * @param result Resulting 16 average elements, must be valid
333 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
334 */
335 static inline void average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
336
337 /**
338 * Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
339 * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels, each with 2 channels).<br>
340 * @param image0 First row of 8 elements
341 * @param image1 Second row of 8 elements
342 * @param result Resulting 4 average elements
343 */
344 static inline void average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
345
346 /**
347 * Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
348 * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels).<br>
349 * @param image0 First row of 8 elements
350 * @param image1 Second row of 8 elements
351 * @param result Resulting 4 average elements
352 */
353 static inline void average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result);
354
355 /**
356 * Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
357 * The function takes two rows of 32 elements and returns 8 average elements (4 averaged pixels, each with 2 channels).<br>
358 * @param image0 First row of 16 elements
359 * @param image1 Second row of 16 elements
360 * @param result Resulting 8 average elements
361 */
362 static inline void average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
363
364 /**
365 * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
366 * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).<br>
367 * @param image0 First row of 32 elements
368 * @param image1 Second row of 32 elements
369 * @param result Resulting 16 average elements
370 */
371 static inline void average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
372
373 /**
374 * Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
375 * The function takes two rows of 6 elements and returns 3 average elements (1 averaged pixels, each with 3 channels).<br>
376 * @param image0 First row of 6 elements
377 * @param image1 Second row of 6 elements
378 * @param result Resulting 3 average elements
379 */
380 static inline void average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result);
381
382 /**
383 * Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
384 * The function takes two rows of 24 elements and returns 12 average elements (4 averaged pixels, each with 3 channels).<br>
385 * @param image0 First row of 24 elements
386 * @param image1 Second row of 24 elements
387 * @param result Resulting 12 average elements
388 */
389 static inline void average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
390
391 /**
392 * Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
393 * The function takes two rows of 8 elements and returns 4 average elements (1 averaged pixel).<br>
394 * @param image0 First row of 8 elements
395 * @param image1 Second row of 8 elements
396 * @param result Resulting 4 average elements
397 */
398 static inline void average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result);
399
400 /**
401 * Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
402 * The function takes two rows of 16 elements and returns 8 average elements (2 averaged pixels, each with 4 channels).<br>
403 * @param image0 First row of 16 elements
404 * @param image1 Second row of 16 elements
405 * @param result Resulting 8 average elements
406 */
407 static inline void average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
408
409 /**
410 * Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
411 * The function takes two rows of 32 elements and returns 16 average elements (4 averaged pixels, each with 4 channels).<br>
412 * @param image0 First row of 32 elements
413 * @param image1 Second row of 32 elements
414 * @param result Resulting 16 average elements
415 */
416 static inline void average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
417
418 /**
419 * Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
420 * The function takes two rows of 30 elements and returns 10 average elements (10 averaged pixels).<br>
421 * @param image0 First row of 30 elements
422 * @param image1 Second row of 30 elements
423 * @param image2 Third row of 30 elements
424 * @param result Resulting 10 average elements
425 */
426 static inline void average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
427
428 /**
429 * Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
430 * This function must be invoked before the right shift is applied.
431 * @param value The eight signed 16 bit values to be handled
432 * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
433 */
434 static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i& value);
435
436 /**
437 * Adds 2^shifts - 1 to each negative int16_t value, so that each value can be right shifted to allow a correct division by 2^shifts.
438 * This function must be invoked before the right shift is applied.
439 * @param value The eight int16_t values to be handled, with range (-infinity, infinity)
440 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
441 * @return The modified value for which division a shift yield equal (and correct!) results
442 */
443 static inline __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts);
444
445 /**
446 * Divides eight int16_t values by applying a right shift.
447 * The function can divide positive and negative values correctly (but without rounding).
448 * @param value The eight int16_t values to be divided, with range (-infinity, infinity)
449 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
450 * @return The divided values
451 */
452 static inline __m128i divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts);
453
454 /**
455 * Applies a rounded division by a right shift for eight int16_t values.
456 * The function can divide positive and negative values correctly (and handles rounding).<br>
457 * However, this function has a specific value range for the input values:
458 * <pre>
459 * maxValue = (2^15 - 1) - 2^(rightShifts - 1) = 32767 - 2^(rightShifts - 1)
460 * </pre>
461 * @param value The eight int16_t values to be divided, with range [-maxValue, maxValue]
462 * @param rightShifts The number of right shifts which needs to be applied, with range [1, 15]
463 * @return The divided values
464 * @see maximalValueForRoundedDivisionByRightShiftSigned16Bit().
465 */
466 static inline __m128i roundedDivideByRightShiftSigned16Bit(const __m128i& value_s16x8, const unsigned int rightShifts);
467
468 /**
469 * Returns the maximal value for which the function roundedDivideByRightShiftSigned16Bit() can be applied.
470 * @param rightShifts The number of right shifts which needs to be applied, with range [1, 15]
471 * @return The maximal value, which is 32767 - 2^(rightShifts - 1).
472 */
473 static inline int16_t maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts);
474
475 /**
476 * Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
477 * This function must be invoked before the right shift is applied.
478 * @param value The eight signed 32 bit values to be handled
479 * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
480 */
481 static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i& value);
482
483 /**
484 * Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to allow a correct division by 2^shifts.
485 * This function must be invoked before the right shift is applied.
486 * @param value The eight signed 32 bit values to be handled
487 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 31]
488 * @return The modified value for which division a shift yield equal (and correct!) results
489 */
490 static inline __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts);
491
492 /**
493 * Divides eight signed 32 bit values by applying a right shift.
494 * This is able to determine the correct division result for positive and negative 32 bit values.
495 * @param value The eight signed 32 bit values to be handled
496 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 32]
497 * @return The divided values
498 */
499 static inline __m128i divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts);
500
501 /**
502 * Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 bit frame.
503 * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
504 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
505 * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
506 * @param width The width of the original frame in pixel, with range [10, infinity)
507 */
508 static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
509
510 /**
511 * Determines the squared horizontal and vertical gradients and the product of both gradients for 16 following pixels for a given 1 channel 8 bit frame.
512 * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
513 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
514 * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
515 * @param width The width of the original frame in pixel, with range [10, infinity)
516 */
517 static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
518
519 /**
520 * Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit precision.
521 * @param image0 First 11 elements to determine the sad for, may be non aligned
522 * @param image1 Second 11 elements to determine the sad for, may be non aligned
523 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
524 */
525 static inline __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
526
527 /**
528 * Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
529 * This function supports to load the 10 elements from a buffer with only 10 bytes or with a buffer with at least 16 bytes.
530 * @param image0 First 10 elements to determine the sad for, may be non aligned
531 * @param image1 Second 10 elements to determine the sad for, may be non aligned
532 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
533 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 10 bytes only
534 */
535 template <bool tBufferHas16Bytes>
536 static inline __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
537
538 /**
539 * Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
540 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
541 * @param image0 First 15 elements to determine the sad for, may be non aligned
542 * @param image1 Second 15 elements to determine the sad for, may be non aligned
543 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
544 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
545 */
546 template <bool tBufferHas16Bytes>
547 static inline __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
548
549 /**
550 * Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
551 * The first interpolation element results from the first and second element of both rows.<br>
552 * The second interpolation element results from the second and third element of both rows.<br>
553 * ...<br>
554 * The eighth interpolation element results from the eighth and ninth.<br>
555 * The interpolation is specified by tx and ty with range [0, 128u].<br>
556 * @param values0 First row of 9 elements to be interpolated
557 * @param values1 Second row of 9 elements to be interpolated
558 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
559 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
560 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
561 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
562 * @return Interpolation result for 8 elements, which are 8 pixels
563 */
564 static inline __m128i interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
565
566 /**
567 * Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
568 * The first interpolation element results from the first and second element of both rows.<br>
569 * The second interpolation element results from the second and third element of both rows.<br>
570 * ...<br>
571 * The eighth interpolation element results from the eighth and ninth.<br>
572 * The interpolation is specified by tx and ty with range [0, 128u].<br>
573 * @param values0 First row of 10 elements to be interpolated
574 * @param values1 Second row of 10 elements to be interpolated
575 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
576 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
577 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
578 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
579 * @return Interpolation result for 8 elements, which are 4 pixels
580 */
581 static inline __m128i interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
582
583 /**
584 * Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
585 * The first interpolation element results from the first and second element of both rows.<br>
586 * The second interpolation element results from the second and third element of both rows.<br>
587 * ...<br>
588 * The eighth interpolation element results from the eighth and ninth.<br>
589 * The interpolation is specified by tx and ty with range [0, 128u].<br>
590 * @param values0 First row of 11 elements to be interpolated
591 * @param values1 Second row of 11 elements to be interpolated
592 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
593 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
594 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
595 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
596 * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
597 */
598 static inline __m128i interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
599
600 /**
601 * Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
602 * The interpolation is specified by tx and ty with range [0, 128u].<br>
603 * @param values0 First row of 16 elements to be interpolated
604 * @param values1 Second row of 16 elements to be interpolated
605 * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
606 * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
607 * @return Interpolation result for 15 elements, which are (15 pixels)
608 */
609 static inline __m128i interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
610
611 /**
612 * Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
613 * The interpolation is specified by tx and ty with range [0, 128u].<br>
614 * @param values0 First row of 15 elements to be interpolated
615 * @param values1 Second row of 15 elements to be interpolated
616 * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
617 * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
618 * @return Interpolation result for 12 elements, which are (4 pixels)
619 */
620 static inline __m128i interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
621
622 /**
623 * Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
624 * The first interpolation element results from the first and second element of both rows.<br>
625 * The second interpolation element results from the second and third element of both rows.<br>
626 * ...<br>
627 * The eighth interpolation element results from the eighth and ninth.<br>
628 * The interpolation is specified by tx and ty with range [0, 128u].<br>
629 * @param values0 First row of 12 elements to be interpolated
630 * @param values1 Second row of 12 elements to be interpolated
631 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
632 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
633 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
634 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
635 * @return Interpolation result for 8 elements, which are (2 pixels)
636 */
637 static inline __m128i interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
638
639 /**
640 * Interpolates 2x4 elements (two separated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit frames.
641 * The first interpolation element results from the first and second element of both rows.<br>
642 * The second interpolation element results from the second and third element of both rows.<br>
643 * ...<br>
644 * The eighth interpolation element results from the eighth and ninth.<br>
645 * The interpolation is specified by tx and ty with range [0, 128u].<br>
646 * @param values0 First row of 16 elements to be interpolated
647 * @param values1 Second row of 16 elements to be interpolated
648 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
649 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
650 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
651 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
652 * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
653 */
654 static inline __m128i interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
655
656 /**
657 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
658 * @param pixel0 Upper left pixel in the first frame
659 * @param pixel1 Upper left pixel in the second frame
660 * @param size0 Size of one frame row in bytes
661 * @param size1 Size of one frame row in bytes
662 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
663 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
664 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
665 * @param f1xy Product of the fx and the fy interpolation factor for the second image
666 * @return Interpolated sum of square difference
667 */
668 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
669
670 /**
671 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
672 * @param pixel0 Upper left pixel in the first frame
673 * @param pixel1 Upper left pixel in the second frame
674 * @param size0 Size of one frame row in bytes
675 * @param size1 Size of one frame row in bytes
676 * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
677 * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
678 * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
679 * @param f0xy Product of the fx and the fy interpolation factor for the first image
680 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
681 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
682 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
683 * @param f1xy Product of the fx and the fy interpolation factor for the second image
684 * @return Interpolated sum of square difference
685 */
686 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
687
688 /**
689 * Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
690 * @param image0 First 16 elements to determine the ssd for, may be non aligned
691 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
692 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
693 */
694 static inline __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
695
696 /**
697 * Deinterleaves 15 elements of e.g., an image with 3 channels and 8 bit per element.
698 * This functions converts X CBA CBA CBA CBA CBA to 00000000000CCCCC 000BBBBB000AAAAA.
699 * @param interleaved The 15 elements holding the interleaved image data
700 * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
701 * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
702 */
703 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2);
704
705 /**
706 * Deinterleaves 24 elements of e.g., an image with 3 channels and 8 bit per element.
707 * This functions converts XX XXX XXX CBA CBA CB A CBA CBA CBA CBA CBA to 00000000CCCCCCCC BBBBBBBBAAAAAAAA.
708 * @param interleavedA First 16 elements holding the interleaved image data
709 * @param interleavedB Second 16 elements holding the interleaved image data, the first 8 elements will be used only
710 * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
711 * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
712 */
713 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2);
714
715 /**
716 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
717 * This functions converts CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA to CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA.
718 * @param interleavedA First 16 elements holding the interleaved image data
719 * @param interleavedB Second 16 elements holding the interleaved image data
720 * @param interleavedC Third 16 elements holding the interleaved image data
721 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
722 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
723 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
724 */
725 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2);
726
727 /**
728 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
729 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
730 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
731 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
732 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
733 */
734 static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
735
736 /**
737 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
738 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes), must be valid
739 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively, must be valid
740 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively, must be valid
741 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively, must be valid
742 */
743 static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2);
744
745 /**
746 * Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
747 * @param interleaved 45 elements of an image with 3 channels and 8 bit per element (45 bytes), must be valid
748 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
749 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
750 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
751 */
752 static inline void deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
753
754 /**
755 * Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
756 * This functions converts CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA to CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA.
757 * @param channel0 The 16 elements of the first channel to be interleaved
758 * @param channel1 The 16 elements of the second channel to be interleaved
759 * @param channel2 The 16 elements of the third channel to be interleaved
760 * @param interleavedA Resulting first 16 of the interleaved data
761 * @param interleavedB Resulting second 16 of the interleaved data
762 * @param interleavedC Resulting third 16 of the interleaved data
763 */
764 OCEAN_FORCE_INLINE static void interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC);
765
766 /**
767 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
768 * @param channel0 The 16 elements of the first channel to be interleaved, must be valid
769 * @param channel1 The 16 elements of the second channel to be interleaved, must be valid
770 * @param channel2 The 16 elements of the third channel to be interleaved, must be valid
771 * @param interleaved The resulting 48 interleaved elements, must be valid
772 */
773 static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved);
774
775 /**
776 * Stores 8 single-channel 8-bit elements as 24 interleaved 3-channel elements (8 elements -> 8×3 = 24 bytes).
777 * Each input element is replicated to all 3 channels.
778 * @param singleChannel_u_8x8 The input with 8 single-channel elements in lower 8 bytes
779 * @param interleaved Pointer to 24 bytes where interleaved data will be stored, must be valid
780 */
781 static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i& singleChannel_u_8x8, uint8_t* interleaved);
782
783 /**
784 * Stores 8 single-channel 8-bit elements as 32 interleaved 4-channel elements (8 elements -> 8×4 = 32 bytes) with constant 4th channel value.
785 * Each input element is replicated to the first 3 channels, with a constant value for the 4th channel.
786 * @param singleChannel_u_8x8 The input with 8 single-channel elements in lower 8 bytes
787 * @param lastChannelValue The constant value for the last channel
788 * @param interleaved Pointer to 32 bytes where interleaved data will be stored, must be valid
789 */
790 static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i& singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t* interleaved);
791
792 /**
793 * Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels and 8 bit per element (e.g., YA16 to AY16).
794 * @param interleaved 16 elements of an image with 2 channels and 8 bit per element (32 bytes)
795 * @param reversedInterleaved Resulting 32 elements with reversed channel order
796 */
797 static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
798
799 /**
800 * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element.
801 * @param interleaved0 First 16 elements holding the interleaved image data
802 * @param interleaved1 Second 16 elements holding the interleaved image data
803 * @param interleaved2 Third 16 elements holding the interleaved image data
804 * @param reversedInterleaved0 Resulting first 16 elements holding the interleaved image data with reversed channel order
805 * @param reversedInterleaved1 Resulting second 16 elements holding the interleaved image data with reversed channel order
806 * @param reversedInterleaved2 Resulting third 16 elements holding the interleaved image data with reversed channel order
807 */
808 static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2);
809
810 /**
811 * Reverses the order of the first and last channel of 48 elements (16 pixels) of an image with 3 interleaved channels and 8 bit per element (e.g., RGB24 to BGR24).
812 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
813 * @param reversedInterleaved Resulting 48 elements with reversed channel order
814 */
815 static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
816
817 /**
818 * Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels and 8 bit per element (e.g., RGBA32 to ABGR24).
819 * @param interleaved 64 elements of an image with 4 channels and 8 bit per element (64 bytes)
820 * @param reversedInterleaved Resulting 64 elements with reversed channel order
821 */
822 static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
823
824 /**
825 * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element (in place).
826 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
827 */
828 static void reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved);
829
830 /**
831 * Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interleaved channels and 8 bit per element and further swaps both sets.
832 * @param first First 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
833 * @param second Second 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
834 */
835 static inline void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second);
836
837 /**
838 * Reverses the order of 48 elements with 8 bit per element.
839 * @param elements0 First 16 elements
840 * @param elements1 Second 16 elements
841 * @param elements2 Third 16 elements
842 * @param reversedElements0 Resulting reversed first 16 elements
843 * @param reversedElements1 Resulting reversed second 16 elements
844 * @param reversedElements2 Resulting reversed third 16 elements
845 */
846 static inline void reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2);
847
848 /**
849 * Reverses the order of 48 elements with 8 bit per element.
850 * @param elements 48 elements that will be reversed
851 * @param reversedElements Resulting reversed 48 elements
852 */
853 static inline void reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements);
854
855 /**
856 * Reverses the order of 48 elements with 8 bit per element (in place).
857 * @param elements 48 elements that will be reversed
858 */
859 static inline void reverseElements8Bit48Elements(uint8_t* elements);
860
861 /**
862 * Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
863 * @param first First 48 elements that will be reversed and swapped with the second 48 elements
864 * @param second Second 48 elements that will be reversed and swapped with the first 48 elements
865 */
866 static inline void swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second);
867
868 /**
869 * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel.
870 * The function takes four pixels DCBA DCBA DCBA DCBA and provides ADCB ADCB ADCB ADCB.<br>
871 * @param elements 16 elements of 4 pixels to be shifted
872 * @param shiftedElements Resulting shifted elements
873 */
874 static inline void shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
875
876 /**
877 * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel and mirrors the four individual pixels.
878 * @param elements 16 elements of 4 pixels to be shifted and mirrored
879 * @param shiftedElements Resulting shifted and mirrored elements
880 */
881 static inline void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
882
883 /**
884 * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel.
885 * The function takes four pixels DCBA DCBA DCBA DCBA and provides CBAD CBAD CBAD CBAD.<br>
886 * @param elements 16 elements of 4 pixels to be shifted
887 * @param shiftedElements Resulting shifted elements
888 */
889 static inline void shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
890
891 /**
892 * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel and mirrors the four individual pixels.
893 * @param elements 16 elements of 4 pixels to be shifted and mirrored
894 * @param shiftedElements Resulting shifted and mirrored elements
895 */
896 static inline void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
897
898 /**
899 * Sums 16 elements with 8 bit per element.
900 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
901 * @param elements 16 elements holding the image data
902 * @return Resulting sums
903 */
904 static inline __m128i sum1Channel8Bit16Elements(const __m128i& elements);
905
906 /**
907 * Sums 16 elements with 8 bit per element.
908 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
909 * @param elements 16 elements holding the image data
910 * @return Resulting sums
911 */
912 static inline __m128i sum1Channel8Bit16Elements(const uint8_t* elements);
913
914 /**
915 * Sums the first 15 elements of a buffer with 8 bit per element.
916 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.<br>
917 * If the provided buffer holds at least 16 bytes the load function is much faster compared to the case if the buffer is not larger than 15 bytes.<br>
918 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
919 * @param elements 16 elements holding the image data
920 * @return Resulting sums
921 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
922 */
923 template <bool tBufferHas16Bytes>
924 static inline __m128i sum1Channel8BitFront15Elements(const uint8_t* elements);
925
926 /**
927 * Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is interpreted as zero.
928 * However, the provided buffer must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE register.<br>
929 * Thus, this functions handles one buffer with this pattern (while the memory starts left and ends right): [NA 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15].
930 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
931 * @param elements (1+) 15 elements holding the image data
932 * @return Resulting sum
933 */
934 static inline __m128i sum1Channel8BitBack15Elements(const uint8_t* elements);
935
936 /**
937 * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
938 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
939 * @param interleaved0 First 16 elements holding the interleaved image data
940 * @param interleaved1 Second 16 elements holding the interleaved image data
941 * @param interleaved2 Third 16 elements holding the interleaved image data
942 * @return Resulting sums
943 */
944 static inline __m128i sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2);
945
946 /**
947 * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
948 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
949 * @param interleaved 48 elements holding the interleaved image data
950 * @return Resulting sums
951 */
952 static inline __m128i sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved);
953
954 /**
955 * Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
956 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
957 * @param interleaved 45 elements holding the interleaved image data
958 * @return Resulting sums
959 */
960 static inline __m128i sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved);
961
962 /**
963 * Loads the lower 64 bit of a 128i value from the memory.
964 * The upper 64 bit are zeroed.
965 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 8 bytes
966 * @return Resulting value
967 */
968 static inline __m128i load128iLower64(const void* const buffer);
969
970 /**
971 * Loads a 128i value from the memory.
972 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 16 bytes
973 * @return Resulting value
974 */
975 static inline __m128i load128i(const void* const buffer);
976
977 /**
978 * Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes, to a 128i value and sets the remaining bytes of the resulting 128i value to zero.
979 * The loaded memory will be stored in the upper 10 bytes of the 128i value while the lowest remaining 6 bytes will be set to zero.
980 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [09 08 07 06 05 04 03 02 01 00 ZZ ZZ ZZ ZZ ZZ ZZ], with ZZ meaning zero.<br>
981 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
982 * @return Resulting 128 bit value
983 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 10 bytes
984 */
985 template <bool tBufferHas16Bytes>
986 static inline __m128i load_u8_10_upper_zero(const uint8_t* const buffer);
987
988 /**
989 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
990 * The loaded memory will be stored in the upper 15 bytes of the 128i value while the lowest remaining 1 byte will be set to zero.
991 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 ZZ], with ZZ meaning zero.<br>
992 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
993 * @return Resulting 128 bit value
994 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
995 */
996 template <bool tBufferHas16Bytes>
997 static inline __m128i load_u8_15_upper_zero(const uint8_t* const buffer);
998
999 /**
1000 * Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
1001 * The loaded memory will be stored in the lower 13 bytes of the 128i value while the highest remaining 3 byte will be random.<br>
1002 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? ?? ?? 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
1003 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
1004 * @return Resulting 128 bit value
1005 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 13 bytes
1006 */
1007 template <bool tBufferHas16Bytes>
1008 static inline __m128i load_u8_13_lower_random(const uint8_t* const buffer);
1009
1010 /**
1011 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
1012 * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be set to zero.<br>
1013 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [-- 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ZZ meaning zero.<br>
1014 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
1015 * @return Resulting 128 bit value
1016 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
1017 */
1018 template <bool tBufferHas16Bytes>
1019 static inline __m128i load_u8_15_lower_zero(const uint8_t* const buffer);
1020
1021 /**
1022 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
1023 * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be random.<br>
1024 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
1025 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
1026 * @return Resulting 128 bit value
1027 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
1028 */
1029 template <bool tBufferHas16Bytes>
1030 static inline __m128i load_u8_15_lower_random(const uint8_t* const buffer);
1031
1032 /**
1033 * Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified number of bytes to the right (by inserting zeros).
1034 * This function can be used if the remaining buffer is smaller than 16 bytes while the buffer exceeds/continues in the lower address space (from the original point of interest).<br>
1035 * Thus, this function an handle a buffer with the following pattern (with lower address left and high address right):<br>
1036 * | ?? ?? ?? ?? ?? ?? ?? ?? ?? V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 |, where ?? represent random values in our buffer (in the lower address space), and VX represent the values of interest and V0 the location to which 'buffer' is pointing to.<br>
1037 * by load_u8_16_and_shift_right<6>(buffer - 6);<br>
1038 * The resulting 128i register will then be composed of (high bits left, low bits right): [00 00 00 00 00 00 V9 V8 V7 V6 V5 V4 V3 V2 V1 V0].
1039 * @param buffer The actual address from which the 16 bytes will be loaded, must be valid and must be at least 16 bytes large
1040 * @return The resulting 128 bit value
1041 * @tparam tShiftBytes The number of bytes which will be shifted (to the right) after the memory has loaded, with range [0, 16]
1042 */
1043 template <unsigned int tShiftBytes>
1044 static inline __m128i load_u8_16_and_shift_right(const uint8_t* const buffer);
1045
1046 /**
1047 * Stores a 128i value to the memory.
1048 * @param value Value to be stored
1049 * @param buffer Buffer receiving the value (does not need to be aligned on any particular boundary)
1050 */
1051 static inline void store128i(const __m128i& value, uint8_t* const buffer);
1052
1053 /**
1054 * Sets a 128i value by two 64 bit values.
1055 * @param high64 High 64 bits to be set
1056 * @param low64 Low 64 bits to be set
1057 * @return Resulting 128i value
1058 */
1059 static inline __m128i set128i(const unsigned long long high64, const unsigned long long low64);
1060
1061 /**
1062 * Removes the higher 16 bits of four 32 bit elements.
1063 * Given: PONM-LKJI-HGFE-DCBA<br>
1064 * Result: 00NM-00JI-00FE-00BA<br>
1065 * @param value Value to remove the high bits for
1066 * @return Result
1067 */
1068 static inline __m128i removeHighBits32_16(const __m128i& value);
1069
1070 /**
1071 * Removes the lower 16 bits of four 32 bit elements.
1072 * Given: PONM-LKJI-HGFE-DCBA<br>
1073 * Result: PO00-LK00-HG00-DC00<br>
1074 * @param value Value to remove the lower bits for
1075 * @return Result
1076 */
1077 static inline __m128i removeLowBits32_16(const __m128i& value);
1078
1079 /**
1080 * Removes the higher 8 bits of eight 16 bit elements.
1081 * Given: PONM-LKJI-HGFE-DCBA<br>
1082 * Result: 0O0M-0K0I-0G0E-0C0A<br>
1083 * @param value Value to remove the high bits for
1084 * @return Result
1085 */
1086 static inline __m128i removeHighBits16_8(const __m128i& value);
1087
1088 /**
1089 * Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
1090 * Given: PONM-LKJI-HGFE-DCBA<br>
1091 * Result: 000M-0K0I-0G0E-0C0A<br>
1092 * @param value Value to remove the high bits for
1093 * @return Result
1094 */
1095 static inline __m128i removeHighBits16_8_7_lower(const __m128i& value);
1096
1097 /**
1098 * Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
1099 * Given: PONM-LKJI-HGFE-DCBA<br>
1100 * Result: 0O0M-0K0I-0G0E-0C00<br>
1101 * @param value Value to remove the high bits for
1102 * @return Result
1103 */
1104 static inline __m128i removeHighBits16_8_7_upper(const __m128i& value);
1105
1106 /**
1107 * Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1108 * Given: PONM-LKJI-HGFE-DCBA<br>
1109 * Result: 0000-0000-OMKI-GECA<br>
1110 * @param value Value to remove the high bits for
1111 * @return Result
1112 */
1113 static inline __m128i moveLowBits16_8ToLow64(const __m128i& value);
1114
1115 /**
1116 * Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0.
1117 * Given: PONM-LKJI-HGFE-DCBA<br>
1118 * Result: 0000-0000-0000-MIEA<br>
1119 * @param value Value to remove the high bits for
1120 * @return Result
1121 */
1122 static inline __m128i moveLowBits32_8ToLow32(const __m128i& value);
1123
1124 /**
1125 * Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1126 * Given: PONM-LKJI-HGFE-DCBA<br>
1127 * Result: 0000-0000-NMJI-FEBA<br>
1128 * @param value Value to remove the high bits for
1129 * @return Result
1130 */
1131 static inline __m128i moveLowBits32_16ToLow64(const __m128i& value);
1132
1133 /**
1134 * Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with 0.
1135 * Given: PONM-LKJI-HGFE-DCBA<br>
1136 * Result: OMKI-GECA-0000-0000<br>
1137 * @param value Value to remove the high bits for
1138 * @return Result
1139 */
1140 static inline __m128i moveLowBits16_8ToHigh64(const __m128i& value);
1141
1142 /**
1143 * Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
1144 * Given: PONM-LKJI-HGFE-DCBA<br>
1145 * Result: 00PO-00LK-00HG-00DC<br>
1146 * @param value Value to remove the high bits for
1147 * @return Result
1148 */
1149 static inline __m128i moveHighBits32_16(const __m128i& value);
1150
1151 /**
1152 * Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
1153 * Given: PONM-LKJI-HGFE-DCBA<br>
1154 * Result: 0P0N-0L0J-0H0F-0D0B<br>
1155 * @param value Value to remove the high bits for
1156 * @return Result
1157 */
1158 static inline __m128i moveHighBits16_8(const __m128i& value);
1159
1160 /**
1161 * Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
1162 * Given: PONM-LKJI-HGFE-DCBA<br>
1163 * Result: 0000-000J-0H0F-0D0B<br>
1164 * @param value Value to remove the high bits for
1165 * @return Result
1166 */
1167 static inline __m128i moveHighBits16_8_5(const __m128i& value);
1168
1169 /**
1170 * Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
1171 * Given: PONM-LKJI-HGFE-DCBA<br>
1172 * Result: 0000-0L0J-0H0F-0D0B<br>
1173 * @param value Value to remove the high bits for
1174 * @return Result
1175 */
1176 static inline __m128i moveHighBits16_8_6(const __m128i& value);
1177
1178 /**
1179 * Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
1180 * Given: PONM-LKJI-HGFE-DCBA<br>
1181 * Result: 000N-0L0J-0H0F-0D0B<br>
1182 * @param value Value to remove the high bits for
1183 * @return Result
1184 */
1185 static inline __m128i moveHighBits16_8_7(const __m128i& value);
1186
1187 /**
1188 * Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
1189 * Given: PONM-LKJI-HGFE-DCBA<br>
1190 * Result: 000D-000C-000B-000A<br>
1191 * @param value Value to be shuffled
1192 * @return Result
1193 */
1194 static inline __m128i shuffleLow32ToLow32_8(const __m128i& value);
1195
1196 /**
1197 * Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1198 * Given: PONM-LKJI-HGFE-DCBA<br>
1199 * Result: 0H0D-0G0C-0F0B-0E0A<br>
1200 * @param value Value to be shuffled
1201 * @return Result
1202 */
1203 static inline __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value);
1204
1205 /**
1206 * Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1207 * Given: PONM-LKJI-HGFE-DCBA<br>
1208 * Result: 0P0L-0O0K-0N0J-0M0I<br>
1209 * @param value Value to be shuffled
1210 * @return Result
1211 */
1212 static inline __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i& value);
1213
1214 /**
1215 * Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1216 * @param value Value to be shuffled
1217 * @return Result
1218 */
1219 static inline __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value);
1220
1221 /**
1222 * Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1223 * @param value Value to be shuffled
1224 * @return Result
1225 */
1226 static inline __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i& value);
1227
1228 /**
1229 * Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
1230 * @return Bitmask
1231 */
1232 static inline __m128i bitMaskRemoveHigh16_8();
1233
1234 /**
1235 * Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
1236 * @return Bitmask
1237 */
1238 static inline __m128i bitMaskRemoveHigh32_16();
1239
1240 /**
1241 * Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
1242 * The pseudo code of the function is as follows:
1243 * <pre>
1244 * products0[0] = values0[0] * values1[0]
1245 * ...
1246 * products0[3] = values0[3] * values1[3]
1247 *
1248 * products1[0] = values0[4] * values1[4]
1249 * ...
1250 * products1[3] = values0[7] * values1[7]
1251 * </pre>
1252 * @param values0 The first 8 int16_t values to be multiplied
1253 * @param values1 The second 8 int16_t values to be multiplied
1254 * @param products0 The resulting first 4 int32_t products
1255 * @param products1 The resulting second 4 int32_t products
1256 */
1257 static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1);
1258
1259 /**
1260 * Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
1261 * The pseudo code of the function is as follows:
1262 * <pre>
1263 * results0[0] += values0[0] * values1[0]
1264 * ...
1265 * results0[3] += values0[3] * values1[3]
1266 *
1267 * results1[0] += values0[4] * values1[4]
1268 * ...
1269 * results1[3] += values0[7] * values1[7]
1270 * </pre>
1271 * @param values0 The first 8 int16_t values to be multiplied
1272 * @param values1 The second 8 int16_t values to be multiplied
1273 * @param results0 The results to which the first 4 int32_t products will be added
1274 * @param results1 The results to which the second 4 int32_t products will be added
1275 */
1276 static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1);
1277
1278 private:
1279
1280 /**
1281 * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
1282 * @param pixel Upper left pixel in the frame
1283 * @param size Size of one frame row in bytes
1284 * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
1285 * @param fxy_ Product of the fx and the inverse fy interpolation factor
1286 * @param fx_y Product of the inverse fx and the fy interpolation factor
1287 * @param fxy Product of the fx and the fy interpolation factor
1288 * @return Interpolated pixel values
1289 */
1290 static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
1291};
1292
1293inline void SSE::prefetchT0(const void* const data)
1294{
1295 _mm_prefetch((char*)data, _MM_HINT_T0);
1296}
1297
1298inline void SSE::prefetchT1(const void* const data)
1299{
1300 _mm_prefetch((char*)data, _MM_HINT_T1);
1301}
1302
1303inline void SSE::prefetchT2(const void* const data)
1304{
1305 _mm_prefetch((char*)data, _MM_HINT_T2);
1306}
1307
1308inline void SSE::prefetchNTA(const void* const data)
1309{
1310 _mm_prefetch((char*)data, _MM_HINT_NTA);
1311}
1312
1313template <unsigned int tIndex>
1314inline uint8_t SSE::value_u8(const __m128i& value)
1315{
1316 static_assert(tIndex <= 15u, "Invalid index!");
1317
1318#ifdef OCEAN_COMPILER_MSC
1319 return value.m128i_u8[tIndex];
1320#else
1321 return ((const M128i*)(&value))->m128i_u8[tIndex];
1322#endif
1323}
1324
1325inline uint8_t SSE::value_u8(const __m128i& value, const unsigned int index)
1326{
1327 ocean_assert(index <= 15u);
1328
1329#ifdef OCEAN_COMPILER_MSC
1330 return value.m128i_u8[index];
1331#else
1332 return ((const M128i*)(&value))->m128i_u8[index];
1333#endif
1334}
1335
1336template <unsigned int tIndex>
1337inline uint16_t SSE::value_u16(const __m128i& value)
1338{
1339 static_assert(tIndex <= 7u, "Invalid index!");
1340
1341#ifdef OCEAN_COMPILER_MSC
1342 return value.m128i_u16[tIndex];
1343#else
1344 return ((const M128i*)(&value))->m128i_u16[tIndex];
1345#endif
1346}
1347
1348template <unsigned int tIndex>
1349inline unsigned int SSE::value_u32(const __m128i& value)
1350{
1351 static_assert(tIndex <= 3u, "Invalid index!");
1352
1353#ifdef OCEAN_COMPILER_MSC
1354 return value.m128i_u32[tIndex];
1355#else
1356 return ((const M128i*)(&value))->m128i_u32[tIndex];
1357#endif
1358}
1359
1360OCEAN_FORCE_INLINE unsigned int SSE::sum_u32_4(const __m128i& value)
1361{
1362#ifdef OCEAN_COMPILER_MSC
1363 return value.m128i_u32[0] + value.m128i_u32[1] + value.m128i_u32[2] + value.m128i_u32[3];
1364#else
1365 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1] + ((const M128i*)(&value))->m128i_u32[2] + ((const M128i*)(&value))->m128i_u32[3];
1366#endif
1367}
1368
1369inline unsigned int SSE::sum_u32_first_2(const __m128i& value)
1370{
1371#ifdef OCEAN_COMPILER_MSC
1372 return value.m128i_u32[0] + value.m128i_u32[1];
1373#else
1374 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1];
1375#endif
1376}
1377
1378inline unsigned int SSE::sum_u32_first_third(const __m128i& value)
1379{
1380#ifdef OCEAN_COMPILER_MSC
1381 return value.m128i_u32[0] + value.m128i_u32[2];
1382#else
1383 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[2];
1384#endif
1385}
1386
1387OCEAN_FORCE_INLINE float SSE::sum_f32_4(const __m128& value)
1388{
1389#ifdef OCEAN_COMPILER_MSC
1390 return value.m128_f32[0] + value.m128_f32[1] + value.m128_f32[2] + value.m128_f32[3];
1391#else
1392 return ((const M128*)(&value))->m128_f32[0] + ((const M128*)(&value))->m128_f32[1] + ((const M128*)(&value))->m128_f32[2] + ((const M128*)(&value))->m128_f32[3];
1393#endif
1394}
1395
1396OCEAN_FORCE_INLINE double SSE::sum_f64_2(const __m128d& value)
1397{
1398#ifdef OCEAN_COMPILER_MSC
1399 return value.m128d_f64[0] + value.m128d_f64[1];
1400#else
1401 return ((const M128d*)(&value))->m128d_f64[0] + ((const M128d*)(&value))->m128d_f64[1];
1402#endif
1403}
1404
1405inline __m128i SSE::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1406{
1407 ocean_assert(image0 && image1);
1408
1409 return SSE::sumSquareDifference8Bit16Elements(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1410}
1411
1412inline __m128i SSE::sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1413{
1414 ocean_assert(image0 && image1);
1415
1416 return _mm_sad_epu8(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1417}
1418
1419inline __m128i SSE::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
1420{
1421 ocean_assert(image0 && image1);
1422
1423 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1424 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1425
1426 // subtract the 16 elements (usage of saturation and bitwise or operator)
1427 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1428
1429 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1430
1431 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00AA008ull, 0xA006A004A002A000ull));
1432 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1433
1434 // square the 16 elements
1435 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1436 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1437
1438 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1439 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1440 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1441
1442 // 4 32 bit square difference values
1443 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1444}
1445
1446inline __m128i SSE::sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
1447{
1448 ocean_assert(image0 && image1);
1449
1450 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1451 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1452
1453 // subtract the 16 elements (usage of saturation and bitwise or operator)
1454 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1455
1456 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1457
1458 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1459 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00FA00Dull, 0xA00BA009A007A005ull));
1460
1461 // square the 16 elements
1462 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1463 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1464
1465 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1466 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1467 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1468
1469 // 4 32 bit square difference values
1470 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1471}
1472
1473template <bool tBufferHas16Bytes>
1474inline __m128i SSE::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
1475{
1476 ocean_assert(image0 && image1);
1477
1478 const __m128i row0 = load_u8_13_lower_random<tBufferHas16Bytes>(image0);
1479 const __m128i row1 = load_u8_13_lower_random<tBufferHas16Bytes>(image1);
1480
1481 // subtract the 16 elements (usage of saturation and bitwise or operator)
1482 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1483
1484 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1485
1486 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00CA00AA008ull, 0xA006A004A002A000ull));
1487 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1488
1489 // square the 16 elements
1490 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1491 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1492
1493 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1494 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1495 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1496
1497 // 4 32 bit square difference values
1498 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1499}
1500
1501inline __m128i SSE::sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
1502{
1503 ocean_assert(image0 && image1);
1504
1505 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1506 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1507
1508 // subtract the 16 elements (usage of saturation and bitwise or operator)
1509 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1510
1511 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1512
1513 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00FA00DA00Bull, 0xA009A007A005A003ull));
1514 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1515
1516 // square the 16 elements
1517 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1518 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1519
1520 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1521 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1522 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1523
1524 // 4 32 bit square difference values
1525 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1526}
1527
1528template <bool tBufferHas16Bytes>
1529inline __m128i SSE::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1530{
1531 ocean_assert(image0 && image1);
1532
1533 const __m128i row0 = load_u8_15_lower_random<tBufferHas16Bytes>(image0);
1534 const __m128i row1 = load_u8_15_lower_random<tBufferHas16Bytes>(image1);
1535
1536 // subtract the 16 elements (usage of saturation and bitwise or operator)
1537 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1538
1539 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1540 const __m128i subtractLow = removeHighBits16_8(subtract);
1541 const __m128i subtractHigh = moveHighBits16_8_7(subtract); // the highest high 8 bit are not used due to the only 15 elements
1542
1543 // square the 16 elements
1544 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1545 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1546
1547 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1548 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1549 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1550
1551 // 4 32 bit square difference values
1552 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1553}
1554
1555template <bool tBufferHas16Bytes>
1556inline __m128i SSE::sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
1557{
1558 ocean_assert(image0 && image1);
1559
1560 return _mm_sad_epu8(load_u8_10_upper_zero<tBufferHas16Bytes>(image0), load_u8_10_upper_zero<tBufferHas16Bytes>(image1));
1561}
1562
1563template <bool tBufferHas16Bytes>
1564inline __m128i SSE::sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1565{
1566 ocean_assert(image0 && image1);
1567
1568 return _mm_sad_epu8(load_u8_15_upper_zero<tBufferHas16Bytes>(image0), load_u8_15_upper_zero<tBufferHas16Bytes>(image1));
1569}
1570
1571inline __m128i SSE::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1572{
1573 ocean_assert(image0 && image1);
1574
1575 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1576 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1577
1578 return sumSquareDifference8Bit16Elements(row0, row1);
1579}
1580
1581inline __m128i SSE::sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1582{
1583 ocean_assert(image0 && image1);
1584
1585 return _mm_sad_epu8(SSE::load128i(image0), SSE::load128i(image1));
1586}
1587
1588inline __m128i SSE::sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1)
1589{
1590 ocean_assert(image0 && image1);
1591 ocean_assert((unsigned long long)image0 % 16ll == 0ll);
1592 ocean_assert((unsigned long long)image1 % 16ll == 0ll);
1593
1594 const __m128i row0 = _mm_load_si128((__m128i*)image0);
1595 const __m128i row1 = _mm_load_si128((__m128i*)image1);
1596
1597 return sumSquareDifference8Bit16Elements(row0, row1);
1598}
1599
1600inline __m128i SSE::sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1)
1601{
1602 // subtract the 16 elements (usage of saturation and bitwise or operator)
1603 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1604
1605 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1606 const __m128i subtractLow = removeHighBits16_8(subtract);
1607 const __m128i subtractHigh = moveHighBits16_8(subtract);
1608
1609 // square the 16 elements
1610 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1611 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1612
1613 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1614 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1615 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1616
1617 // 4 32 bit square difference values
1618 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1619}
1620
1621inline __m128i SSE::interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1622{
1623 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1624 // values0: aF yE | yD yC | yB yA | y9 y8 | y7 y6 | y5 y4 | y3 y2 | y1 y0
1625 // values1: aF' yE' | yD' yC' | yB' yA' | y9' y8' | y7' y6' | y5' y4' | y3' y2' | y1' y0'
1626
1627 // shuffled elements
1628 // row0: y7 y6 y5 y4 y3 y2 y1 y0 | * fx_ * fy_
1629 // row1: y8 y7 y6 y5 y4 y3 y2 y1 | * fx * fy_
1630 // row2: y7' y6' y5' y4' y3' y2' y1' y0' | * fx_ * fy
1631 // row3: y8' y7' y6' y5' y4' y3' y2' y1' | * fx * fy
1632
1633#ifdef OCEAN_COMPILER_MSC
1634
1635 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1636 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1637 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1638 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1639 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1640 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1641 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1642
1643 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1644 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1645 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1646 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1647 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1648 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1649 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1650
1651 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1652 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1653 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1654 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1655 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1656 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1657 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1658
1659 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1660 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1661 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1662 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1663 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1664 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1665 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1666
1667 ocean_assert(fx_fy_.m128i_u16[0] + fxfy_.m128i_u16[0] + fx_fy.m128i_u16[0] + fxfy.m128i_u16[0] == 128u * 128u);
1668
1669#else
1670
1671#ifdef OCEAN_DEBUG
1672
1673 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1674 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1675 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1676 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1677
1678#endif // OCEAN_DEBUG
1679
1680 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1681 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1682 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1683 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1684 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1685 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1686 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1687
1688 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1689 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1690 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1691 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1692 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1693 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1694 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1695
1696 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1697 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1698 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1699 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1700 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1701 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1702 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1703
1704 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1705 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1706 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1707 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1708 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1709 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1710 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1711
1712 ocean_assert(debug_fx_fy_.m128i_u16[0] + debug_fxfy_.m128i_u16[0] + debug_fx_fy.m128i_u16[0] + debug_fxfy.m128i_u16[0] == 128u * 128u);
1713
1714#endif
1715
1716 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1717
1718 // row0
1719 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1720
1721 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1722 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1723
1724 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1725 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1726
1727 // row2
1728 row = _mm_shuffle_epi8(values1, shuffle);
1729
1730 multiLow = _mm_mullo_epi16(row, fx_fy);
1731 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1732
1733 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1734 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1735
1736
1737
1738 shuffle = set128i(0xA008A007A006A005ull, 0xA004A003A002A001ull);
1739
1740 // row1
1741 row = _mm_shuffle_epi8(values0, shuffle);
1742
1743 multiLow = _mm_mullo_epi16(row, fxfy_);
1744 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1745
1746 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1747 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1748
1749
1750 // row4
1751 row = _mm_shuffle_epi8(values1, shuffle);
1752
1753 multiLow = _mm_mullo_epi16(row, fxfy);
1754 multiHigh = _mm_mulhi_epu16(row, fxfy);
1755
1756 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1757 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1758
1759
1760 // normalization ( + 128 * 128 / 2) / (128 * 128)
1761 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1762 resultEven = _mm_srli_epi32(resultEven, 14);
1763
1764 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1765 resultOdd = _mm_srli_epi32(resultOdd, 14);
1766
1767 // stack the 2 four 32 bit values together to eight 8 bit values
1768 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1769}
1770
1771inline __m128i SSE::interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1772{
1773 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1774 // values0: a7 y7 | a6 y6 | a5 y5 | a4 y4 | a3 y3 | a2 y2 | a1 y1 | a0 y0
1775 // values1: a7' y7' | a6' y6' | a5' y5' | a4' y4' | a3' y3' | a2' y2' | a1' y1' | a0' y0'
1776
1777 // shuffled elements
1778 // row0: a3 y3 a2 y2 a1 y1 a0 y0 | * fx_ * fy_
1779 // row1: a4 y4 a3 y3 a2 y2 a1 y1 | * fx * fy_
1780 // row2: a3' y3' a2' y2' a1' y1' a0' y0' | * fx_ * fy
1781 // row3: a4' y4' a3' y3' a2' y2' a1' y1' | * fx * fy
1782
1783#ifdef OCEAN_COMPILER_MSC
1784
1785 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1786 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1787 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1788 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1789 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1790 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1791 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1792
1793 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1794 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1795 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1796 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1797 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1798 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1799 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1800
1801 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1802 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1803 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1804 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1805 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1806 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1807 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1808
1809 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1810 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1811 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1812 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1813 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1814 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1815 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1816
1817#else
1818
1819#ifdef OCEAN_DEBUG
1820
1821 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1822 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1823 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1824 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1825
1826#endif // OCEAN_DEBUG
1827
1828 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1829 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1830 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1831 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1832 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1833 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1834 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1835
1836 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1837 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1838 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1839 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1840 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1841 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1842 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1843
1844 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1845 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1846 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1847 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1848 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1849 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1850 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1851
1852 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1853 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1854 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1855 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1856 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1857 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1858 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1859
1860#endif
1861
1862 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1863
1864 // row0
1865 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1866
1867 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1868 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1869
1870 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1871 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1872
1873 // row2
1874 row = _mm_shuffle_epi8(values1, shuffle);
1875
1876 multiLow = _mm_mullo_epi16(row, fx_fy);
1877 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1878
1879 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1880 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1881
1882
1883
1884 shuffle = set128i(0xA009A008A007A006ull, 0xA005A004A003A002ull);
1885
1886 // row1
1887 row = _mm_shuffle_epi8(values0, shuffle);
1888
1889 multiLow = _mm_mullo_epi16(row, fxfy_);
1890 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1891
1892 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1893 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1894
1895
1896 // row4
1897 row = _mm_shuffle_epi8(values1, shuffle);
1898
1899 multiLow = _mm_mullo_epi16(row, fxfy);
1900 multiHigh = _mm_mulhi_epu16(row, fxfy);
1901
1902 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1903 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1904
1905
1906 // normalization ( + 128 * 128 / 2) / (128 * 128)
1907 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1908 resultEven = _mm_srli_epi32(resultEven, 14);
1909
1910 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1911 resultOdd = _mm_srli_epi32(resultOdd, 14);
1912
1913 // stack the 2 four 32 bit values together to eight 8 bit values
1914 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1915}
1916
1917inline __m128i SSE::interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1918{
1919 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1920 // values0: r5 | b4 g4 r4 | b3 g3 r3 | b2 g2 r2 | b1 g1 r1 | b0 g0 r0
1921 // values1: r5'| b4' g4' r4'| b3' g3' r3'| b2' g2' r2'| b1' g1' r1'| b0' g0' r0'
1922
1923 // shuffled elements
1924 // row0: g2 r2 b1 g1 r1 b0 g0 r0 | * fx_ * fy_
1925 // row1: g3 r3 b2 g2 r2 b1 g1 r1 | * fx * fy_
1926 // row2: g2' r2' b1' g1' r1' b0' g0' r0' | * fx_ * fy
1927 // row3: g3' r3' b2' g2' r2' b1' g1' r1' | * fx * fy
1928
1929#ifdef OCEAN_COMPILER_MSC
1930
1931 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1932 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1933 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1934 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1935 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1936 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1937 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1938
1939 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1940 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1941 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1942 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1943 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1944 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1945 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1946
1947 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1948 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1949 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1950 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1951 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1952 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1953 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1954
1955 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1956 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1957 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1958 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1959 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1960 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1961 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1962
1963#else
1964
1965#ifdef OCEAN_DEBUG
1966
1967 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1968 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1969 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1970 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1971
1972#endif // OCEAN_DEBUG
1973
1974 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1975 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1976 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1977 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1978 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1979 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1980 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1981
1982 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1983 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1984 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1985 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1986 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1987 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1988 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1989
1990 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1991 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1992 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1993 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1994 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1995 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1996 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1997
1998 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1999 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2000 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2001 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2002 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2003 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2004 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2005
2006#endif
2007
2008 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2009
2010 // row0
2011 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2012
2013 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2014 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2015
2016 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2017 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2018
2019 // row2
2020 row = _mm_shuffle_epi8(values1, shuffle);
2021
2022 multiLow = _mm_mullo_epi16(row, fx_fy);
2023 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2024
2025 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2026 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2027
2028
2029
2030 shuffle = set128i(0xA00AA009A008A007ull, 0xA006A005A004A003ull);
2031
2032 // row1
2033 row = _mm_shuffle_epi8(values0, shuffle);
2034
2035 multiLow = _mm_mullo_epi16(row, fxfy_);
2036 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2037
2038 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2039 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2040
2041
2042 // row4
2043 row = _mm_shuffle_epi8(values1, shuffle);
2044
2045 multiLow = _mm_mullo_epi16(row, fxfy);
2046 multiHigh = _mm_mulhi_epu16(row, fxfy);
2047
2048 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2049 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2050
2051
2052 // normalization ( + 128 * 128 / 2) / (128 * 128)
2053 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2054 resultEven = _mm_srli_epi32(resultEven, 14);
2055
2056 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2057 resultOdd = _mm_srli_epi32(resultOdd, 14);
2058
2059 // stack the 2 four 32 bit values together to eight 8 bit values
2060 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2061}
2062
2063inline __m128i SSE::interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2064{
2065 __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2066 __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2067
2068 __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2069 __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2070
2071 __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2072 __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2073
2074 __m128i row0_d = _mm_shuffle_epi8(values0, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2075 __m128i row1_d = _mm_shuffle_epi8(values1, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2076
2077 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2078 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2079 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2080 row0_d = _mm_madd_epi16(row0_d, fx_fy_fxfy_);
2081
2082 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2083 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2084 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2085 row1_d = _mm_madd_epi16(row1_d, fx_fyfxfy);
2086
2087 const __m128i rounding = _mm_set1_epi32(8192);
2088
2089 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2090 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2091 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2092 __m128i row_d = _mm_add_epi32(row0_d, row1_d);
2093
2094 row_a = _mm_add_epi32(row_a, rounding);
2095 row_b = _mm_add_epi32(row_b, rounding);
2096 row_c = _mm_add_epi32(row_c, rounding);
2097 row_d = _mm_add_epi32(row_d, rounding);
2098
2099 row_a = _mm_srli_epi32(row_a, 14);
2100 row_b = _mm_srli_epi32(row_b, 14);
2101 row_c = _mm_srli_epi32(row_c, 14);
2102 row_d = _mm_srli_epi32(row_d, 14);
2103
2104 row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF0c080400ull));
2105 row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFFFFFFFFull, 0x0c080400FFFFFFFFull));
2106 row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0c080400ull, 0xFFFFFFFFFFFFFFFFull));
2107 row_d = _mm_shuffle_epi8(row_d, set128i(0xFF080400FFFFFFFFull, 0xFFFFFFFFFFFFFFFFull));
2108
2109 row_a = _mm_or_si128(row_a, row_b);
2110 row_c = _mm_or_si128(row_c, row_d);
2111
2112 return _mm_or_si128(row_a, row_c);
2113}
2114
2115inline __m128i SSE::interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2116{
2117 __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2118 __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2119
2120 __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2121 __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2122
2123 __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2124 __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2125
2126 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2127 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2128 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2129
2130 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2131 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2132 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2133
2134 const __m128i rounding = _mm_set1_epi32(8192);
2135
2136 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2137 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2138 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2139
2140 row_a = _mm_add_epi32(row_a, rounding);
2141 row_b = _mm_add_epi32(row_b, rounding);
2142 row_c = _mm_add_epi32(row_c, rounding);
2143
2144 row_a = _mm_srli_epi32(row_a, 14);
2145 row_b = _mm_srli_epi32(row_b, 14);
2146 row_c = _mm_srli_epi32(row_c, 14);
2147
2148 row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
2149 row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
2150 row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
2151
2152 return _mm_or_si128(row_a, _mm_or_si128(row_b, row_c));
2153}
2154
2155inline __m128i SSE::interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2156{
2157 // F E D C B A 9 8 7 6 5 4 3 2 1 0
2158 // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2159 // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2160
2161 // shuffled elements
2162 // row0: a1 b1 g1 r1 a0 b0 g0 r0 | * fx_ * fy_
2163 // row1: a2 b2 g2 r2 a1 b1 g1 r1 | * fx * fy_
2164 // row2: a1' b1' g1' r1' a0' b0' g0' r0' | * fx_ * fy
2165 // row3: a2' b2' g2' r2' a1' b1' g1' r1' | * fx * fy
2166
2167#ifdef OCEAN_COMPILER_MSC
2168
2169 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2170 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2171 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2172 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2173 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2174 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2175 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2176
2177 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2178 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2179 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2180 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2181 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2182 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2183 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2184
2185 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2186 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2187 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2188 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2189 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2190 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2191 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2192
2193 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2194 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2195 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2196 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2197 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2198 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2199 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2200
2201#else
2202
2203#ifdef OCEAN_DEBUG
2204
2205 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2206 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2207 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2208 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2209
2210#endif // OCEAN_DEBUG
2211
2212 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2213 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2214 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2215 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2216 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2217 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2218 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2219
2220 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2221 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2222 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2223 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2224 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2225 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2226 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2227
2228 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2229 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2230 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2231 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2232 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2233 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2234 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2235
2236 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2237 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2238 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2239 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2240 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2241 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2242 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2243
2244#endif
2245
2246 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2247
2248 // row0
2249 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2250
2251 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2252 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2253
2254 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2255 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2256
2257 // row2
2258 row = _mm_shuffle_epi8(values1, shuffle);
2259
2260 multiLow = _mm_mullo_epi16(row, fx_fy);
2261 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2262
2263 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2264 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2265
2266
2267
2268 shuffle = set128i(0xA00BA00AA009A008ull, 0xA007A006A005A004ull);
2269
2270 // row1
2271 row = _mm_shuffle_epi8(values0, shuffle);
2272
2273 multiLow = _mm_mullo_epi16(row, fxfy_);
2274 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2275
2276 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2277 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2278
2279
2280 // row4
2281 row = _mm_shuffle_epi8(values1, shuffle);
2282
2283 multiLow = _mm_mullo_epi16(row, fxfy);
2284 multiHigh = _mm_mulhi_epu16(row, fxfy);
2285
2286 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2287 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2288
2289
2290 // normalization ( + 128 * 128 / 2) / (128 * 128)
2291 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2292 resultEven = _mm_srli_epi32(resultEven, 14);
2293
2294 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2295 resultOdd = _mm_srli_epi32(resultOdd, 14);
2296
2297 // stack the 2 four 32 bit values together to eight 8 bit values
2298 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2299}
2300
2301
2302inline __m128i SSE::interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2303{
2304 // F E D C B A 9 8 7 6 5 4 3 2 1 0
2305 // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2306 // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2307
2308 // shuffled elements
2309 // row0: a2 b2 g2 r2 a0 b0 g0 r0 | * fx_ * fy_
2310 // row1: a3 b3 g3 r3 a1 b1 g1 r1 | * fx * fy_
2311 // row2: a2' b2' g2' r2' a0' b0' g0' r0' | * fx_ * fy
2312 // row3: a3' b3' g3' r3' a1' b1' g1' r1' | * fx * fy
2313
2314#ifdef OCEAN_COMPILER_MSC
2315
2316 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2317 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2318 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2319 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2320 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2321 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2322 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2323
2324 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2325 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2326 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2327 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2328 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2329 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2330 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2331
2332 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2333 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2334 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2335 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2336 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2337 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2338 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2339
2340 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2341 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2342 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2343 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2344 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2345 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2346 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2347
2348#else
2349
2350#ifdef OCEAN_DEBUG
2351
2352 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2353 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2354 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2355 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2356
2357#endif // OCEAN_DEBUG
2358
2359 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2360 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2361 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2362 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2363 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2364 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2365 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2366
2367 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2368 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2369 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2370 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2371 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2372 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2373 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2374
2375 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2376 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2377 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2378 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2379 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2380 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2381 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2382
2383 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2384 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2385 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2386 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2387 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2388 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2389 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2390
2391#endif
2392
2393 __m128i shuffle = set128i(0xA00BA00AA009A008ull, 0xA003A002A001A000ull);
2394
2395 // row0
2396 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2397
2398 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2399 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2400
2401 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2402 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2403
2404 // row2
2405 row = _mm_shuffle_epi8(values1, shuffle);
2406
2407 multiLow = _mm_mullo_epi16(row, fx_fy);
2408 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2409
2410 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2411 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2412
2413
2414
2415 shuffle = set128i(0xA00FA00EA00DA00Cull, 0xA007A006A005A004ull);
2416
2417 // row1
2418 row = _mm_shuffle_epi8(values0, shuffle);
2419
2420 multiLow = _mm_mullo_epi16(row, fxfy_);
2421 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2422
2423 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2424 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2425
2426
2427 // row4
2428 row = _mm_shuffle_epi8(values1, shuffle);
2429
2430 multiLow = _mm_mullo_epi16(row, fxfy);
2431 multiHigh = _mm_mulhi_epu16(row, fxfy);
2432
2433 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2434 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2435
2436
2437 // normalization ( + 128 * 128 / 2) / (128 * 128)
2438 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2439 resultEven = _mm_srli_epi32(resultEven, 14);
2440
2441 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2442 resultOdd = _mm_srli_epi32(resultOdd, 14);
2443
2444 // stack the 2 four 32 bit values together to eight 8 bit values
2445 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2446}
2447
2448inline void SSE::average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result)
2449{
2450 ocean_assert(image0 && image1);
2451
2452 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2453 const __m128 row0 = _mm_loadu_ps(image0);
2454 const __m128 row1 = _mm_loadu_ps(image1);
2455
2456 // get sum of first 4 elements
2457 const __m128 sumFirst = _mm_add_ps(row0, row1);
2458
2459 // load next 4 elements
2460 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2461 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2462
2463 // get sum of second 4 elements
2464 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2465
2466 // get sum of adjacent summed pixels
2467 const __m128 sumAdjacent = _mm_hadd_ps(sumFirst, sumSecond);
2468
2469 /* following variant is exactly as fast as _mm_hadd_ps(,) ~ 0.30ms / 100,000 iteration
2470 const unsigned int mask10001000 = 136u;
2471 const unsigned int mask11011101 = 221u;
2472 const __m128 sumAdjacent = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, mask10001000), _mm_shuffle_ps(sumFirst, sumSecond, mask11011101));
2473 */
2474
2475 // divide by 4 --> multiply by 0.25
2476 const __m128 division = _mm_mul_ps(sumAdjacent, _mm_set_ps1(0.25f));
2477
2478 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2479 _mm_storeu_ps(result, division);
2480}
2481
2482inline void SSE::average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2483{
2484 ocean_assert(image0 && image1);
2485
2486 // 16 * uchar = m128i, but only the first 8 elements are set
2487 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2488 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2489
2490 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2491 const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2492 const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2493
2494 // build overall sum and add 2 for rounding
2495 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2496
2497 // divide by 4 by right shifting of two bits
2498 const __m128i division16 = _mm_srli_epi16(sum, 2);
2499
2500 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2501 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2502
2503 memcpy(result, &division8, sizeof(uint8_t) * 4);
2504}
2505
2506inline void SSE::average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2507{
2508 ocean_assert(image0 != nullptr && image1 != nullptr);
2509 ocean_assert(threshold >= 1u);
2510
2511 // we load the first 8 elements, the uppper 8 bytes will be set to zero
2512 const __m128i row0_u_8x8 = _mm_loadl_epi64((__m128i*)image0);
2513 const __m128i row1_u_8x8 = _mm_loadl_epi64((__m128i*)image1);
2514
2515 const __m128i row0_u_16x8 = _mm_cvtepu8_epi16(row0_u_8x8); // converting the lower 8 bytes to 16 byte values
2516 const __m128i row1_u_16x8 = _mm_cvtepu8_epi16(row1_u_8x8);
2517
2518 const __m128i verticalSum_u_16x8 = _mm_adds_epu16(row0_u_16x8, row1_u_16x8);
2519 const __m128i sum_u_16x8 = _mm_hadd_epi16(verticalSum_u_16x8, verticalSum_u_16x8);
2520
2521 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2522
2523 const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2524
2525 memcpy(result, &mask_u_8x8, sizeof(uint8_t) * 4);
2526}
2527
2528inline void SSE::average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2529{
2530 ocean_assert(image0 && image1);
2531
2532 // 16 * uchar = m128i
2533 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2534 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2535
2536 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2537 const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2538 const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2539
2540 // build overall sum and add 2 for rounding
2541 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2542
2543 // divide by 4 by right shifting of two bits
2544 const __m128i division16 = _mm_srli_epi16(sum, 2);
2545
2546 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2547 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2548
2549 // copy the lower 64 bit to the memory
2550 _mm_storel_epi64((__m128i*)result, division8);
2551
2552 /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2553 const __m128i avgRows = _mm_avg_epu8(row0, row1);
2554 const __m128i avgRowsSwap = _mm_or_si128(_mm_slli_epi16(avgRows, 8), _mm_srli_epi16(avgRows, 8));
2555
2556 const __m128i avg = _mm_avg_epu8(avgRows, avgRowsSwap); // 1 result in 2 uchar
2557 const __m128i avgOrdered = _mm_shuffle_epi8(avg, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2558
2559 _mm_storel_epi64((__m128i*)result, avgOrdered);
2560 */
2561}
2562
2563inline void SSE::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2564{
2565 ocean_assert(image0 != nullptr && image1 != nullptr);
2566 ocean_assert(threshold >= 1u);
2567
2568 // 16 * uchar = m128i
2569 const __m128i row0_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2570 const __m128i row1_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2571
2572 const __m128i horizontalSum0_u_16x8 = _mm_maddubs_epi16(row0_u_8x16, _mm_set1_epi8(1));
2573 const __m128i horizontalSum1_u_16x8 = _mm_maddubs_epi16(row1_u_8x16, _mm_set1_epi8(1));
2574
2575 const __m128i sum_u_16x8 = _mm_add_epi16(horizontalSum0_u_16x8, horizontalSum1_u_16x8);
2576
2577 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2578
2579 const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2580
2581 // copy the lower 64 bit to the memory
2582 _mm_storel_epi64((__m128i*)result, mask_u_8x8);
2583}
2584
2585inline void SSE::average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2586{
2587 ocean_assert(image0 && image1);
2588
2589 // first 16 elements
2590 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2591 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2592
2593 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2594 const __m128i firstSumLow = _mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1));
2595 const __m128i firstSumHigh = _mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1));
2596
2597 // build overall sum and add 2 for rounding
2598 const __m128i firstSum = _mm_add_epi16(firstSumLow, _mm_add_epi16(firstSumHigh, _mm_set1_epi32(int(0x00020002))));
2599
2600 // divide by 4 by right shifting of two bits
2601 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2602
2603 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2604 const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2605
2606 // second 16 elements
2607 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2608 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2609
2610 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2611 const __m128i secondSumLow = _mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1));
2612 const __m128i secondSumHigh = _mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1));
2613
2614 // build overall sum and add 2 for rounding
2615 const __m128i secondSum = _mm_add_epi16(secondSumLow, _mm_add_epi16(secondSumHigh, _mm_set1_epi32(int(0x00020002))));
2616
2617 // divide by 4 by right shifting of two bits
2618 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2619
2620 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2621 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2622
2623
2624 // combine both divion results
2625 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2626
2627 // copy the 128 bit to the memory
2628 _mm_storeu_si128((__m128i*)result, division8);
2629
2630 /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2631 const __m128i avgFirstRows = _mm_avg_epu8(firstRow0, firstRow1);
2632 const __m128i avgFirstRowsSwap = _mm_or_si128(_mm_slli_epi16(avgFirstRows, 8), _mm_srli_epi16(avgFirstRows, 8));
2633
2634 const __m128i avgFirst = _mm_avg_epu8(avgFirstRows, avgFirstRowsSwap); // 1 result in 2 uchar
2635 const __m128i avgFristOrdered = _mm_shuffle_epi8(avgFirst, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2636
2637 const __m128i avgSecondRows = _mm_avg_epu8(secondRow0, secondRow1);
2638 const __m128i avgSecondRowsSwap = _mm_or_si128(_mm_slli_epi16(avgSecondRows, 8), _mm_srli_epi16(avgSecondRows, 8));
2639
2640 const __m128i avgSecond = _mm_avg_epu8(avgSecondRows, avgSecondRowsSwap); // 1 result in 2 uchar
2641 const __m128i avgSecondOrdered = _mm_shuffle_epi8(avgSecond, _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0));
2642
2643 // combine both divion results
2644 const __m128i combinedAvg = _mm_or_si128(avgFristOrdered, avgSecondOrdered);
2645
2646 // copy the 128 bit to the memory
2647 _mm_storeu_si128((__m128i*)result, combinedAvg);
2648 */
2649}
2650
2651inline void SSE::average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2652{
2653 ocean_assert(image0 != nullptr && image1 != nullptr);
2654 ocean_assert(threshold >= 1u);
2655
2656 // load first 16 uchars
2657 const __m128i row0A_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2658 const __m128i row1A_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2659
2660 const __m128i horizontalSum0A_u_16x8 = _mm_maddubs_epi16(row0A_u_8x16, _mm_set1_epi8(1));
2661 const __m128i horizontalSum1A_u_16x8 = _mm_maddubs_epi16(row1A_u_8x16, _mm_set1_epi8(1));
2662
2663 const __m128i sumA_u_16x8 = _mm_add_epi16(horizontalSum0A_u_16x8, horizontalSum1A_u_16x8);
2664
2665 const __m128i maskA_u_16x8 = _mm_cmpgt_epi16(sumA_u_16x8, _mm_set1_epi16(short(threshold - 1)));
2666
2667 const __m128i row0B_u_8x16 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2668 const __m128i row1B_u_8x16 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2669
2670 const __m128i horizontalSum0B_u_16x8 = _mm_maddubs_epi16(row0B_u_8x16, _mm_set1_epi8(1));
2671 const __m128i horizontalSum1B_u_16x8 = _mm_maddubs_epi16(row1B_u_8x16, _mm_set1_epi8(1));
2672
2673 const __m128i sumB_u_16x8 = _mm_add_epi16(horizontalSum0B_u_16x8, horizontalSum1B_u_16x8);
2674
2675 const __m128i maskB_u_16x8 = _mm_cmpgt_epi16(sumB_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2676
2677 const __m128i mask_u_8x16 = _mm_or_si128(moveLowBits16_8ToLow64(maskA_u_16x8), moveLowBits16_8ToHigh64(maskB_u_16x8));
2678
2679 // copy the 128 bit to the memory
2680 _mm_storeu_si128((__m128i*)result, mask_u_8x16);
2681}
2682
2683inline void SSE::average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2684{
2685 ocean_assert(image0 && image1);
2686
2687 // 16 * uchar = m128i, but only the first 8 elements are set
2688 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2689 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2690
2691 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2692 const __m128i shuffledRow0 = shuffleNeighbor2Low64BitsToLow16_8(row0);
2693 const __m128i shuffledRow1 = shuffleNeighbor2Low64BitsToLow16_8(row1);
2694
2695 // build sum and add 2 for rounding
2696 const __m128i sumLow = _mm_add_epi16(shuffledRow0, shuffledRow1);
2697 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumLow), _mm_set1_epi32(int(0x00020002)));
2698
2699 // divide by 4 by right shifting of two bits
2700 const __m128i division16 = _mm_srli_epi16(sum, 2);
2701
2702 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2703 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2704
2705 memcpy(result, &division8, sizeof(uint8_t) * 4);
2706}
2707
2708inline void SSE::average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result)
2709{
2710 ocean_assert(image0 && image1);
2711
2712 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2713 const __m128 row0 = _mm_loadu_ps(image0);
2714 const __m128 row1 = _mm_loadu_ps(image1);
2715
2716 // get sum of first 4 elements
2717 const __m128 sumFirst = _mm_add_ps(row0, row1);
2718
2719 // load next 4 elements
2720 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2721 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2722
2723 // get sum of second 4 elements
2724 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2725
2726 // get sum of summed pixels
2727 // mask01000100 = 68u
2728 // mask11101110 = 238u
2729 const __m128 sumComponents = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, 68u), _mm_shuffle_ps(sumFirst, sumSecond, 238u));
2730
2731 // divide by 4 --> multiply by 0.25
2732 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2733
2734 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2735 _mm_storeu_ps(result, division);
2736}
2737
2738inline void SSE::average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2739{
2740 ocean_assert(image0 && image1);
2741
2742 // 16 * uchar = m128i
2743 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2744 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2745
2746 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2747 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2748 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2749
2750 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2751 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2752
2753 // divide by 4 by right shifting of two bits
2754 const __m128i division16 = _mm_srli_epi16(sum, 2);
2755
2756 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2757 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2758
2759 // copy the lower 64 bit to the memory
2760 _mm_storel_epi64((__m128i*)result, division8);
2761}
2762
2763inline void SSE::average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2764{
2765 ocean_assert(image0 && image1);
2766
2767 // first 16 elements: 16 * uchar = m128i
2768 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2769 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2770
2771 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2772 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2773 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2774
2775 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2776 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2777
2778 // divide by 4 by right shifting of two bits
2779 const __m128i division16 = _mm_srli_epi16(sum, 2);
2780
2781 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2782 const __m128i firstDivision8 = moveLowBits16_8ToLow64(division16);
2783
2784 // second 16 elements
2785 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2786 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2787
2788 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2789 const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(secondRow0), shuffleNeighbor2Low64BitsToLow16_8(secondRow1));
2790 const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(secondRow0), shuffleNeighbor2High64BitsToLow16_8(secondRow1));
2791
2792 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2793 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2794
2795 // divide by 4 by right shifting of two bits
2796 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2797
2798 // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2799 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2800
2801
2802 // combine both divion results
2803 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2804
2805 // copy the 128 bit to the memory
2806 _mm_storeu_si128((__m128i*)result, division8);
2807}
2808
2809inline void SSE::average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result)
2810{
2811 ocean_assert(image0 && image1 && result);
2812
2813 // 6 * float = 2 pixel: 00 01 02 03 04 05
2814
2815 // load element 0 up to 3, input does not need to be aligned on any particular boundary.
2816 const __m128 row0 = _mm_loadu_ps(image0);
2817 const __m128 row1 = _mm_loadu_ps(image1);
2818
2819 // get sum of first 4 elements
2820 const __m128 sumFirst = _mm_add_ps(row0, row1);
2821
2822 // load element 2 up to 5 to prevent that we access memory out of our range
2823 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 2);
2824 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 2);
2825
2826 // get sum of second 4 elements
2827 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2828
2829 // get sum of summed pixels
2830 // NOTE: _mm_shuffle_ps resulting first 64bit are always from first __m128, second 64bit from second __m128
2831 // mask111001 = 57u; // 'i+1'th float became 'i'
2832 const __m128 sumComponents = _mm_add_ps(sumFirst, _mm_shuffle_ps(sumSecond, sumSecond, 57u));
2833
2834 // divide by 4 --> multiply by 0.25
2835 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2836
2837 // store 3 elements (96 bit) to the memory
2838
2839#ifdef OCEAN_COMPILER_MSC
2840 memcpy(result, &division.m128_f32[0], sizeof(float) * 3);
2841#else
2842 memcpy(result, &division, sizeof(float) * 3);
2843#endif
2844}
2845
2846inline void SSE::average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2847{
2848 ocean_assert(image0 && image1 && result);
2849
2850 __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2851 __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2852
2853 // distribute the first 12 elements (element 00 up to 11):
2854 // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2855 //
2856 // -- -- -- -- -- 08 -- 07 -- 06 -- 02 -- 01 -- 00
2857 // -- -- -- -- -- 11 -- 10 -- 09 -- 05 -- 04 -- 03
2858
2859 __m128i shuffleMaskLow = set128i(0xA0A0A0A0A008A007ull, 0xA006A002A001A000ull);
2860 __m128i shuffleMaskHigh = set128i(0xA0A0A0A0A00BA00Aull, 0xA009A005A004A003ull);
2861
2862 __m128i sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2863 __m128i sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2864
2865 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2866 __m128i sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2867
2868 // divide by 4 by right shifting of two bits
2869 __m128i division16 = _mm_srli_epi16(sum, 2);
2870
2871 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2872 __m128i division8 = _mm_shuffle_epi8(division16, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A00A0806040200ull));
2873
2874
2875 // now we load the remaining 12 elements (however, this time we take element 04 up to 15 to prevent that we access memory out of our range)
2876 // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2877 //
2878 // -- -- -- -- -- 12 -- 11 -- 10 -- 06 -- 05 -- 04
2879 // -- -- -- -- -- 15 -- 14 -- 13 -- 09 -- 08 -- 07
2880
2881 row0 = _mm_lddqu_si128((__m128i*)(image0 + 8));
2882 row1 = _mm_lddqu_si128((__m128i*)(image1 + 8));
2883
2884 shuffleMaskLow = set128i(0xA0A0A0A0A00CA00Bull, 0xA00AA006A005A004ull);
2885 shuffleMaskHigh = set128i(0xA0A0A0A0A00FA00Eull, 0xA00DA009A008A007ull);
2886
2887 sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2888 sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2889
2890 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2891 sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2892
2893 // divide by 4 by right shifting of two bits
2894 division16 = _mm_srli_epi16(sum, 2);
2895
2896 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2897 division8 = _mm_or_si128(division8, _mm_shuffle_epi8(division16, set128i(0xA0A0A0A00A080604ull, 0x0200A0A0A0A0A0A0ull)));
2898
2899#ifdef OCEAN_COMPILER_MSC
2900 memcpy(result, &division8.m128i_u8[0], 12);
2901#else
2902 memcpy(result, &division8, 12);
2903#endif
2904}
2905
2906inline void SSE::average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result)
2907{
2908 ocean_assert(image0 && image1);
2909
2910 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2911 const __m128 row0 = _mm_loadu_ps(image0);
2912 const __m128 row1 = _mm_loadu_ps(image1);
2913
2914 // get sum of first 4 elements
2915 const __m128 sumFirstPixel = _mm_add_ps(row0, row1);
2916
2917 // load next 4 elements
2918 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2919 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2920
2921 // get sum of second 4 elements
2922 const __m128 sumSecondPixel = _mm_add_ps(rowSecond0, rowSecond1);
2923
2924 // get sum of summed pixels
2925 const __m128 sumComponents = _mm_add_ps(sumFirstPixel, sumSecondPixel);
2926
2927 // divide by 4 --> multiply by 0.25
2928 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2929
2930 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2931 _mm_storeu_ps(result, division);
2932}
2933
2934inline void SSE::average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2935{
2936 ocean_assert(image0 && image1);
2937
2938 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2939 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2940
2941 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2942 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(row0), shuffleNeighbor4Low64BitsToLow16_8(row1));
2943 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(row0), shuffleNeighbor4High64BitsToLow16_8(row1));
2944
2945 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2946 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2947
2948 // divide by 4 by right shifting of two bits
2949 const __m128i division16 = _mm_srli_epi16(sum, 2);
2950
2951 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2952 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2953
2954 // copy the lower 64 bit to the memory
2955 _mm_storel_epi64((__m128i*)result, division8);
2956}
2957
2958inline void SSE::average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2959{
2960 ocean_assert(image0 && image1);
2961
2962 // first 16 elements
2963 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2964 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2965
2966 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2967 const __m128i firstSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(firstRow0), shuffleNeighbor4Low64BitsToLow16_8(firstRow1));
2968 const __m128i firstSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(firstRow0), shuffleNeighbor4High64BitsToLow16_8(firstRow1));
2969
2970 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2971 const __m128i firstSum = _mm_add_epi16(_mm_hadd_epi16(firstSumLow, firstSumHigh), _mm_set1_epi32(int(0x00020002)));
2972
2973 // divide by 4 by right shifting of two bits
2974 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2975
2976 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2977 const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2978
2979
2980 // second 16 elements
2981 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2982 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2983
2984 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2985 const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(secondRow0), shuffleNeighbor4Low64BitsToLow16_8(secondRow1));
2986 const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(secondRow0), shuffleNeighbor4High64BitsToLow16_8(secondRow1));
2987
2988 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2989 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2990
2991 // divide by 4 by right shifting of two bits
2992 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2993
2994 // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2995 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2996
2997
2998 // combine both divion results
2999 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3000
3001 // copy the 128 bit to the memory
3002 _mm_storeu_si128((__m128i*)result, division8);
3003}
3004
3005inline void SSE::average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
3006{
3007 ocean_assert(image0 && image1 && image2);
3008
3009 /**
3010 * | 1 2 1 |
3011 * 1/16 | 2 4 2 |
3012 * | 1 2 1 |
3013 */
3014
3015 // first 16 elements (actual 14 are used)
3016 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
3017 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
3018 const __m128i firstRow2 = _mm_lddqu_si128((__m128i*)image2);
3019
3020 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
3021 const __m128i firstSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1)), _mm_add_epi16(removeHighBits16_8(firstRow1), removeHighBits16_8(firstRow2)));
3022 const __m128i firstSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1)), _mm_add_epi16(moveHighBits16_8(firstRow1), moveHighBits16_8(firstRow2)));
3023
3024 // second 16 elements, starting from 15th element
3025 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 14));
3026 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 14));
3027 const __m128i secondRow2 = _mm_lddqu_si128((__m128i*)(image2 + 14));
3028
3029 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
3030 const __m128i secondSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1)), _mm_add_epi16(removeHighBits16_8(secondRow1), removeHighBits16_8(secondRow2)));
3031 const __m128i secondSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1)), _mm_add_epi16(moveHighBits16_8(secondRow1), moveHighBits16_8(secondRow2)));
3032
3033 // build overall sum and add 8 for rounding
3034 // positions 0, 2, 3, 5, 6 are valid, e.g. pos. 0 contains element00 + element01
3035 const __m128i firstSum = _mm_add_epi16(firstSumEven, _mm_add_epi16(firstSumOdd, _mm_set1_epi32(int(0x00080008))));
3036 // e.g. pos. 0 contains now element00 + element01 + element02
3037 const __m128i firstSumWithEven = _mm_add_epi16(firstSum, _mm_shuffle_epi8(firstSumEven, set128i(0xFFFF0F0E0B0AFFFFull, 0x09080504FFFF0302ull)));
3038 // e.g. pos. 0 contains now element00 + element01 + element02 + element01
3039 const __m128i firstSumWithBoth = _mm_add_epi16(firstSumWithEven, _mm_shuffle_epi8(firstSumOdd, set128i(0xFFFF0D0C0908FFFFull, 0x07060302FFFF0100ull)));
3040
3041 // build overall sum and add 8 for rounding
3042 // positions 1, 2, 4, 5, 7 are valid
3043 const __m128i secondSum = _mm_add_epi16(secondSumEven, _mm_add_epi16(secondSumOdd, _mm_set1_epi32(int(0x00080008))));
3044 const __m128i secondSumWithEven = _mm_add_epi16(secondSum, _mm_shuffle_epi8(secondSumEven, set128i(0x0F0EFFFF0D0C0908ull, 0xFFFF07060302FFFFull)));
3045 const __m128i secondSumWithBoth = _mm_add_epi16(secondSumWithEven, _mm_shuffle_epi8(secondSumOdd, set128i(0x0D0CFFFF0B0A0706ull, 0xFFFF05040100FFFFull)));
3046
3047 // divide by 16 by right shifting of four bits
3048 const __m128i firstDivision16 = _mm_srli_epi16(firstSumWithBoth, 4);
3049 const __m128i secondDivision16 = _mm_srli_epi16(secondSumWithBoth, 4);
3050
3051 // reorder valid elements to lowest bits
3052 const __m128i firstDivision8 = _mm_shuffle_epi8(firstDivision16, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0C0A060400ull));
3053 const __m128i secondDivision8 = _mm_shuffle_epi8(secondDivision16, set128i(0xFFFFFFFFFFFF0E0Aull, 0x080402FFFFFFFFFFull));
3054
3055 // combine both divion results
3056 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3057
3058 // copy the lowest 10*8 bit to the memory
3059#ifdef OCEAN_COMPILER_MSC
3060 memcpy(result, &division8.m128i_u8[0], 10);
3061#else
3062 memcpy(result, &division8, 10);
3063#endif
3064}
3065
3067{
3068 /**
3069 * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3070 * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3071 * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3072 */
3073
3074 // We create a bit mask for all 16 bit odd values, an odd value will create an active lower bit in each 16 bit value
3075 const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0001000100010001ull, 0x0001000100010001ull));
3076
3077 // We create a bit mask for all 16 bit negative values, a negative value will create an active lower bit in each 16 bit value
3078 const __m128i maskNegatives = _mm_srli_epi16(_mm_and_si128(value, CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull)), 15);
3079
3080 // We add 1 to each 16 bit value having an active 'odd-bit' and active
3081 // 'negative-bit'
3082 return _mm_add_epi16(value, _mm_and_si128(maskNegatives, maskOdds));
3083}
3084
3085inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3086{
3087 ocean_assert(rightShifts < 16u);
3088
3089 // the offset for negative values: 2^shifts - 1
3090 const __m128i offsetForNegatives_s_16x8 = _mm_set1_epi16(short((1u << rightShifts) - 1u));
3091
3092 // bit mask for all 16 bit negative values
3093 const __m128i maskHigh_s_16x8 = CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull);
3094
3095 // 0x0000 for positive values, 0xFFFF for negative values
3096 const __m128i maskNegativeValues_s_16x8 = _mm_cmpeq_epi16(_mm_and_si128(value, maskHigh_s_16x8), maskHigh_s_16x8);
3097
3098 // 0 for positive values, 2^shifts - 1 for negative values
3099 const __m128i offset_s_16x8 = _mm_and_si128(offsetForNegatives_s_16x8, maskNegativeValues_s_16x8);
3100
3101 return _mm_add_epi16(value, offset_s_16x8);
3102}
3103
3104inline __m128i SSE::divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3105{
3106 return _mm_srai_epi16(addOffsetBeforeRightShiftDivisionSigned16Bit(value, rightShifts), int(rightShifts));
3107}
3108
3109inline __m128i SSE::roundedDivideByRightShiftSigned16Bit(const __m128i& value_s16x8, const unsigned int rightShifts)
3110{
3111 ocean_assert(rightShifts >= 1 && rightShifts <= 15);
3112
3113 const __m128i signMask_s16x8 = _mm_srai_epi16(value_s16x8, 15); // 0x0000 for +, 0xFFFF for -
3114
3115 const __m128i absValue_s16x8 = _mm_abs_epi16(value_s16x8);
3116 const __m128i offset_s16x8 = _mm_set1_epi16(1 << (rightShifts - 1));
3117
3118 const __m128i absValueWithOffset_s16x8 = _mm_add_epi16(absValue_s16x8, offset_s16x8);
3119
3120 const __m128i shifted_s16x8 = _mm_srai_epi16(absValueWithOffset_s16x8, rightShifts);
3121
3122 return _mm_sub_epi16(_mm_xor_si128(shifted_s16x8, signMask_s16x8), signMask_s16x8); // restore sign: (shifted ^ sign_mask) - sign_mask
3123}
3124
3125inline int16_t SSE::maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts)
3126{
3127 ocean_assert(rightShifts >= 1 && rightShifts <= 15);
3128
3129 const int32_t maxValue = 32767 - (1 << (rightShifts - 1));
3130
3132
3133 return int16_t(maxValue);
3134}
3135
3137{
3138 /**
3139 * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3140 * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3141 * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3142 */
3143
3144 // We create a bit mask for all 32 bit odd values, an odd value will create an active lower bit in each 32 bit value
3145 const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0000000100000001ull, 0x0000000100000001ull));
3146
3147 // We create a bit mask for all 32 bit negative values, a negative value will create an active lower bit in each 32 bit value
3148 const __m128i maskNegatives = _mm_srli_epi32(_mm_and_si128(value, CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull)), 31);
3149
3150 // We add 1 to each 32 bit value having an active 'odd-bit' and active 'negative-bit'
3151 return _mm_add_epi32(value, _mm_and_si128(maskNegatives, maskOdds));
3152}
3153
3154inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3155{
3156 ocean_assert(rightShifts < 32u);
3157
3158 // the offset for negative values: 2^shifts - 1
3159 const __m128i offsetForNegatives_s_32x4 = _mm_set1_epi32(int((1u << rightShifts) - 1u));
3160
3161 // bit mask for all 32 bit negative values
3162 const __m128i maskHigh_s_32x4 = CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull);
3163
3164 // 0x00000000 for positive values, 0xFFFFFFFF for negative values
3165 const __m128i maskNegativeValues_s_32x4 = _mm_cmpeq_epi32(_mm_and_si128(value, maskHigh_s_32x4), maskHigh_s_32x4);
3166
3167 // 0 for positive values, 2^shifts - 1 for negative values
3168 const __m128i offset_s_32x4 = _mm_and_si128(offsetForNegatives_s_32x4, maskNegativeValues_s_32x4);
3169
3170 return _mm_add_epi32(value, offset_s_32x4);
3171}
3172
3173inline __m128i SSE::divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3174{
3175 return _mm_srai_epi32(addOffsetBeforeRightShiftDivisionSigned32Bit(value, rightShifts), int(rightShifts));
3176}
3177
3178inline void SSE::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
3179{
3180 ocean_assert(source && response && width >= 10u);
3181
3182 // Load 16 unsigned 8-bit values; left/right/top/bottom pixels
3183 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3184 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3185
3186 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3187 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3188
3189 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3190 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3191 //const __m128i horizontalMinusLo = _mm_shuffle_epi8(horizontalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3192 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3193
3194 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3195 //const __m128i horizontalPlusLo = _mm_shuffle_epi8(horizontalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3196 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3197
3198 // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3199 const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3200 const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3201
3202 // Convert the low and high signed 16-bit differences to signed 8-bit and merge them into a single
3203 const __m128i horizontalGradient = _mm_or_si128(
3204 _mm_shuffle_epi8(horizontalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3205 _mm_shuffle_epi8(horizontalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3206
3207 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3208 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3209 //const __m128i verticalMinusLo = _mm_shuffle_epi8(verticalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == a[7:0]
3210 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3211
3212 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3213 //const __m128i verticalPlusLo = _mm_shuffle_epi8(verticalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == b[7:0]
3214 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3215
3216 // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3217 const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3218 const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3219
3220 // Convert the differences to signed char and merge the high and low halves
3221 const __m128i verticalGradient = _mm_or_si128(
3222 _mm_shuffle_epi8(verticalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3223 _mm_shuffle_epi8(verticalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3224
3225 // Take the horizontal gradients, [dx0, dx1, dx2, ...], and the vertical gradient, [dy0, dy1, dy2, ...] and interleave them, [dx0, dy0, dx1, dy1, dx2, dy2, ...]
3226 const __m128i interleavedResponseLo = _mm_unpacklo_epi8(horizontalGradient, verticalGradient);
3227 const __m128i interleavedResponseHi = _mm_unpackhi_epi8(horizontalGradient, verticalGradient);
3228
3229 ocean_assert(sizeof(char) == 1ull);
3230 _mm_storeu_si128((__m128i*)response, interleavedResponseLo);
3231 _mm_storeu_si128((__m128i*)(response + 16ull), interleavedResponseHi);
3232}
3233
3234inline void SSE::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
3235{
3236 ocean_assert(source && response && width >= 10u);
3237
3238 // Load 4x(16x8u) values: left/right/top/bottom pixels
3239 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3240 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3241
3242 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3243 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3244
3245 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3246 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3247 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3248
3249 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3250 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3251
3252 // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3253 const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3254 const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3255
3256 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3257 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3258 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3259
3260 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3261 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3262
3263 // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3264 const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3265 const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3266
3267 // Squared gradients: h*h, v*v, h*v
3268 const __m128i horizontalHorizontalLo = _mm_mullo_epi16(horizontalGradientLo, horizontalGradientLo);
3269 const __m128i horizontalHorizontalHi = _mm_mullo_epi16(horizontalGradientHi, horizontalGradientHi);
3270
3271 const __m128i verticalVerticalLo = _mm_mullo_epi16(verticalGradientLo, verticalGradientLo);
3272 const __m128i verticalVerticalHi = _mm_mullo_epi16(verticalGradientHi, verticalGradientHi);
3273
3274 const __m128i horzontalVerticalLo = _mm_mullo_epi16(horizontalGradientLo, verticalGradientLo);
3275 const __m128i horzontalVerticalHi = _mm_mullo_epi16(horizontalGradientHi, verticalGradientHi);
3276
3277 // Interleave/pack the above squared gradient, 16S values
3278 //
3279 // a, b, c - Above variables ending in *Lo
3280 // d, e, f - Above variables ending in *Hi
3281 //
3282 // a = [a7, a6, a5, a4, a3, a2, a1, a0]
3283 // b = [b7, b6, b5, b4, b3, b2, b1, b0]
3284 // c = [c7, c6, c5, c4, c3, c2, c1, c0]
3285 //
3286 // d = [d7, d6, d5, d4, d3, d2, d1, d0]
3287 // e = [e7, e6, e5, e4, e3, e2, e1, e0]
3288 // f = [f7, f6, f5, f4, f3, f2, f1, f0]
3289 //
3290 // A = [b2, a2, c1, b1, a1, c0, b0, a0]
3291 // B = [a5, c4, b4, a4, c3, b3, a3, c2]
3292 // C = [c7, b7, a7, c6, b6, a6, c5, b5]
3293 //
3294 // D = [e2, d2, f1, e1, d1, f0, e0, d0]
3295 // E = [d5, f4, e4, d4, f3, e3, d3, f2]
3296 // F = [f7, e7, d7, f6, e6, d6, f5, e5]
3297
3298 const __m128i block0Lo = _mm_or_si128( // == [b2, a2, c1, b1, a1, c0, b0, a0]
3299 _mm_or_si128( // == [b2, a2, 00, b1, a1, 00, b0, a0]
3300 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, a2, 00, 00, a1, 00, 00, a0]
3301 _mm_shuffle_epi8(verticalVerticalLo, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [b2, 00, 00, b1, 00, 00, b0, 00]
3302 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, c1, 00, 00, c0, 00, 00]
3303
3304 const __m128i block1Lo = _mm_or_si128( // == [a5, c4, b4, a4, c3, b3, a3, c2]
3305 _mm_or_si128( // == [a5, 00, b4, a4, 00, b3, a3, 00]
3306 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [a5, 00, 00, a4, 00, 00, a4, 00]
3307 _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, b4, 00, 00, b3, 00, 00]
3308 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, c4, 00, 00, c3, 00, 00, c2]
3309
3310 const __m128i block2Lo = _mm_or_si128( // == [c7, b7, a7, c6, b6, a6, c5, b5]
3311 _mm_or_si128( // == [00, b7, a7, 00, b6, a6, 00, b5]
3312 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, a7, 00, 00, a6, 00, 00]
3313 _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, b7, 00, 00, b6, 00, 00, b5]
3314 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [c7, 00, 00, c6, 00, 00, c5, 00]
3315
3316 const __m128i block0Hi = _mm_or_si128( // == [e2, d2, f1, e1, d1, f0, e0, d0]
3317 _mm_or_si128( // == [e2, d2, 00, e1, d1, 00, e0, d0]
3318 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, d2, 00, 00, d1, 00, 00, d0]
3319 _mm_shuffle_epi8(verticalVerticalHi, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [e2, 00, 00, e1, 00, 00, e0, 00]
3320 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, f1, 00, 00, f0, 00, 00]
3321
3322 const __m128i block1Hi = _mm_or_si128( // == [d5, f4, e4, d4, f3, e3, d3, f2]
3323 _mm_or_si128( // == [d5, 00, e4, d4, 00, e3, d3, 00]
3324 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [d5, 00, 00, d4, 00, 00, d4, 00]
3325 _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, e4, 00, 00, e3, 00, 00]
3326 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, f4, 00, 00, f3, 00, 00, f2]
3327
3328 const __m128i block2Hi = _mm_or_si128( // == [f7, e7, d7, f6, e6, d6, f5, e5]
3329 _mm_or_si128( // == [00, e7, d7, 00, e6, d6, 00, e5]
3330 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, d7, 00, 00, d6, 00, 00]
3331 _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, e7, 00, 00, e6, 00, 00, e5]
3332 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [f7, 00, 00, f6, 00, 00, f5, 00]
3333
3334 _mm_storeu_si128((__m128i*)response, block0Lo);
3335 _mm_storeu_si128((__m128i*)(response + 8ull), block1Lo);
3336 _mm_storeu_si128((__m128i*)(response + 16ull), block2Lo);
3337 _mm_storeu_si128((__m128i*)(response + 24ull), block0Hi);
3338 _mm_storeu_si128((__m128i*)(response + 32ull), block1Hi);
3339 _mm_storeu_si128((__m128i*)(response + 40ull), block2Hi);
3340}
3341
3342OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2)
3343{
3344 // interleaved R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 X
3345
3346 // channel01 R0 R1 R2 R3 R4 X X X G0 G1 G2 G3 G4 X X X
3347 // channel2 B0 B1 B2 B3 B4 X X X 0 0 0 0 0 0 0 0
3348
3349 channel01 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFF0d0a070401ull, 0xFFFFFF0c09060300ull));
3350
3351 channel2 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull));
3352}
3353
3354OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2)
3355{
3356 // interleavedA R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
3357 // interleavedB G5 B5 R6 G6 B6 R7 G7 B7 X X X X X X X X
3358
3359 // channel01 R0 R1 R2 R3 R4 R5 R6 R7 G0 G1 G2 G3 G4 G5 G6 G7
3360 // channel2 B0 B1 B2 B3 B4 B5 B6 B7 0 0 0 0 0 0 0 0
3361
3362 channel01 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFF0d0a070401ull, 0xFFFF0f0c09060300ull)),
3363 _mm_shuffle_epi8(interleavedB, set128i(0x060300FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3364
3365 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3366 _mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFFFFFFull, 0x070401FFFFFFFFFFull)));
3367}
3368
3369OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3370{
3371 channel0 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFF0f0c09060300ull)),
3372 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0e0b08ull, 0x0502FFFFFFFFFFFFull)),
3373 _mm_shuffle_epi8(interleavedC, set128i(0x0d0a070401FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3374
3375 channel1 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3376 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3377 _mm_shuffle_epi8(interleavedC, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3378
3379 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3380 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFF0d0aull, 0x070401FFFFFFFFFFull)),
3381 _mm_shuffle_epi8(interleavedC, set128i(0x0f0c09060300FFFFull, 0xFFFFFFFFFFFFFFFFull))));
3382}
3383
3384inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3385{
3386 ocean_assert(interleaved != nullptr);
3387
3388 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0, channel1, channel2);
3389}
3390
3391inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2)
3392{
3393 ocean_assert(interleaved && channel0 && channel1 && channel2);
3394
3395 __m128i channel0_128, channel1_128, channel2_128;
3396 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0_128, channel1_128, channel2_128);
3397
3398 store128i(channel0_128, channel0);
3399 store128i(channel1_128, channel1);
3400 store128i(channel2_128, channel2);
3401}
3402
3403inline void SSE::deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3404{
3405 ocean_assert(interleaved != nullptr);
3406
3407 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3), channel0, channel1, channel2);
3408}
3409
3410OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC)
3411{
3412 interleavedA = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0x05FFFF04FFFF03FFull, 0xFF02FFFF01FFFF00ull)),
3413 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFFFF04FFFF03FFFFull, 0x02FFFF01FFFF00FFull)),
3414 _mm_shuffle_epi8(channel2, set128i(0xFF04FFFF03FFFF02ull, 0xFFFF01FFFF00FFFFull))));
3415
3416 interleavedB = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFF0AFFFF09FFFF08ull, 0xFFFF07FFFF06FFFFull)),
3417 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0x0AFFFF09FFFF08FFull, 0xFF07FFFF06FFFF05ull)),
3418 _mm_shuffle_epi8(channel2, set128i(0xFFFF09FFFF08FFFFull, 0x07FFFF06FFFF05FFull))));
3419
3420 interleavedC = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFFFF0FFFFF0EFFFFull, 0x0DFFFF0CFFFF0BFFull)),
3421 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFF0FFFFF0EFFFF0Dull, 0xFFFF0CFFFF0BFFFFull)),
3422 _mm_shuffle_epi8(channel2, set128i(0x0FFFFF0EFFFF0DFFull, 0xFF0CFFFF0BFFFF0Aull))));
3423}
3424
3425OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved)
3426{
3427 ocean_assert(channel0 && channel1 && channel2 && interleaved);
3428
3429 __m128i interleavedA_128, interleavedB_128, interleavedC_128;
3430 interleave3Channel8Bit48Elements(load128i(channel0), load128i(channel1), load128i(channel2), interleavedA_128, interleavedB_128, interleavedC_128);
3431
3432 store128i(interleavedA_128, interleaved + 0);
3433 store128i(interleavedB_128, interleaved + 16);
3434 store128i(interleavedC_128, interleaved + 32);
3435}
3436
3437OCEAN_FORCE_INLINE void SSE::store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i& singleChannel_u_8x8, uint8_t* interleaved)
3438{
3439 ocean_assert(interleaved != nullptr);
3440
3441 // singleChannel_u_8x8 contains 8 elements in lower 8 bytes: [s0, s1, s2, s3, s4, s5, s6, s7]
3442
3443 const __m128i shuffleMask0 = _mm_set_epi8(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0);
3444 const __m128i interleaved0 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask0);
3445
3446 const __m128i shuffleMask1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 7, 7, 7, 6, 6, 6, 5, 5);
3447 const __m128i interleaved1 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask1);
3448
3449 _mm_storeu_si128((__m128i*)(interleaved + 0), interleaved0);
3450 _mm_storel_epi64((__m128i*)(interleaved + 16), interleaved1);
3451}
3452
3453OCEAN_FORCE_INLINE void SSE::store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i& singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t* interleaved)
3454{
3455 ocean_assert(interleaved != nullptr);
3456
3457 // singleChannel_u_8x8 contains 8 elements in lower 8 bytes: [s0, s1, s2, s3, s4, s5, s6, s7]
3458
3459 const __m128i shuffleMask0 = _mm_set_epi8(-128, 3, 3, 3, -128, 2, 2, 2, -128, 1, 1, 1, -128, 0, 0, 0); // -128 means set to zero, for 4th channel positions
3460 const __m128i shuffleMask1 = _mm_set_epi8(-128, 7, 7, 7, -128, 6, 6, 6, -128, 5, 5, 5, -128, 4, 4, 4);
3461
3462 // expand to first 3 channels with zero in 4th channel positions
3463 __m128i result0 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask0);
3464 __m128i result1 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask1);
3465
3466 const __m128i channel4Mask = _mm_set_epi8(-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0);
3467
3468 const __m128i lastChannelValue_u_8x16 = _mm_set1_epi8(char(lastChannelValue));
3469
3470 result0 = _mm_blendv_epi8(result0, lastChannelValue_u_8x16, channel4Mask);
3471 result1 = _mm_blendv_epi8(result1, lastChannelValue_u_8x16, channel4Mask);
3472
3473 _mm_storeu_si128((__m128i*)(interleaved + 0), result0);
3474 _mm_storeu_si128((__m128i*)(interleaved + 16), result1);
3475}
3476
3477OCEAN_FORCE_INLINE void SSE::reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3478{
3479 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3480
3481 // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3482 // Y A Y A Y A Y A Y A Y A Y A Y A
3483 // output: A Y A Y A Y A Y A Y A Y A Y A Y
3484 // 1 0 3 2 5 4 7 6 9 8 B A D C F E
3485
3486 const __m128i shuffleMask_u_16x8 = set128i(0x0E0F0C0D0A0B0809ull, 0x0607040502030001ull);
3487
3488 store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3489 store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3490}
3491
3492OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2)
3493{
3494 reversedInterleaved0 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFF0c0d0e090a0b06ull, 0x0708030405000102ull)),
3495 _mm_shuffle_epi8(interleaved1, set128i(0x01FFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull)));
3496
3497 reversedInterleaved1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFF0fFFull)),
3498 _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0x0fFF0b0c0d08090aull, 0x050607020304FF00ull)),
3499 _mm_shuffle_epi8(interleaved2, set128i(0xFF00FFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3500
3501 reversedInterleaved2 = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFF0eull)),
3502 _mm_shuffle_epi8(interleaved2, set128i(0x0d0e0f0a0b0c0708ull, 0x09040506010203FFull)));
3503}
3504
3505OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* const reversedInterleaved)
3506{
3507 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3508
3509 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3510 reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3511
3512 store128i(reversedInterleaved0, reversedInterleaved);
3513 store128i(reversedInterleaved1, reversedInterleaved + 16);
3514 store128i(reversedInterleaved2, reversedInterleaved + 32);
3515}
3516
3517OCEAN_FORCE_INLINE void SSE::reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3518{
3519 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3520
3521 // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3522 // R G B A R G B A R G B A R G B A
3523 // output: A B G R A B G R A B G R A B G R
3524 // 3 2 1 0 7 6 5 4 B A 9 8 F E D C
3525
3526 const __m128i shuffleMask_u_16x8 = set128i(0x0C0D0E0F08090A0Bull, 0x0405060700010203ull);
3527
3528 store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3529 store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3530 store128i(_mm_shuffle_epi8(load128i(interleaved + 32), shuffleMask_u_16x8), reversedInterleaved + 32);
3531 store128i(_mm_shuffle_epi8(load128i(interleaved + 48), shuffleMask_u_16x8), reversedInterleaved + 48);
3532}
3533
3534inline void SSE::reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved)
3535{
3536 ocean_assert(interleaved);
3537
3538 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3539 reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3540
3541 store128i(reversedInterleaved0, interleaved);
3542 store128i(reversedInterleaved1, interleaved + 16);
3543 store128i(reversedInterleaved2, interleaved + 32);
3544}
3545
3546inline void SSE::swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second)
3547{
3548 ocean_assert(first && second && first != second);
3549
3550 __m128i first0, first1, first2;
3551 reverseChannelOrder3Channel8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3552
3553 __m128i second0, second1, second2;
3554 reverseChannelOrder3Channel8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3555
3556 store128i(first0, second);
3557 store128i(first1, second + 16);
3558 store128i(first2, second + 32);
3559
3560 store128i(second0, first);
3561 store128i(second1, first + 16);
3562 store128i(second2, first + 32);
3563}
3564
3565inline void SSE::reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2)
3566{
3567 const __m128i mask = set128i(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
3568
3569 reversedElements0 = _mm_shuffle_epi8(elements2, mask);
3570 reversedElements1 = _mm_shuffle_epi8(elements1, mask);
3571 reversedElements2 = _mm_shuffle_epi8(elements0, mask);
3572}
3573
3574inline void SSE::reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements)
3575{
3576 ocean_assert(elements && reversedElements);
3577
3578 __m128i reversedElements0, reversedElements1, reversedElements2;
3579 reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3580
3581 store128i(reversedElements0, reversedElements);
3582 store128i(reversedElements1, reversedElements + 16);
3583 store128i(reversedElements2, reversedElements + 32);
3584}
3585
3586inline void SSE::reverseElements8Bit48Elements(uint8_t* elements)
3587{
3588 ocean_assert(elements);
3589
3590 __m128i reversedElements0, reversedElements1, reversedElements2;
3591 reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3592
3593 store128i(reversedElements0, elements);
3594 store128i(reversedElements1, elements + 16);
3595 store128i(reversedElements2, elements + 32);
3596}
3597
3598inline void SSE::swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second)
3599{
3600 ocean_assert(first && second && first != second);
3601
3602 __m128i first0, first1, first2;
3603 reverseElements8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3604
3605 __m128i second0, second1, second2;
3606 reverseElements8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3607
3608 store128i(first0, second);
3609 store128i(first1, second + 16);
3610 store128i(first2, second + 32);
3611
3612 store128i(second0, first);
3613 store128i(second1, first + 16);
3614 store128i(second2, first + 32);
3615}
3616
3617inline void SSE::shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3618{
3619 ocean_assert(elements && shiftedElements);
3620
3621 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0c0f0e0d080b0a09ull, 0x0407060500030201ull)), shiftedElements);
3622}
3623
3624inline void SSE::shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3625{
3626 ocean_assert(elements && shiftedElements);
3627
3628 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0003020104070605ull, 0x080b0a090c0f0e0dull)), shiftedElements);
3629}
3630
3631inline void SSE::shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3632{
3633 ocean_assert(elements && shiftedElements);
3634
3635 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0e0d0c0f0a09080bull, 0x0605040702010003ull)), shiftedElements);
3636}
3637
3638inline void SSE::shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3639{
3640 ocean_assert(elements && shiftedElements);
3641
3642 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0201000306050407ull, 0x0a09080b0e0d0c0full)), shiftedElements);
3643}
3644
3645inline __m128i SSE::sum1Channel8Bit16Elements(const __m128i& elements)
3646{
3647 const __m128i zero = _mm_setzero_si128();
3648 const __m128i sum = _mm_sad_epu8(elements, zero);
3649
3650 return _mm_add_epi32(_mm_srli_si128(sum, 8), sum);
3651}
3652
3653inline __m128i SSE::sum1Channel8Bit16Elements(const uint8_t* elements)
3654{
3655 ocean_assert(elements != nullptr);
3656
3657 return sum1Channel8Bit16Elements(load128i(elements));
3658}
3659
3660template <bool tBufferHas16Bytes>
3661inline __m128i SSE::sum1Channel8BitFront15Elements(const uint8_t* elements)
3662{
3663 ocean_assert(elements != nullptr);
3664 return sum1Channel8Bit16Elements(load_u8_15_upper_zero<tBufferHas16Bytes>(elements));
3665}
3666
3667inline __m128i SSE::sum1Channel8BitBack15Elements(const uint8_t* elements)
3668{
3669 ocean_assert(elements != nullptr);
3670 return sum1Channel8Bit16Elements(load_u8_16_and_shift_right<1u>(elements));
3671}
3672
3673inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2)
3674{
3675 // Interleaved0: R BGR BGR BGR BGR BGR
3676 // Interleaved1: GR BGR BGR BGR BGR BG
3677 // Interleaved2: BGR BGR BGR BGR BGR B
3678
3679 // BBBBBBBB RRRRRRRR
3680 const __m128i channel0_2First = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFF0e0b080502ull, 0xFFFF0f0c09060300ull)),
3681 _mm_shuffle_epi8(interleaved1, set128i(0x070401FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3682
3683 // BBBBBBBB RRRRRRRR
3684 const __m128i channel0_2Second = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFF0d0aull, 0xFFFFFFFFFF0e0b08ull)),
3685 _mm_shuffle_epi8(interleaved2, set128i(0x0f0c09060300FFFFull, 0x0d0a070401FFFFFFull)));
3686
3687 // GGGGGGGG GGGGGGGG
3688 const __m128i channel1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3689 _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3690 _mm_shuffle_epi8(interleaved2, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3691
3692 const __m128i zero = _mm_setzero_si128();
3693
3694 // 0000 BBBB 0000 RRRR
3695 const __m128i sum0_2 = _mm_add_epi32(_mm_sad_epu8(channel0_2First, zero), _mm_sad_epu8(channel0_2Second, zero));
3696
3697 // 0000 GGGG 0000 GGGG
3698 const __m128i sum1 = _mm_sad_epu8(channel1, zero);
3699
3700 // 0000 BBBB GGGG RRRR
3701 return _mm_blend_epi16(sum0_2, _mm_add_epi32(_mm_slli_si128(sum1, 4), _mm_srli_si128(sum1, 4)), int(0xC));
3702}
3703
3704inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved)
3705{
3706 ocean_assert(interleaved != nullptr);
3707
3708 return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32));
3709}
3710
3711inline __m128i SSE::sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved)
3712{
3713 ocean_assert(interleaved != nullptr);
3714
3715 return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3));
3716}
3717
3718inline __m128i SSE::load128iLower64(const void* const buffer)
3719{
3720 ocean_assert(buffer != nullptr);
3721 return _mm_loadl_epi64((const __m128i*)(buffer));
3722}
3723
3724inline __m128i SSE::load128i(const void* const buffer)
3725{
3726 ocean_assert(buffer != nullptr);
3727 return _mm_lddqu_si128((const __m128i*)(buffer));
3728}
3729
3730template <bool tBufferHas16Bytes>
3731inline __m128i SSE::load_u8_10_upper_zero(const uint8_t* const buffer)
3732{
3733 ocean_assert(buffer != nullptr);
3734
3735 __m128i result;
3736
3737#ifdef OCEAN_COMPILER_MSC
3738
3739 result.m128i_u64[0] = uint64_t(0);
3740 memcpy(result.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3741 memcpy(result.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3742
3743#else
3744
3745 M128i& ourResult = *((M128i*)(&result));
3746
3747 ourResult.m128i_u64[0] = uint64_t(0);
3748 memcpy(ourResult.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3749 memcpy(ourResult.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3750
3751#endif
3752
3753 return result;
3754}
3755
3756template <>
3757inline __m128i SSE::load_u8_10_upper_zero<true>(const uint8_t* const buffer)
3758{
3759 ocean_assert(buffer != nullptr);
3760
3761 // we load 16 bytes and shift the SSE register by 6 byte afterwards
3762 return _mm_slli_si128(SSE::load128i(buffer), 6);
3763}
3764
3765template <bool tBufferHas16Bytes>
3766inline __m128i SSE::load_u8_15_upper_zero(const uint8_t* const buffer)
3767{
3768 ocean_assert(buffer != nullptr);
3769
3770 __m128i intermediate;
3771 memcpy(&intermediate, buffer, 15);
3772
3773 // we shift the SSE register by 1 byte afterwards
3774 return _mm_slli_si128(intermediate, 1);
3775}
3776
3777template <>
3778inline __m128i SSE::load_u8_15_upper_zero<true>(const uint8_t* const buffer)
3779{
3780 ocean_assert(buffer != nullptr);
3781
3782 // we load 16 bytes and shift the SSE register by 1 byte afterwards
3783 return _mm_slli_si128(_mm_lddqu_si128((__m128i*)(buffer)), 1);
3784}
3785
3786template <bool tBufferHas16Bytes>
3787inline __m128i SSE::load_u8_13_lower_random(const uint8_t* const buffer)
3788{
3789 ocean_assert(buffer != nullptr);
3790
3791 __m128i result;
3792 memcpy(&result, buffer, 13);
3793
3794 return result;
3795}
3796
3797template <>
3798inline __m128i SSE::load_u8_13_lower_random<true>(const uint8_t* const buffer)
3799{
3800 ocean_assert(buffer != nullptr);
3801
3802 // we load the entire 16 bytes to the 128i value as this is the fastest way
3803 return _mm_lddqu_si128((__m128i*)(buffer));
3804}
3805
3806template <bool tBufferHas16Bytes>
3807inline __m128i SSE::load_u8_15_lower_zero(const uint8_t* const buffer)
3808{
3809 ocean_assert(buffer != nullptr);
3810
3811 __m128i result;
3812 memcpy(&result, buffer, 15);
3813
3814#ifdef OCEAN_COMPILER_MSC
3815 result.m128i_u8[15] = 0u;
3816#else
3817 ((M128i&)result).m128i_u8[15] = 0u;
3818#endif
3819
3820 return result;
3821}
3822
3823template <>
3824inline __m128i SSE::load_u8_15_lower_zero<true>(const uint8_t* const buffer)
3825{
3826 ocean_assert(buffer != nullptr);
3827
3828 // we load the entire 16 bytes to the 128i value as this is the fastest way
3829 __m128i result = _mm_lddqu_si128((__m128i*)(buffer));
3830
3831#ifdef OCEAN_COMPILER_MSC
3832 result.m128i_u8[15] = 0u;
3833#else
3834 ((M128i&)result).m128i_u8[15] = 0u;
3835#endif
3836
3837 return result;
3838}
3839
3840template <bool tBufferHas16Bytes>
3841inline __m128i SSE::load_u8_15_lower_random(const uint8_t* const buffer)
3842{
3843 ocean_assert(buffer != nullptr);
3844
3845 __m128i result;
3846 memcpy(&result, buffer, 15);
3847
3848 return result;
3849}
3850
3851template <>
3852inline __m128i SSE::load_u8_15_lower_random<true>(const uint8_t* const buffer)
3853{
3854 ocean_assert(buffer != nullptr);
3855
3856 // we load the entire 16 bytes to the 128i value as this is the fastest way
3857 return _mm_lddqu_si128((__m128i*)(buffer));
3858}
3859
3860template <unsigned int tShiftBytes>
3861inline __m128i SSE::load_u8_16_and_shift_right(const uint8_t* const buffer)
3862{
3863 static_assert(tShiftBytes <= 16u, "Invalid shift!");
3864
3865 ocean_assert(buffer != nullptr);
3866 return _mm_srli_si128(_mm_lddqu_si128((__m128i*)(buffer)), tShiftBytes);
3867}
3868
3869inline void SSE::store128i(const __m128i& value, uint8_t* const buffer)
3870{
3871 ocean_assert(buffer != nullptr);
3872 _mm_storeu_si128((__m128i*)(buffer), value);
3873}
3874
3875inline __m128i SSE::set128i(const unsigned long long high64, const unsigned long long low64)
3876{
3877
3878#ifdef _WINDOWS
3879
3880 #ifdef _WIN64
3881 return _mm_set_epi64x(high64, low64);
3882 #else
3883 return _mm_set_epi32(*(((int*)&high64) + 1), *((int*)&high64), *(((int*)&low64) + 1), *((int*)&low64));
3884 #endif
3885
3886#else
3887
3888 return _mm_set_epi64x(high64, low64);
3889
3890#endif
3891
3892}
3893
3894inline __m128i SSE::removeHighBits32_16(const __m128i& value)
3895{
3896 return _mm_and_si128(value, _mm_set1_epi32(int(0x0000FFFFu)));
3897}
3898
3899inline __m128i SSE::removeLowBits32_16(const __m128i& value)
3900{
3901 return _mm_and_si128(value, _mm_set1_epi32(int(0xFFFF0000u)));
3902}
3903
3904inline __m128i SSE::removeHighBits16_8(const __m128i& value)
3905{
3906 return _mm_and_si128(value, _mm_set1_epi32(int(0x00FF00FFu)));
3907}
3908
3909inline __m128i SSE::removeHighBits16_8_7_lower(const __m128i& value)
3910{
3911 return _mm_and_si128(value, set128i(0x000000FF00FF00FFull, 0x00FF00FF00FF00FFull));
3912}
3913
3914inline __m128i SSE::removeHighBits16_8_7_upper(const __m128i& value)
3915{
3916 return _mm_and_si128(value, set128i(0x00FF00FF00FF00FFull, 0x00FF00FF00FF0000ull));
3917}
3918
3919inline __m128i SSE::moveLowBits16_8ToLow64(const __m128i& value)
3920{
3921 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0E0C0A0806040200ull));
3922}
3923
3924inline __m128i SSE::moveLowBits32_8ToLow32(const __m128i& value)
3925{
3926 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A0A0A00C080400ull));
3927}
3928
3929inline __m128i SSE::moveLowBits32_16ToLow64(const __m128i& value)
3930{
3931 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0D0C090805040100ull));
3932}
3933
3934inline __m128i SSE::moveLowBits16_8ToHigh64(const __m128i& value)
3935{
3936 return _mm_shuffle_epi8(value, set128i(0x0E0C0A0806040200ull, 0xA0A0A0A0A0A0A0A0ull));
3937}
3938
3939inline __m128i SSE::moveHighBits32_16(const __m128i& value)
3940{
3941 // shift the four 32 bit integers by 16 to the right and fill by zeros
3942 return _mm_srli_epi32(value, 16);
3943}
3944
3945inline __m128i SSE::moveHighBits16_8(const __m128i& value)
3946{
3947 return _mm_shuffle_epi8(value, set128i(0xA00FA00DA00BA009ull, 0xA007A005A003A001ull));
3948}
3949
3950inline __m128i SSE::moveHighBits16_8_5(const __m128i& value)
3951{
3952 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A009ull, 0xA007A005A003A001ull));
3953}
3954
3955inline __m128i SSE::moveHighBits16_8_6(const __m128i& value)
3956{
3957 return _mm_shuffle_epi8(value, set128i(0xFFFFFFFFFF0bFF09ull, 0xFF07FF05FF03FF01ull));
3958}
3959
3960inline __m128i SSE::moveHighBits16_8_7(const __m128i& value)
3961{
3962 return _mm_shuffle_epi8(value, set128i(0xA0A0A00DA00BA009ull, 0xA007A005A003A001ull));
3963}
3964
3965inline __m128i SSE::shuffleLow32ToLow32_8(const __m128i& value)
3966{
3967 return _mm_shuffle_epi8(value, set128i(0xA0A0A003A0A0A002ull, 0xA0A0A001A0A0A000ull));
3968}
3969
3970inline __m128i SSE::shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value)
3971{
3972 // we could also use one of the following mask-defining possibility, all provide the same result
3973 // _mm_set_epi8(0x80, 7, 0x80, 3, 0x80, 6, 0x80, 2, 0x80, 5, 0x80, 1, 0x80, 4, 0x80, 0))
3974 // _mm_set_epi8(0xA0, 7, 0xA0, 3, 0xA0, 6, 0xA0, 2, 0xA0, 5, 0xA0, 1, 0xA0, 4, 0xA0, 0))
3975 // _mm_set_epi8(0xFF, 7, 0xFF, 3, 0xFF, 6, 0xFF, 2, 0xFF, 5, 0xFF, 1, 0xFF, 4, 0xFF, 0))
3976
3977 return _mm_shuffle_epi8(value, set128i(0xA007A003A006A002ull, 0xA005A001A004A000ull));
3978}
3979
3980inline __m128i SSE::shuffleNeighbor4High64BitsToLow16_8(const __m128i& value)
3981{
3982 return _mm_shuffle_epi8(value, set128i(0xA00FA00BA00EA00Aull, 0xA00DA009A00CA008ull));
3983}
3984
3985inline __m128i SSE::shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value)
3986{
3987 return _mm_shuffle_epi8(value, set128i(0xFF07FF05FF06FF04ull, 0xFF03FF01FF02FF00ull));
3988}
3989
3990inline __m128i SSE::shuffleNeighbor2High64BitsToLow16_8(const __m128i& value)
3991{
3992 return _mm_shuffle_epi8(value, set128i(0xFF0FFF0DFF0EFF0Cull, 0xFF0BFF09FF0AFF08ull));
3993}
3994
3996{
3997 return _mm_set1_epi32(int(0x00FF00FFu));
3998}
3999
4001{
4002 return _mm_set1_epi32(int(0x0000FFFFu));
4003}
4004
4005OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1)
4006{
4007 const __m128i lowProducts = _mm_mullo_epi16(values0, values1);
4008 const __m128i highProducts = _mm_mulhi_epi16(values0, values1);
4009
4010 products0 = _mm_unpacklo_epi16(lowProducts, highProducts);
4011 products1 = _mm_unpackhi_epi16(lowProducts, highProducts);
4012}
4013
4014OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1)
4015{
4016 __m128i products0;
4017 __m128i products1;
4018 multiplyInt8x16ToInt32x8(values0, values1, products0, products1);
4019
4020 results0 = _mm_add_epi32(results0, products0);
4021 results1 = _mm_add_epi32(results1, products1);
4022}
4023
4024inline unsigned int SSE::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
4025{
4026 ocean_assert(pixel);
4027 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
4028
4029 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
4030}
4031
4032inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
4033{
4034 ocean_assert(pixel0 && pixel1);
4035
4036 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
4037
4038 return sqrDistance(*pixel0, (uint8_t)interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
4039}
4040
4041inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
4042{
4043 ocean_assert(pixel0 && pixel1);
4044
4045 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
4046 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
4047
4048 return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
4049}
4050
4051}
4052
4053}
4054
4055#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
4056
4057#endif // META_OCEAN_CV_SSE_H
This class implements computer vision functions using SSE extensions.
Definition SSE.h:42
static __m128i divideByRightShiftSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 32 bit values by applying a right shift.
Definition SSE.h:3173
static void average32Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2763
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 ...
Definition SSE.h:3178
static unsigned int sum_u32_first_2(const __m128i &value)
Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1369
static void average24Elements3Channel24Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2846
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition SSE.h:1303
static void reverseElements8Bit48Elements(const __m128i &elements0, const __m128i &elements1, const __m128i &elements2, __m128i &reversedElements0, __m128i &reversedElements1, __m128i &reversedElements2)
Reverses the order of 48 elements with 8 bit per element.
Definition SSE.h:3565
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition SSE.h:3724
static void average16Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2738
static __m128i load_u8_16_and_shift_right(const uint8_t *const buffer)
Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified ...
Definition SSE.h:3861
static __m128i moveLowBits32_16ToLow64(const __m128i &value)
Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3929
static __m128i moveLowBits32_8ToLow32(const __m128i &value)
Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0...
Definition SSE.h:3924
static __m128i moveHighBits16_8_6(const __m128i &value)
Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3955
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i &value)
Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3136
static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d &value)
Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
Definition SSE.h:1396
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3342
static void store128i(const __m128i &value, uint8_t *const buffer)
Stores a 128i value to the memory.
Definition SSE.h:3869
static __m128i sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
Definition SSE.h:1474
static __m128i sumInterleave3Channel8Bit45Elements(const uint8_t *interleaved)
Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3711
static __m128i moveLowBits16_8ToHigh64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with ...
Definition SSE.h:3934
static __m128i divideByRightShiftSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight int16_t values by applying a right shift.
Definition SSE.h:3104
static __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3980
static void swapReversedElements8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
Definition SSE.h:3598
static __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit pr...
Definition SSE.h:1412
static void average8ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2506
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:1621
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i &values0, const __m128i &values1, __m128i &results0, __m128i &results1)
Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
Definition SSE.h:4014
static __m128i sumSquareDifference8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1501
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1378
static __m128i sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1419
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2563
static __m128i moveHighBits16_8_5(const __m128i &value)
Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3950
static int16_t maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts)
Returns the maximal value for which the function roundedDivideByRightShiftSigned16Bit() can be applie...
Definition SSE.h:3125
static __m128i shuffleLow32ToLow32_8(const __m128i &value)
Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
Definition SSE.h:3965
static void shiftChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3617
static __m128i moveHighBits16_8(const __m128i &value)
Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3945
static __m128i removeHighBits16_8_7_upper(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
Definition SSE.h:3914
static void deInterleave3Channel8Bit45Elements(const uint8_t *interleaved, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3403
static unsigned int value_u32(const __m128i &value)
Returns one specific 32 bit unsigned integer value of a m128i value object.
Definition SSE.h:1349
static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const __m128i &channel0, const __m128i &channel1, const __m128i &channel2, __m128i &interleavedA, __m128i &interleavedB, __m128i &interleavedC)
Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3410
static __m128i load_u8_15_upper_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3766
static __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3985
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition SSE.h:1298
static __m128i sum1Channel8Bit16Elements(const __m128i &elements)
Sums 16 elements with 8 bit per element.
Definition SSE.h:3645
static __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3970
static void average8Elements2Channel64Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
Definition SSE.h:2708
static __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative int16_t value, so that each value can be right shifted to allow a ...
Definition SSE.h:3085
static __m128i load_u8_15_lower_random(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3841
static __m128i removeHighBits16_8_7_lower(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
Definition SSE.h:3909
static void average8Elements4Channel128Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
Definition SSE.h:2906
static __m128i load_u8_10_upper_zero(const uint8_t *const buffer)
Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes,...
Definition SSE.h:3731
static __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
Definition SSE.h:1581
static __m128i moveHighBits32_16(const __m128i &value)
Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
Definition SSE.h:3939
static void average16Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2934
static __m128i moveHighBits16_8_7(const __m128i &value)
Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3960
static __m128i roundedDivideByRightShiftSigned16Bit(const __m128i &value_s16x8, const unsigned int rightShifts)
Applies a rounded division by a right shift for eight int16_t values.
Definition SSE.h:3109
static __m128i bitMaskRemoveHigh32_16()
Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
Definition SSE.h:4000
static __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1588
static __m128i removeHighBits32_16(const __m128i &value)
Removes the higher 16 bits of four 32 bit elements.
Definition SSE.h:3894
static __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3990
static void average6Elements3Channel96Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
Definition SSE.h:2809
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two separated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition SSE.h:2302
static __m128i interpolation3Channel24Bit12Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2115
static __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to al...
Definition SSE.h:3154
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2155
static void average8Elements1Channel32Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
Definition SSE.h:2448
static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i &singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t *interleaved)
Stores 8 single-channel 8-bit elements as 32 interleaved 4-channel elements (8 elements -> 8×4 = 32 b...
Definition SSE.h:3453
static void shiftChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3631
static void average8Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2482
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3354
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1293
static __m128i interpolation1Channel8Bit15Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2063
static uint16_t value_u16(const __m128i &value)
Returns one specific 16 bit unsigned integer value of a m128i value object.
Definition SSE.h:1337
static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2, __m128i &reversedInterleaved0, __m128i &reversedInterleaved1, __m128i &reversedInterleaved2)
Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channe...
Definition SSE.h:3492
static __m128i sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1405
static __m128i removeLowBits32_16(const __m128i &value)
Removes the lower 16 bits of four 32 bit elements.
Definition SSE.h:3899
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:1771
static uint8_t value_u8(const __m128i &value)
Returns one specific 8 bit unsigned integer value of a m128i value object.
Definition SSE.h:1314
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 16 fol...
Definition SSE.h:3234
static __m128i bitMaskRemoveHigh16_8()
Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
Definition SSE.h:3995
static __m128i removeHighBits16_8(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements.
Definition SSE.h:3904
static __m128i sum1Channel8BitBack15Elements(const uint8_t *elements)
Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is ...
Definition SSE.h:3667
static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i &singleChannel_u_8x8, uint8_t *interleaved)
Stores 8 single-channel 8-bit elements as 24 interleaved 3-channel elements (8 elements -> 8×3 = 24 b...
Definition SSE.h:3437
static __m128i load_u8_15_lower_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3807
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3369
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1571
static __m128i sumInterleave3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2)
Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3673
static void average32Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2958
static void average30Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition SSE.h:3005
static __m128i sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1529
static __m128i sum1Channel8BitFront15Elements(const uint8_t *elements)
Sums the first 15 elements of a buffer with 8 bit per element.
Definition SSE.h:3661
static void average32ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
Definition SSE.h:2651
static void average32Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2585
static __m128i sumSquareDifference8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1446
static OCEAN_FORCE_INLINE float sum_f32_4(const __m128 &value)
Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
Definition SSE.h:1387
static __m128i load_u8_13_lower_random(const uint8_t *const buffer)
Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes,...
Definition SSE.h:3787
static void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interl...
Definition SSE.h:3546
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition SSE.h:1360
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition SSE.h:1308
static __m128i moveLowBits16_8ToLow64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3919
static __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
Definition SSE.h:1556
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition SSE.h:4024
static void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3638
static __m128i load128iLower64(const void *const buffer)
Loads the lower 64 bit of a 128i value from the memory.
Definition SSE.h:3718
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition SSE.h:4032
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3875
static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels...
Definition SSE.h:3517
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i &value)
Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3066
static void average8Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2683
static void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3624
static __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1564
static void average16Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2528
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i &values0, const __m128i &values1, __m128i &products0, __m128i &products1)
Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
Definition SSE.h:4005
static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels...
Definition SSE.h:3477
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:1917
This class provides basic numeric functionalities.
Definition Numeric.h:57
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:71
float m128_f32[4]
The four 32 bit elements.
Definition SSE.h:73
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:82
double m128d_f64[2]
The two 64 bit elements.
Definition SSE.h:84
This union defines a wrapper for the __m128i SSE intrinsic data type.
Definition SSE.h:51
uint64_t m128i_u64[2]
The two 64 bit elements.
Definition SSE.h:53
uint16_t m128i_u16[8]
The eight 16 bit elements.
Definition SSE.h:59
uint32_t m128i_u32[4]
The four 32 bit elements.
Definition SSE.h:56
uint8_t m128i_u8[16]
The sixteen 8 bit elements.
Definition SSE.h:62