Ocean
Loading...
Searching...
No Matches
SSE.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SSE_H
9#define META_OCEAN_CV_SSE_H
10
11#include "ocean/cv/CV.h"
12
14
15#include "ocean/math/Numeric.h"
16
17#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
18
19// SSE2 include files
20#include <emmintrin.h>
21#include <immintrin.h>
22#include <mmintrin.h>
23
24// SSE3 include files
25#include <pmmintrin.h>
26
27// SSE4 include files
28#include <smmintrin.h>
29
30namespace Ocean
31{
32
33namespace CV
34{
35
36/**
37 * This class implements computer vision functions using SSE extensions.
38 * @ingroup cv
39 */
40class SSE
41{
42 public:
43
44#if !defined(OCEAN_COMPILER_MSC)
45
46 /**
47 * This union defines a wrapper for the __m128i SSE intrinsic data type.
48 */
49 union M128i
50 {
51 /// The two 64 bit elements.
52 uint64_t m128i_u64[2];
53
54 /// The four 32 bit elements.
55 uint32_t m128i_u32[4];
56
57 /// The eight 16 bit elements.
58 uint16_t m128i_u16[8];
59
60 /// The sixteen 8 bit elements.
61 uint8_t m128i_u8[16];
62 };
63
64 static_assert(sizeof(M128i) == 16, "Invalid data type!");
65
66 /**
67 * This union defines a wrapper for the __m128 SSE intrinsic data type.
68 */
69 union M128
70 {
71 /// The four 32 bit elements.
72 float m128_f32[4];
73 };
74
75 static_assert(sizeof(M128) == 16, "Invalid data type!");
76
77 /**
78 * This union defines a wrapper for the __m128 SSE intrinsic data type.
79 */
80 union M128d
81 {
82 /// The two 64 bit elements.
83 double m128d_f64[2];
84 };
85
86 static_assert(sizeof(M128d) == 16, "Invalid data type!");
87
88#endif
89
90 public:
91
92 /**
93 * Prefetches a block of temporal memory into all cache levels.
94 * @param data Data to be prefetched
95 */
96 static inline void prefetchT0(const void* const data);
97
98 /**
99 * Prefetches a block of temporal memory in all cache levels except 0th cache level.
100 * @param data Data to be prefetched
101 */
102 static inline void prefetchT1(const void* const data);
103
104 /**
105 * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
106 * @param data Data to be prefetched
107 */
108 static inline void prefetchT2(const void* const data);
109
110 /**
111 * Prefetches a block of non-temporal memory into non-temporal cache structure.
112 * @param data Data to be prefetched
113 */
114 static inline void prefetchNTA(const void* const data);
115
116 /**
117 * Returns one specific 8 bit unsigned integer value of a m128i value object.
118 * @param value The value from which the 8 bit value will be returned
119 * @return The requested 8 bit value
120 * @tparam tIndex The index of the requested 8 bit integer value, with range [0, 15]
121 */
122 template <unsigned int tIndex>
123 static inline uint8_t value_u8(const __m128i& value);
124
125 /**
126 * Returns one specific 8 bit unsigned integer value of a m128i value object.
127 * @param value The value from which the 8 bit value will be returned
128 * @param index The index of the requested 8 bit integer value, with range [0, 15]
129 * @return The requested 8 bit value
130 */
131 static inline uint8_t value_u8(const __m128i& value, const unsigned int index);
132
133 /**
134 * Returns one specific 16 bit unsigned integer value of a m128i value object.
135 * @param value The value from which the 16 bit value will be returned
136 * @return The requested 16 bit value
137 * @tparam tIndex The index of the requested 16 bit integer value, with range [0, 7]
138 */
139 template <unsigned int tIndex>
140 static inline uint16_t value_u16(const __m128i& value);
141
142 /**
143 * Returns one specific 32 bit unsigned integer value of a m128i value object.
144 * @param value The value from which the 32 bit value will be returned
145 * @return The requested 32 bit value
146 * @tparam tIndex The index of the requested 32 bit integer value, with range [0, 3]
147 */
148 template <unsigned int tIndex>
149 static inline unsigned int value_u32(const __m128i& value);
150
151 /**
152 * Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the result.
153 * @param value The value which elements will be added
154 * @return The resulting sum value
155 */
156 static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i& value);
157
158 /**
159 * Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
160 * @param value The value which elements will be added
161 * @return The resulting sum value
162 */
163 static inline unsigned int sum_u32_first_2(const __m128i& value);
164
165 /**
166 * Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
167 * @param value The value which elements will be added
168 * @return The resulting sum value
169 */
170 static inline unsigned int sum_u32_first_third(const __m128i& value);
171
172 /**
173 * Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
174 * @param value The value which elements will be added
175 * @return The resulting sum value
176 */
177 static OCEAN_FORCE_INLINE float sum_f32_4(const __m128& value);
178
179 /**
180 * Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
181 * @param value The value which elements will be added
182 * @return The resulting sum value
183 */
184 static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d& value);
185
186 /**
187 * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
188 * @param image0 First 11 elements to determine the ssd for, may be non aligned
189 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
190 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
191 */
192 static inline __m128i sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
193
194 /**
195 * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision, the remaining 4 elements are set to zero.
196 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
197 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [00 01 02 03 04 05 06 07 08 09 10 11 NA NA NA NA].
198 * @param image0 First 12 (+4) elements to determine the ssd for, with any alignment
199 * @param image1 Second 12 (+4) elements to determine the ssd for, with any alignment
200 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
201 */
202 static inline __m128i sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
203
204 /**
205 * Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit precision, the beginning 4 elements are interpreted as zero.
206 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
207 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends right): [NA NA NA NA 04 05 06 07 08 09 10 11 12 13 14 15].
208 * @param image0 First (4+) 12 elements to determine the ssd for, with any alignment
209 * @param image1 Second (4+) 12 elements to determine the ssd for, with any alignment
210 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
211 */
212 static inline __m128i sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
213
214 /**
215 * Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
216 * This function supports to load the 13 elements from a buffer with only 13 bytes or with a buffer with at least 16 bytes.
217 * @param image0 First 13 elements to determine the ssd for, may be non aligned
218 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
219 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
220 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 13 bytes only
221 */
222 template <bool tBufferHas16Bytes>
223 static inline __m128i sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
224
225 /**
226 * Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit precision, the beginning 3 elements are interpreted as zero.
227 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
228 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [NA NA NA 03 04 05 06 07 08 09 10 11 12 13 14 15].
229 * @param image0 First (3+) 13 elements to determine the ssd for, may be non aligned
230 * @param image1 Second (3+) 13 elements to determine the ssd for, may be non aligned
231 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
232 */
233 static inline __m128i sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
234
235 /**
236 * Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
237 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
238 * @param image0 First 15 elements to determine the ssd for, may be non aligned
239 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
240 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
241 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
242 */
243 template <bool tBufferHas16Bytes>
244 static inline __m128i sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
245
246 /**
247 * Sum square difference determination for 16 elements with 8 bit precision.
248 * @param image0 First 16 elements to determine the ssd for, may be non aligned
249 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
250 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
251 */
252 static inline __m128i sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
253
254 /**
255 * Sum square difference determination for 16 elements with 8 bit precision.
256 * @param image0 First 16 elements to determine the ssd for, may be non aligned
257 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
258 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
259 */
260 static inline __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1);
261
262 /**
263 * Sum square difference determination for 16 elements with 8 bit precision.
264 * @param row0 First 16 elements to determine the ssd for
265 * @param row1 Second 16 elements to determine the ssd for
266 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
267 */
268 static inline __m128i sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1);
269
270 /**
271 * Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
272 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
273 * @param image0 First row of 8 elements
274 * @param image1 Second row of 8 elements
275 * @param result Resulting 4 average elements
276 */
277 static inline void average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result);
278
279 /**
280 * Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
281 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
282 * @param image0 First row of 8 elements
283 * @param image1 Second row of 8 elements
284 * @param result Resulting 4 average elements
285 */
286 static inline void average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
287
288 /**
289 * Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
290 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
291 * @param image0 First row of 8 elements, must be valid
292 * @param image1 Second row of 8 elements, must be valid
293 * @param result Resulting 4 average elementss, must be valid
294 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
295 */
296 static inline void average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
297
298 /**
299 * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
300 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
301 * @param image0 First row of 16 elements, must be valid
302 * @param image1 Second row of 16 elements, must be valid
303 * @param result Resulting 8 average elements, must be valid
304 */
305 static inline void average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
306
307 /**
308 * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
309 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
310 * @param image0 First row of 16 elements, must be valid
311 * @param image1 Second row of 16 elements, must be valid
312 * @param result Resulting 8 average elements, must be valid
313 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
314 */
315 static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
316
317 /**
318 * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
319 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
320 * @param image0 First row of 32 elements
321 * @param image1 Second row of 32 elements
322 * @param result Resulting 16 average elements
323 */
324 static inline void average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
325
326 /**
327 * Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
328 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
329 * @param image0 First row of 32 elements, must be valid
330 * @param image1 Second row of 32 elements, must be valid
331 * @param result Resulting 16 average elements, must be valid
332 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
333 */
334 static inline void average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
335
336 /**
337 * Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
338 * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels, each with 2 channels).<br>
339 * @param image0 First row of 8 elements
340 * @param image1 Second row of 8 elements
341 * @param result Resulting 4 average elements
342 */
343 static inline void average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
344
345 /**
346 * Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
347 * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels).<br>
348 * @param image0 First row of 8 elements
349 * @param image1 Second row of 8 elements
350 * @param result Resulting 4 average elements
351 */
352 static inline void average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result);
353
354 /**
355 * Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
356 * The function takes two rows of 32 elements and returns 8 average elements (4 averaged pixels, each with 2 channels).<br>
357 * @param image0 First row of 16 elements
358 * @param image1 Second row of 16 elements
359 * @param result Resulting 8 average elements
360 */
361 static inline void average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
362
363 /**
364 * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
365 * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).<br>
366 * @param image0 First row of 32 elements
367 * @param image1 Second row of 32 elements
368 * @param result Resulting 16 average elements
369 */
370 static inline void average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
371
372 /**
373 * Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
374 * The function takes two rows of 6 elements and returns 3 average elements (1 averaged pixels, each with 3 channels).<br>
375 * @param image0 First row of 6 elements
376 * @param image1 Second row of 6 elements
377 * @param result Resulting 3 average elements
378 */
379 static inline void average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result);
380
381 /**
382 * Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
383 * The function takes two rows of 24 elements and returns 12 average elements (4 averaged pixels, each with 3 channels).<br>
384 * @param image0 First row of 24 elements
385 * @param image1 Second row of 24 elements
386 * @param result Resulting 12 average elements
387 */
388 static inline void average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
389
390 /**
391 * Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
392 * The function takes two rows of 8 elements and returns 4 average elements (1 averaged pixel).<br>
393 * @param image0 First row of 8 elements
394 * @param image1 Second row of 8 elements
395 * @param result Resulting 4 average elements
396 */
397 static inline void average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result);
398
399 /**
400 * Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
401 * The function takes two rows of 16 elements and returns 8 average elements (2 averaged pixels, each with 4 channels).<br>
402 * @param image0 First row of 16 elements
403 * @param image1 Second row of 16 elements
404 * @param result Resulting 8 average elements
405 */
406 static inline void average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
407
408 /**
409 * Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
410 * The function takes two rows of 32 elements and returns 16 average elements (4 averaged pixels, each with 4 channels).<br>
411 * @param image0 First row of 32 elements
412 * @param image1 Second row of 32 elements
413 * @param result Resulting 16 average elements
414 */
415 static inline void average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
416
417 /**
418 * Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
419 * The function takes two rows of 30 elements and returns 10 average elements (10 averaged pixels).<br>
420 * @param image0 First row of 30 elements
421 * @param image1 Second row of 30 elements
422 * @param image2 Third row of 30 elements
423 * @param result Resulting 10 average elements
424 */
425 static inline void average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
426
427 /**
428 * Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
429 * This function must be invoked before the right shift is applied.
430 * @param value The eight signed 16 bit values to be handled
431 * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
432 */
433 static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i& value);
434
435 /**
436 * Adds 2^shifts - 1 to each negative int16_t value, so that each value can be right shifted to allow a correct division by 2^shifts.
437 * This function must be invoked before the right shift is applied.
438 * @param value The eight int16_t values to be handled, with range (-infinity, infinity)
439 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
440 * @return The modified value for which division a shift yield equal (and correct!) results
441 */
442 static inline __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts);
443
444 /**
445 * Divides eight int16_t values by applying a right shift.
446 * The function can divide positive and negative values correctly (but without rounding).
447 * @param value The eight int16_t values to be divided, with range (-infinity, infinity)
448 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
449 * @return The divided values
450 */
451 static inline __m128i divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts);
452
453 /**
454 * Applies a rounded division by a right shift for eight int16_t values.
455 * The function can divide positive and negative values correctly (and handles rounding).<br>
456 * However, this function has a specific value range for the input values:
457 * <pre>
458 * maxValue = (2^15 - 1) - 2^(rightShifts - 1) = 32767 - 2^(rightShifts - 1)
459 * </pre>
460 * @param value_s16x8 The eight int16_t values to be divided, with range [-maxValue, maxValue]
461 * @param rightShifts The number of right shifts which needs to be applied, with range [1, 15]
462 * @return The divided values
463 * @see maximalValueForRoundedDivisionByRightShiftSigned16Bit().
464 */
465 static inline __m128i roundedDivideByRightShiftSigned16Bit(const __m128i& value_s16x8, const unsigned int rightShifts);
466
467 /**
468 * Returns the maximal value for which the function roundedDivideByRightShiftSigned16Bit() can be applied.
469 * @param rightShifts The number of right shifts which needs to be applied, with range [1, 15]
470 * @return The maximal value, which is 32767 - 2^(rightShifts - 1).
471 */
472 static inline int16_t maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts);
473
474 /**
475 * Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
476 * This function must be invoked before the right shift is applied.
477 * @param value The eight signed 32 bit values to be handled
478 * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
479 */
480 static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i& value);
481
482 /**
483 * Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to allow a correct division by 2^shifts.
484 * This function must be invoked before the right shift is applied.
485 * @param value The eight signed 32 bit values to be handled
486 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 31]
487 * @return The modified value for which division a shift yield equal (and correct!) results
488 */
489 static inline __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts);
490
491 /**
492 * Divides eight signed 32 bit values by applying a right shift.
493 * This is able to determine the correct division result for positive and negative 32 bit values.
494 * @param value The eight signed 32 bit values to be handled
495 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 32]
496 * @return The divided values
497 */
498 static inline __m128i divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts);
499
500 /**
501 * Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 bit frame.
502 * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
503 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
504 * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
505 * @param width The width of the original frame in pixel, with range [10, infinity)
506 */
507 static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
508
509 /**
510 * Determines the squared horizontal and vertical gradients and the product of both gradients for 16 following pixels for a given 1 channel 8 bit frame.
511 * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
512 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
513 * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
514 * @param width The width of the original frame in pixel, with range [10, infinity)
515 */
516 static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
517
518 /**
519 * Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit precision.
520 * @param image0 First 11 elements to determine the sad for, may be non aligned
521 * @param image1 Second 11 elements to determine the sad for, may be non aligned
522 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
523 */
524 static inline __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
525
526 /**
527 * Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
528 * This function supports to load the 10 elements from a buffer with only 10 bytes or with a buffer with at least 16 bytes.
529 * @param image0 First 10 elements to determine the sad for, may be non aligned
530 * @param image1 Second 10 elements to determine the sad for, may be non aligned
531 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
532 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 10 bytes only
533 */
534 template <bool tBufferHas16Bytes>
535 static inline __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
536
537 /**
538 * Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
539 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
540 * @param image0 First 15 elements to determine the sad for, may be non aligned
541 * @param image1 Second 15 elements to determine the sad for, may be non aligned
542 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
543 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
544 */
545 template <bool tBufferHas16Bytes>
546 static inline __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
547
548 /**
549 * Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
550 * The first interpolation element results from the first and second element of both rows.<br>
551 * The second interpolation element results from the second and third element of both rows.<br>
552 * ...<br>
553 * The eighth interpolation element results from the eighth and ninth.<br>
554 * The interpolation is specified by tx and ty with range [0, 128u].<br>
555 * @param values0 First row of 9 elements to be interpolated
556 * @param values1 Second row of 9 elements to be interpolated
557 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
558 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
559 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
560 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
561 * @return Interpolation result for 8 elements, which are 8 pixels
562 */
563 static inline __m128i interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
564
565 /**
566 * Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
567 * The first interpolation element results from the first and second element of both rows.<br>
568 * The second interpolation element results from the second and third element of both rows.<br>
569 * ...<br>
570 * The eighth interpolation element results from the eighth and ninth.<br>
571 * The interpolation is specified by tx and ty with range [0, 128u].<br>
572 * @param values0 First row of 10 elements to be interpolated
573 * @param values1 Second row of 10 elements to be interpolated
574 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
575 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
576 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
577 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
578 * @return Interpolation result for 8 elements, which are 4 pixels
579 */
580 static inline __m128i interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
581
582 /**
583 * Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
584 * The first interpolation element results from the first and second element of both rows.<br>
585 * The second interpolation element results from the second and third element of both rows.<br>
586 * ...<br>
587 * The eighth interpolation element results from the eighth and ninth.<br>
588 * The interpolation is specified by tx and ty with range [0, 128u].<br>
589 * @param values0 First row of 11 elements to be interpolated
590 * @param values1 Second row of 11 elements to be interpolated
591 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
592 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
593 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
594 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
595 * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
596 */
597 static inline __m128i interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
598
599 /**
600 * Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
601 * The interpolation is specified by tx and ty with range [0, 128u].<br>
602 * @param values0 First row of 16 elements to be interpolated
603 * @param values1 Second row of 16 elements to be interpolated
604 * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
605 * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
606 * @return Interpolation result for 15 elements, which are (15 pixels)
607 */
608 static inline __m128i interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
609
610 /**
611 * Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
612 * The interpolation is specified by tx and ty with range [0, 128u].<br>
613 * @param values0 First row of 15 elements to be interpolated
614 * @param values1 Second row of 15 elements to be interpolated
615 * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
616 * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
617 * @return Interpolation result for 12 elements, which are (4 pixels)
618 */
619 static inline __m128i interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
620
621 /**
622 * Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
623 * The first interpolation element results from the first and second element of both rows.<br>
624 * The second interpolation element results from the second and third element of both rows.<br>
625 * ...<br>
626 * The eighth interpolation element results from the eighth and ninth.<br>
627 * The interpolation is specified by tx and ty with range [0, 128u].<br>
628 * @param values0 First row of 12 elements to be interpolated
629 * @param values1 Second row of 12 elements to be interpolated
630 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
631 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
632 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
633 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
634 * @return Interpolation result for 8 elements, which are (2 pixels)
635 */
636 static inline __m128i interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
637
638 /**
639 * Interpolates 2x4 elements (two separated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit frames.
640 * The first interpolation element results from the first and second element of both rows.<br>
641 * The second interpolation element results from the second and third element of both rows.<br>
642 * ...<br>
643 * The eighth interpolation element results from the eighth and ninth.<br>
644 * The interpolation is specified by tx and ty with range [0, 128u].<br>
645 * @param values0 First row of 16 elements to be interpolated
646 * @param values1 Second row of 16 elements to be interpolated
647 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
648 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
649 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
650 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
651 * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
652 */
653 static inline __m128i interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
654
655 /**
656 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
657 * @param pixel0 Upper left pixel in the first frame
658 * @param pixel1 Upper left pixel in the second frame
659 * @param size0 Size of one frame row in bytes
660 * @param size1 Size of one frame row in bytes
661 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
662 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
663 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
664 * @param f1xy Product of the fx and the fy interpolation factor for the second image
665 * @return Interpolated sum of square difference
666 */
667 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
668
669 /**
670 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
671 * @param pixel0 Upper left pixel in the first frame
672 * @param pixel1 Upper left pixel in the second frame
673 * @param size0 Size of one frame row in bytes
674 * @param size1 Size of one frame row in bytes
675 * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
676 * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
677 * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
678 * @param f0xy Product of the fx and the fy interpolation factor for the first image
679 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
680 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
681 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
682 * @param f1xy Product of the fx and the fy interpolation factor for the second image
683 * @return Interpolated sum of square difference
684 */
685 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
686
687 /**
688 * Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
689 * @param image0 First 16 elements to determine the ssd for, may be non aligned
690 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
691 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
692 */
693 static inline __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
694
695 /**
696 * Deinterleaves 15 elements of e.g., an image with 3 channels and 8 bit per element.
697 * This functions converts X CBA CBA CBA CBA CBA to 00000000000CCCCC 000BBBBB000AAAAA.
698 * @param interleaved The 15 elements holding the interleaved image data
699 * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
700 * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
701 */
702 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2);
703
704 /**
705 * Deinterleaves 24 elements of e.g., an image with 3 channels and 8 bit per element.
706 * This functions converts XX XXX XXX CBA CBA CB A CBA CBA CBA CBA CBA to 00000000CCCCCCCC BBBBBBBBAAAAAAAA.
707 * @param interleavedA First 16 elements holding the interleaved image data
708 * @param interleavedB Second 16 elements holding the interleaved image data, the first 8 elements will be used only
709 * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
710 * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
711 */
712 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2);
713
714 /**
715 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
716 * This functions converts CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA to CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA.
717 * @param interleavedA First 16 elements holding the interleaved image data
718 * @param interleavedB Second 16 elements holding the interleaved image data
719 * @param interleavedC Third 16 elements holding the interleaved image data
720 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
721 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
722 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
723 */
724 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2);
725
726 /**
727 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
728 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
729 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
730 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
731 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
732 */
733 static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
734
735 /**
736 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
737 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes), must be valid
738 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively, must be valid
739 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively, must be valid
740 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively, must be valid
741 */
742 static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2);
743
744 /**
745 * Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
746 * @param interleaved 45 elements of an image with 3 channels and 8 bit per element (45 bytes), must be valid
747 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
748 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
749 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
750 */
751 static inline void deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
752
753 /**
754 * Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
755 * This functions converts CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA to CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA.
756 * @param channel0 The 16 elements of the first channel to be interleaved
757 * @param channel1 The 16 elements of the second channel to be interleaved
758 * @param channel2 The 16 elements of the third channel to be interleaved
759 * @param interleavedA Resulting first 16 of the interleaved data
760 * @param interleavedB Resulting second 16 of the interleaved data
761 * @param interleavedC Resulting third 16 of the interleaved data
762 */
763 OCEAN_FORCE_INLINE static void interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC);
764
765 /**
766 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
767 * @param channel0 The 16 elements of the first channel to be interleaved, must be valid
768 * @param channel1 The 16 elements of the second channel to be interleaved, must be valid
769 * @param channel2 The 16 elements of the third channel to be interleaved, must be valid
770 * @param interleaved The resulting 48 interleaved elements, must be valid
771 */
772 static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved);
773
774 /**
775 * Stores 8 single-channel 8-bit elements as 24 interleaved 3-channel elements (8 elements -> 8×3 = 24 bytes).
776 * Each input element is replicated to all 3 channels.
777 * @param singleChannel_u_8x8 The input with 8 single-channel elements in lower 8 bytes
778 * @param interleaved Pointer to 24 bytes where interleaved data will be stored, must be valid
779 */
780 static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i& singleChannel_u_8x8, uint8_t* interleaved);
781
782 /**
783 * Stores 8 single-channel 8-bit elements as 32 interleaved 4-channel elements (8 elements -> 8×4 = 32 bytes) with constant 4th channel value.
784 * Each input element is replicated to the first 3 channels, with a constant value for the 4th channel.
785 * @param singleChannel_u_8x8 The input with 8 single-channel elements in lower 8 bytes
786 * @param lastChannelValue The constant value for the last channel
787 * @param interleaved Pointer to 32 bytes where interleaved data will be stored, must be valid
788 */
789 static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i& singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t* interleaved);
790
791 /**
792 * Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels and 8 bit per element (e.g., YA16 to AY16).
793 * @param interleaved 16 elements of an image with 2 channels and 8 bit per element (32 bytes)
794 * @param reversedInterleaved Resulting 32 elements with reversed channel order
795 */
796 static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
797
798 /**
799 * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element.
800 * @param interleaved0 First 16 elements holding the interleaved image data
801 * @param interleaved1 Second 16 elements holding the interleaved image data
802 * @param interleaved2 Third 16 elements holding the interleaved image data
803 * @param reversedInterleaved0 Resulting first 16 elements holding the interleaved image data with reversed channel order
804 * @param reversedInterleaved1 Resulting second 16 elements holding the interleaved image data with reversed channel order
805 * @param reversedInterleaved2 Resulting third 16 elements holding the interleaved image data with reversed channel order
806 */
807 static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2);
808
809 /**
810 * Reverses the order of the first and last channel of 48 elements (16 pixels) of an image with 3 interleaved channels and 8 bit per element (e.g., RGB24 to BGR24).
811 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
812 * @param reversedInterleaved Resulting 48 elements with reversed channel order
813 */
814 static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
815
816 /**
817 * Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels and 8 bit per element (e.g., RGBA32 to ABGR24).
818 * @param interleaved 64 elements of an image with 4 channels and 8 bit per element (64 bytes)
819 * @param reversedInterleaved Resulting 64 elements with reversed channel order
820 */
821 static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
822
823 /**
824 * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element (in place).
825 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
826 */
827 static void reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved);
828
829 /**
830 * Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interleaved channels and 8 bit per element and further swaps both sets.
831 * @param first First 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
832 * @param second Second 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
833 */
834 static inline void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second);
835
836 /**
837 * Reverses the order of 48 elements with 8 bit per element.
838 * @param elements0 First 16 elements
839 * @param elements1 Second 16 elements
840 * @param elements2 Third 16 elements
841 * @param reversedElements0 Resulting reversed first 16 elements
842 * @param reversedElements1 Resulting reversed second 16 elements
843 * @param reversedElements2 Resulting reversed third 16 elements
844 */
845 static inline void reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2);
846
847 /**
848 * Reverses the order of 48 elements with 8 bit per element.
849 * @param elements 48 elements that will be reversed
850 * @param reversedElements Resulting reversed 48 elements
851 */
852 static inline void reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements);
853
854 /**
855 * Reverses the order of 48 elements with 8 bit per element (in place).
856 * @param elements 48 elements that will be reversed
857 */
858 static inline void reverseElements8Bit48Elements(uint8_t* elements);
859
860 /**
861 * Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
862 * @param first First 48 elements that will be reversed and swapped with the second 48 elements
863 * @param second Second 48 elements that will be reversed and swapped with the first 48 elements
864 */
865 static inline void swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second);
866
867 /**
868 * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel.
869 * The function takes four pixels DCBA DCBA DCBA DCBA and provides ADCB ADCB ADCB ADCB.<br>
870 * @param elements 16 elements of 4 pixels to be shifted
871 * @param shiftedElements Resulting shifted elements
872 */
873 static inline void shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
874
875 /**
876 * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel and mirrors the four individual pixels.
877 * @param elements 16 elements of 4 pixels to be shifted and mirrored
878 * @param shiftedElements Resulting shifted and mirrored elements
879 */
880 static inline void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
881
882 /**
883 * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel.
884 * The function takes four pixels DCBA DCBA DCBA DCBA and provides CBAD CBAD CBAD CBAD.<br>
885 * @param elements 16 elements of 4 pixels to be shifted
886 * @param shiftedElements Resulting shifted elements
887 */
888 static inline void shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
889
890 /**
891 * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel and mirrors the four individual pixels.
892 * @param elements 16 elements of 4 pixels to be shifted and mirrored
893 * @param shiftedElements Resulting shifted and mirrored elements
894 */
895 static inline void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
896
897 /**
898 * Sums 16 elements with 8 bit per element.
899 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
900 * @param elements 16 elements holding the image data
901 * @return Resulting sums
902 */
903 static inline __m128i sum1Channel8Bit16Elements(const __m128i& elements);
904
905 /**
906 * Sums 16 elements with 8 bit per element.
907 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
908 * @param elements 16 elements holding the image data
909 * @return Resulting sums
910 */
911 static inline __m128i sum1Channel8Bit16Elements(const uint8_t* elements);
912
913 /**
914 * Sums the first 15 elements of a buffer with 8 bit per element.
915 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.<br>
916 * If the provided buffer holds at least 16 bytes the load function is much faster compared to the case if the buffer is not larger than 15 bytes.<br>
917 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
918 * @param elements 16 elements holding the image data
919 * @return Resulting sums
920 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
921 */
922 template <bool tBufferHas16Bytes>
923 static inline __m128i sum1Channel8BitFront15Elements(const uint8_t* elements);
924
925 /**
926 * Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is interpreted as zero.
927 * However, the provided buffer must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE register.<br>
928 * Thus, this functions handles one buffer with this pattern (while the memory starts left and ends right): [NA 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15].
929 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
930 * @param elements (1+) 15 elements holding the image data
931 * @return Resulting sum
932 */
933 static inline __m128i sum1Channel8BitBack15Elements(const uint8_t* elements);
934
935 /**
936 * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
937 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
938 * @param interleaved0 First 16 elements holding the interleaved image data
939 * @param interleaved1 Second 16 elements holding the interleaved image data
940 * @param interleaved2 Third 16 elements holding the interleaved image data
941 * @return Resulting sums
942 */
943 static inline __m128i sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2);
944
945 /**
946 * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
947 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
948 * @param interleaved 48 elements holding the interleaved image data
949 * @return Resulting sums
950 */
951 static inline __m128i sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved);
952
953 /**
954 * Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
955 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
956 * @param interleaved 45 elements holding the interleaved image data
957 * @return Resulting sums
958 */
959 static inline __m128i sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved);
960
961 /**
962 * Loads the lower 64 bit of a 128i value from the memory.
963 * The upper 64 bit are zeroed.
964 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 8 bytes
965 * @return Resulting value
966 */
967 static inline __m128i load128iLower64(const void* const buffer);
968
969 /**
970 * Loads a 128i value from the memory.
971 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 16 bytes
972 * @return Resulting value
973 */
974 static inline __m128i load128i(const void* const buffer);
975
976 /**
977 * Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes, to a 128i value and sets the remaining bytes of the resulting 128i value to zero.
978 * The loaded memory will be stored in the upper 10 bytes of the 128i value while the lowest remaining 6 bytes will be set to zero.
979 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [09 08 07 06 05 04 03 02 01 00 ZZ ZZ ZZ ZZ ZZ ZZ], with ZZ meaning zero.<br>
980 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
981 * @return Resulting 128 bit value
982 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 10 bytes
983 */
984 template <bool tBufferHas16Bytes>
985 static inline __m128i load_u8_10_upper_zero(const uint8_t* const buffer);
986
987 /**
988 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
989 * The loaded memory will be stored in the upper 15 bytes of the 128i value while the lowest remaining 1 byte will be set to zero.
990 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 ZZ], with ZZ meaning zero.<br>
991 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
992 * @return Resulting 128 bit value
993 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
994 */
995 template <bool tBufferHas16Bytes>
996 static inline __m128i load_u8_15_upper_zero(const uint8_t* const buffer);
997
998 /**
999 * Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
1000 * The loaded memory will be stored in the lower 13 bytes of the 128i value while the highest remaining 3 byte will be random.<br>
1001 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? ?? ?? 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
1002 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
1003 * @return Resulting 128 bit value
1004 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 13 bytes
1005 */
1006 template <bool tBufferHas16Bytes>
1007 static inline __m128i load_u8_13_lower_random(const uint8_t* const buffer);
1008
1009 /**
1010 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
1011 * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be set to zero.<br>
1012 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [-- 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ZZ meaning zero.<br>
1013 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
1014 * @return Resulting 128 bit value
1015 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
1016 */
1017 template <bool tBufferHas16Bytes>
1018 static inline __m128i load_u8_15_lower_zero(const uint8_t* const buffer);
1019
1020 /**
1021 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
1022 * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be random.<br>
1023 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
1024 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
1025 * @return Resulting 128 bit value
1026 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
1027 */
1028 template <bool tBufferHas16Bytes>
1029 static inline __m128i load_u8_15_lower_random(const uint8_t* const buffer);
1030
1031 /**
1032 * Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified number of bytes to the right (by inserting zeros).
1033 * This function can be used if the remaining buffer is smaller than 16 bytes while the buffer exceeds/continues in the lower address space (from the original point of interest).<br>
1034 * Thus, this function an handle a buffer with the following pattern (with lower address left and high address right):<br>
1035 * | ?? ?? ?? ?? ?? ?? ?? ?? ?? V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 |, where ?? represent random values in our buffer (in the lower address space), and VX represent the values of interest and V0 the location to which 'buffer' is pointing to.<br>
1036 * by load_u8_16_and_shift_right<6>(buffer - 6);<br>
1037 * The resulting 128i register will then be composed of (high bits left, low bits right): [00 00 00 00 00 00 V9 V8 V7 V6 V5 V4 V3 V2 V1 V0].
1038 * @param buffer The actual address from which the 16 bytes will be loaded, must be valid and must be at least 16 bytes large
1039 * @return The resulting 128 bit value
1040 * @tparam tShiftBytes The number of bytes which will be shifted (to the right) after the memory has loaded, with range [0, 16]
1041 */
1042 template <unsigned int tShiftBytes>
1043 static inline __m128i load_u8_16_and_shift_right(const uint8_t* const buffer);
1044
1045 /**
1046 * Stores a 128i value to the memory.
1047 * @param value Value to be stored
1048 * @param buffer Buffer receiving the value (does not need to be aligned on any particular boundary)
1049 */
1050 static inline void store128i(const __m128i& value, uint8_t* const buffer);
1051
1052 /**
1053 * Sets a 128i value by two 64 bit values.
1054 * @param high64 High 64 bits to be set
1055 * @param low64 Low 64 bits to be set
1056 * @return Resulting 128i value
1057 */
1058 static inline __m128i set128i(const unsigned long long high64, const unsigned long long low64);
1059
1060 /**
1061 * Removes the higher 16 bits of four 32 bit elements.
1062 * Given: PONM-LKJI-HGFE-DCBA<br>
1063 * Result: 00NM-00JI-00FE-00BA<br>
1064 * @param value Value to remove the high bits for
1065 * @return Result
1066 */
1067 static inline __m128i removeHighBits32_16(const __m128i& value);
1068
1069 /**
1070 * Removes the lower 16 bits of four 32 bit elements.
1071 * Given: PONM-LKJI-HGFE-DCBA<br>
1072 * Result: PO00-LK00-HG00-DC00<br>
1073 * @param value Value to remove the lower bits for
1074 * @return Result
1075 */
1076 static inline __m128i removeLowBits32_16(const __m128i& value);
1077
1078 /**
1079 * Removes the higher 8 bits of eight 16 bit elements.
1080 * Given: PONM-LKJI-HGFE-DCBA<br>
1081 * Result: 0O0M-0K0I-0G0E-0C0A<br>
1082 * @param value Value to remove the high bits for
1083 * @return Result
1084 */
1085 static inline __m128i removeHighBits16_8(const __m128i& value);
1086
1087 /**
1088 * Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
1089 * Given: PONM-LKJI-HGFE-DCBA<br>
1090 * Result: 000M-0K0I-0G0E-0C0A<br>
1091 * @param value Value to remove the high bits for
1092 * @return Result
1093 */
1094 static inline __m128i removeHighBits16_8_7_lower(const __m128i& value);
1095
1096 /**
1097 * Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
1098 * Given: PONM-LKJI-HGFE-DCBA<br>
1099 * Result: 0O0M-0K0I-0G0E-0C00<br>
1100 * @param value Value to remove the high bits for
1101 * @return Result
1102 */
1103 static inline __m128i removeHighBits16_8_7_upper(const __m128i& value);
1104
1105 /**
1106 * Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1107 * Given: PONM-LKJI-HGFE-DCBA<br>
1108 * Result: 0000-0000-OMKI-GECA<br>
1109 * @param value Value to remove the high bits for
1110 * @return Result
1111 */
1112 static inline __m128i moveLowBits16_8ToLow64(const __m128i& value);
1113
1114 /**
1115 * Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0.
1116 * Given: PONM-LKJI-HGFE-DCBA<br>
1117 * Result: 0000-0000-0000-MIEA<br>
1118 * @param value Value to remove the high bits for
1119 * @return Result
1120 */
1121 static inline __m128i moveLowBits32_8ToLow32(const __m128i& value);
1122
1123 /**
1124 * Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1125 * Given: PONM-LKJI-HGFE-DCBA<br>
1126 * Result: 0000-0000-NMJI-FEBA<br>
1127 * @param value Value to remove the high bits for
1128 * @return Result
1129 */
1130 static inline __m128i moveLowBits32_16ToLow64(const __m128i& value);
1131
1132 /**
1133 * Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with 0.
1134 * Given: PONM-LKJI-HGFE-DCBA<br>
1135 * Result: OMKI-GECA-0000-0000<br>
1136 * @param value Value to remove the high bits for
1137 * @return Result
1138 */
1139 static inline __m128i moveLowBits16_8ToHigh64(const __m128i& value);
1140
1141 /**
1142 * Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
1143 * Given: PONM-LKJI-HGFE-DCBA<br>
1144 * Result: 00PO-00LK-00HG-00DC<br>
1145 * @param value Value to remove the high bits for
1146 * @return Result
1147 */
1148 static inline __m128i moveHighBits32_16(const __m128i& value);
1149
1150 /**
1151 * Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
1152 * Given: PONM-LKJI-HGFE-DCBA<br>
1153 * Result: 0P0N-0L0J-0H0F-0D0B<br>
1154 * @param value Value to remove the high bits for
1155 * @return Result
1156 */
1157 static inline __m128i moveHighBits16_8(const __m128i& value);
1158
1159 /**
1160 * Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
1161 * Given: PONM-LKJI-HGFE-DCBA<br>
1162 * Result: 0000-000J-0H0F-0D0B<br>
1163 * @param value Value to remove the high bits for
1164 * @return Result
1165 */
1166 static inline __m128i moveHighBits16_8_5(const __m128i& value);
1167
1168 /**
1169 * Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
1170 * Given: PONM-LKJI-HGFE-DCBA<br>
1171 * Result: 0000-0L0J-0H0F-0D0B<br>
1172 * @param value Value to remove the high bits for
1173 * @return Result
1174 */
1175 static inline __m128i moveHighBits16_8_6(const __m128i& value);
1176
1177 /**
1178 * Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
1179 * Given: PONM-LKJI-HGFE-DCBA<br>
1180 * Result: 000N-0L0J-0H0F-0D0B<br>
1181 * @param value Value to remove the high bits for
1182 * @return Result
1183 */
1184 static inline __m128i moveHighBits16_8_7(const __m128i& value);
1185
1186 /**
1187 * Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
1188 * Given: PONM-LKJI-HGFE-DCBA<br>
1189 * Result: 000D-000C-000B-000A<br>
1190 * @param value Value to be shuffled
1191 * @return Result
1192 */
1193 static inline __m128i shuffleLow32ToLow32_8(const __m128i& value);
1194
1195 /**
1196 * Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1197 * Given: PONM-LKJI-HGFE-DCBA<br>
1198 * Result: 0H0D-0G0C-0F0B-0E0A<br>
1199 * @param value Value to be shuffled
1200 * @return Result
1201 */
1202 static inline __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value);
1203
1204 /**
1205 * Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1206 * Given: PONM-LKJI-HGFE-DCBA<br>
1207 * Result: 0P0L-0O0K-0N0J-0M0I<br>
1208 * @param value Value to be shuffled
1209 * @return Result
1210 */
1211 static inline __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i& value);
1212
1213 /**
1214 * Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1215 * @param value Value to be shuffled
1216 * @return Result
1217 */
1218 static inline __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value);
1219
1220 /**
1221 * Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1222 * @param value Value to be shuffled
1223 * @return Result
1224 */
1225 static inline __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i& value);
1226
1227 /**
1228 * Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
1229 * @return Bitmask
1230 */
1231 static inline __m128i bitMaskRemoveHigh16_8();
1232
1233 /**
1234 * Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
1235 * @return Bitmask
1236 */
1237 static inline __m128i bitMaskRemoveHigh32_16();
1238
1239 /**
1240 * Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
1241 * The pseudo code of the function is as follows:
1242 * <pre>
1243 * products0[0] = values0[0] * values1[0]
1244 * ...
1245 * products0[3] = values0[3] * values1[3]
1246 *
1247 * products1[0] = values0[4] * values1[4]
1248 * ...
1249 * products1[3] = values0[7] * values1[7]
1250 * </pre>
1251 * @param values0 The first 8 int16_t values to be multiplied
1252 * @param values1 The second 8 int16_t values to be multiplied
1253 * @param products0 The resulting first 4 int32_t products
1254 * @param products1 The resulting second 4 int32_t products
1255 */
1256 static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1);
1257
1258 /**
1259 * Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
1260 * The pseudo code of the function is as follows:
1261 * <pre>
1262 * results0[0] += values0[0] * values1[0]
1263 * ...
1264 * results0[3] += values0[3] * values1[3]
1265 *
1266 * results1[0] += values0[4] * values1[4]
1267 * ...
1268 * results1[3] += values0[7] * values1[7]
1269 * </pre>
1270 * @param values0 The first 8 int16_t values to be multiplied
1271 * @param values1 The second 8 int16_t values to be multiplied
1272 * @param results0 The results to which the first 4 int32_t products will be added
1273 * @param results1 The results to which the second 4 int32_t products will be added
1274 */
1275 static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1);
1276
1277 private:
1278
1279 /**
1280 * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
1281 * @param pixel Upper left pixel in the frame
1282 * @param size Size of one frame row in bytes
1283 * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
1284 * @param fxy_ Product of the fx and the inverse fy interpolation factor
1285 * @param fx_y Product of the inverse fx and the fy interpolation factor
1286 * @param fxy Product of the fx and the fy interpolation factor
1287 * @return Interpolated pixel values
1288 */
1289 static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
1290};
1291
1292inline void SSE::prefetchT0(const void* const data)
1293{
1294 _mm_prefetch((char*)data, _MM_HINT_T0);
1295}
1296
1297inline void SSE::prefetchT1(const void* const data)
1298{
1299 _mm_prefetch((char*)data, _MM_HINT_T1);
1300}
1301
1302inline void SSE::prefetchT2(const void* const data)
1303{
1304 _mm_prefetch((char*)data, _MM_HINT_T2);
1305}
1306
1307inline void SSE::prefetchNTA(const void* const data)
1308{
1309 _mm_prefetch((char*)data, _MM_HINT_NTA);
1310}
1311
1312template <unsigned int tIndex>
1313inline uint8_t SSE::value_u8(const __m128i& value)
1314{
1315 static_assert(tIndex <= 15u, "Invalid index!");
1316
1317#ifdef OCEAN_COMPILER_MSC
1318 return value.m128i_u8[tIndex];
1319#else
1320 return ((const M128i*)(&value))->m128i_u8[tIndex];
1321#endif
1322}
1323
1324inline uint8_t SSE::value_u8(const __m128i& value, const unsigned int index)
1325{
1326 ocean_assert(index <= 15u);
1327
1328#ifdef OCEAN_COMPILER_MSC
1329 return value.m128i_u8[index];
1330#else
1331 return ((const M128i*)(&value))->m128i_u8[index];
1332#endif
1333}
1334
1335template <unsigned int tIndex>
1336inline uint16_t SSE::value_u16(const __m128i& value)
1337{
1338 static_assert(tIndex <= 7u, "Invalid index!");
1339
1340#ifdef OCEAN_COMPILER_MSC
1341 return value.m128i_u16[tIndex];
1342#else
1343 return ((const M128i*)(&value))->m128i_u16[tIndex];
1344#endif
1345}
1346
1347template <unsigned int tIndex>
1348inline unsigned int SSE::value_u32(const __m128i& value)
1349{
1350 static_assert(tIndex <= 3u, "Invalid index!");
1351
1352#ifdef OCEAN_COMPILER_MSC
1353 return value.m128i_u32[tIndex];
1354#else
1355 return ((const M128i*)(&value))->m128i_u32[tIndex];
1356#endif
1357}
1358
1359OCEAN_FORCE_INLINE unsigned int SSE::sum_u32_4(const __m128i& value)
1360{
1361#ifdef OCEAN_COMPILER_MSC
1362 return value.m128i_u32[0] + value.m128i_u32[1] + value.m128i_u32[2] + value.m128i_u32[3];
1363#else
1364 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1] + ((const M128i*)(&value))->m128i_u32[2] + ((const M128i*)(&value))->m128i_u32[3];
1365#endif
1366}
1367
1368inline unsigned int SSE::sum_u32_first_2(const __m128i& value)
1369{
1370#ifdef OCEAN_COMPILER_MSC
1371 return value.m128i_u32[0] + value.m128i_u32[1];
1372#else
1373 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1];
1374#endif
1375}
1376
1377inline unsigned int SSE::sum_u32_first_third(const __m128i& value)
1378{
1379#ifdef OCEAN_COMPILER_MSC
1380 return value.m128i_u32[0] + value.m128i_u32[2];
1381#else
1382 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[2];
1383#endif
1384}
1385
1386OCEAN_FORCE_INLINE float SSE::sum_f32_4(const __m128& value)
1387{
1388#ifdef OCEAN_COMPILER_MSC
1389 return value.m128_f32[0] + value.m128_f32[1] + value.m128_f32[2] + value.m128_f32[3];
1390#else
1391 return ((const M128*)(&value))->m128_f32[0] + ((const M128*)(&value))->m128_f32[1] + ((const M128*)(&value))->m128_f32[2] + ((const M128*)(&value))->m128_f32[3];
1392#endif
1393}
1394
1395OCEAN_FORCE_INLINE double SSE::sum_f64_2(const __m128d& value)
1396{
1397#ifdef OCEAN_COMPILER_MSC
1398 return value.m128d_f64[0] + value.m128d_f64[1];
1399#else
1400 return ((const M128d*)(&value))->m128d_f64[0] + ((const M128d*)(&value))->m128d_f64[1];
1401#endif
1402}
1403
1404inline __m128i SSE::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1405{
1406 ocean_assert(image0 && image1);
1407
1408 return SSE::sumSquareDifference8Bit16Elements(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1409}
1410
1411inline __m128i SSE::sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1412{
1413 ocean_assert(image0 && image1);
1414
1415 return _mm_sad_epu8(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1416}
1417
1418inline __m128i SSE::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
1419{
1420 ocean_assert(image0 && image1);
1421
1422 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1423 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1424
1425 // subtract the 16 elements (usage of saturation and bitwise or operator)
1426 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1427
1428 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1429
1430 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00AA008ull, 0xA006A004A002A000ull));
1431 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1432
1433 // square the 16 elements
1434 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1435 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1436
1437 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1438 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1439 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1440
1441 // 4 32 bit square difference values
1442 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1443}
1444
1445inline __m128i SSE::sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
1446{
1447 ocean_assert(image0 && image1);
1448
1449 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1450 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1451
1452 // subtract the 16 elements (usage of saturation and bitwise or operator)
1453 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1454
1455 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1456
1457 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1458 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00FA00Dull, 0xA00BA009A007A005ull));
1459
1460 // square the 16 elements
1461 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1462 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1463
1464 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1465 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1466 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1467
1468 // 4 32 bit square difference values
1469 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1470}
1471
1472template <bool tBufferHas16Bytes>
1473inline __m128i SSE::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
1474{
1475 ocean_assert(image0 && image1);
1476
1477 const __m128i row0 = load_u8_13_lower_random<tBufferHas16Bytes>(image0);
1478 const __m128i row1 = load_u8_13_lower_random<tBufferHas16Bytes>(image1);
1479
1480 // subtract the 16 elements (usage of saturation and bitwise or operator)
1481 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1482
1483 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1484
1485 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00CA00AA008ull, 0xA006A004A002A000ull));
1486 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1487
1488 // square the 16 elements
1489 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1490 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1491
1492 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1493 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1494 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1495
1496 // 4 32 bit square difference values
1497 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1498}
1499
1500inline __m128i SSE::sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
1501{
1502 ocean_assert(image0 && image1);
1503
1504 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1505 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1506
1507 // subtract the 16 elements (usage of saturation and bitwise or operator)
1508 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1509
1510 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1511
1512 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00FA00DA00Bull, 0xA009A007A005A003ull));
1513 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1514
1515 // square the 16 elements
1516 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1517 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1518
1519 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1520 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1521 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1522
1523 // 4 32 bit square difference values
1524 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1525}
1526
1527template <bool tBufferHas16Bytes>
1528inline __m128i SSE::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1529{
1530 ocean_assert(image0 && image1);
1531
1532 const __m128i row0 = load_u8_15_lower_random<tBufferHas16Bytes>(image0);
1533 const __m128i row1 = load_u8_15_lower_random<tBufferHas16Bytes>(image1);
1534
1535 // subtract the 16 elements (usage of saturation and bitwise or operator)
1536 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1537
1538 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1539 const __m128i subtractLow = removeHighBits16_8(subtract);
1540 const __m128i subtractHigh = moveHighBits16_8_7(subtract); // the highest high 8 bit are not used due to the only 15 elements
1541
1542 // square the 16 elements
1543 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1544 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1545
1546 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1547 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1548 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1549
1550 // 4 32 bit square difference values
1551 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1552}
1553
1554template <bool tBufferHas16Bytes>
1555inline __m128i SSE::sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
1556{
1557 ocean_assert(image0 && image1);
1558
1559 return _mm_sad_epu8(load_u8_10_upper_zero<tBufferHas16Bytes>(image0), load_u8_10_upper_zero<tBufferHas16Bytes>(image1));
1560}
1561
1562template <bool tBufferHas16Bytes>
1563inline __m128i SSE::sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1564{
1565 ocean_assert(image0 && image1);
1566
1567 return _mm_sad_epu8(load_u8_15_upper_zero<tBufferHas16Bytes>(image0), load_u8_15_upper_zero<tBufferHas16Bytes>(image1));
1568}
1569
1570inline __m128i SSE::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1571{
1572 ocean_assert(image0 && image1);
1573
1574 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1575 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1576
1577 return sumSquareDifference8Bit16Elements(row0, row1);
1578}
1579
1580inline __m128i SSE::sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1581{
1582 ocean_assert(image0 && image1);
1583
1584 return _mm_sad_epu8(SSE::load128i(image0), SSE::load128i(image1));
1585}
1586
1587inline __m128i SSE::sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1)
1588{
1589 ocean_assert(image0 && image1);
1590 ocean_assert((unsigned long long)image0 % 16ll == 0ll);
1591 ocean_assert((unsigned long long)image1 % 16ll == 0ll);
1592
1593 const __m128i row0 = _mm_load_si128((__m128i*)image0);
1594 const __m128i row1 = _mm_load_si128((__m128i*)image1);
1595
1596 return sumSquareDifference8Bit16Elements(row0, row1);
1597}
1598
1599inline __m128i SSE::sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1)
1600{
1601 // subtract the 16 elements (usage of saturation and bitwise or operator)
1602 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1603
1604 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1605 const __m128i subtractLow = removeHighBits16_8(subtract);
1606 const __m128i subtractHigh = moveHighBits16_8(subtract);
1607
1608 // square the 16 elements
1609 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1610 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1611
1612 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1613 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1614 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1615
1616 // 4 32 bit square difference values
1617 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1618}
1619
1620inline __m128i SSE::interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1621{
1622 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1623 // values0: aF yE | yD yC | yB yA | y9 y8 | y7 y6 | y5 y4 | y3 y2 | y1 y0
1624 // values1: aF' yE' | yD' yC' | yB' yA' | y9' y8' | y7' y6' | y5' y4' | y3' y2' | y1' y0'
1625
1626 // shuffled elements
1627 // row0: y7 y6 y5 y4 y3 y2 y1 y0 | * fx_ * fy_
1628 // row1: y8 y7 y6 y5 y4 y3 y2 y1 | * fx * fy_
1629 // row2: y7' y6' y5' y4' y3' y2' y1' y0' | * fx_ * fy
1630 // row3: y8' y7' y6' y5' y4' y3' y2' y1' | * fx * fy
1631
1632#ifdef OCEAN_COMPILER_MSC
1633
1634 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1635 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1636 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1637 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1638 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1639 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1640 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1641
1642 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1643 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1644 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1645 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1646 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1647 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1648 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1649
1650 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1651 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1652 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1653 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1654 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1655 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1656 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1657
1658 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1659 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1660 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1661 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1662 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1663 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1664 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1665
1666 ocean_assert(fx_fy_.m128i_u16[0] + fxfy_.m128i_u16[0] + fx_fy.m128i_u16[0] + fxfy.m128i_u16[0] == 128u * 128u);
1667
1668#else
1669
1670#ifdef OCEAN_DEBUG
1671
1672 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1673 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1674 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1675 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1676
1677#endif // OCEAN_DEBUG
1678
1679 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1680 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1681 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1682 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1683 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1684 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1685 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1686
1687 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1688 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1689 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1690 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1691 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1692 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1693 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1694
1695 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1696 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1697 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1698 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1699 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1700 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1701 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1702
1703 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1704 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1705 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1706 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1707 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1708 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1709 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1710
1711 ocean_assert(debug_fx_fy_.m128i_u16[0] + debug_fxfy_.m128i_u16[0] + debug_fx_fy.m128i_u16[0] + debug_fxfy.m128i_u16[0] == 128u * 128u);
1712
1713#endif
1714
1715 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1716
1717 // row0
1718 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1719
1720 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1721 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1722
1723 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1724 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1725
1726 // row2
1727 row = _mm_shuffle_epi8(values1, shuffle);
1728
1729 multiLow = _mm_mullo_epi16(row, fx_fy);
1730 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1731
1732 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1733 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1734
1735
1736
1737 shuffle = set128i(0xA008A007A006A005ull, 0xA004A003A002A001ull);
1738
1739 // row1
1740 row = _mm_shuffle_epi8(values0, shuffle);
1741
1742 multiLow = _mm_mullo_epi16(row, fxfy_);
1743 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1744
1745 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1746 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1747
1748
1749 // row4
1750 row = _mm_shuffle_epi8(values1, shuffle);
1751
1752 multiLow = _mm_mullo_epi16(row, fxfy);
1753 multiHigh = _mm_mulhi_epu16(row, fxfy);
1754
1755 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1756 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1757
1758
1759 // normalization ( + 128 * 128 / 2) / (128 * 128)
1760 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1761 resultEven = _mm_srli_epi32(resultEven, 14);
1762
1763 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1764 resultOdd = _mm_srli_epi32(resultOdd, 14);
1765
1766 // stack the 2 four 32 bit values together to eight 8 bit values
1767 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1768}
1769
1770inline __m128i SSE::interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1771{
1772 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1773 // values0: a7 y7 | a6 y6 | a5 y5 | a4 y4 | a3 y3 | a2 y2 | a1 y1 | a0 y0
1774 // values1: a7' y7' | a6' y6' | a5' y5' | a4' y4' | a3' y3' | a2' y2' | a1' y1' | a0' y0'
1775
1776 // shuffled elements
1777 // row0: a3 y3 a2 y2 a1 y1 a0 y0 | * fx_ * fy_
1778 // row1: a4 y4 a3 y3 a2 y2 a1 y1 | * fx * fy_
1779 // row2: a3' y3' a2' y2' a1' y1' a0' y0' | * fx_ * fy
1780 // row3: a4' y4' a3' y3' a2' y2' a1' y1' | * fx * fy
1781
1782#ifdef OCEAN_COMPILER_MSC
1783
1784 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1785 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1786 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1787 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1788 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1789 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1790 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1791
1792 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1793 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1794 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1795 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1796 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1797 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1798 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1799
1800 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1801 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1802 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1803 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1804 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1805 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1806 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1807
1808 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1809 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1810 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1811 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1812 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1813 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1814 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1815
1816#else
1817
1818#ifdef OCEAN_DEBUG
1819
1820 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1821 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1822 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1823 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1824
1825#endif // OCEAN_DEBUG
1826
1827 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1828 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1829 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1830 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1831 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1832 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1833 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1834
1835 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1836 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1837 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1838 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1839 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1840 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1841 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1842
1843 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1844 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1845 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1846 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1847 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1848 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1849 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1850
1851 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1852 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1853 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1854 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1855 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1856 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1857 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1858
1859#endif
1860
1861 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1862
1863 // row0
1864 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1865
1866 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1867 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1868
1869 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1870 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1871
1872 // row2
1873 row = _mm_shuffle_epi8(values1, shuffle);
1874
1875 multiLow = _mm_mullo_epi16(row, fx_fy);
1876 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1877
1878 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1879 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1880
1881
1882
1883 shuffle = set128i(0xA009A008A007A006ull, 0xA005A004A003A002ull);
1884
1885 // row1
1886 row = _mm_shuffle_epi8(values0, shuffle);
1887
1888 multiLow = _mm_mullo_epi16(row, fxfy_);
1889 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1890
1891 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1892 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1893
1894
1895 // row4
1896 row = _mm_shuffle_epi8(values1, shuffle);
1897
1898 multiLow = _mm_mullo_epi16(row, fxfy);
1899 multiHigh = _mm_mulhi_epu16(row, fxfy);
1900
1901 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1902 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1903
1904
1905 // normalization ( + 128 * 128 / 2) / (128 * 128)
1906 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1907 resultEven = _mm_srli_epi32(resultEven, 14);
1908
1909 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1910 resultOdd = _mm_srli_epi32(resultOdd, 14);
1911
1912 // stack the 2 four 32 bit values together to eight 8 bit values
1913 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1914}
1915
1916inline __m128i SSE::interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1917{
1918 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1919 // values0: r5 | b4 g4 r4 | b3 g3 r3 | b2 g2 r2 | b1 g1 r1 | b0 g0 r0
1920 // values1: r5'| b4' g4' r4'| b3' g3' r3'| b2' g2' r2'| b1' g1' r1'| b0' g0' r0'
1921
1922 // shuffled elements
1923 // row0: g2 r2 b1 g1 r1 b0 g0 r0 | * fx_ * fy_
1924 // row1: g3 r3 b2 g2 r2 b1 g1 r1 | * fx * fy_
1925 // row2: g2' r2' b1' g1' r1' b0' g0' r0' | * fx_ * fy
1926 // row3: g3' r3' b2' g2' r2' b1' g1' r1' | * fx * fy
1927
1928#ifdef OCEAN_COMPILER_MSC
1929
1930 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1931 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1932 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1933 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1934 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1935 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1936 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1937
1938 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1939 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1940 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1941 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1942 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1943 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1944 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1945
1946 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1947 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1948 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1949 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1950 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1951 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1952 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1953
1954 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1955 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1956 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1957 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1958 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1959 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1960 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1961
1962#else
1963
1964#ifdef OCEAN_DEBUG
1965
1966 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1967 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1968 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1969 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1970
1971#endif // OCEAN_DEBUG
1972
1973 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1974 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1975 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1976 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1977 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1978 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1979 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1980
1981 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1982 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1983 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1984 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1985 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1986 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1987 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1988
1989 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1990 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1991 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1992 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1993 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1994 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1995 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1996
1997 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1998 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1999 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2000 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2001 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2002 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2003 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2004
2005#endif
2006
2007 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2008
2009 // row0
2010 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2011
2012 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2013 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2014
2015 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2016 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2017
2018 // row2
2019 row = _mm_shuffle_epi8(values1, shuffle);
2020
2021 multiLow = _mm_mullo_epi16(row, fx_fy);
2022 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2023
2024 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2025 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2026
2027
2028
2029 shuffle = set128i(0xA00AA009A008A007ull, 0xA006A005A004A003ull);
2030
2031 // row1
2032 row = _mm_shuffle_epi8(values0, shuffle);
2033
2034 multiLow = _mm_mullo_epi16(row, fxfy_);
2035 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2036
2037 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2038 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2039
2040
2041 // row4
2042 row = _mm_shuffle_epi8(values1, shuffle);
2043
2044 multiLow = _mm_mullo_epi16(row, fxfy);
2045 multiHigh = _mm_mulhi_epu16(row, fxfy);
2046
2047 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2048 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2049
2050
2051 // normalization ( + 128 * 128 / 2) / (128 * 128)
2052 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2053 resultEven = _mm_srli_epi32(resultEven, 14);
2054
2055 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2056 resultOdd = _mm_srli_epi32(resultOdd, 14);
2057
2058 // stack the 2 four 32 bit values together to eight 8 bit values
2059 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2060}
2061
2062inline __m128i SSE::interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2063{
2064 __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2065 __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2066
2067 __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2068 __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2069
2070 __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2071 __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2072
2073 __m128i row0_d = _mm_shuffle_epi8(values0, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2074 __m128i row1_d = _mm_shuffle_epi8(values1, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2075
2076 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2077 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2078 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2079 row0_d = _mm_madd_epi16(row0_d, fx_fy_fxfy_);
2080
2081 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2082 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2083 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2084 row1_d = _mm_madd_epi16(row1_d, fx_fyfxfy);
2085
2086 const __m128i rounding = _mm_set1_epi32(8192);
2087
2088 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2089 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2090 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2091 __m128i row_d = _mm_add_epi32(row0_d, row1_d);
2092
2093 row_a = _mm_add_epi32(row_a, rounding);
2094 row_b = _mm_add_epi32(row_b, rounding);
2095 row_c = _mm_add_epi32(row_c, rounding);
2096 row_d = _mm_add_epi32(row_d, rounding);
2097
2098 row_a = _mm_srli_epi32(row_a, 14);
2099 row_b = _mm_srli_epi32(row_b, 14);
2100 row_c = _mm_srli_epi32(row_c, 14);
2101 row_d = _mm_srli_epi32(row_d, 14);
2102
2103 row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF0c080400ull));
2104 row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFFFFFFFFull, 0x0c080400FFFFFFFFull));
2105 row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0c080400ull, 0xFFFFFFFFFFFFFFFFull));
2106 row_d = _mm_shuffle_epi8(row_d, set128i(0xFF080400FFFFFFFFull, 0xFFFFFFFFFFFFFFFFull));
2107
2108 row_a = _mm_or_si128(row_a, row_b);
2109 row_c = _mm_or_si128(row_c, row_d);
2110
2111 return _mm_or_si128(row_a, row_c);
2112}
2113
2114inline __m128i SSE::interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2115{
2116 __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2117 __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2118
2119 __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2120 __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2121
2122 __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2123 __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2124
2125 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2126 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2127 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2128
2129 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2130 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2131 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2132
2133 const __m128i rounding = _mm_set1_epi32(8192);
2134
2135 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2136 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2137 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2138
2139 row_a = _mm_add_epi32(row_a, rounding);
2140 row_b = _mm_add_epi32(row_b, rounding);
2141 row_c = _mm_add_epi32(row_c, rounding);
2142
2143 row_a = _mm_srli_epi32(row_a, 14);
2144 row_b = _mm_srli_epi32(row_b, 14);
2145 row_c = _mm_srli_epi32(row_c, 14);
2146
2147 row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
2148 row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
2149 row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
2150
2151 return _mm_or_si128(row_a, _mm_or_si128(row_b, row_c));
2152}
2153
2154inline __m128i SSE::interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2155{
2156 // F E D C B A 9 8 7 6 5 4 3 2 1 0
2157 // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2158 // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2159
2160 // shuffled elements
2161 // row0: a1 b1 g1 r1 a0 b0 g0 r0 | * fx_ * fy_
2162 // row1: a2 b2 g2 r2 a1 b1 g1 r1 | * fx * fy_
2163 // row2: a1' b1' g1' r1' a0' b0' g0' r0' | * fx_ * fy
2164 // row3: a2' b2' g2' r2' a1' b1' g1' r1' | * fx * fy
2165
2166#ifdef OCEAN_COMPILER_MSC
2167
2168 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2169 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2170 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2171 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2172 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2173 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2174 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2175
2176 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2177 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2178 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2179 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2180 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2181 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2182 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2183
2184 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2185 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2186 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2187 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2188 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2189 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2190 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2191
2192 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2193 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2194 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2195 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2196 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2197 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2198 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2199
2200#else
2201
2202#ifdef OCEAN_DEBUG
2203
2204 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2205 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2206 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2207 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2208
2209#endif // OCEAN_DEBUG
2210
2211 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2212 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2213 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2214 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2215 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2216 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2217 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2218
2219 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2220 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2221 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2222 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2223 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2224 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2225 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2226
2227 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2228 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2229 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2230 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2231 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2232 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2233 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2234
2235 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2236 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2237 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2238 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2239 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2240 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2241 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2242
2243#endif
2244
2245 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2246
2247 // row0
2248 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2249
2250 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2251 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2252
2253 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2254 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2255
2256 // row2
2257 row = _mm_shuffle_epi8(values1, shuffle);
2258
2259 multiLow = _mm_mullo_epi16(row, fx_fy);
2260 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2261
2262 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2263 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2264
2265
2266
2267 shuffle = set128i(0xA00BA00AA009A008ull, 0xA007A006A005A004ull);
2268
2269 // row1
2270 row = _mm_shuffle_epi8(values0, shuffle);
2271
2272 multiLow = _mm_mullo_epi16(row, fxfy_);
2273 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2274
2275 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2276 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2277
2278
2279 // row4
2280 row = _mm_shuffle_epi8(values1, shuffle);
2281
2282 multiLow = _mm_mullo_epi16(row, fxfy);
2283 multiHigh = _mm_mulhi_epu16(row, fxfy);
2284
2285 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2286 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2287
2288
2289 // normalization ( + 128 * 128 / 2) / (128 * 128)
2290 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2291 resultEven = _mm_srli_epi32(resultEven, 14);
2292
2293 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2294 resultOdd = _mm_srli_epi32(resultOdd, 14);
2295
2296 // stack the 2 four 32 bit values together to eight 8 bit values
2297 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2298}
2299
2300
2301inline __m128i SSE::interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2302{
2303 // F E D C B A 9 8 7 6 5 4 3 2 1 0
2304 // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2305 // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2306
2307 // shuffled elements
2308 // row0: a2 b2 g2 r2 a0 b0 g0 r0 | * fx_ * fy_
2309 // row1: a3 b3 g3 r3 a1 b1 g1 r1 | * fx * fy_
2310 // row2: a2' b2' g2' r2' a0' b0' g0' r0' | * fx_ * fy
2311 // row3: a3' b3' g3' r3' a1' b1' g1' r1' | * fx * fy
2312
2313#ifdef OCEAN_COMPILER_MSC
2314
2315 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2316 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2317 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2318 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2319 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2320 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2321 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2322
2323 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2324 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2325 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2326 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2327 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2328 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2329 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2330
2331 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2332 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2333 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2334 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2335 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2336 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2337 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2338
2339 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2340 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2341 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2342 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2343 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2344 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2345 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2346
2347#else
2348
2349#ifdef OCEAN_DEBUG
2350
2351 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2352 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2353 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2354 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2355
2356#endif // OCEAN_DEBUG
2357
2358 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2359 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2360 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2361 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2362 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2363 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2364 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2365
2366 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2367 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2368 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2369 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2370 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2371 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2372 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2373
2374 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2375 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2376 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2377 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2378 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2379 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2380 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2381
2382 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2383 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2384 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2385 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2386 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2387 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2388 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2389
2390#endif
2391
2392 __m128i shuffle = set128i(0xA00BA00AA009A008ull, 0xA003A002A001A000ull);
2393
2394 // row0
2395 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2396
2397 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2398 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2399
2400 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2401 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2402
2403 // row2
2404 row = _mm_shuffle_epi8(values1, shuffle);
2405
2406 multiLow = _mm_mullo_epi16(row, fx_fy);
2407 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2408
2409 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2410 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2411
2412
2413
2414 shuffle = set128i(0xA00FA00EA00DA00Cull, 0xA007A006A005A004ull);
2415
2416 // row1
2417 row = _mm_shuffle_epi8(values0, shuffle);
2418
2419 multiLow = _mm_mullo_epi16(row, fxfy_);
2420 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2421
2422 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2423 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2424
2425
2426 // row4
2427 row = _mm_shuffle_epi8(values1, shuffle);
2428
2429 multiLow = _mm_mullo_epi16(row, fxfy);
2430 multiHigh = _mm_mulhi_epu16(row, fxfy);
2431
2432 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2433 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2434
2435
2436 // normalization ( + 128 * 128 / 2) / (128 * 128)
2437 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2438 resultEven = _mm_srli_epi32(resultEven, 14);
2439
2440 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2441 resultOdd = _mm_srli_epi32(resultOdd, 14);
2442
2443 // stack the 2 four 32 bit values together to eight 8 bit values
2444 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2445}
2446
2447inline void SSE::average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result)
2448{
2449 ocean_assert(image0 && image1);
2450
2451 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2452 const __m128 row0 = _mm_loadu_ps(image0);
2453 const __m128 row1 = _mm_loadu_ps(image1);
2454
2455 // get sum of first 4 elements
2456 const __m128 sumFirst = _mm_add_ps(row0, row1);
2457
2458 // load next 4 elements
2459 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2460 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2461
2462 // get sum of second 4 elements
2463 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2464
2465 // get sum of adjacent summed pixels
2466 const __m128 sumAdjacent = _mm_hadd_ps(sumFirst, sumSecond);
2467
2468 /* following variant is exactly as fast as _mm_hadd_ps(,) ~ 0.30ms / 100,000 iteration
2469 const unsigned int mask10001000 = 136u;
2470 const unsigned int mask11011101 = 221u;
2471 const __m128 sumAdjacent = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, mask10001000), _mm_shuffle_ps(sumFirst, sumSecond, mask11011101));
2472 */
2473
2474 // divide by 4 --> multiply by 0.25
2475 const __m128 division = _mm_mul_ps(sumAdjacent, _mm_set_ps1(0.25f));
2476
2477 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2478 _mm_storeu_ps(result, division);
2479}
2480
2481inline void SSE::average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2482{
2483 ocean_assert(image0 && image1);
2484
2485 // 16 * uchar = m128i, but only the first 8 elements are set
2486 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2487 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2488
2489 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2490 const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2491 const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2492
2493 // build overall sum and add 2 for rounding
2494 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2495
2496 // divide by 4 by right shifting of two bits
2497 const __m128i division16 = _mm_srli_epi16(sum, 2);
2498
2499 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2500 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2501
2502 memcpy(result, &division8, sizeof(uint8_t) * 4);
2503}
2504
2505inline void SSE::average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2506{
2507 ocean_assert(image0 != nullptr && image1 != nullptr);
2508 ocean_assert(threshold >= 1u);
2509
2510 // we load the first 8 elements, the uppper 8 bytes will be set to zero
2511 const __m128i row0_u_8x8 = _mm_loadl_epi64((__m128i*)image0);
2512 const __m128i row1_u_8x8 = _mm_loadl_epi64((__m128i*)image1);
2513
2514 const __m128i row0_u_16x8 = _mm_cvtepu8_epi16(row0_u_8x8); // converting the lower 8 bytes to 16 byte values
2515 const __m128i row1_u_16x8 = _mm_cvtepu8_epi16(row1_u_8x8);
2516
2517 const __m128i verticalSum_u_16x8 = _mm_adds_epu16(row0_u_16x8, row1_u_16x8);
2518 const __m128i sum_u_16x8 = _mm_hadd_epi16(verticalSum_u_16x8, verticalSum_u_16x8);
2519
2520 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2521
2522 const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2523
2524 memcpy(result, &mask_u_8x8, sizeof(uint8_t) * 4);
2525}
2526
2527inline void SSE::average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2528{
2529 ocean_assert(image0 && image1);
2530
2531 // 16 * uchar = m128i
2532 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2533 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2534
2535 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2536 const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2537 const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2538
2539 // build overall sum and add 2 for rounding
2540 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2541
2542 // divide by 4 by right shifting of two bits
2543 const __m128i division16 = _mm_srli_epi16(sum, 2);
2544
2545 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2546 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2547
2548 // copy the lower 64 bit to the memory
2549 _mm_storel_epi64((__m128i*)result, division8);
2550
2551 /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2552 const __m128i avgRows = _mm_avg_epu8(row0, row1);
2553 const __m128i avgRowsSwap = _mm_or_si128(_mm_slli_epi16(avgRows, 8), _mm_srli_epi16(avgRows, 8));
2554
2555 const __m128i avg = _mm_avg_epu8(avgRows, avgRowsSwap); // 1 result in 2 uchar
2556 const __m128i avgOrdered = _mm_shuffle_epi8(avg, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2557
2558 _mm_storel_epi64((__m128i*)result, avgOrdered);
2559 */
2560}
2561
2562inline void SSE::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2563{
2564 ocean_assert(image0 != nullptr && image1 != nullptr);
2565 ocean_assert(threshold >= 1u);
2566
2567 // 16 * uchar = m128i
2568 const __m128i row0_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2569 const __m128i row1_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2570
2571 const __m128i horizontalSum0_u_16x8 = _mm_maddubs_epi16(row0_u_8x16, _mm_set1_epi8(1));
2572 const __m128i horizontalSum1_u_16x8 = _mm_maddubs_epi16(row1_u_8x16, _mm_set1_epi8(1));
2573
2574 const __m128i sum_u_16x8 = _mm_add_epi16(horizontalSum0_u_16x8, horizontalSum1_u_16x8);
2575
2576 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2577
2578 const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2579
2580 // copy the lower 64 bit to the memory
2581 _mm_storel_epi64((__m128i*)result, mask_u_8x8);
2582}
2583
2584inline void SSE::average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2585{
2586 ocean_assert(image0 && image1);
2587
2588 // first 16 elements
2589 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2590 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2591
2592 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2593 const __m128i firstSumLow = _mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1));
2594 const __m128i firstSumHigh = _mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1));
2595
2596 // build overall sum and add 2 for rounding
2597 const __m128i firstSum = _mm_add_epi16(firstSumLow, _mm_add_epi16(firstSumHigh, _mm_set1_epi32(int(0x00020002))));
2598
2599 // divide by 4 by right shifting of two bits
2600 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2601
2602 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2603 const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2604
2605 // second 16 elements
2606 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2607 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2608
2609 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2610 const __m128i secondSumLow = _mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1));
2611 const __m128i secondSumHigh = _mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1));
2612
2613 // build overall sum and add 2 for rounding
2614 const __m128i secondSum = _mm_add_epi16(secondSumLow, _mm_add_epi16(secondSumHigh, _mm_set1_epi32(int(0x00020002))));
2615
2616 // divide by 4 by right shifting of two bits
2617 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2618
2619 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2620 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2621
2622
2623 // combine both divion results
2624 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2625
2626 // copy the 128 bit to the memory
2627 _mm_storeu_si128((__m128i*)result, division8);
2628
2629 /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2630 const __m128i avgFirstRows = _mm_avg_epu8(firstRow0, firstRow1);
2631 const __m128i avgFirstRowsSwap = _mm_or_si128(_mm_slli_epi16(avgFirstRows, 8), _mm_srli_epi16(avgFirstRows, 8));
2632
2633 const __m128i avgFirst = _mm_avg_epu8(avgFirstRows, avgFirstRowsSwap); // 1 result in 2 uchar
2634 const __m128i avgFristOrdered = _mm_shuffle_epi8(avgFirst, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2635
2636 const __m128i avgSecondRows = _mm_avg_epu8(secondRow0, secondRow1);
2637 const __m128i avgSecondRowsSwap = _mm_or_si128(_mm_slli_epi16(avgSecondRows, 8), _mm_srli_epi16(avgSecondRows, 8));
2638
2639 const __m128i avgSecond = _mm_avg_epu8(avgSecondRows, avgSecondRowsSwap); // 1 result in 2 uchar
2640 const __m128i avgSecondOrdered = _mm_shuffle_epi8(avgSecond, _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0));
2641
2642 // combine both divion results
2643 const __m128i combinedAvg = _mm_or_si128(avgFristOrdered, avgSecondOrdered);
2644
2645 // copy the 128 bit to the memory
2646 _mm_storeu_si128((__m128i*)result, combinedAvg);
2647 */
2648}
2649
2650inline void SSE::average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2651{
2652 ocean_assert(image0 != nullptr && image1 != nullptr);
2653 ocean_assert(threshold >= 1u);
2654
2655 // load first 16 uchars
2656 const __m128i row0A_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2657 const __m128i row1A_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2658
2659 const __m128i horizontalSum0A_u_16x8 = _mm_maddubs_epi16(row0A_u_8x16, _mm_set1_epi8(1));
2660 const __m128i horizontalSum1A_u_16x8 = _mm_maddubs_epi16(row1A_u_8x16, _mm_set1_epi8(1));
2661
2662 const __m128i sumA_u_16x8 = _mm_add_epi16(horizontalSum0A_u_16x8, horizontalSum1A_u_16x8);
2663
2664 const __m128i maskA_u_16x8 = _mm_cmpgt_epi16(sumA_u_16x8, _mm_set1_epi16(short(threshold - 1)));
2665
2666 const __m128i row0B_u_8x16 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2667 const __m128i row1B_u_8x16 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2668
2669 const __m128i horizontalSum0B_u_16x8 = _mm_maddubs_epi16(row0B_u_8x16, _mm_set1_epi8(1));
2670 const __m128i horizontalSum1B_u_16x8 = _mm_maddubs_epi16(row1B_u_8x16, _mm_set1_epi8(1));
2671
2672 const __m128i sumB_u_16x8 = _mm_add_epi16(horizontalSum0B_u_16x8, horizontalSum1B_u_16x8);
2673
2674 const __m128i maskB_u_16x8 = _mm_cmpgt_epi16(sumB_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2675
2676 const __m128i mask_u_8x16 = _mm_or_si128(moveLowBits16_8ToLow64(maskA_u_16x8), moveLowBits16_8ToHigh64(maskB_u_16x8));
2677
2678 // copy the 128 bit to the memory
2679 _mm_storeu_si128((__m128i*)result, mask_u_8x16);
2680}
2681
2682inline void SSE::average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2683{
2684 ocean_assert(image0 && image1);
2685
2686 // 16 * uchar = m128i, but only the first 8 elements are set
2687 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2688 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2689
2690 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2691 const __m128i shuffledRow0 = shuffleNeighbor2Low64BitsToLow16_8(row0);
2692 const __m128i shuffledRow1 = shuffleNeighbor2Low64BitsToLow16_8(row1);
2693
2694 // build sum and add 2 for rounding
2695 const __m128i sumLow = _mm_add_epi16(shuffledRow0, shuffledRow1);
2696 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumLow), _mm_set1_epi32(int(0x00020002)));
2697
2698 // divide by 4 by right shifting of two bits
2699 const __m128i division16 = _mm_srli_epi16(sum, 2);
2700
2701 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2702 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2703
2704 memcpy(result, &division8, sizeof(uint8_t) * 4);
2705}
2706
2707inline void SSE::average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result)
2708{
2709 ocean_assert(image0 && image1);
2710
2711 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2712 const __m128 row0 = _mm_loadu_ps(image0);
2713 const __m128 row1 = _mm_loadu_ps(image1);
2714
2715 // get sum of first 4 elements
2716 const __m128 sumFirst = _mm_add_ps(row0, row1);
2717
2718 // load next 4 elements
2719 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2720 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2721
2722 // get sum of second 4 elements
2723 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2724
2725 // get sum of summed pixels
2726 // mask01000100 = 68u
2727 // mask11101110 = 238u
2728 const __m128 sumComponents = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, 68u), _mm_shuffle_ps(sumFirst, sumSecond, 238u));
2729
2730 // divide by 4 --> multiply by 0.25
2731 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2732
2733 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2734 _mm_storeu_ps(result, division);
2735}
2736
2737inline void SSE::average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2738{
2739 ocean_assert(image0 && image1);
2740
2741 // 16 * uchar = m128i
2742 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2743 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2744
2745 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2746 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2747 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2748
2749 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2750 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2751
2752 // divide by 4 by right shifting of two bits
2753 const __m128i division16 = _mm_srli_epi16(sum, 2);
2754
2755 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2756 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2757
2758 // copy the lower 64 bit to the memory
2759 _mm_storel_epi64((__m128i*)result, division8);
2760}
2761
2762inline void SSE::average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2763{
2764 ocean_assert(image0 && image1);
2765
2766 // first 16 elements: 16 * uchar = m128i
2767 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2768 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2769
2770 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2771 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2772 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2773
2774 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2775 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2776
2777 // divide by 4 by right shifting of two bits
2778 const __m128i division16 = _mm_srli_epi16(sum, 2);
2779
2780 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2781 const __m128i firstDivision8 = moveLowBits16_8ToLow64(division16);
2782
2783 // second 16 elements
2784 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2785 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2786
2787 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2788 const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(secondRow0), shuffleNeighbor2Low64BitsToLow16_8(secondRow1));
2789 const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(secondRow0), shuffleNeighbor2High64BitsToLow16_8(secondRow1));
2790
2791 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2792 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2793
2794 // divide by 4 by right shifting of two bits
2795 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2796
2797 // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2798 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2799
2800
2801 // combine both divion results
2802 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2803
2804 // copy the 128 bit to the memory
2805 _mm_storeu_si128((__m128i*)result, division8);
2806}
2807
2808inline void SSE::average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result)
2809{
2810 ocean_assert(image0 && image1 && result);
2811
2812 // 6 * float = 2 pixel: 00 01 02 03 04 05
2813
2814 // load element 0 up to 3, input does not need to be aligned on any particular boundary.
2815 const __m128 row0 = _mm_loadu_ps(image0);
2816 const __m128 row1 = _mm_loadu_ps(image1);
2817
2818 // get sum of first 4 elements
2819 const __m128 sumFirst = _mm_add_ps(row0, row1);
2820
2821 // load element 2 up to 5 to prevent that we access memory out of our range
2822 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 2);
2823 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 2);
2824
2825 // get sum of second 4 elements
2826 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2827
2828 // get sum of summed pixels
2829 // NOTE: _mm_shuffle_ps resulting first 64bit are always from first __m128, second 64bit from second __m128
2830 // mask111001 = 57u; // 'i+1'th float became 'i'
2831 const __m128 sumComponents = _mm_add_ps(sumFirst, _mm_shuffle_ps(sumSecond, sumSecond, 57u));
2832
2833 // divide by 4 --> multiply by 0.25
2834 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2835
2836 // store 3 elements (96 bit) to the memory
2837
2838#ifdef OCEAN_COMPILER_MSC
2839 memcpy(result, &division.m128_f32[0], sizeof(float) * 3);
2840#else
2841 memcpy(result, &division, sizeof(float) * 3);
2842#endif
2843}
2844
2845inline void SSE::average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2846{
2847 ocean_assert(image0 && image1 && result);
2848
2849 __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2850 __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2851
2852 // distribute the first 12 elements (element 00 up to 11):
2853 // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2854 //
2855 // -- -- -- -- -- 08 -- 07 -- 06 -- 02 -- 01 -- 00
2856 // -- -- -- -- -- 11 -- 10 -- 09 -- 05 -- 04 -- 03
2857
2858 __m128i shuffleMaskLow = set128i(0xA0A0A0A0A008A007ull, 0xA006A002A001A000ull);
2859 __m128i shuffleMaskHigh = set128i(0xA0A0A0A0A00BA00Aull, 0xA009A005A004A003ull);
2860
2861 __m128i sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2862 __m128i sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2863
2864 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2865 __m128i sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2866
2867 // divide by 4 by right shifting of two bits
2868 __m128i division16 = _mm_srli_epi16(sum, 2);
2869
2870 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2871 __m128i division8 = _mm_shuffle_epi8(division16, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A00A0806040200ull));
2872
2873
2874 // now we load the remaining 12 elements (however, this time we take element 04 up to 15 to prevent that we access memory out of our range)
2875 // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2876 //
2877 // -- -- -- -- -- 12 -- 11 -- 10 -- 06 -- 05 -- 04
2878 // -- -- -- -- -- 15 -- 14 -- 13 -- 09 -- 08 -- 07
2879
2880 row0 = _mm_lddqu_si128((__m128i*)(image0 + 8));
2881 row1 = _mm_lddqu_si128((__m128i*)(image1 + 8));
2882
2883 shuffleMaskLow = set128i(0xA0A0A0A0A00CA00Bull, 0xA00AA006A005A004ull);
2884 shuffleMaskHigh = set128i(0xA0A0A0A0A00FA00Eull, 0xA00DA009A008A007ull);
2885
2886 sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2887 sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2888
2889 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2890 sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2891
2892 // divide by 4 by right shifting of two bits
2893 division16 = _mm_srli_epi16(sum, 2);
2894
2895 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2896 division8 = _mm_or_si128(division8, _mm_shuffle_epi8(division16, set128i(0xA0A0A0A00A080604ull, 0x0200A0A0A0A0A0A0ull)));
2897
2898#ifdef OCEAN_COMPILER_MSC
2899 memcpy(result, &division8.m128i_u8[0], 12);
2900#else
2901 memcpy(result, &division8, 12);
2902#endif
2903}
2904
2905inline void SSE::average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result)
2906{
2907 ocean_assert(image0 && image1);
2908
2909 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2910 const __m128 row0 = _mm_loadu_ps(image0);
2911 const __m128 row1 = _mm_loadu_ps(image1);
2912
2913 // get sum of first 4 elements
2914 const __m128 sumFirstPixel = _mm_add_ps(row0, row1);
2915
2916 // load next 4 elements
2917 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2918 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2919
2920 // get sum of second 4 elements
2921 const __m128 sumSecondPixel = _mm_add_ps(rowSecond0, rowSecond1);
2922
2923 // get sum of summed pixels
2924 const __m128 sumComponents = _mm_add_ps(sumFirstPixel, sumSecondPixel);
2925
2926 // divide by 4 --> multiply by 0.25
2927 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2928
2929 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2930 _mm_storeu_ps(result, division);
2931}
2932
2933inline void SSE::average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2934{
2935 ocean_assert(image0 && image1);
2936
2937 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2938 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2939
2940 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2941 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(row0), shuffleNeighbor4Low64BitsToLow16_8(row1));
2942 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(row0), shuffleNeighbor4High64BitsToLow16_8(row1));
2943
2944 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2945 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2946
2947 // divide by 4 by right shifting of two bits
2948 const __m128i division16 = _mm_srli_epi16(sum, 2);
2949
2950 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2951 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2952
2953 // copy the lower 64 bit to the memory
2954 _mm_storel_epi64((__m128i*)result, division8);
2955}
2956
2957inline void SSE::average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2958{
2959 ocean_assert(image0 && image1);
2960
2961 // first 16 elements
2962 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2963 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2964
2965 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2966 const __m128i firstSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(firstRow0), shuffleNeighbor4Low64BitsToLow16_8(firstRow1));
2967 const __m128i firstSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(firstRow0), shuffleNeighbor4High64BitsToLow16_8(firstRow1));
2968
2969 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2970 const __m128i firstSum = _mm_add_epi16(_mm_hadd_epi16(firstSumLow, firstSumHigh), _mm_set1_epi32(int(0x00020002)));
2971
2972 // divide by 4 by right shifting of two bits
2973 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2974
2975 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2976 const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2977
2978
2979 // second 16 elements
2980 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2981 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2982
2983 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2984 const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(secondRow0), shuffleNeighbor4Low64BitsToLow16_8(secondRow1));
2985 const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(secondRow0), shuffleNeighbor4High64BitsToLow16_8(secondRow1));
2986
2987 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2988 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2989
2990 // divide by 4 by right shifting of two bits
2991 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2992
2993 // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2994 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2995
2996
2997 // combine both divion results
2998 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2999
3000 // copy the 128 bit to the memory
3001 _mm_storeu_si128((__m128i*)result, division8);
3002}
3003
3004inline void SSE::average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
3005{
3006 ocean_assert(image0 && image1 && image2);
3007
3008 /**
3009 * | 1 2 1 |
3010 * 1/16 | 2 4 2 |
3011 * | 1 2 1 |
3012 */
3013
3014 // first 16 elements (actual 14 are used)
3015 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
3016 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
3017 const __m128i firstRow2 = _mm_lddqu_si128((__m128i*)image2);
3018
3019 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
3020 const __m128i firstSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1)), _mm_add_epi16(removeHighBits16_8(firstRow1), removeHighBits16_8(firstRow2)));
3021 const __m128i firstSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1)), _mm_add_epi16(moveHighBits16_8(firstRow1), moveHighBits16_8(firstRow2)));
3022
3023 // second 16 elements, starting from 15th element
3024 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 14));
3025 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 14));
3026 const __m128i secondRow2 = _mm_lddqu_si128((__m128i*)(image2 + 14));
3027
3028 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
3029 const __m128i secondSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1)), _mm_add_epi16(removeHighBits16_8(secondRow1), removeHighBits16_8(secondRow2)));
3030 const __m128i secondSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1)), _mm_add_epi16(moveHighBits16_8(secondRow1), moveHighBits16_8(secondRow2)));
3031
3032 // build overall sum and add 8 for rounding
3033 // positions 0, 2, 3, 5, 6 are valid, e.g. pos. 0 contains element00 + element01
3034 const __m128i firstSum = _mm_add_epi16(firstSumEven, _mm_add_epi16(firstSumOdd, _mm_set1_epi32(int(0x00080008))));
3035 // e.g. pos. 0 contains now element00 + element01 + element02
3036 const __m128i firstSumWithEven = _mm_add_epi16(firstSum, _mm_shuffle_epi8(firstSumEven, set128i(0xFFFF0F0E0B0AFFFFull, 0x09080504FFFF0302ull)));
3037 // e.g. pos. 0 contains now element00 + element01 + element02 + element01
3038 const __m128i firstSumWithBoth = _mm_add_epi16(firstSumWithEven, _mm_shuffle_epi8(firstSumOdd, set128i(0xFFFF0D0C0908FFFFull, 0x07060302FFFF0100ull)));
3039
3040 // build overall sum and add 8 for rounding
3041 // positions 1, 2, 4, 5, 7 are valid
3042 const __m128i secondSum = _mm_add_epi16(secondSumEven, _mm_add_epi16(secondSumOdd, _mm_set1_epi32(int(0x00080008))));
3043 const __m128i secondSumWithEven = _mm_add_epi16(secondSum, _mm_shuffle_epi8(secondSumEven, set128i(0x0F0EFFFF0D0C0908ull, 0xFFFF07060302FFFFull)));
3044 const __m128i secondSumWithBoth = _mm_add_epi16(secondSumWithEven, _mm_shuffle_epi8(secondSumOdd, set128i(0x0D0CFFFF0B0A0706ull, 0xFFFF05040100FFFFull)));
3045
3046 // divide by 16 by right shifting of four bits
3047 const __m128i firstDivision16 = _mm_srli_epi16(firstSumWithBoth, 4);
3048 const __m128i secondDivision16 = _mm_srli_epi16(secondSumWithBoth, 4);
3049
3050 // reorder valid elements to lowest bits
3051 const __m128i firstDivision8 = _mm_shuffle_epi8(firstDivision16, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0C0A060400ull));
3052 const __m128i secondDivision8 = _mm_shuffle_epi8(secondDivision16, set128i(0xFFFFFFFFFFFF0E0Aull, 0x080402FFFFFFFFFFull));
3053
3054 // combine both divion results
3055 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3056
3057 // copy the lowest 10*8 bit to the memory
3058#ifdef OCEAN_COMPILER_MSC
3059 memcpy(result, &division8.m128i_u8[0], 10);
3060#else
3061 memcpy(result, &division8, 10);
3062#endif
3063}
3064
3066{
3067 /**
3068 * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3069 * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3070 * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3071 */
3072
3073 // We create a bit mask for all 16 bit odd values, an odd value will create an active lower bit in each 16 bit value
3074 const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0001000100010001ull, 0x0001000100010001ull));
3075
3076 // We create a bit mask for all 16 bit negative values, a negative value will create an active lower bit in each 16 bit value
3077 const __m128i maskNegatives = _mm_srli_epi16(_mm_and_si128(value, CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull)), 15);
3078
3079 // We add 1 to each 16 bit value having an active 'odd-bit' and active
3080 // 'negative-bit'
3081 return _mm_add_epi16(value, _mm_and_si128(maskNegatives, maskOdds));
3082}
3083
3084inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3085{
3086 ocean_assert(rightShifts < 16u);
3087
3088 // the offset for negative values: 2^shifts - 1
3089 const __m128i offsetForNegatives_s_16x8 = _mm_set1_epi16(short((1u << rightShifts) - 1u));
3090
3091 // bit mask for all 16 bit negative values
3092 const __m128i maskHigh_s_16x8 = CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull);
3093
3094 // 0x0000 for positive values, 0xFFFF for negative values
3095 const __m128i maskNegativeValues_s_16x8 = _mm_cmpeq_epi16(_mm_and_si128(value, maskHigh_s_16x8), maskHigh_s_16x8);
3096
3097 // 0 for positive values, 2^shifts - 1 for negative values
3098 const __m128i offset_s_16x8 = _mm_and_si128(offsetForNegatives_s_16x8, maskNegativeValues_s_16x8);
3099
3100 return _mm_add_epi16(value, offset_s_16x8);
3101}
3102
3103inline __m128i SSE::divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3104{
3105 return _mm_srai_epi16(addOffsetBeforeRightShiftDivisionSigned16Bit(value, rightShifts), int(rightShifts));
3106}
3107
3108inline __m128i SSE::roundedDivideByRightShiftSigned16Bit(const __m128i& value_s16x8, const unsigned int rightShifts)
3109{
3110 ocean_assert(rightShifts >= 1 && rightShifts <= 15);
3111
3112 const __m128i signMask_s16x8 = _mm_srai_epi16(value_s16x8, 15); // 0x0000 for +, 0xFFFF for -
3113
3114 const __m128i absValue_s16x8 = _mm_abs_epi16(value_s16x8);
3115 const __m128i offset_s16x8 = _mm_set1_epi16(1 << (rightShifts - 1));
3116
3117 const __m128i absValueWithOffset_s16x8 = _mm_add_epi16(absValue_s16x8, offset_s16x8);
3118
3119 const __m128i shifted_s16x8 = _mm_srai_epi16(absValueWithOffset_s16x8, rightShifts);
3120
3121 return _mm_sub_epi16(_mm_xor_si128(shifted_s16x8, signMask_s16x8), signMask_s16x8); // restore sign: (shifted ^ sign_mask) - sign_mask
3122}
3123
3124inline int16_t SSE::maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts)
3125{
3126 ocean_assert(rightShifts >= 1 && rightShifts <= 15);
3127
3128 const int32_t maxValue = 32767 - (1 << (rightShifts - 1));
3129
3131
3132 return int16_t(maxValue);
3133}
3134
3136{
3137 /**
3138 * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3139 * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3140 * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3141 */
3142
3143 // We create a bit mask for all 32 bit odd values, an odd value will create an active lower bit in each 32 bit value
3144 const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0000000100000001ull, 0x0000000100000001ull));
3145
3146 // We create a bit mask for all 32 bit negative values, a negative value will create an active lower bit in each 32 bit value
3147 const __m128i maskNegatives = _mm_srli_epi32(_mm_and_si128(value, CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull)), 31);
3148
3149 // We add 1 to each 32 bit value having an active 'odd-bit' and active 'negative-bit'
3150 return _mm_add_epi32(value, _mm_and_si128(maskNegatives, maskOdds));
3151}
3152
3153inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3154{
3155 ocean_assert(rightShifts < 32u);
3156
3157 // the offset for negative values: 2^shifts - 1
3158 const __m128i offsetForNegatives_s_32x4 = _mm_set1_epi32(int((1u << rightShifts) - 1u));
3159
3160 // bit mask for all 32 bit negative values
3161 const __m128i maskHigh_s_32x4 = CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull);
3162
3163 // 0x00000000 for positive values, 0xFFFFFFFF for negative values
3164 const __m128i maskNegativeValues_s_32x4 = _mm_cmpeq_epi32(_mm_and_si128(value, maskHigh_s_32x4), maskHigh_s_32x4);
3165
3166 // 0 for positive values, 2^shifts - 1 for negative values
3167 const __m128i offset_s_32x4 = _mm_and_si128(offsetForNegatives_s_32x4, maskNegativeValues_s_32x4);
3168
3169 return _mm_add_epi32(value, offset_s_32x4);
3170}
3171
3172inline __m128i SSE::divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3173{
3174 return _mm_srai_epi32(addOffsetBeforeRightShiftDivisionSigned32Bit(value, rightShifts), int(rightShifts));
3175}
3176
3177inline void SSE::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
3178{
3179 ocean_assert(source && response && width >= 10u);
3180
3181 // Load 16 unsigned 8-bit values; left/right/top/bottom pixels
3182 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3183 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3184
3185 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3186 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3187
3188 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3189 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3190 //const __m128i horizontalMinusLo = _mm_shuffle_epi8(horizontalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3191 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3192
3193 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3194 //const __m128i horizontalPlusLo = _mm_shuffle_epi8(horizontalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3195 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3196
3197 // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3198 const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3199 const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3200
3201 // Convert the low and high signed 16-bit differences to signed 8-bit and merge them into a single
3202 const __m128i horizontalGradient = _mm_or_si128(
3203 _mm_shuffle_epi8(horizontalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3204 _mm_shuffle_epi8(horizontalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3205
3206 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3207 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3208 //const __m128i verticalMinusLo = _mm_shuffle_epi8(verticalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == a[7:0]
3209 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3210
3211 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3212 //const __m128i verticalPlusLo = _mm_shuffle_epi8(verticalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == b[7:0]
3213 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3214
3215 // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3216 const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3217 const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3218
3219 // Convert the differences to signed char and merge the high and low halves
3220 const __m128i verticalGradient = _mm_or_si128(
3221 _mm_shuffle_epi8(verticalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3222 _mm_shuffle_epi8(verticalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3223
3224 // Take the horizontal gradients, [dx0, dx1, dx2, ...], and the vertical gradient, [dy0, dy1, dy2, ...] and interleave them, [dx0, dy0, dx1, dy1, dx2, dy2, ...]
3225 const __m128i interleavedResponseLo = _mm_unpacklo_epi8(horizontalGradient, verticalGradient);
3226 const __m128i interleavedResponseHi = _mm_unpackhi_epi8(horizontalGradient, verticalGradient);
3227
3228 ocean_assert(sizeof(char) == 1ull);
3229 _mm_storeu_si128((__m128i*)response, interleavedResponseLo);
3230 _mm_storeu_si128((__m128i*)(response + 16ull), interleavedResponseHi);
3231}
3232
3233inline void SSE::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
3234{
3235 ocean_assert(source && response && width >= 10u);
3236
3237 // Load 4x(16x8u) values: left/right/top/bottom pixels
3238 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3239 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3240
3241 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3242 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3243
3244 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3245 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3246 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3247
3248 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3249 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3250
3251 // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3252 const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3253 const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3254
3255 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3256 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3257 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3258
3259 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3260 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3261
3262 // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3263 const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3264 const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3265
3266 // Squared gradients: h*h, v*v, h*v
3267 const __m128i horizontalHorizontalLo = _mm_mullo_epi16(horizontalGradientLo, horizontalGradientLo);
3268 const __m128i horizontalHorizontalHi = _mm_mullo_epi16(horizontalGradientHi, horizontalGradientHi);
3269
3270 const __m128i verticalVerticalLo = _mm_mullo_epi16(verticalGradientLo, verticalGradientLo);
3271 const __m128i verticalVerticalHi = _mm_mullo_epi16(verticalGradientHi, verticalGradientHi);
3272
3273 const __m128i horzontalVerticalLo = _mm_mullo_epi16(horizontalGradientLo, verticalGradientLo);
3274 const __m128i horzontalVerticalHi = _mm_mullo_epi16(horizontalGradientHi, verticalGradientHi);
3275
3276 // Interleave/pack the above squared gradient, 16S values
3277 //
3278 // a, b, c - Above variables ending in *Lo
3279 // d, e, f - Above variables ending in *Hi
3280 //
3281 // a = [a7, a6, a5, a4, a3, a2, a1, a0]
3282 // b = [b7, b6, b5, b4, b3, b2, b1, b0]
3283 // c = [c7, c6, c5, c4, c3, c2, c1, c0]
3284 //
3285 // d = [d7, d6, d5, d4, d3, d2, d1, d0]
3286 // e = [e7, e6, e5, e4, e3, e2, e1, e0]
3287 // f = [f7, f6, f5, f4, f3, f2, f1, f0]
3288 //
3289 // A = [b2, a2, c1, b1, a1, c0, b0, a0]
3290 // B = [a5, c4, b4, a4, c3, b3, a3, c2]
3291 // C = [c7, b7, a7, c6, b6, a6, c5, b5]
3292 //
3293 // D = [e2, d2, f1, e1, d1, f0, e0, d0]
3294 // E = [d5, f4, e4, d4, f3, e3, d3, f2]
3295 // F = [f7, e7, d7, f6, e6, d6, f5, e5]
3296
3297 const __m128i block0Lo = _mm_or_si128( // == [b2, a2, c1, b1, a1, c0, b0, a0]
3298 _mm_or_si128( // == [b2, a2, 00, b1, a1, 00, b0, a0]
3299 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, a2, 00, 00, a1, 00, 00, a0]
3300 _mm_shuffle_epi8(verticalVerticalLo, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [b2, 00, 00, b1, 00, 00, b0, 00]
3301 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, c1, 00, 00, c0, 00, 00]
3302
3303 const __m128i block1Lo = _mm_or_si128( // == [a5, c4, b4, a4, c3, b3, a3, c2]
3304 _mm_or_si128( // == [a5, 00, b4, a4, 00, b3, a3, 00]
3305 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [a5, 00, 00, a4, 00, 00, a4, 00]
3306 _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, b4, 00, 00, b3, 00, 00]
3307 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, c4, 00, 00, c3, 00, 00, c2]
3308
3309 const __m128i block2Lo = _mm_or_si128( // == [c7, b7, a7, c6, b6, a6, c5, b5]
3310 _mm_or_si128( // == [00, b7, a7, 00, b6, a6, 00, b5]
3311 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, a7, 00, 00, a6, 00, 00]
3312 _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, b7, 00, 00, b6, 00, 00, b5]
3313 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [c7, 00, 00, c6, 00, 00, c5, 00]
3314
3315 const __m128i block0Hi = _mm_or_si128( // == [e2, d2, f1, e1, d1, f0, e0, d0]
3316 _mm_or_si128( // == [e2, d2, 00, e1, d1, 00, e0, d0]
3317 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, d2, 00, 00, d1, 00, 00, d0]
3318 _mm_shuffle_epi8(verticalVerticalHi, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [e2, 00, 00, e1, 00, 00, e0, 00]
3319 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, f1, 00, 00, f0, 00, 00]
3320
3321 const __m128i block1Hi = _mm_or_si128( // == [d5, f4, e4, d4, f3, e3, d3, f2]
3322 _mm_or_si128( // == [d5, 00, e4, d4, 00, e3, d3, 00]
3323 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [d5, 00, 00, d4, 00, 00, d4, 00]
3324 _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, e4, 00, 00, e3, 00, 00]
3325 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, f4, 00, 00, f3, 00, 00, f2]
3326
3327 const __m128i block2Hi = _mm_or_si128( // == [f7, e7, d7, f6, e6, d6, f5, e5]
3328 _mm_or_si128( // == [00, e7, d7, 00, e6, d6, 00, e5]
3329 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, d7, 00, 00, d6, 00, 00]
3330 _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, e7, 00, 00, e6, 00, 00, e5]
3331 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [f7, 00, 00, f6, 00, 00, f5, 00]
3332
3333 _mm_storeu_si128((__m128i*)response, block0Lo);
3334 _mm_storeu_si128((__m128i*)(response + 8ull), block1Lo);
3335 _mm_storeu_si128((__m128i*)(response + 16ull), block2Lo);
3336 _mm_storeu_si128((__m128i*)(response + 24ull), block0Hi);
3337 _mm_storeu_si128((__m128i*)(response + 32ull), block1Hi);
3338 _mm_storeu_si128((__m128i*)(response + 40ull), block2Hi);
3339}
3340
3341OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2)
3342{
3343 // interleaved R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 X
3344
3345 // channel01 R0 R1 R2 R3 R4 X X X G0 G1 G2 G3 G4 X X X
3346 // channel2 B0 B1 B2 B3 B4 X X X 0 0 0 0 0 0 0 0
3347
3348 channel01 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFF0d0a070401ull, 0xFFFFFF0c09060300ull));
3349
3350 channel2 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull));
3351}
3352
3353OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2)
3354{
3355 // interleavedA R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
3356 // interleavedB G5 B5 R6 G6 B6 R7 G7 B7 X X X X X X X X
3357
3358 // channel01 R0 R1 R2 R3 R4 R5 R6 R7 G0 G1 G2 G3 G4 G5 G6 G7
3359 // channel2 B0 B1 B2 B3 B4 B5 B6 B7 0 0 0 0 0 0 0 0
3360
3361 channel01 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFF0d0a070401ull, 0xFFFF0f0c09060300ull)),
3362 _mm_shuffle_epi8(interleavedB, set128i(0x060300FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3363
3364 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3365 _mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFFFFFFull, 0x070401FFFFFFFFFFull)));
3366}
3367
3368OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3369{
3370 channel0 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFF0f0c09060300ull)),
3371 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0e0b08ull, 0x0502FFFFFFFFFFFFull)),
3372 _mm_shuffle_epi8(interleavedC, set128i(0x0d0a070401FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3373
3374 channel1 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3375 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3376 _mm_shuffle_epi8(interleavedC, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3377
3378 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3379 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFF0d0aull, 0x070401FFFFFFFFFFull)),
3380 _mm_shuffle_epi8(interleavedC, set128i(0x0f0c09060300FFFFull, 0xFFFFFFFFFFFFFFFFull))));
3381}
3382
3383inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3384{
3385 ocean_assert(interleaved != nullptr);
3386
3387 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0, channel1, channel2);
3388}
3389
3390inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2)
3391{
3392 ocean_assert(interleaved && channel0 && channel1 && channel2);
3393
3394 __m128i channel0_128, channel1_128, channel2_128;
3395 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0_128, channel1_128, channel2_128);
3396
3397 store128i(channel0_128, channel0);
3398 store128i(channel1_128, channel1);
3399 store128i(channel2_128, channel2);
3400}
3401
3402inline void SSE::deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3403{
3404 ocean_assert(interleaved != nullptr);
3405
3406 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3), channel0, channel1, channel2);
3407}
3408
3409OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC)
3410{
3411 interleavedA = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0x05FFFF04FFFF03FFull, 0xFF02FFFF01FFFF00ull)),
3412 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFFFF04FFFF03FFFFull, 0x02FFFF01FFFF00FFull)),
3413 _mm_shuffle_epi8(channel2, set128i(0xFF04FFFF03FFFF02ull, 0xFFFF01FFFF00FFFFull))));
3414
3415 interleavedB = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFF0AFFFF09FFFF08ull, 0xFFFF07FFFF06FFFFull)),
3416 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0x0AFFFF09FFFF08FFull, 0xFF07FFFF06FFFF05ull)),
3417 _mm_shuffle_epi8(channel2, set128i(0xFFFF09FFFF08FFFFull, 0x07FFFF06FFFF05FFull))));
3418
3419 interleavedC = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFFFF0FFFFF0EFFFFull, 0x0DFFFF0CFFFF0BFFull)),
3420 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFF0FFFFF0EFFFF0Dull, 0xFFFF0CFFFF0BFFFFull)),
3421 _mm_shuffle_epi8(channel2, set128i(0x0FFFFF0EFFFF0DFFull, 0xFF0CFFFF0BFFFF0Aull))));
3422}
3423
3424OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved)
3425{
3426 ocean_assert(channel0 && channel1 && channel2 && interleaved);
3427
3428 __m128i interleavedA_128, interleavedB_128, interleavedC_128;
3429 interleave3Channel8Bit48Elements(load128i(channel0), load128i(channel1), load128i(channel2), interleavedA_128, interleavedB_128, interleavedC_128);
3430
3431 store128i(interleavedA_128, interleaved + 0);
3432 store128i(interleavedB_128, interleaved + 16);
3433 store128i(interleavedC_128, interleaved + 32);
3434}
3435
3436OCEAN_FORCE_INLINE void SSE::store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i& singleChannel_u_8x8, uint8_t* interleaved)
3437{
3438 ocean_assert(interleaved != nullptr);
3439
3440 // singleChannel_u_8x8 contains 8 elements in lower 8 bytes: [s0, s1, s2, s3, s4, s5, s6, s7]
3441
3442 const __m128i shuffleMask0 = _mm_set_epi8(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0);
3443 const __m128i interleaved0 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask0);
3444
3445 const __m128i shuffleMask1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 7, 7, 7, 6, 6, 6, 5, 5);
3446 const __m128i interleaved1 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask1);
3447
3448 _mm_storeu_si128((__m128i*)(interleaved + 0), interleaved0);
3449 _mm_storel_epi64((__m128i*)(interleaved + 16), interleaved1);
3450}
3451
3452OCEAN_FORCE_INLINE void SSE::store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i& singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t* interleaved)
3453{
3454 ocean_assert(interleaved != nullptr);
3455
3456 // singleChannel_u_8x8 contains 8 elements in lower 8 bytes: [s0, s1, s2, s3, s4, s5, s6, s7]
3457
3458 const __m128i shuffleMask0 = _mm_set_epi8(-128, 3, 3, 3, -128, 2, 2, 2, -128, 1, 1, 1, -128, 0, 0, 0); // -128 means set to zero, for 4th channel positions
3459 const __m128i shuffleMask1 = _mm_set_epi8(-128, 7, 7, 7, -128, 6, 6, 6, -128, 5, 5, 5, -128, 4, 4, 4);
3460
3461 // expand to first 3 channels with zero in 4th channel positions
3462 __m128i result0 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask0);
3463 __m128i result1 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask1);
3464
3465 const __m128i channel4Mask = _mm_set_epi8(-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0);
3466
3467 const __m128i lastChannelValue_u_8x16 = _mm_set1_epi8(char(lastChannelValue));
3468
3469 result0 = _mm_blendv_epi8(result0, lastChannelValue_u_8x16, channel4Mask);
3470 result1 = _mm_blendv_epi8(result1, lastChannelValue_u_8x16, channel4Mask);
3471
3472 _mm_storeu_si128((__m128i*)(interleaved + 0), result0);
3473 _mm_storeu_si128((__m128i*)(interleaved + 16), result1);
3474}
3475
3476OCEAN_FORCE_INLINE void SSE::reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3477{
3478 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3479
3480 // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3481 // Y A Y A Y A Y A Y A Y A Y A Y A
3482 // output: A Y A Y A Y A Y A Y A Y A Y A Y
3483 // 1 0 3 2 5 4 7 6 9 8 B A D C F E
3484
3485 const __m128i shuffleMask_u_16x8 = set128i(0x0E0F0C0D0A0B0809ull, 0x0607040502030001ull);
3486
3487 store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3488 store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3489}
3490
3491OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2)
3492{
3493 reversedInterleaved0 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFF0c0d0e090a0b06ull, 0x0708030405000102ull)),
3494 _mm_shuffle_epi8(interleaved1, set128i(0x01FFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull)));
3495
3496 reversedInterleaved1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFF0fFFull)),
3497 _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0x0fFF0b0c0d08090aull, 0x050607020304FF00ull)),
3498 _mm_shuffle_epi8(interleaved2, set128i(0xFF00FFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3499
3500 reversedInterleaved2 = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFF0eull)),
3501 _mm_shuffle_epi8(interleaved2, set128i(0x0d0e0f0a0b0c0708ull, 0x09040506010203FFull)));
3502}
3503
3504OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* const reversedInterleaved)
3505{
3506 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3507
3508 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3509 reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3510
3511 store128i(reversedInterleaved0, reversedInterleaved);
3512 store128i(reversedInterleaved1, reversedInterleaved + 16);
3513 store128i(reversedInterleaved2, reversedInterleaved + 32);
3514}
3515
3516OCEAN_FORCE_INLINE void SSE::reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3517{
3518 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3519
3520 // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3521 // R G B A R G B A R G B A R G B A
3522 // output: A B G R A B G R A B G R A B G R
3523 // 3 2 1 0 7 6 5 4 B A 9 8 F E D C
3524
3525 const __m128i shuffleMask_u_16x8 = set128i(0x0C0D0E0F08090A0Bull, 0x0405060700010203ull);
3526
3527 store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3528 store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3529 store128i(_mm_shuffle_epi8(load128i(interleaved + 32), shuffleMask_u_16x8), reversedInterleaved + 32);
3530 store128i(_mm_shuffle_epi8(load128i(interleaved + 48), shuffleMask_u_16x8), reversedInterleaved + 48);
3531}
3532
3533inline void SSE::reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved)
3534{
3535 ocean_assert(interleaved);
3536
3537 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3538 reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3539
3540 store128i(reversedInterleaved0, interleaved);
3541 store128i(reversedInterleaved1, interleaved + 16);
3542 store128i(reversedInterleaved2, interleaved + 32);
3543}
3544
3545inline void SSE::swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second)
3546{
3547 ocean_assert(first && second && first != second);
3548
3549 __m128i first0, first1, first2;
3550 reverseChannelOrder3Channel8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3551
3552 __m128i second0, second1, second2;
3553 reverseChannelOrder3Channel8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3554
3555 store128i(first0, second);
3556 store128i(first1, second + 16);
3557 store128i(first2, second + 32);
3558
3559 store128i(second0, first);
3560 store128i(second1, first + 16);
3561 store128i(second2, first + 32);
3562}
3563
3564inline void SSE::reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2)
3565{
3566 const __m128i mask = set128i(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
3567
3568 reversedElements0 = _mm_shuffle_epi8(elements2, mask);
3569 reversedElements1 = _mm_shuffle_epi8(elements1, mask);
3570 reversedElements2 = _mm_shuffle_epi8(elements0, mask);
3571}
3572
3573inline void SSE::reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements)
3574{
3575 ocean_assert(elements && reversedElements);
3576
3577 __m128i reversedElements0, reversedElements1, reversedElements2;
3578 reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3579
3580 store128i(reversedElements0, reversedElements);
3581 store128i(reversedElements1, reversedElements + 16);
3582 store128i(reversedElements2, reversedElements + 32);
3583}
3584
3585inline void SSE::reverseElements8Bit48Elements(uint8_t* elements)
3586{
3587 ocean_assert(elements);
3588
3589 __m128i reversedElements0, reversedElements1, reversedElements2;
3590 reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3591
3592 store128i(reversedElements0, elements);
3593 store128i(reversedElements1, elements + 16);
3594 store128i(reversedElements2, elements + 32);
3595}
3596
3597inline void SSE::swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second)
3598{
3599 ocean_assert(first && second && first != second);
3600
3601 __m128i first0, first1, first2;
3602 reverseElements8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3603
3604 __m128i second0, second1, second2;
3605 reverseElements8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3606
3607 store128i(first0, second);
3608 store128i(first1, second + 16);
3609 store128i(first2, second + 32);
3610
3611 store128i(second0, first);
3612 store128i(second1, first + 16);
3613 store128i(second2, first + 32);
3614}
3615
3616inline void SSE::shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3617{
3618 ocean_assert(elements && shiftedElements);
3619
3620 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0c0f0e0d080b0a09ull, 0x0407060500030201ull)), shiftedElements);
3621}
3622
3623inline void SSE::shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3624{
3625 ocean_assert(elements && shiftedElements);
3626
3627 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0003020104070605ull, 0x080b0a090c0f0e0dull)), shiftedElements);
3628}
3629
3630inline void SSE::shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3631{
3632 ocean_assert(elements && shiftedElements);
3633
3634 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0e0d0c0f0a09080bull, 0x0605040702010003ull)), shiftedElements);
3635}
3636
3637inline void SSE::shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3638{
3639 ocean_assert(elements && shiftedElements);
3640
3641 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0201000306050407ull, 0x0a09080b0e0d0c0full)), shiftedElements);
3642}
3643
3644inline __m128i SSE::sum1Channel8Bit16Elements(const __m128i& elements)
3645{
3646 const __m128i zero = _mm_setzero_si128();
3647 const __m128i sum = _mm_sad_epu8(elements, zero);
3648
3649 return _mm_add_epi32(_mm_srli_si128(sum, 8), sum);
3650}
3651
3652inline __m128i SSE::sum1Channel8Bit16Elements(const uint8_t* elements)
3653{
3654 ocean_assert(elements != nullptr);
3655
3656 return sum1Channel8Bit16Elements(load128i(elements));
3657}
3658
3659template <bool tBufferHas16Bytes>
3660inline __m128i SSE::sum1Channel8BitFront15Elements(const uint8_t* elements)
3661{
3662 ocean_assert(elements != nullptr);
3663 return sum1Channel8Bit16Elements(load_u8_15_upper_zero<tBufferHas16Bytes>(elements));
3664}
3665
3666inline __m128i SSE::sum1Channel8BitBack15Elements(const uint8_t* elements)
3667{
3668 ocean_assert(elements != nullptr);
3669 return sum1Channel8Bit16Elements(load_u8_16_and_shift_right<1u>(elements));
3670}
3671
3672inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2)
3673{
3674 // Interleaved0: R BGR BGR BGR BGR BGR
3675 // Interleaved1: GR BGR BGR BGR BGR BG
3676 // Interleaved2: BGR BGR BGR BGR BGR B
3677
3678 // BBBBBBBB RRRRRRRR
3679 const __m128i channel0_2First = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFF0e0b080502ull, 0xFFFF0f0c09060300ull)),
3680 _mm_shuffle_epi8(interleaved1, set128i(0x070401FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3681
3682 // BBBBBBBB RRRRRRRR
3683 const __m128i channel0_2Second = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFF0d0aull, 0xFFFFFFFFFF0e0b08ull)),
3684 _mm_shuffle_epi8(interleaved2, set128i(0x0f0c09060300FFFFull, 0x0d0a070401FFFFFFull)));
3685
3686 // GGGGGGGG GGGGGGGG
3687 const __m128i channel1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3688 _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3689 _mm_shuffle_epi8(interleaved2, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3690
3691 const __m128i zero = _mm_setzero_si128();
3692
3693 // 0000 BBBB 0000 RRRR
3694 const __m128i sum0_2 = _mm_add_epi32(_mm_sad_epu8(channel0_2First, zero), _mm_sad_epu8(channel0_2Second, zero));
3695
3696 // 0000 GGGG 0000 GGGG
3697 const __m128i sum1 = _mm_sad_epu8(channel1, zero);
3698
3699 // 0000 BBBB GGGG RRRR
3700 return _mm_blend_epi16(sum0_2, _mm_add_epi32(_mm_slli_si128(sum1, 4), _mm_srli_si128(sum1, 4)), int(0xC));
3701}
3702
3703inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved)
3704{
3705 ocean_assert(interleaved != nullptr);
3706
3707 return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32));
3708}
3709
3710inline __m128i SSE::sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved)
3711{
3712 ocean_assert(interleaved != nullptr);
3713
3714 return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3));
3715}
3716
3717inline __m128i SSE::load128iLower64(const void* const buffer)
3718{
3719 ocean_assert(buffer != nullptr);
3720 return _mm_loadl_epi64((const __m128i*)(buffer));
3721}
3722
3723inline __m128i SSE::load128i(const void* const buffer)
3724{
3725 ocean_assert(buffer != nullptr);
3726 return _mm_lddqu_si128((const __m128i*)(buffer));
3727}
3728
3729template <bool tBufferHas16Bytes>
3730inline __m128i SSE::load_u8_10_upper_zero(const uint8_t* const buffer)
3731{
3732 ocean_assert(buffer != nullptr);
3733
3734 __m128i result;
3735
3736#ifdef OCEAN_COMPILER_MSC
3737
3738 result.m128i_u64[0] = uint64_t(0);
3739 memcpy(result.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3740 memcpy(result.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3741
3742#else
3743
3744 M128i& ourResult = *((M128i*)(&result));
3745
3746 ourResult.m128i_u64[0] = uint64_t(0);
3747 memcpy(ourResult.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3748 memcpy(ourResult.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3749
3750#endif
3751
3752 return result;
3753}
3754
3755template <>
3756inline __m128i SSE::load_u8_10_upper_zero<true>(const uint8_t* const buffer)
3757{
3758 ocean_assert(buffer != nullptr);
3759
3760 // we load 16 bytes and shift the SSE register by 6 byte afterwards
3761 return _mm_slli_si128(SSE::load128i(buffer), 6);
3762}
3763
3764template <bool tBufferHas16Bytes>
3765inline __m128i SSE::load_u8_15_upper_zero(const uint8_t* const buffer)
3766{
3767 ocean_assert(buffer != nullptr);
3768
3769 __m128i intermediate;
3770 memcpy(&intermediate, buffer, 15);
3771
3772 // we shift the SSE register by 1 byte afterwards
3773 return _mm_slli_si128(intermediate, 1);
3774}
3775
3776template <>
3777inline __m128i SSE::load_u8_15_upper_zero<true>(const uint8_t* const buffer)
3778{
3779 ocean_assert(buffer != nullptr);
3780
3781 // we load 16 bytes and shift the SSE register by 1 byte afterwards
3782 return _mm_slli_si128(_mm_lddqu_si128((__m128i*)(buffer)), 1);
3783}
3784
3785template <bool tBufferHas16Bytes>
3786inline __m128i SSE::load_u8_13_lower_random(const uint8_t* const buffer)
3787{
3788 ocean_assert(buffer != nullptr);
3789
3790 __m128i result;
3791 memcpy(&result, buffer, 13);
3792
3793 return result;
3794}
3795
3796template <>
3797inline __m128i SSE::load_u8_13_lower_random<true>(const uint8_t* const buffer)
3798{
3799 ocean_assert(buffer != nullptr);
3800
3801 // we load the entire 16 bytes to the 128i value as this is the fastest way
3802 return _mm_lddqu_si128((__m128i*)(buffer));
3803}
3804
3805template <bool tBufferHas16Bytes>
3806inline __m128i SSE::load_u8_15_lower_zero(const uint8_t* const buffer)
3807{
3808 ocean_assert(buffer != nullptr);
3809
3810 __m128i result;
3811 memcpy(&result, buffer, 15);
3812
3813#ifdef OCEAN_COMPILER_MSC
3814 result.m128i_u8[15] = 0u;
3815#else
3816 ((M128i&)result).m128i_u8[15] = 0u;
3817#endif
3818
3819 return result;
3820}
3821
3822template <>
3823inline __m128i SSE::load_u8_15_lower_zero<true>(const uint8_t* const buffer)
3824{
3825 ocean_assert(buffer != nullptr);
3826
3827 // we load the entire 16 bytes to the 128i value as this is the fastest way
3828 __m128i result = _mm_lddqu_si128((__m128i*)(buffer));
3829
3830#ifdef OCEAN_COMPILER_MSC
3831 result.m128i_u8[15] = 0u;
3832#else
3833 ((M128i&)result).m128i_u8[15] = 0u;
3834#endif
3835
3836 return result;
3837}
3838
3839template <bool tBufferHas16Bytes>
3840inline __m128i SSE::load_u8_15_lower_random(const uint8_t* const buffer)
3841{
3842 ocean_assert(buffer != nullptr);
3843
3844 __m128i result;
3845 memcpy(&result, buffer, 15);
3846
3847 return result;
3848}
3849
3850template <>
3851inline __m128i SSE::load_u8_15_lower_random<true>(const uint8_t* const buffer)
3852{
3853 ocean_assert(buffer != nullptr);
3854
3855 // we load the entire 16 bytes to the 128i value as this is the fastest way
3856 return _mm_lddqu_si128((__m128i*)(buffer));
3857}
3858
3859template <unsigned int tShiftBytes>
3860inline __m128i SSE::load_u8_16_and_shift_right(const uint8_t* const buffer)
3861{
3862 static_assert(tShiftBytes <= 16u, "Invalid shift!");
3863
3864 ocean_assert(buffer != nullptr);
3865 return _mm_srli_si128(_mm_lddqu_si128((__m128i*)(buffer)), tShiftBytes);
3866}
3867
3868inline void SSE::store128i(const __m128i& value, uint8_t* const buffer)
3869{
3870 ocean_assert(buffer != nullptr);
3871 _mm_storeu_si128((__m128i*)(buffer), value);
3872}
3873
3874inline __m128i SSE::set128i(const unsigned long long high64, const unsigned long long low64)
3875{
3876
3877#ifdef _WINDOWS
3878
3879 #ifdef _WIN64
3880 return _mm_set_epi64x(high64, low64);
3881 #else
3882 return _mm_set_epi32(*(((int*)&high64) + 1), *((int*)&high64), *(((int*)&low64) + 1), *((int*)&low64));
3883 #endif
3884
3885#else
3886
3887 return _mm_set_epi64x(high64, low64);
3888
3889#endif
3890
3891}
3892
3893inline __m128i SSE::removeHighBits32_16(const __m128i& value)
3894{
3895 return _mm_and_si128(value, _mm_set1_epi32(int(0x0000FFFFu)));
3896}
3897
3898inline __m128i SSE::removeLowBits32_16(const __m128i& value)
3899{
3900 return _mm_and_si128(value, _mm_set1_epi32(int(0xFFFF0000u)));
3901}
3902
3903inline __m128i SSE::removeHighBits16_8(const __m128i& value)
3904{
3905 return _mm_and_si128(value, _mm_set1_epi32(int(0x00FF00FFu)));
3906}
3907
3908inline __m128i SSE::removeHighBits16_8_7_lower(const __m128i& value)
3909{
3910 return _mm_and_si128(value, set128i(0x000000FF00FF00FFull, 0x00FF00FF00FF00FFull));
3911}
3912
3913inline __m128i SSE::removeHighBits16_8_7_upper(const __m128i& value)
3914{
3915 return _mm_and_si128(value, set128i(0x00FF00FF00FF00FFull, 0x00FF00FF00FF0000ull));
3916}
3917
3918inline __m128i SSE::moveLowBits16_8ToLow64(const __m128i& value)
3919{
3920 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0E0C0A0806040200ull));
3921}
3922
3923inline __m128i SSE::moveLowBits32_8ToLow32(const __m128i& value)
3924{
3925 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A0A0A00C080400ull));
3926}
3927
3928inline __m128i SSE::moveLowBits32_16ToLow64(const __m128i& value)
3929{
3930 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0D0C090805040100ull));
3931}
3932
3933inline __m128i SSE::moveLowBits16_8ToHigh64(const __m128i& value)
3934{
3935 return _mm_shuffle_epi8(value, set128i(0x0E0C0A0806040200ull, 0xA0A0A0A0A0A0A0A0ull));
3936}
3937
3938inline __m128i SSE::moveHighBits32_16(const __m128i& value)
3939{
3940 // shift the four 32 bit integers by 16 to the right and fill by zeros
3941 return _mm_srli_epi32(value, 16);
3942}
3943
3944inline __m128i SSE::moveHighBits16_8(const __m128i& value)
3945{
3946 return _mm_shuffle_epi8(value, set128i(0xA00FA00DA00BA009ull, 0xA007A005A003A001ull));
3947}
3948
3949inline __m128i SSE::moveHighBits16_8_5(const __m128i& value)
3950{
3951 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A009ull, 0xA007A005A003A001ull));
3952}
3953
3954inline __m128i SSE::moveHighBits16_8_6(const __m128i& value)
3955{
3956 return _mm_shuffle_epi8(value, set128i(0xFFFFFFFFFF0bFF09ull, 0xFF07FF05FF03FF01ull));
3957}
3958
3959inline __m128i SSE::moveHighBits16_8_7(const __m128i& value)
3960{
3961 return _mm_shuffle_epi8(value, set128i(0xA0A0A00DA00BA009ull, 0xA007A005A003A001ull));
3962}
3963
3964inline __m128i SSE::shuffleLow32ToLow32_8(const __m128i& value)
3965{
3966 return _mm_shuffle_epi8(value, set128i(0xA0A0A003A0A0A002ull, 0xA0A0A001A0A0A000ull));
3967}
3968
3969inline __m128i SSE::shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value)
3970{
3971 // we could also use one of the following mask-defining possibility, all provide the same result
3972 // _mm_set_epi8(0x80, 7, 0x80, 3, 0x80, 6, 0x80, 2, 0x80, 5, 0x80, 1, 0x80, 4, 0x80, 0))
3973 // _mm_set_epi8(0xA0, 7, 0xA0, 3, 0xA0, 6, 0xA0, 2, 0xA0, 5, 0xA0, 1, 0xA0, 4, 0xA0, 0))
3974 // _mm_set_epi8(0xFF, 7, 0xFF, 3, 0xFF, 6, 0xFF, 2, 0xFF, 5, 0xFF, 1, 0xFF, 4, 0xFF, 0))
3975
3976 return _mm_shuffle_epi8(value, set128i(0xA007A003A006A002ull, 0xA005A001A004A000ull));
3977}
3978
3979inline __m128i SSE::shuffleNeighbor4High64BitsToLow16_8(const __m128i& value)
3980{
3981 return _mm_shuffle_epi8(value, set128i(0xA00FA00BA00EA00Aull, 0xA00DA009A00CA008ull));
3982}
3983
3984inline __m128i SSE::shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value)
3985{
3986 return _mm_shuffle_epi8(value, set128i(0xFF07FF05FF06FF04ull, 0xFF03FF01FF02FF00ull));
3987}
3988
3989inline __m128i SSE::shuffleNeighbor2High64BitsToLow16_8(const __m128i& value)
3990{
3991 return _mm_shuffle_epi8(value, set128i(0xFF0FFF0DFF0EFF0Cull, 0xFF0BFF09FF0AFF08ull));
3992}
3993
3995{
3996 return _mm_set1_epi32(int(0x00FF00FFu));
3997}
3998
4000{
4001 return _mm_set1_epi32(int(0x0000FFFFu));
4002}
4003
4004OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1)
4005{
4006 const __m128i lowProducts = _mm_mullo_epi16(values0, values1);
4007 const __m128i highProducts = _mm_mulhi_epi16(values0, values1);
4008
4009 products0 = _mm_unpacklo_epi16(lowProducts, highProducts);
4010 products1 = _mm_unpackhi_epi16(lowProducts, highProducts);
4011}
4012
4013OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1)
4014{
4015 __m128i products0;
4016 __m128i products1;
4017 multiplyInt8x16ToInt32x8(values0, values1, products0, products1);
4018
4019 results0 = _mm_add_epi32(results0, products0);
4020 results1 = _mm_add_epi32(results1, products1);
4021}
4022
4023inline unsigned int SSE::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
4024{
4025 ocean_assert(pixel);
4026 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
4027
4028 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
4029}
4030
4031inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
4032{
4033 ocean_assert(pixel0 && pixel1);
4034
4035 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
4036
4037 return sqrDistance(*pixel0, (uint8_t)interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
4038}
4039
4040inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
4041{
4042 ocean_assert(pixel0 && pixel1);
4043
4044 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
4045 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
4046
4047 return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
4048}
4049
4050}
4051
4052}
4053
4054#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
4055
4056#endif // META_OCEAN_CV_SSE_H
This class implements computer vision functions using SSE extensions.
Definition SSE.h:41
static __m128i divideByRightShiftSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 32 bit values by applying a right shift.
Definition SSE.h:3172
static void average32Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2762
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 ...
Definition SSE.h:3177
static unsigned int sum_u32_first_2(const __m128i &value)
Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1368
static void average24Elements3Channel24Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2845
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition SSE.h:1302
static void reverseElements8Bit48Elements(const __m128i &elements0, const __m128i &elements1, const __m128i &elements2, __m128i &reversedElements0, __m128i &reversedElements1, __m128i &reversedElements2)
Reverses the order of 48 elements with 8 bit per element.
Definition SSE.h:3564
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition SSE.h:3723
static void average16Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2737
static __m128i load_u8_16_and_shift_right(const uint8_t *const buffer)
Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified ...
Definition SSE.h:3860
static __m128i moveLowBits32_16ToLow64(const __m128i &value)
Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3928
static __m128i moveLowBits32_8ToLow32(const __m128i &value)
Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0...
Definition SSE.h:3923
static __m128i moveHighBits16_8_6(const __m128i &value)
Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3954
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i &value)
Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3135
static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d &value)
Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
Definition SSE.h:1395
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3341
static void store128i(const __m128i &value, uint8_t *const buffer)
Stores a 128i value to the memory.
Definition SSE.h:3868
static __m128i sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
Definition SSE.h:1473
static __m128i sumInterleave3Channel8Bit45Elements(const uint8_t *interleaved)
Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3710
static __m128i moveLowBits16_8ToHigh64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with ...
Definition SSE.h:3933
static __m128i divideByRightShiftSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight int16_t values by applying a right shift.
Definition SSE.h:3103
static __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3979
static void swapReversedElements8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
Definition SSE.h:3597
static __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit pr...
Definition SSE.h:1411
static void average8ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2505
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:1620
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i &values0, const __m128i &values1, __m128i &results0, __m128i &results1)
Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
Definition SSE.h:4013
static __m128i sumSquareDifference8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1500
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1377
static __m128i sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1418
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2562
static __m128i moveHighBits16_8_5(const __m128i &value)
Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3949
static int16_t maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts)
Returns the maximal value for which the function roundedDivideByRightShiftSigned16Bit() can be applie...
Definition SSE.h:3124
static __m128i shuffleLow32ToLow32_8(const __m128i &value)
Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
Definition SSE.h:3964
static void shiftChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3616
static __m128i moveHighBits16_8(const __m128i &value)
Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3944
static __m128i removeHighBits16_8_7_upper(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
Definition SSE.h:3913
static void deInterleave3Channel8Bit45Elements(const uint8_t *interleaved, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3402
static unsigned int value_u32(const __m128i &value)
Returns one specific 32 bit unsigned integer value of a m128i value object.
Definition SSE.h:1348
static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const __m128i &channel0, const __m128i &channel1, const __m128i &channel2, __m128i &interleavedA, __m128i &interleavedB, __m128i &interleavedC)
Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3409
static __m128i load_u8_15_upper_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3765
static __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3984
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition SSE.h:1297
static __m128i sum1Channel8Bit16Elements(const __m128i &elements)
Sums 16 elements with 8 bit per element.
Definition SSE.h:3644
static __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3969
static void average8Elements2Channel64Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
Definition SSE.h:2707
static __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative int16_t value, so that each value can be right shifted to allow a ...
Definition SSE.h:3084
static __m128i load_u8_15_lower_random(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3840
static __m128i removeHighBits16_8_7_lower(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
Definition SSE.h:3908
static void average8Elements4Channel128Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
Definition SSE.h:2905
static __m128i load_u8_10_upper_zero(const uint8_t *const buffer)
Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes,...
Definition SSE.h:3730
static __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
Definition SSE.h:1580
static __m128i moveHighBits32_16(const __m128i &value)
Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
Definition SSE.h:3938
static void average16Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2933
static __m128i moveHighBits16_8_7(const __m128i &value)
Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3959
static __m128i roundedDivideByRightShiftSigned16Bit(const __m128i &value_s16x8, const unsigned int rightShifts)
Applies a rounded division by a right shift for eight int16_t values.
Definition SSE.h:3108
static __m128i bitMaskRemoveHigh32_16()
Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
Definition SSE.h:3999
static __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1587
static __m128i removeHighBits32_16(const __m128i &value)
Removes the higher 16 bits of four 32 bit elements.
Definition SSE.h:3893
static __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3989
static void average6Elements3Channel96Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
Definition SSE.h:2808
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two separated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition SSE.h:2301
static __m128i interpolation3Channel24Bit12Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2114
static __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to al...
Definition SSE.h:3153
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2154
static void average8Elements1Channel32Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
Definition SSE.h:2447
static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i &singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t *interleaved)
Stores 8 single-channel 8-bit elements as 32 interleaved 4-channel elements (8 elements -> 8×4 = 32 b...
Definition SSE.h:3452
static void shiftChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3630
static void average8Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2481
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3353
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1292
static __m128i interpolation1Channel8Bit15Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2062
static uint16_t value_u16(const __m128i &value)
Returns one specific 16 bit unsigned integer value of a m128i value object.
Definition SSE.h:1336
static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2, __m128i &reversedInterleaved0, __m128i &reversedInterleaved1, __m128i &reversedInterleaved2)
Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channe...
Definition SSE.h:3491
static __m128i sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1404
static __m128i removeLowBits32_16(const __m128i &value)
Removes the lower 16 bits of four 32 bit elements.
Definition SSE.h:3898
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:1770
static uint8_t value_u8(const __m128i &value)
Returns one specific 8 bit unsigned integer value of a m128i value object.
Definition SSE.h:1313
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 16 fol...
Definition SSE.h:3233
static __m128i bitMaskRemoveHigh16_8()
Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
Definition SSE.h:3994
static __m128i removeHighBits16_8(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements.
Definition SSE.h:3903
static __m128i sum1Channel8BitBack15Elements(const uint8_t *elements)
Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is ...
Definition SSE.h:3666
static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i &singleChannel_u_8x8, uint8_t *interleaved)
Stores 8 single-channel 8-bit elements as 24 interleaved 3-channel elements (8 elements -> 8×3 = 24 b...
Definition SSE.h:3436
static __m128i load_u8_15_lower_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3806
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3368
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1570
static __m128i sumInterleave3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2)
Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3672
static void average32Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2957
static void average30Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition SSE.h:3004
static __m128i sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1528
static __m128i sum1Channel8BitFront15Elements(const uint8_t *elements)
Sums the first 15 elements of a buffer with 8 bit per element.
Definition SSE.h:3660
static void average32ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
Definition SSE.h:2650
static void average32Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2584
static __m128i sumSquareDifference8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1445
static OCEAN_FORCE_INLINE float sum_f32_4(const __m128 &value)
Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
Definition SSE.h:1386
static __m128i load_u8_13_lower_random(const uint8_t *const buffer)
Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes,...
Definition SSE.h:3786
static void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interl...
Definition SSE.h:3545
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition SSE.h:1359
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition SSE.h:1307
static __m128i moveLowBits16_8ToLow64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3918
static __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
Definition SSE.h:1555
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition SSE.h:4023
static void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3637
static __m128i load128iLower64(const void *const buffer)
Loads the lower 64 bit of a 128i value from the memory.
Definition SSE.h:3717
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition SSE.h:4031
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3874
static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels...
Definition SSE.h:3516
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i &value)
Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3065
static void average8Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2682
static void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3623
static __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1563
static void average16Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2527
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i &values0, const __m128i &values1, __m128i &products0, __m128i &products1)
Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
Definition SSE.h:4004
static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels...
Definition SSE.h:3476
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:1916
This class provides basic numeric functionalities.
Definition Numeric.h:57
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:70
float m128_f32[4]
The four 32 bit elements.
Definition SSE.h:72
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:81
double m128d_f64[2]
The two 64 bit elements.
Definition SSE.h:83
This union defines a wrapper for the __m128i SSE intrinsic data type.
Definition SSE.h:50
uint64_t m128i_u64[2]
The two 64 bit elements.
Definition SSE.h:52
uint16_t m128i_u16[8]
The eight 16 bit elements.
Definition SSE.h:58
uint32_t m128i_u32[4]
The four 32 bit elements.
Definition SSE.h:55
uint8_t m128i_u8[16]
The sixteen 8 bit elements.
Definition SSE.h:61