Ocean
Loading...
Searching...
No Matches
SSE.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SSE_H
9#define META_OCEAN_CV_SSE_H
10
11#include "ocean/cv/CV.h"
12
14
15#include "ocean/math/Math.h"
16
17#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
18
19// SSE2 include files
20#include <emmintrin.h>
21#include <immintrin.h>
22#include <mmintrin.h>
23
24// SSE3 include files
25#include <pmmintrin.h>
26#include <mmintrin.h>
27
28// SSE4 include files
29#include <smmintrin.h>
30
31namespace Ocean
32{
33
34namespace CV
35{
36
37/**
38 * This class implements computer vision functions using SSE extensions.
39 * @ingroup cv
40 */
41class SSE
42{
43 public:
44
45#if !defined(OCEAN_COMPILER_MSC)
46
47 /**
48 * This union defines a wrapper for the __m128i SSE intrinsic data type.
49 */
50 union M128i
51 {
52 /// The two 64 bit elements.
53 uint64_t m128i_u64[2];
54
55 /// The four 32 bit elements.
56 uint32_t m128i_u32[4];
57
58 /// The eight 16 bit elements.
59 uint16_t m128i_u16[8];
60
61 /// The sixteen 8 bit elements.
62 uint8_t m128i_u8[16];
63 };
64
65 static_assert(sizeof(M128i) == 16, "Invalid data type!");
66
67 /**
68 * This union defines a wrapper for the __m128 SSE intrinsic data type.
69 */
70 union M128
71 {
72 /// The four 32 bit elements.
73 float m128_f32[4];
74 };
75
76 static_assert(sizeof(M128) == 16, "Invalid data type!");
77
78 /**
79 * This union defines a wrapper for the __m128 SSE intrinsic data type.
80 */
81 union M128d
82 {
83 /// The two 64 bit elements.
84 double m128d_f64[2];
85 };
86
87 static_assert(sizeof(M128d) == 16, "Invalid data type!");
88
89#endif
90
91 public:
92
93 /**
94 * Prefetches a block of temporal memory into all cache levels.
95 * @param data Data to be prefetched
96 */
97 static inline void prefetchT0(const void* const data);
98
99 /**
100 * Prefetches a block of temporal memory in all cache levels except 0th cache level.
101 * @param data Data to be prefetched
102 */
103 static inline void prefetchT1(const void* const data);
104
105 /**
106 * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
107 * @param data Data to be prefetched
108 */
109 static inline void prefetchT2(const void* const data);
110
111 /**
112 * Prefetches a block of non-temporal memory into non-temporal cache structure.
113 * @param data Data to be prefetched
114 */
115 static inline void prefetchNTA(const void* const data);
116
117 /**
118 * Returns one specific 8 bit unsigned integer value of a m128i value object.
119 * @param value The value from which the 8 bit value will be returned
120 * @return The requested 8 bit value
121 * @tparam tIndex The index of the requested 8 bit integer value, with range [0, 15]
122 */
123 template <unsigned int tIndex>
124 static inline uint8_t value_u8(const __m128i& value);
125
126 /**
127 * Returns one specific 8 bit unsigned integer value of a m128i value object.
128 * @param value The value from which the 8 bit value will be returned
129 * @param index The index of the requested 8 bit integer value, with range [0, 15]
130 * @return The requested 8 bit value
131 */
132 static inline uint8_t value_u8(const __m128i& value, const unsigned int index);
133
134 /**
135 * Returns one specific 16 bit unsigned integer value of a m128i value object.
136 * @param value The value from which the 16 bit value will be returned
137 * @return The requested 16 bit value
138 * @tparam tIndex The index of the requested 16 bit integer value, with range [0, 7]
139 */
140 template <unsigned int tIndex>
141 static inline uint16_t value_u16(const __m128i& value);
142
143 /**
144 * Returns one specific 32 bit unsigned integer value of a m128i value object.
145 * @param value The value from which the 32 bit value will be returned
146 * @return The requested 32 bit value
147 * @tparam tIndex The index of the requested 32 bit integer value, with range [0, 3]
148 */
149 template <unsigned int tIndex>
150 static inline unsigned int value_u32(const __m128i& value);
151
152 /**
153 * Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the result.
154 * @param value The value which elements will be added
155 * @return The resulting sum value
156 */
157 static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i& value);
158
159 /**
160 * Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
161 * @param value The value which elements will be added
162 * @return The resulting sum value
163 */
164 static inline unsigned int sum_u32_first_2(const __m128i& value);
165
166 /**
167 * Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
168 * @param value The value which elements will be added
169 * @return The resulting sum value
170 */
171 static inline unsigned int sum_u32_first_third(const __m128i& value);
172
173 /**
174 * Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
175 * @param value The value which elements will be added
176 * @return The resulting sum value
177 */
178 static OCEAN_FORCE_INLINE float sum_f32_4(const __m128& value);
179
180 /**
181 * Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
182 * @param value The value which elements will be added
183 * @return The resulting sum value
184 */
185 static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d& value);
186
187 /**
188 * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
189 * @param image0 First 11 elements to determine the ssd for, may be non aligned
190 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
191 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
192 */
193 static inline __m128i sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
194
195 /**
196 * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision, the remaining 4 elements are set to zero.
197 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
198 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [00 01 02 03 04 05 06 07 08 09 10 11 NA NA NA NA].
199 * @param image0 First 12 (+4) elements to determine the ssd for, with any alignment
200 * @param image1 Second 12 (+4) elements to determine the ssd for, with any alignment
201 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
202 */
203 static inline __m128i sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
204
205 /**
206 * Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit precision, the beginning 4 elements are interpreted as zero.
207 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
208 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends right): [NA NA NA NA 04 05 06 07 08 09 10 11 12 13 14 15].
209 * @param image0 First (4+) 12 elements to determine the ssd for, with any alignment
210 * @param image1 Second (4+) 12 elements to determine the ssd for, with any alignment
211 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
212 */
213 static inline __m128i sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
214
215 /**
216 * Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
217 * This function supports to load the 13 elements from a buffer with only 13 bytes or with a buffer with at least 16 bytes.
218 * @param image0 First 13 elements to determine the ssd for, may be non aligned
219 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
220 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
221 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 13 bytes only
222 */
223 template <bool tBufferHas16Bytes>
224 static inline __m128i sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
225
226 /**
227 * Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit precision, the beginning 3 elements are interpreted as zero.
228 * However, the provides buffers must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE registers.<br>
229 * Thus, this function handles two buffers with this pattern (while the memory starts left and ends rights: [NA NA NA 03 04 05 06 07 08 09 10 11 12 13 14 15].
230 * @param image0 First (3+) 13 elements to determine the ssd for, may be non aligned
231 * @param image1 Second (3+) 13 elements to determine the ssd for, may be non aligned
232 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
233 */
234 static inline __m128i sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
235
236 /**
237 * Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
238 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
239 * @param image0 First 15 elements to determine the ssd for, may be non aligned
240 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
241 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
242 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
243 */
244 template <bool tBufferHas16Bytes>
245 static inline __m128i sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
246
247 /**
248 * Sum square difference determination for 16 elements with 8 bit precision.
249 * @param image0 First 16 elements to determine the ssd for, may be non aligned
250 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
251 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
252 */
253 static inline __m128i sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
254
255 /**
256 * Sum square difference determination for 16 elements with 8 bit precision.
257 * @param image0 First 16 elements to determine the ssd for, may be non aligned
258 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
259 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
260 */
261 static inline __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1);
262
263 /**
264 * Sum square difference determination for 16 elements with 8 bit precision.
265 * @param row0 First 16 elements to determine the ssd for
266 * @param row1 Second 16 elements to determine the ssd for
267 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
268 */
269 static inline __m128i sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1);
270
271 /**
272 * Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
273 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
274 * @param image0 First row of 8 elements
275 * @param image1 Second row of 8 elements
276 * @param result Resulting 4 average elements
277 */
278 static inline void average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result);
279
280 /**
281 * Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
282 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
283 * @param image0 First row of 8 elements
284 * @param image1 Second row of 8 elements
285 * @param result Resulting 4 average elements
286 */
287 static inline void average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
288
289 /**
290 * Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
291 * The function takes two rows of 8 elements and returns 4 average elements (4 averaged pixels).<br>
292 * @param image0 First row of 8 elements, must be valid
293 * @param image1 Second row of 8 elements, must be valid
294 * @param result Resulting 4 average elementss, must be valid
295 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
296 */
297 static inline void average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
298
299 /**
300 * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
301 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
302 * @param image0 First row of 16 elements, must be valid
303 * @param image1 Second row of 16 elements, must be valid
304 * @param result Resulting 8 average elements, must be valid
305 */
306 static inline void average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
307
308 /**
309 * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
310 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels).<br>
311 * @param image0 First row of 16 elements, must be valid
312 * @param image1 Second row of 16 elements, must be valid
313 * @param result Resulting 8 average elements, must be valid
314 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
315 */
316 static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
317
318 /**
319 * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
320 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
321 * @param image0 First row of 32 elements
322 * @param image1 Second row of 32 elements
323 * @param result Resulting 16 average elements
324 */
325 static inline void average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
326
327 /**
328 * Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
329 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels).<br>
330 * @param image0 First row of 32 elements, must be valid
331 * @param image1 Second row of 32 elements, must be valid
332 * @param result Resulting 16 average elements, must be valid
333 * @param threshold The minimal sum value of four pixels to result in a mask with value 255, with range [1, 255 * 4]
334 */
335 static inline void average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold = 776u);
336
337 /**
338 * Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
339 * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels, each with 2 channels).<br>
340 * @param image0 First row of 8 elements
341 * @param image1 Second row of 8 elements
342 * @param result Resulting 4 average elements
343 */
344 static inline void average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
345
346 /**
347 * Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
348 * The function takes two rows of 8 elements and returns 4 average elements (2 averaged pixels).<br>
349 * @param image0 First row of 8 elements
350 * @param image1 Second row of 8 elements
351 * @param result Resulting 4 average elements
352 */
353 static inline void average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result);
354
355 /**
356 * Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
357 * The function takes two rows of 32 elements and returns 8 average elements (4 averaged pixels, each with 2 channels).<br>
358 * @param image0 First row of 16 elements
359 * @param image1 Second row of 16 elements
360 * @param result Resulting 8 average elements
361 */
362 static inline void average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
363
364 /**
365 * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
366 * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).<br>
367 * @param image0 First row of 32 elements
368 * @param image1 Second row of 32 elements
369 * @param result Resulting 16 average elements
370 */
371 static inline void average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
372
373 /**
374 * Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
375 * The function takes two rows of 6 elements and returns 3 average elements (1 averaged pixels, each with 3 channels).<br>
376 * @param image0 First row of 6 elements
377 * @param image1 Second row of 6 elements
378 * @param result Resulting 3 average elements
379 */
380 static inline void average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result);
381
382 /**
383 * Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
384 * The function takes two rows of 24 elements and returns 12 average elements (4 averaged pixels, each with 3 channels).<br>
385 * @param image0 First row of 24 elements
386 * @param image1 Second row of 24 elements
387 * @param result Resulting 12 average elements
388 */
389 static inline void average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
390
391 /**
392 * Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
393 * The function takes two rows of 8 elements and returns 4 average elements (1 averaged pixel).<br>
394 * @param image0 First row of 8 elements
395 * @param image1 Second row of 8 elements
396 * @param result Resulting 4 average elements
397 */
398 static inline void average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result);
399
400 /**
401 * Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
402 * The function takes two rows of 16 elements and returns 8 average elements (2 averaged pixels, each with 4 channels).<br>
403 * @param image0 First row of 16 elements
404 * @param image1 Second row of 16 elements
405 * @param result Resulting 8 average elements
406 */
407 static inline void average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
408
409 /**
410 * Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
411 * The function takes two rows of 32 elements and returns 16 average elements (4 averaged pixels, each with 4 channels).<br>
412 * @param image0 First row of 32 elements
413 * @param image1 Second row of 32 elements
414 * @param result Resulting 16 average elements
415 */
416 static inline void average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result);
417
418 /**
419 * Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
420 * The function takes two rows of 30 elements and returns 10 average elements (10 averaged pixels).<br>
421 * @param image0 First row of 30 elements
422 * @param image1 Second row of 30 elements
423 * @param image2 Third row of 30 elements
424 * @param result Resulting 10 average elements
425 */
426 static inline void average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
427
428 /**
429 * Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
430 * This function must be invoked before the right shift is applied.
431 * @param value The eight signed 16 bit values to be handled
432 * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
433 */
434 static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i& value);
435
436 /**
437 * Adds 2^shifts - 1 to each negative signed 16 bit value, so they each value can be right shifted to allow a correct division by 2^shifts.
438 * This function must be invoked before the right shift is applied.
439 * @param value The eight signed 16 bit values to be handled
440 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
441 * @return The modified value for which division a shift yield equal (and correct!) results
442 */
443 static inline __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts);
444
445 /**
446 * Divides eight signed 16 bit values by applying a right shift.
447 * This is able to determine the correct division result for positive and negative 16 bit values.
448 * @param value The eight signed 16 bit values to be handled
449 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 15]
450 * @return The divided values
451 */
452 static inline __m128i divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts);
453
454 /**
455 * Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right shifted by one bit to allow a correct division by two.
456 * This function must be invoked before the right shift is applied.
457 * @param value The eight signed 32 bit values to be handled
458 * @return The modified value for which divide (/ 2) and bit shift (>> 1) yield equal (and correct!) results
459 */
460 static inline __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i& value);
461
462 /**
463 * Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to allow a correct division by 2^shifts.
464 * This function must be invoked before the right shift is applied.
465 * @param value The eight signed 32 bit values to be handled
466 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 31]
467 * @return The modified value for which division a shift yield equal (and correct!) results
468 */
469 static inline __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts);
470
471 /**
472 * Divides eight signed 32 bit values by applying a right shift.
473 * This is able to determine the correct division result for positive and negative 32 bit values.
474 * @param value The eight signed 32 bit values to be handled
475 * @param rightShifts The number of right shifts which needs to be applied, with range [0, 32]
476 * @return The divided values
477 */
478 static inline __m128i divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts);
479
480 /**
481 * Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 bit frame.
482 * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
483 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
484 * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
485 * @param width The width of the original frame in pixel, with range [10, infinity)
486 */
487 static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
488
489 /**
490 * Determines the squared horizontal and vertical gradients and the product of both gradients for 16 following pixels for a given 1 channel 8 bit frame.
491 * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
492 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
493 * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
494 * @param width The width of the original frame in pixel, with range [10, infinity)
495 */
496 static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
497
498 /**
499 * Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit precision.
500 * @param image0 First 11 elements to determine the sad for, may be non aligned
501 * @param image1 Second 11 elements to determine the sad for, may be non aligned
502 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
503 */
504 static inline __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
505
506 /**
507 * Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
508 * This function supports to load the 10 elements from a buffer with only 10 bytes or with a buffer with at least 16 bytes.
509 * @param image0 First 10 elements to determine the sad for, may be non aligned
510 * @param image1 Second 10 elements to determine the sad for, may be non aligned
511 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
512 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 10 bytes only
513 */
514 template <bool tBufferHas16Bytes>
515 static inline __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
516
517 /**
518 * Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
519 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.
520 * @param image0 First 15 elements to determine the sad for, may be non aligned
521 * @param image1 Second 15 elements to determine the sad for, may be non aligned
522 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
523 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds 15 bytes only
524 */
525 template <bool tBufferHas16Bytes>
526 static inline __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
527
528 /**
529 * Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
530 * The first interpolation element results from the first and second element of both rows.<br>
531 * The second interpolation element results from the second and third element of both rows.<br>
532 * ...<br>
533 * The eighth interpolation element results from the eighth and ninth.<br>
534 * The interpolation is specified by tx and ty with range [0, 128u].<br>
535 * @param values0 First row of 9 elements to be interpolated
536 * @param values1 Second row of 9 elements to be interpolated
537 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
538 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
539 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
540 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
541 * @return Interpolation result for 8 elements, which are 8 pixels
542 */
543 static inline __m128i interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
544
545 /**
546 * Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
547 * The first interpolation element results from the first and second element of both rows.<br>
548 * The second interpolation element results from the second and third element of both rows.<br>
549 * ...<br>
550 * The eighth interpolation element results from the eighth and ninth.<br>
551 * The interpolation is specified by tx and ty with range [0, 128u].<br>
552 * @param values0 First row of 10 elements to be interpolated
553 * @param values1 Second row of 10 elements to be interpolated
554 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
555 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
556 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
557 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
558 * @return Interpolation result for 8 elements, which are 4 pixels
559 */
560 static inline __m128i interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
561
562 /**
563 * Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
564 * The first interpolation element results from the first and second element of both rows.<br>
565 * The second interpolation element results from the second and third element of both rows.<br>
566 * ...<br>
567 * The eighth interpolation element results from the eighth and ninth.<br>
568 * The interpolation is specified by tx and ty with range [0, 128u].<br>
569 * @param values0 First row of 11 elements to be interpolated
570 * @param values1 Second row of 11 elements to be interpolated
571 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
572 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
573 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
574 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
575 * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
576 */
577 static inline __m128i interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
578
579 /**
580 * Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
581 * The interpolation is specified by tx and ty with range [0, 128u].<br>
582 * @param values0 First row of 16 elements to be interpolated
583 * @param values1 Second row of 16 elements to be interpolated
584 * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
585 * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
586 * @return Interpolation result for 15 elements, which are (15 pixels)
587 */
588 static inline __m128i interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
589
590 /**
591 * Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
592 * The interpolation is specified by tx and ty with range [0, 128u].<br>
593 * @param values0 First row of 15 elements to be interpolated
594 * @param values1 Second row of 15 elements to be interpolated
595 * @param fx_fy_fxfy_ In each unsigned 16 bit element: ((128u - tx) * (128u - ty)) | (tx * (128u - ty)) << 16
596 * @param fx_fyfxfy In each unsigned 16 bit element: (128u - tx) * ty | (tx * ty) << 16
597 * @return Interpolation result for 12 elements, which are (4 pixels)
598 */
599 static inline __m128i interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy);
600
601 /**
602 * Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
603 * The first interpolation element results from the first and second element of both rows.<br>
604 * The second interpolation element results from the second and third element of both rows.<br>
605 * ...<br>
606 * The eighth interpolation element results from the eighth and ninth.<br>
607 * The interpolation is specified by tx and ty with range [0, 128u].<br>
608 * @param values0 First row of 12 elements to be interpolated
609 * @param values1 Second row of 12 elements to be interpolated
610 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
611 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
612 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
613 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
614 * @return Interpolation result for 8 elements, which are (2 pixels)
615 */
616 static inline __m128i interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
617
618 /**
619 * Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit frames.
620 * The first interpolation element results from the first and second element of both rows.<br>
621 * The second interpolation element results from the second and third element of both rows.<br>
622 * ...<br>
623 * The eighth interpolation element results from the eighth and ninth.<br>
624 * The interpolation is specified by tx and ty with range [0, 128u].<br>
625 * @param values0 First row of 16 elements to be interpolated
626 * @param values1 Second row of 16 elements to be interpolated
627 * @param fx_fy_ In each unsigned 16 bit element: Product of (128u - tx) and (128u - ty)
628 * @param fxfy_ In each unsigned 16 bit element: Product of (tx) and (128u - ty)
629 * @param fx_fy In each unsigned 16 bit element: Product of (128u - tx) and (ty)
630 * @param fxfy In each unsigned 16 bit element: Product of (tx) and (ty)
631 * @return Interpolation result for 8 elements, which are (2 2/3 pixels)
632 */
633 static inline __m128i interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy);
634
635 /**
636 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
637 * @param pixel0 Uppler left pixel in the first frame
638 * @param pixel1 Uppler left pixel in the second frame
639 * @param size0 Size of one frame row in bytes
640 * @param size1 Size of one frame row in bytes
641 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
642 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
643 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
644 * @param f1xy Product of the fx and the fy interpolation factor for the second image
645 * @return Interpolated sum of square difference
646 */
647 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
648
649 /**
650 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
651 * @param pixel0 Uppler left pixel in the first frame
652 * @param pixel1 Uppler left pixel in the second frame
653 * @param size0 Size of one frame row in bytes
654 * @param size1 Size of one frame row in bytes
655 * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
656 * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
657 * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
658 * @param f0xy Product of the fx and the fy interpolation factor for the first image
659 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
660 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
661 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
662 * @param f1xy Product of the fx and the fy interpolation factor for the second image
663 * @return Interpolated sum of square difference
664 */
665 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
666
667 /**
668 * Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
669 * @param image0 First 16 elements to determine the ssd for, may be non aligned
670 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
671 * @return SSD result distributed over four terms of the sum, thus result is (m128i_u32[0] + m128i_u32[1] + m128i_u32[2] + m128i_u32[3])
672 */
673 static inline __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
674
675 /**
676 * Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
677 * This functions converts X CBA CBA CBA CBA CBA to 00000000000CCCCC 000BBBBB000AAAAA.
678 * @param interleaved The 15 elements holding the interleaved image data
679 * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
680 * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
681 */
682 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2);
683
684 /**
685 * Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
686 * This functions converts XX XXX XXX CBA CBA CB A CBA CBA CBA CBA CBA to 00000000CCCCCCCC BBBBBBBBAAAAAAAA.
687 * @param interleavedA First 16 elements holding the interleaved image data
688 * @param interleavedB Second 16 elements holding the interleaved image data, the first 8 elements will be used only
689 * @param channel01 Resulting first and second channel elements, first 8 elements of the first channel, followed by 8 elements of the second channel
690 * @param channel2 Resulting third channel elements, first 8 elements of the third channel, followed by zeros
691 */
692 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2);
693
694 /**
695 * Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
696 * This functions converts CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA to CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA.
697 * @param interleavedA First 16 elements holding the interleaved image data
698 * @param interleavedB Second 16 elements holding the interleaved image data
699 * @param interleavedC Third 16 elements holding the interleaved image data
700 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
701 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
702 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
703 */
704 static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2);
705
706 /**
707 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
708 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
709 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
710 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
711 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
712 */
713 static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
714
715 /**
716 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
717 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes), must be valid
718 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively, must be valid
719 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively, must be valid
720 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively, must be valid
721 */
722 static inline void deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2);
723
724 /**
725 * Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
726 * @param interleaved 45 elements of an image with 3 channels and 8 bit per element (45 bytes), must be valid
727 * @param channel0 Resulting first channel holding all elements corresponding to the first channel consecutively
728 * @param channel1 Resulting second channel holding all elements corresponding to the second channel consecutively
729 * @param channel2 Resulting third channel holding all elements corresponding to the third channel consecutively
730 */
731 static inline void deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2);
732
733 /**
734 * Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
735 * This functions converts CCCCCCCCCCCCCCCC BBBBBBBBBBBBBBBB AAAAAAAAAAAAAAAA to CBA CBA CBA CBA CBA C BA CBA CBA CBA CBA CB A CBA CBA CBA CBA CBA.
736 * @param channel0 The 16 elements of the first channel to be interleaved
737 * @param channel1 The 16 elements of the second channel to be interleaved
738 * @param channel2 The 16 elements of the third channel to be interleaved
739 * @param interleavedA Resulting first 16 of the interleaved data
740 * @param interleavedB Resulting second 16 of the interleaved data
741 * @param interleavedC Resulting third 16 of the interleaved data
742 */
743 OCEAN_FORCE_INLINE static void interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC);
744
745 /**
746 * Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
747 * @param channel0 The 16 elements of the first channel to be interleaved, must be valid
748 * @param channel1 The 16 elements of the second channel to be interleaved, must be valid
749 * @param channel2 The 16 elements of the third channel to be interleaved, must be valid
750 * @param interleaved The resulting 48 interleaved elements, must be valid
751 */
752 static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved);
753
754 /**
755 * Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels and 8 bit per element (e.g., YA16 to AY16).
756 * @param interleaved 16 elements of an image with 2 channels and 8 bit per element (32 bytes)
757 * @param reversedInterleaved Resulting 32 elements with reversed channel order
758 */
759 static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
760
761 /**
762 * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element.
763 * @param interleaved0 First 16 elements holding the interleaved image data
764 * @param interleaved1 Second 16 elements holding the interleaved image data
765 * @param interleaved2 Third 16 elements holding the interleaved image data
766 * @param reversedInterleaved0 Resulting first 16 elements holding the interleaved image data with reversed channel order
767 * @param reversedInterleaved1 Resulting second 16 elements holding the interleaved image data with reversed channel order
768 * @param reversedInterleaved2 Resulting third 16 elements holding the interleaved image data with reversed channel order
769 */
770 static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2);
771
772 /**
773 * Reverses the order of the first and last channel of 48 elements (16 pixels) of an image with 3 interleaved channels and 8 bit per element (e.g., RGB24 to BGR24).
774 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
775 * @param reversedInterleaved Resulting 48 elements with reversed channel order
776 */
777 static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
778
779 /**
780 * Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels and 8 bit per element (e.g., RGBA32 to ABGR24).
781 * @param interleaved 64 elements of an image with 4 channels and 8 bit per element (64 bytes)
782 * @param reversedInterleaved Resulting 64 elements with reversed channel order
783 */
784 static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved);
785
786 /**
787 * Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channels and 8 bit per element (in place).
788 * @param interleaved 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
789 */
790 static void reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved);
791
792 /**
793 * Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interleaved channels and 8 bit per element and further swaps both sets.
794 * @param first First 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
795 * @param second Second 48 elements of an image with 3 channels and 8 bit per element (48 bytes)
796 */
797 static inline void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second);
798
799 /**
800 * Reverses the order of 48 elements with 8 bit per element.
801 * @param elements0 First 16 elements
802 * @param elements1 Second 16 elements
803 * @param elements2 Third 16 elements
804 * @param reversedElements0 Resulting reversed first 16 elements
805 * @param reversedElements1 Resulting reversed second 16 elements
806 * @param reversedElements2 Resulting reversed third 16 elements
807 */
808 static inline void reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2);
809
810 /**
811 * Reverses the order of 48 elements with 8 bit per element.
812 * @param elements 48 elements that will be reversed
813 * @param reversedElements Resulting reversed 48 elements
814 */
815 static inline void reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements);
816
817 /**
818 * Reverses the order of 48 elements with 8 bit per element (in place).
819 * @param elements 48 elements that will be reversed
820 */
821 static inline void reverseElements8Bit48Elements(uint8_t* elements);
822
823 /**
824 * Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
825 * @param first First 48 elements that will be reversed and swapped with the second 48 elements
826 * @param second Second 48 elements that will be reversed and swapped with the first 48 elements
827 */
828 static inline void swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second);
829
830 /**
831 * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel.
832 * The function takes four pixels DCBA DCBA DCBA DCBA and provides ADCB ADCB ADCB ADCB.<br>
833 * @param elements 16 elements of 4 pixels to be shifted
834 * @param shiftedElements Resulting shifted elements
835 */
836 static inline void shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
837
838 /**
839 * Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back channel and mirrors the four individual pixels.
840 * @param elements 16 elements of 4 pixels to be shifted and mirrored
841 * @param shiftedElements Resulting shifted and mirrored elements
842 */
843 static inline void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
844
845 /**
846 * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel.
847 * The function takes four pixels DCBA DCBA DCBA DCBA and provides CBAD CBAD CBAD CBAD.<br>
848 * @param elements 16 elements of 4 pixels to be shifted
849 * @param shiftedElements Resulting shifted elements
850 */
851 static inline void shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
852
853 /**
854 * Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front channel and mirrors the four individual pixels.
855 * @param elements 16 elements of 4 pixels to be shifted and mirrored
856 * @param shiftedElements Resulting shifted and mirrored elements
857 */
858 static inline void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements);
859
860 /**
861 * Sums 16 elements with 8 bit per element.
862 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
863 * @param elements 16 elements holding the image data
864 * @return Resulting sums
865 */
866 static inline __m128i sum1Channel8Bit16Elements(const __m128i& elements);
867
868 /**
869 * Sums 16 elements with 8 bit per element.
870 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.<br>
871 * @param elements 16 elements holding the image data
872 * @return Resulting sums
873 */
874 static inline __m128i sum1Channel8Bit16Elements(const uint8_t* elements);
875
876 /**
877 * Sums the first 15 elements of a buffer with 8 bit per element.
878 * This function supports to load the 15 elements from a buffer with only 15 bytes or with a buffer with at least 16 bytes.<br>
879 * If the provided buffer holds at least 16 bytes the load function is much faster compared to the case if the buffer is not larger than 15 bytes.<br>
880 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
881 * @param elements 16 elements holding the image data
882 * @return Resulting sums
883 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
884 */
885 template <bool tBufferHas16Bytes>
886 static inline __m128i sum1Channel8BitFront15Elements(const uint8_t* elements);
887
888 /**
889 * Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is interpreted as zero.
890 * However, the provided buffer must be at least 16 bytes large as the entire 16 bytes will be loaded to the SSE register.<br>
891 * Thus, this functions handles one buffer with this pattern (while the memory starts left and ends right): [NA 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15].
892 * The results are stored as first 32 bit integer value (high bits left, low bits right): ???? ???? ???? 0000.
893 * @param elements (1+) 15 elements holding the image data
894 * @return Resulting sum
895 */
896 static inline __m128i sum1Channel8BitBack15Elements(const uint8_t* elements);
897
898 /**
899 * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
900 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
901 * @param interleaved0 First 16 elements holding the interleaved image data
902 * @param interleaved1 Second 16 elements holding the interleaved image data
903 * @param interleaved2 Third 16 elements holding the interleaved image data
904 * @return Resulting sums
905 */
906 static inline __m128i sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2);
907
908 /**
909 * Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
910 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
911 * @param interleaved 48 elements holding the interleaved image data
912 * @return Resulting sums
913 */
914 static inline __m128i sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved);
915
916 /**
917 * Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel and element.
918 * The results are stored in three 32 bit integer values (high bits left, low bits right): ???? 2222 1111 0000.<br>
919 * @param interleaved 45 elements holding the interleaved image data
920 * @return Resulting sums
921 */
922 static inline __m128i sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved);
923
924 /**
925 * Loads the lower 64 bit of a 128i value from the memory.
926 * The upper 64 bit are zeroed.
927 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 8 bytes
928 * @return Resulting value
929 */
930 static inline __m128i load128iLower64(const void* const buffer);
931
932 /**
933 * Loads a 128i value from the memory.
934 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary), ensure that the buffer has a size of at least 16 bytes
935 * @return Resulting value
936 */
937 static inline __m128i load128i(const void* const buffer);
938
939 /**
940 * Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes, to a 128i value and sets the remaining bytes of the resulting 128i value to zero.
941 * The loaded memory will be stored in the upper 10 bytes of the 128i value while the lowest remaining 6 bytes will be set to zero.
942 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [09 08 07 06 05 04 03 02 01 00 ZZ ZZ ZZ ZZ ZZ ZZ], with ZZ meaning zero.<br>
943 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
944 * @return Resulting 128 bit value
945 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 10 bytes
946 */
947 template <bool tBufferHas16Bytes>
948 static inline __m128i load_u8_10_upper_zero(const uint8_t* const buffer);
949
950 /**
951 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
952 * The loaded memory will be stored in the upper 15 bytes of the 128i value while the lowest remaining 1 byte will be set to zero.
953 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 ZZ], with ZZ meaning zero.<br>
954 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
955 * @return Resulting 128 bit value
956 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
957 */
958 template <bool tBufferHas16Bytes>
959 static inline __m128i load_u8_15_upper_zero(const uint8_t* const buffer);
960
961 /**
962 * Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
963 * The loaded memory will be stored in the lower 13 bytes of the 128i value while the highest remaining 3 byte will be random.<br>
964 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? ?? ?? 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
965 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
966 * @return Resulting 128 bit value
967 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 13 bytes
968 */
969 template <bool tBufferHas16Bytes>
970 static inline __m128i load_u8_13_lower_random(const uint8_t* const buffer);
971
972 /**
973 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value and sets the remaining byte of the resulting 128i value to zero.
974 * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be set to zero.<br>
975 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [-- 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ZZ meaning zero.<br>
976 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
977 * @return Resulting 128 bit value
978 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
979 */
980 template <bool tBufferHas16Bytes>
981 static inline __m128i load_u8_15_lower_zero(const uint8_t* const buffer);
982
983 /**
984 * Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes, to a 128i value while the remaining byte of the resulting 128i value will be random.
985 * The loaded memory will be stored in the lower 15 bytes of the 128i value while the highest remaining 1 byte will be random.<br>
986 * Thus, the resulting 128 bit value has the following byte pattern (high bits left, low bits right): [?? 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00], with ?? meaning a random value.<br>
987 * @param buffer Buffer to be loaded (does not need to be aligned on any particular boundary)
988 * @return Resulting 128 bit value
989 * @tparam tBufferHas16Bytes True, if the buffer holds at least 16 bytes; False, if the buffer holds only 15 bytes
990 */
991 template <bool tBufferHas16Bytes>
992 static inline __m128i load_u8_15_lower_random(const uint8_t* const buffer);
993
994 /**
995 * Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified number of bytes to the right (by inserting zeros).
996 * This function can be used if the remaining buffer is smaller than 16 bytes while the buffer exceeds/continues in the lower address space (from the original point of interest).<br>
997 * Thus, this function an handle a buffer with the following pattern (with lower address left and high address right):<br>
998 * | ?? ?? ?? ?? ?? ?? ?? ?? ?? V0 V1 V2 V3 V4 V5 V6 V7 V8 V9 |, where ?? represent random values in our buffer (in the lower address space), and VX represent the values of interest and V0 the location to which 'buffer' is pointing to.<br>
999 * by load_u8_16_and_shift_right<6>(buffer - 6);<br>
1000 * The resulting 128i register will then be composed of (high bits left, low bits right): [00 00 00 00 00 00 V9 V8 V7 V6 V5 V4 V3 V2 V1 V0].
1001 * @param buffer The actual address from which the 16 bytes will be loaded, must be valid and must be at least 16 bytes large
1002 * @return The resulting 128 bit value
1003 * @tparam tShiftBytes The number of bytes which will be shifted (to the right) after the memory has loaded, with range [0, 16]
1004 */
1005 template <unsigned int tShiftBytes>
1006 static inline __m128i load_u8_16_and_shift_right(const uint8_t* const buffer);
1007
1008 /**
1009 * Stores a 128i value to the memory.
1010 * @param value Value to be stored
1011 * @param buffer Buffer receiving the value (does not need to be aligned on any particular boundary)
1012 */
1013 static inline void store128i(const __m128i& value, uint8_t* const buffer);
1014
1015 /**
1016 * Sets a 128i value by two 64 bit values.
1017 * @param high64 High 64 bits to be set
1018 * @param low64 Low 64 bits to be set
1019 * @return Resulting 128i value
1020 */
1021 static inline __m128i set128i(const unsigned long long high64, const unsigned long long low64);
1022
1023 /**
1024 * Removes the higher 16 bits of four 32 bit elements.
1025 * Given: PONM-LKJI-HGFE-DCBA<br>
1026 * Result: 00NM-00JI-00FE-00BA<br>
1027 * @param value Value to remove the high bits for
1028 * @return Result
1029 */
1030 static inline __m128i removeHighBits32_16(const __m128i& value);
1031
1032 /**
1033 * Removes the lower 16 bits of four 32 bit elements.
1034 * Given: PONM-LKJI-HGFE-DCBA<br>
1035 * Result: PO00-LK00-HG00-DC00<br>
1036 * @param value Value to remove the lower bits for
1037 * @return Result
1038 */
1039 static inline __m128i removeLowBits32_16(const __m128i& value);
1040
1041 /**
1042 * Removes the higher 8 bits of eight 16 bit elements.
1043 * Given: PONM-LKJI-HGFE-DCBA<br>
1044 * Result: 0O0M-0K0I-0G0E-0C0A<br>
1045 * @param value Value to remove the high bits for
1046 * @return Result
1047 */
1048 static inline __m128i removeHighBits16_8(const __m128i& value);
1049
1050 /**
1051 * Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
1052 * Given: PONM-LKJI-HGFE-DCBA<br>
1053 * Result: 000M-0K0I-0G0E-0C0A<br>
1054 * @param value Value to remove the high bits for
1055 * @return Result
1056 */
1057 static inline __m128i removeHighBits16_8_7_lower(const __m128i& value);
1058
1059 /**
1060 * Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
1061 * Given: PONM-LKJI-HGFE-DCBA<br>
1062 * Result: 0O0M-0K0I-0G0E-0C00<br>
1063 * @param value Value to remove the high bits for
1064 * @return Result
1065 */
1066 static inline __m128i removeHighBits16_8_7_upper(const __m128i& value);
1067
1068 /**
1069 * Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1070 * Given: PONM-LKJI-HGFE-DCBA<br>
1071 * Result: 0000-0000-OMKI-GECA<br>
1072 * @param value Value to remove the high bits for
1073 * @return Result
1074 */
1075 static inline __m128i moveLowBits16_8ToLow64(const __m128i& value);
1076
1077 /**
1078 * Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0.
1079 * Given: PONM-LKJI-HGFE-DCBA<br>
1080 * Result: 0000-0000-0000-MIEA<br>
1081 * @param value Value to remove the high bits for
1082 * @return Result
1083 */
1084 static inline __m128i moveLowBits32_8ToLow32(const __m128i& value);
1085
1086 /**
1087 * Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with 0.
1088 * Given: PONM-LKJI-HGFE-DCBA<br>
1089 * Result: 0000-0000-NMJI-FEBA<br>
1090 * @param value Value to remove the high bits for
1091 * @return Result
1092 */
1093 static inline __m128i moveLowBits32_16ToLow64(const __m128i& value);
1094
1095 /**
1096 * Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with 0.
1097 * Given: PONM-LKJI-HGFE-DCBA<br>
1098 * Result: OMKI-GECA-0000-0000<br>
1099 * @param value Value to remove the high bits for
1100 * @return Result
1101 */
1102 static inline __m128i moveLowBits16_8ToHigh64(const __m128i& value);
1103
1104 /**
1105 * Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
1106 * Given: PONM-LKJI-HGFE-DCBA<br>
1107 * Result: 00PO-00LK-00HG-00DC<br>
1108 * @param value Value to remove the high bits for
1109 * @return Result
1110 */
1111 static inline __m128i moveHighBits32_16(const __m128i& value);
1112
1113 /**
1114 * Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
1115 * Given: PONM-LKJI-HGFE-DCBA<br>
1116 * Result: 0P0N-0L0J-0H0F-0D0B<br>
1117 * @param value Value to remove the high bits for
1118 * @return Result
1119 */
1120 static inline __m128i moveHighBits16_8(const __m128i& value);
1121
1122 /**
1123 * Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
1124 * Given: PONM-LKJI-HGFE-DCBA<br>
1125 * Result: 0000-000J-0H0F-0D0B<br>
1126 * @param value Value to remove the high bits for
1127 * @return Result
1128 */
1129 static inline __m128i moveHighBits16_8_5(const __m128i& value);
1130
1131 /**
1132 * Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
1133 * Given: PONM-LKJI-HGFE-DCBA<br>
1134 * Result: 0000-0L0J-0H0F-0D0B<br>
1135 * @param value Value to remove the high bits for
1136 * @return Result
1137 */
1138 static inline __m128i moveHighBits16_8_6(const __m128i& value);
1139
1140 /**
1141 * Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
1142 * Given: PONM-LKJI-HGFE-DCBA<br>
1143 * Result: 000N-0L0J-0H0F-0D0B<br>
1144 * @param value Value to remove the high bits for
1145 * @return Result
1146 */
1147 static inline __m128i moveHighBits16_8_7(const __m128i& value);
1148
1149 /**
1150 * Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
1151 * Given: PONM-LKJI-HGFE-DCBA<br>
1152 * Result: 000D-000C-000B-000A<br>
1153 * @param value Value to be shuffled
1154 * @return Result
1155 */
1156 static inline __m128i shuffleLow32ToLow32_8(const __m128i& value);
1157
1158 /**
1159 * Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1160 * Given: PONM-LKJI-HGFE-DCBA<br>
1161 * Result: 0H0D-0G0C-0F0B-0E0A<br>
1162 * @param value Value to be shuffled
1163 * @return Result
1164 */
1165 static inline __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value);
1166
1167 /**
1168 * Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1169 * Given: PONM-LKJI-HGFE-DCBA<br>
1170 * Result: 0P0L-0O0K-0N0J-0M0I<br>
1171 * @param value Value to be shuffled
1172 * @return Result
1173 */
1174 static inline __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i& value);
1175
1176 /**
1177 * Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
1178 * @param value Value to be shuffled
1179 * @return Result
1180 */
1181 static inline __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value);
1182
1183 /**
1184 * Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
1185 * @param value Value to be shuffled
1186 * @return Result
1187 */
1188 static inline __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i& value);
1189
1190 /**
1191 * Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
1192 * @return Bitmask
1193 */
1194 static inline __m128i bitMaskRemoveHigh16_8();
1195
1196 /**
1197 * Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
1198 * @return Bitmask
1199 */
1200 static inline __m128i bitMaskRemoveHigh32_16();
1201
1202 /**
1203 * Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
1204 * The pseudo code of the function is as follows:
1205 * <pre>
1206 * products0[0] = values0[0] * values1[0]
1207 * ...
1208 * products0[3] = values0[3] * values1[3]
1209 *
1210 * products1[0] = values0[4] * values1[4]
1211 * ...
1212 * products1[3] = values0[7] * values1[7]
1213 * </pre>
1214 * @param values0 The first 8 int16_t values to be multiplied
1215 * @param values1 The second 8 int16_t values to be multiplied
1216 * @param products0 The resulting first 4 int32_t products
1217 * @param products1 The resulting second 4 int32_t products
1218 */
1219 static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1);
1220
1221 /**
1222 * Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
1223 * The pseudo code of the function is as follows:
1224 * <pre>
1225 * results0[0] += values0[0] * values1[0]
1226 * ...
1227 * results0[3] += values0[3] * values1[3]
1228 *
1229 * results1[0] += values0[4] * values1[4]
1230 * ...
1231 * results1[3] += values0[7] * values1[7]
1232 * </pre>
1233 * @param values0 The first 8 int16_t values to be multiplied
1234 * @param values1 The second 8 int16_t values to be multiplied
1235 * @param results0 The results to which the first 4 int32_t products will be added
1236 * @param results1 The results to which the second 4 int32_t products will be added
1237 */
1238 static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1);
1239
1240 private:
1241
1242 /**
1243 * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
1244 * @param pixel Upper left pixel in the frame
1245 * @param size Size of one frame row in bytes
1246 * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
1247 * @param fxy_ Product of the fx and the inverse fy interpolation factor
1248 * @param fx_y Product of the inverse fx and the fy interpolation factor
1249 * @param fxy Product of the fx and the fy interpolation factor
1250 * @return Interpolated pixel values
1251 */
1252 static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
1253};
1254
1255inline void SSE::prefetchT0(const void* const data)
1256{
1257 _mm_prefetch((char*)data, _MM_HINT_T0);
1258}
1259
1260inline void SSE::prefetchT1(const void* const data)
1261{
1262 _mm_prefetch((char*)data, _MM_HINT_T1);
1263}
1264
1265inline void SSE::prefetchT2(const void* const data)
1266{
1267 _mm_prefetch((char*)data, _MM_HINT_T2);
1268}
1269
1270inline void SSE::prefetchNTA(const void* const data)
1271{
1272 _mm_prefetch((char*)data, _MM_HINT_NTA);
1273}
1274
1275template <unsigned int tIndex>
1276inline uint8_t SSE::value_u8(const __m128i& value)
1277{
1278 static_assert(tIndex <= 15u, "Invalid index!");
1279
1280#ifdef OCEAN_COMPILER_MSC
1281 return value.m128i_u8[tIndex];
1282#else
1283 return ((const M128i*)(&value))->m128i_u8[tIndex];
1284#endif
1285}
1286
1287inline uint8_t SSE::value_u8(const __m128i& value, const unsigned int index)
1288{
1289 ocean_assert(index <= 15u);
1290
1291#ifdef OCEAN_COMPILER_MSC
1292 return value.m128i_u8[index];
1293#else
1294 return ((const M128i*)(&value))->m128i_u8[index];
1295#endif
1296}
1297
1298template <unsigned int tIndex>
1299inline uint16_t SSE::value_u16(const __m128i& value)
1300{
1301 static_assert(tIndex <= 7u, "Invalid index!");
1302
1303#ifdef OCEAN_COMPILER_MSC
1304 return value.m128i_u16[tIndex];
1305#else
1306 return ((const M128i*)(&value))->m128i_u16[tIndex];
1307#endif
1308}
1309
1310template <unsigned int tIndex>
1311inline unsigned int SSE::value_u32(const __m128i& value)
1312{
1313 static_assert(tIndex <= 3u, "Invalid index!");
1314
1315#ifdef OCEAN_COMPILER_MSC
1316 return value.m128i_u32[tIndex];
1317#else
1318 return ((const M128i*)(&value))->m128i_u32[tIndex];
1319#endif
1320}
1321
1322OCEAN_FORCE_INLINE unsigned int SSE::sum_u32_4(const __m128i& value)
1323{
1324#ifdef OCEAN_COMPILER_MSC
1325 return value.m128i_u32[0] + value.m128i_u32[1] + value.m128i_u32[2] + value.m128i_u32[3];
1326#else
1327 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1] + ((const M128i*)(&value))->m128i_u32[2] + ((const M128i*)(&value))->m128i_u32[3];
1328#endif
1329}
1330
1331inline unsigned int SSE::sum_u32_first_2(const __m128i& value)
1332{
1333#ifdef OCEAN_COMPILER_MSC
1334 return value.m128i_u32[0] + value.m128i_u32[1];
1335#else
1336 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[1];
1337#endif
1338}
1339
1340inline unsigned int SSE::sum_u32_first_third(const __m128i& value)
1341{
1342#ifdef OCEAN_COMPILER_MSC
1343 return value.m128i_u32[0] + value.m128i_u32[2];
1344#else
1345 return ((const M128i*)(&value))->m128i_u32[0] + ((const M128i*)(&value))->m128i_u32[2];
1346#endif
1347}
1348
1349OCEAN_FORCE_INLINE float SSE::sum_f32_4(const __m128& value)
1350{
1351#ifdef OCEAN_COMPILER_MSC
1352 return value.m128_f32[0] + value.m128_f32[1] + value.m128_f32[2] + value.m128_f32[3];
1353#else
1354 return ((const M128*)(&value))->m128_f32[0] + ((const M128*)(&value))->m128_f32[1] + ((const M128*)(&value))->m128_f32[2] + ((const M128*)(&value))->m128_f32[3];
1355#endif
1356}
1357
1358OCEAN_FORCE_INLINE double SSE::sum_f64_2(const __m128d& value)
1359{
1360#ifdef OCEAN_COMPILER_MSC
1361 return value.m128d_f64[0] + value.m128d_f64[1];
1362#else
1363 return ((const M128d*)(&value))->m128d_f64[0] + ((const M128d*)(&value))->m128d_f64[1];
1364#endif
1365}
1366
1367inline __m128i SSE::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1368{
1369 ocean_assert(image0 && image1);
1370
1371 return SSE::sumSquareDifference8Bit16Elements(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1372}
1373
1374inline __m128i SSE::sumAbsoluteDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
1375{
1376 ocean_assert(image0 && image1);
1377
1378 return _mm_sad_epu8(_mm_srli_si128(SSE::load128i(image0), 5), _mm_srli_si128(SSE::load128i(image1), 5));
1379}
1380
1381inline __m128i SSE::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
1382{
1383 ocean_assert(image0 && image1);
1384
1385 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1386 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1387
1388 // subtract the 16 elements (usage of saturation and bitwise or operator)
1389 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1390
1391 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1392
1393 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00AA008ull, 0xA006A004A002A000ull));
1394 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1395
1396 // square the 16 elements
1397 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1398 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1399
1400 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1401 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1402 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1403
1404 // 4 32 bit square difference values
1405 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1406}
1407
1408inline __m128i SSE::sumSquareDifference8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
1409{
1410 ocean_assert(image0 && image1);
1411
1412 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1413 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1414
1415 // subtract the 16 elements (usage of saturation and bitwise or operator)
1416 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1417
1418 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1419
1420 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1421 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00FA00Dull, 0xA00BA009A007A005ull));
1422
1423 // square the 16 elements
1424 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1425 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1426
1427 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1428 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1429 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1430
1431 // 4 32 bit square difference values
1432 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1433}
1434
1435template <bool tBufferHas16Bytes>
1436inline __m128i SSE::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
1437{
1438 ocean_assert(image0 && image1);
1439
1440 const __m128i row0 = load_u8_13_lower_random<tBufferHas16Bytes>(image0);
1441 const __m128i row1 = load_u8_13_lower_random<tBufferHas16Bytes>(image1);
1442
1443 // subtract the 16 elements (usage of saturation and bitwise or operator)
1444 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1445
1446 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1447
1448 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00CA00AA008ull, 0xA006A004A002A000ull));
1449 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1450
1451 // square the 16 elements
1452 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1453 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1454
1455 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1456 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1457 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1458
1459 // 4 32 bit square difference values
1460 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1461}
1462
1463inline __m128i SSE::sumSquareDifference8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
1464{
1465 ocean_assert(image0 && image1);
1466
1467 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1468 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1469
1470 // subtract the 16 elements (usage of saturation and bitwise or operator)
1471 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1472
1473 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1474
1475 const __m128i subtractLow = _mm_shuffle_epi8(subtract, set128i(0xA0A0A00FA00DA00Bull, 0xA009A007A005A003ull));
1476 const __m128i subtractHigh = _mm_shuffle_epi8(subtract, set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1477
1478 // square the 16 elements
1479 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1480 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1481
1482 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1483 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1484 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1485
1486 // 4 32 bit square difference values
1487 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1488}
1489
1490template <bool tBufferHas16Bytes>
1491inline __m128i SSE::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1492{
1493 ocean_assert(image0 && image1);
1494
1495 const __m128i row0 = load_u8_15_lower_random<tBufferHas16Bytes>(image0);
1496 const __m128i row1 = load_u8_15_lower_random<tBufferHas16Bytes>(image1);
1497
1498 // subtract the 16 elements (usage of saturation and bitwise or operator)
1499 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1500
1501 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1502 const __m128i subtractLow = removeHighBits16_8(subtract);
1503 const __m128i subtractHigh = moveHighBits16_8_7(subtract); // the highest high 8 bit are not used due to the only 15 elements
1504
1505 // square the 16 elements
1506 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1507 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1508
1509 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1510 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1511 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1512
1513 // 4 32 bit square difference values
1514 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1515}
1516
1517template <bool tBufferHas16Bytes>
1518inline __m128i SSE::sumAbsoluteDifferences8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
1519{
1520 ocean_assert(image0 && image1);
1521
1522 return _mm_sad_epu8(load_u8_10_upper_zero<tBufferHas16Bytes>(image0), load_u8_10_upper_zero<tBufferHas16Bytes>(image1));
1523}
1524
1525template <bool tBufferHas16Bytes>
1526inline __m128i SSE::sumAbsoluteDifferences8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
1527{
1528 ocean_assert(image0 && image1);
1529
1530 return _mm_sad_epu8(load_u8_15_upper_zero<tBufferHas16Bytes>(image0), load_u8_15_upper_zero<tBufferHas16Bytes>(image1));
1531}
1532
1533inline __m128i SSE::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1534{
1535 ocean_assert(image0 && image1);
1536
1537 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1538 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1539
1540 return sumSquareDifference8Bit16Elements(row0, row1);
1541}
1542
1543inline __m128i SSE::sumAbsoluteDifferences8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
1544{
1545 ocean_assert(image0 && image1);
1546
1547 return _mm_sad_epu8(SSE::load128i(image0), SSE::load128i(image1));
1548}
1549
1550inline __m128i SSE::sumSquareDifference8Bit16ElementsAligned16(const uint8_t* const image0, const uint8_t* const image1)
1551{
1552 ocean_assert(image0 && image1);
1553 ocean_assert((unsigned long long)image0 % 16ll == 0ll);
1554 ocean_assert((unsigned long long)image1 % 16ll == 0ll);
1555
1556 const __m128i row0 = _mm_load_si128((__m128i*)image0);
1557 const __m128i row1 = _mm_load_si128((__m128i*)image1);
1558
1559 return sumSquareDifference8Bit16Elements(row0, row1);
1560}
1561
1562inline __m128i SSE::sumSquareDifference8Bit16Elements(const __m128i& row0, const __m128i& row1)
1563{
1564 // subtract the 16 elements (usage of saturation and bitwise or operator)
1565 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1566
1567 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
1568 const __m128i subtractLow = removeHighBits16_8(subtract);
1569 const __m128i subtractHigh = moveHighBits16_8(subtract);
1570
1571 // square the 16 elements
1572 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1573 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1574
1575 // distribute the 16 elements of 16 bit values into 8 elements of 32 bit values (an itermediate add operation is used)
1576 const __m128i sumSquareLow = _mm_add_epi32(removeHighBits32_16(squareLow), removeHighBits32_16(squareHigh));
1577 const __m128i sumSquareHigh = _mm_add_epi32(moveHighBits32_16(squareLow), moveHighBits32_16(squareHigh));
1578
1579 // 4 32 bit square difference values
1580 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1581}
1582
1583inline __m128i SSE::interpolation1Channel8Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1584{
1585 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1586 // values0: aF yE | yD yC | yB yA | y9 y8 | y7 y6 | y5 y4 | y3 y2 | y1 y0
1587 // values1: aF' yE' | yD' yC' | yB' yA' | y9' y8' | y7' y6' | y5' y4' | y3' y2' | y1' y0'
1588
1589 // shuffled elements
1590 // row0: y7 y6 y5 y4 y3 y2 y1 y0 | * fx_ * fy_
1591 // row1: y8 y7 y6 y5 y4 y3 y2 y1 | * fx * fy_
1592 // row2: y7' y6' y5' y4' y3' y2' y1' y0' | * fx_ * fy
1593 // row3: y8' y7' y6' y5' y4' y3' y2' y1' | * fx * fy
1594
1595#ifdef OCEAN_COMPILER_MSC
1596
1597 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1598 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1599 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1600 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1601 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1602 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1603 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1604
1605 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1606 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1607 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1608 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1609 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1610 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1611 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1612
1613 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1614 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1615 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1616 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1617 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1618 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1619 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1620
1621 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1622 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1623 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1624 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1625 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1626 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1627 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1628
1629 ocean_assert(fx_fy_.m128i_u16[0] + fxfy_.m128i_u16[0] + fx_fy.m128i_u16[0] + fxfy.m128i_u16[0] == 128u * 128u);
1630
1631#else
1632
1633#ifdef OCEAN_DEBUG
1634
1635 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1636 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1637 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1638 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1639
1640#endif // OCEAN_DEBUG
1641
1642 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1643 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1644 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1645 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1646 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1647 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1648 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1649
1650 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1651 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1652 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1653 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1654 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1655 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1656 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1657
1658 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1659 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1660 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1661 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1662 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1663 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1664 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1665
1666 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1667 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1668 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1669 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1670 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1671 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1672 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1673
1674 ocean_assert(debug_fx_fy_.m128i_u16[0] + debug_fxfy_.m128i_u16[0] + debug_fx_fy.m128i_u16[0] + debug_fxfy.m128i_u16[0] == 128u * 128u);
1675
1676#endif
1677
1678 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1679
1680 // row0
1681 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1682
1683 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1684 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1685
1686 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1687 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1688
1689 // row2
1690 row = _mm_shuffle_epi8(values1, shuffle);
1691
1692 multiLow = _mm_mullo_epi16(row, fx_fy);
1693 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1694
1695 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1696 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1697
1698
1699
1700 shuffle = set128i(0xA008A007A006A005ull, 0xA004A003A002A001ull);
1701
1702 // row1
1703 row = _mm_shuffle_epi8(values0, shuffle);
1704
1705 multiLow = _mm_mullo_epi16(row, fxfy_);
1706 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1707
1708 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1709 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1710
1711
1712 // row4
1713 row = _mm_shuffle_epi8(values1, shuffle);
1714
1715 multiLow = _mm_mullo_epi16(row, fxfy);
1716 multiHigh = _mm_mulhi_epu16(row, fxfy);
1717
1718 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1719 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1720
1721
1722 // normalization ( + 128 * 128 / 2) / (128 * 128)
1723 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1724 resultEven = _mm_srli_epi32(resultEven, 14);
1725
1726 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1727 resultOdd = _mm_srli_epi32(resultOdd, 14);
1728
1729 // stack the 2 four 32 bit values together to eight 8 bit values
1730 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1731}
1732
1733inline __m128i SSE::interpolation2Channel16Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1734{
1735 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1736 // values0: a7 y7 | a6 y6 | a5 y5 | a4 y4 | a3 y3 | a2 y2 | a1 y1 | a0 y0
1737 // values1: a7' y7' | a6' y6' | a5' y5' | a4' y4' | a3' y3' | a2' y2' | a1' y1' | a0' y0'
1738
1739 // shuffled elements
1740 // row0: a3 y3 a2 y2 a1 y1 a0 y0 | * fx_ * fy_
1741 // row1: a4 y4 a3 y3 a2 y2 a1 y1 | * fx * fy_
1742 // row2: a3' y3' a2' y2' a1' y1' a0' y0' | * fx_ * fy
1743 // row3: a4' y4' a3' y3' a2' y2' a1' y1' | * fx * fy
1744
1745#ifdef OCEAN_COMPILER_MSC
1746
1747 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1748 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1749 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1750 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1751 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1752 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1753 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1754
1755 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1756 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1757 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1758 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1759 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1760 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1761 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1762
1763 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1764 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1765 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1766 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1767 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1768 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1769 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1770
1771 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1772 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1773 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1774 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1775 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1776 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1777 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1778
1779#else
1780
1781#ifdef OCEAN_DEBUG
1782
1783 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1784 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1785 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1786 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1787
1788#endif // OCEAN_DEBUG
1789
1790 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1791 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1792 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1793 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1794 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1795 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1796 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1797
1798 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1799 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1800 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1801 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1802 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1803 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1804 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1805
1806 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1807 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1808 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1809 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1810 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1811 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1812 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1813
1814 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1815 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1816 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1817 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1818 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1819 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1820 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1821
1822#endif
1823
1824 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1825
1826 // row0
1827 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1828
1829 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1830 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1831
1832 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1833 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1834
1835 // row2
1836 row = _mm_shuffle_epi8(values1, shuffle);
1837
1838 multiLow = _mm_mullo_epi16(row, fx_fy);
1839 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1840
1841 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1842 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1843
1844
1845
1846 shuffle = set128i(0xA009A008A007A006ull, 0xA005A004A003A002ull);
1847
1848 // row1
1849 row = _mm_shuffle_epi8(values0, shuffle);
1850
1851 multiLow = _mm_mullo_epi16(row, fxfy_);
1852 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1853
1854 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1855 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1856
1857
1858 // row4
1859 row = _mm_shuffle_epi8(values1, shuffle);
1860
1861 multiLow = _mm_mullo_epi16(row, fxfy);
1862 multiHigh = _mm_mulhi_epu16(row, fxfy);
1863
1864 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1865 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1866
1867
1868 // normalization ( + 128 * 128 / 2) / (128 * 128)
1869 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1870 resultEven = _mm_srli_epi32(resultEven, 14);
1871
1872 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1873 resultOdd = _mm_srli_epi32(resultOdd, 14);
1874
1875 // stack the 2 four 32 bit values together to eight 8 bit values
1876 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
1877}
1878
1879inline __m128i SSE::interpolation3Channel24Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
1880{
1881 // F E D C B A 9 8 7 6 5 4 3 2 1 0
1882 // values0: r5 | b4 g4 r4 | b3 g3 r3 | b2 g2 r2 | b1 g1 r1 | b0 g0 r0
1883 // values1: r5'| b4' g4' r4'| b3' g3' r3'| b2' g2' r2'| b1' g1' r1'| b0' g0' r0'
1884
1885 // shuffled elements
1886 // row0: g2 r2 b1 g1 r1 b0 g0 r0 | * fx_ * fy_
1887 // row1: g3 r3 b2 g2 r2 b1 g1 r1 | * fx * fy_
1888 // row2: g2' r2' b1' g1' r1' b0' g0' r0' | * fx_ * fy
1889 // row3: g3' r3' b2' g2' r2' b1' g1' r1' | * fx * fy
1890
1891#ifdef OCEAN_COMPILER_MSC
1892
1893 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1894 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1895 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1896 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1897 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1898 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1899 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1900
1901 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1902 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1903 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1904 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1905 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1906 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1907 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1908
1909 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1910 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1911 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1912 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1913 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1914 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1915 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1916
1917 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1918 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1919 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1920 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1921 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1922 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1923 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1924
1925#else
1926
1927#ifdef OCEAN_DEBUG
1928
1929 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
1930 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
1931 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
1932 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
1933
1934#endif // OCEAN_DEBUG
1935
1936 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
1937 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
1938 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
1939 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
1940 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
1941 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
1942 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
1943
1944 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
1945 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
1946 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
1947 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
1948 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
1949 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
1950 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
1951
1952 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
1953 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
1954 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
1955 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
1956 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
1957 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
1958 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
1959
1960 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
1961 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
1962 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
1963 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
1964 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
1965 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
1966 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
1967
1968#endif
1969
1970 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1971
1972 // row0
1973 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1974
1975 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1976 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1977
1978 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
1979 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1980
1981 // row2
1982 row = _mm_shuffle_epi8(values1, shuffle);
1983
1984 multiLow = _mm_mullo_epi16(row, fx_fy);
1985 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1986
1987 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1988 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1989
1990
1991
1992 shuffle = set128i(0xA00AA009A008A007ull, 0xA006A005A004A003ull);
1993
1994 // row1
1995 row = _mm_shuffle_epi8(values0, shuffle);
1996
1997 multiLow = _mm_mullo_epi16(row, fxfy_);
1998 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1999
2000 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2001 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2002
2003
2004 // row4
2005 row = _mm_shuffle_epi8(values1, shuffle);
2006
2007 multiLow = _mm_mullo_epi16(row, fxfy);
2008 multiHigh = _mm_mulhi_epu16(row, fxfy);
2009
2010 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2011 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2012
2013
2014 // normalization ( + 128 * 128 / 2) / (128 * 128)
2015 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2016 resultEven = _mm_srli_epi32(resultEven, 14);
2017
2018 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2019 resultOdd = _mm_srli_epi32(resultOdd, 14);
2020
2021 // stack the 2 four 32 bit values together to eight 8 bit values
2022 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2023}
2024
2025inline __m128i SSE::interpolation1Channel8Bit15Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2026{
2027 __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2028 __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2029
2030 __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2031 __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2032
2033 __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2034 __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2035
2036 __m128i row0_d = _mm_shuffle_epi8(values0, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2037 __m128i row1_d = _mm_shuffle_epi8(values1, set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2038
2039 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2040 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2041 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2042 row0_d = _mm_madd_epi16(row0_d, fx_fy_fxfy_);
2043
2044 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2045 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2046 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2047 row1_d = _mm_madd_epi16(row1_d, fx_fyfxfy);
2048
2049 const __m128i rounding = _mm_set1_epi32(8192);
2050
2051 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2052 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2053 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2054 __m128i row_d = _mm_add_epi32(row0_d, row1_d);
2055
2056 row_a = _mm_add_epi32(row_a, rounding);
2057 row_b = _mm_add_epi32(row_b, rounding);
2058 row_c = _mm_add_epi32(row_c, rounding);
2059 row_d = _mm_add_epi32(row_d, rounding);
2060
2061 row_a = _mm_srli_epi32(row_a, 14);
2062 row_b = _mm_srli_epi32(row_b, 14);
2063 row_c = _mm_srli_epi32(row_c, 14);
2064 row_d = _mm_srli_epi32(row_d, 14);
2065
2066 row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF0c080400ull));
2067 row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFFFFFFFFull, 0x0c080400FFFFFFFFull));
2068 row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0c080400ull, 0xFFFFFFFFFFFFFFFFull));
2069 row_d = _mm_shuffle_epi8(row_d, set128i(0xFF080400FFFFFFFFull, 0xFFFFFFFFFFFFFFFFull));
2070
2071 row_a = _mm_or_si128(row_a, row_b);
2072 row_c = _mm_or_si128(row_c, row_d);
2073
2074 return _mm_or_si128(row_a, row_c);
2075}
2076
2077inline __m128i SSE::interpolation3Channel24Bit12Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_fxfy_, const __m128i& fx_fyfxfy)
2078{
2079 __m128i row0_a = _mm_shuffle_epi8(values0, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2080 __m128i row1_a = _mm_shuffle_epi8(values1, set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2081
2082 __m128i row0_b = _mm_shuffle_epi8(values0, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2083 __m128i row1_b = _mm_shuffle_epi8(values1, set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2084
2085 __m128i row0_c = _mm_shuffle_epi8(values0, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2086 __m128i row1_c = _mm_shuffle_epi8(values1, set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2087
2088 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2089 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2090 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2091
2092 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2093 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2094 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2095
2096 const __m128i rounding = _mm_set1_epi32(8192);
2097
2098 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2099 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2100 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2101
2102 row_a = _mm_add_epi32(row_a, rounding);
2103 row_b = _mm_add_epi32(row_b, rounding);
2104 row_c = _mm_add_epi32(row_c, rounding);
2105
2106 row_a = _mm_srli_epi32(row_a, 14);
2107 row_b = _mm_srli_epi32(row_b, 14);
2108 row_c = _mm_srli_epi32(row_c, 14);
2109
2110 row_a = _mm_shuffle_epi8(row_a, set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
2111 row_b = _mm_shuffle_epi8(row_b, set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
2112 row_c = _mm_shuffle_epi8(row_c, set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
2113
2114 return _mm_or_si128(row_a, _mm_or_si128(row_b, row_c));
2115}
2116
2117inline __m128i SSE::interpolation4Channel32Bit8Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2118{
2119 // F E D C B A 9 8 7 6 5 4 3 2 1 0
2120 // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2121 // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2122
2123 // shuffled elements
2124 // row0: a1 b1 g1 r1 a0 b0 g0 r0 | * fx_ * fy_
2125 // row1: a2 b2 g2 r2 a1 b1 g1 r1 | * fx * fy_
2126 // row2: a1' b1' g1' r1' a0' b0' g0' r0' | * fx_ * fy
2127 // row3: a2' b2' g2' r2' a1' b1' g1' r1' | * fx * fy
2128
2129#ifdef OCEAN_COMPILER_MSC
2130
2131 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2132 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2133 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2134 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2135 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2136 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2137 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2138
2139 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2140 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2141 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2142 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2143 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2144 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2145 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2146
2147 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2148 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2149 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2150 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2151 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2152 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2153 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2154
2155 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2156 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2157 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2158 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2159 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2160 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2161 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2162
2163#else
2164
2165#ifdef OCEAN_DEBUG
2166
2167 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2168 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2169 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2170 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2171
2172#endif // OCEAN_DEBUG
2173
2174 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2175 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2176 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2177 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2178 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2179 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2180 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2181
2182 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2183 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2184 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2185 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2186 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2187 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2188 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2189
2190 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2191 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2192 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2193 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2194 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2195 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2196 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2197
2198 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2199 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2200 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2201 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2202 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2203 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2204 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2205
2206#endif
2207
2208 __m128i shuffle = set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2209
2210 // row0
2211 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2212
2213 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2214 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2215
2216 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2217 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2218
2219 // row2
2220 row = _mm_shuffle_epi8(values1, shuffle);
2221
2222 multiLow = _mm_mullo_epi16(row, fx_fy);
2223 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2224
2225 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2226 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2227
2228
2229
2230 shuffle = set128i(0xA00BA00AA009A008ull, 0xA007A006A005A004ull);
2231
2232 // row1
2233 row = _mm_shuffle_epi8(values0, shuffle);
2234
2235 multiLow = _mm_mullo_epi16(row, fxfy_);
2236 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2237
2238 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2239 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2240
2241
2242 // row4
2243 row = _mm_shuffle_epi8(values1, shuffle);
2244
2245 multiLow = _mm_mullo_epi16(row, fxfy);
2246 multiHigh = _mm_mulhi_epu16(row, fxfy);
2247
2248 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2249 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2250
2251
2252 // normalization ( + 128 * 128 / 2) / (128 * 128)
2253 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2254 resultEven = _mm_srli_epi32(resultEven, 14);
2255
2256 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2257 resultOdd = _mm_srli_epi32(resultOdd, 14);
2258
2259 // stack the 2 four 32 bit values together to eight 8 bit values
2260 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2261}
2262
2263
2264inline __m128i SSE::interpolation4Channel32Bit2x4Elements(const __m128i& values0, const __m128i& values1, const __m128i& fx_fy_, const __m128i& fxfy_, const __m128i& fx_fy, const __m128i& fxfy)
2265{
2266 // F E D C B A 9 8 7 6 5 4 3 2 1 0
2267 // values0: a3 b3 g3 r3 | a2 b2 g2 r2 | a1 b1 g1 r1 | a0 b0 g0 r0
2268 // values1: a3' b3' g3' r3'| a2' b2' g2' r2'| a1' b1' g1' r1'| a0' b0' g0' r0'
2269
2270 // shuffled elements
2271 // row0: a2 b2 g2 r2 a0 b0 g0 r0 | * fx_ * fy_
2272 // row1: a3 b3 g3 r3 a1 b1 g1 r1 | * fx * fy_
2273 // row2: a2' b2' g2' r2' a0' b0' g0' r0' | * fx_ * fy
2274 // row3: a3' b3' g3' r3' a1' b1' g1' r1' | * fx * fy
2275
2276#ifdef OCEAN_COMPILER_MSC
2277
2278 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2279 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2280 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2281 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2282 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2283 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2284 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2285
2286 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2287 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2288 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2289 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2290 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2291 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2292 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2293
2294 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2295 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2296 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2297 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2298 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2299 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2300 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2301
2302 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2303 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2304 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2305 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2306 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2307 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2308 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2309
2310#else
2311
2312#ifdef OCEAN_DEBUG
2313
2314 const M128i& debug_fx_fy_ = *(const M128i*)(&fx_fy_);
2315 const M128i& debug_fx_fy = *(const M128i*)(&fx_fy);
2316 const M128i& debug_fxfy_ = *(const M128i*)(&fxfy_);
2317 const M128i& debug_fxfy = *(const M128i*)(&fxfy);
2318
2319#endif // OCEAN_DEBUG
2320
2321 ocean_assert(debug_fx_fy_.m128i_u16[0] == debug_fx_fy_.m128i_u16[1]);
2322 ocean_assert(debug_fx_fy_.m128i_u16[1] == debug_fx_fy_.m128i_u16[2]);
2323 ocean_assert(debug_fx_fy_.m128i_u16[2] == debug_fx_fy_.m128i_u16[3]);
2324 ocean_assert(debug_fx_fy_.m128i_u16[3] == debug_fx_fy_.m128i_u16[4]);
2325 ocean_assert(debug_fx_fy_.m128i_u16[4] == debug_fx_fy_.m128i_u16[5]);
2326 ocean_assert(debug_fx_fy_.m128i_u16[5] == debug_fx_fy_.m128i_u16[6]);
2327 ocean_assert(debug_fx_fy_.m128i_u16[6] == debug_fx_fy_.m128i_u16[7]);
2328
2329 ocean_assert(debug_fxfy_.m128i_u16[0] == debug_fxfy_.m128i_u16[1]);
2330 ocean_assert(debug_fxfy_.m128i_u16[1] == debug_fxfy_.m128i_u16[2]);
2331 ocean_assert(debug_fxfy_.m128i_u16[2] == debug_fxfy_.m128i_u16[3]);
2332 ocean_assert(debug_fxfy_.m128i_u16[3] == debug_fxfy_.m128i_u16[4]);
2333 ocean_assert(debug_fxfy_.m128i_u16[4] == debug_fxfy_.m128i_u16[5]);
2334 ocean_assert(debug_fxfy_.m128i_u16[5] == debug_fxfy_.m128i_u16[6]);
2335 ocean_assert(debug_fxfy_.m128i_u16[6] == debug_fxfy_.m128i_u16[7]);
2336
2337 ocean_assert(debug_fx_fy.m128i_u16[0] == debug_fx_fy.m128i_u16[1]);
2338 ocean_assert(debug_fx_fy.m128i_u16[1] == debug_fx_fy.m128i_u16[2]);
2339 ocean_assert(debug_fx_fy.m128i_u16[2] == debug_fx_fy.m128i_u16[3]);
2340 ocean_assert(debug_fx_fy.m128i_u16[3] == debug_fx_fy.m128i_u16[4]);
2341 ocean_assert(debug_fx_fy.m128i_u16[4] == debug_fx_fy.m128i_u16[5]);
2342 ocean_assert(debug_fx_fy.m128i_u16[5] == debug_fx_fy.m128i_u16[6]);
2343 ocean_assert(debug_fx_fy.m128i_u16[6] == debug_fx_fy.m128i_u16[7]);
2344
2345 ocean_assert(debug_fxfy.m128i_u16[0] == debug_fxfy.m128i_u16[1]);
2346 ocean_assert(debug_fxfy.m128i_u16[1] == debug_fxfy.m128i_u16[2]);
2347 ocean_assert(debug_fxfy.m128i_u16[2] == debug_fxfy.m128i_u16[3]);
2348 ocean_assert(debug_fxfy.m128i_u16[3] == debug_fxfy.m128i_u16[4]);
2349 ocean_assert(debug_fxfy.m128i_u16[4] == debug_fxfy.m128i_u16[5]);
2350 ocean_assert(debug_fxfy.m128i_u16[5] == debug_fxfy.m128i_u16[6]);
2351 ocean_assert(debug_fxfy.m128i_u16[6] == debug_fxfy.m128i_u16[7]);
2352
2353#endif
2354
2355 __m128i shuffle = set128i(0xA00BA00AA009A008ull, 0xA003A002A001A000ull);
2356
2357 // row0
2358 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2359
2360 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2361 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2362
2363 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA); // 0xAA = 1010 1010
2364 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2365
2366 // row2
2367 row = _mm_shuffle_epi8(values1, shuffle);
2368
2369 multiLow = _mm_mullo_epi16(row, fx_fy);
2370 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2371
2372 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2373 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2374
2375
2376
2377 shuffle = set128i(0xA00FA00EA00DA00Cull, 0xA007A006A005A004ull);
2378
2379 // row1
2380 row = _mm_shuffle_epi8(values0, shuffle);
2381
2382 multiLow = _mm_mullo_epi16(row, fxfy_);
2383 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2384
2385 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2386 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2387
2388
2389 // row4
2390 row = _mm_shuffle_epi8(values1, shuffle);
2391
2392 multiLow = _mm_mullo_epi16(row, fxfy);
2393 multiHigh = _mm_mulhi_epu16(row, fxfy);
2394
2395 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2396 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2397
2398
2399 // normalization ( + 128 * 128 / 2) / (128 * 128)
2400 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2401 resultEven = _mm_srli_epi32(resultEven, 14);
2402
2403 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2404 resultOdd = _mm_srli_epi32(resultOdd, 14);
2405
2406 // stack the 2 four 32 bit values together to eight 8 bit values
2407 return moveLowBits32_16ToLow64(_mm_or_si128(resultEven, _mm_slli_si128(resultOdd, 1)));
2408}
2409
2410inline void SSE::average8Elements1Channel32Bit2x2(const float* const image0, const float* const image1, float* const result)
2411{
2412 ocean_assert(image0 && image1);
2413
2414 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2415 const __m128 row0 = _mm_loadu_ps(image0);
2416 const __m128 row1 = _mm_loadu_ps(image1);
2417
2418 // get sum of first 4 elements
2419 const __m128 sumFirst = _mm_add_ps(row0, row1);
2420
2421 // load next 4 elements
2422 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2423 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2424
2425 // get sum of second 4 elements
2426 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2427
2428 // get sum of adjacent summed pixels
2429 const __m128 sumAdjacent = _mm_hadd_ps(sumFirst, sumSecond);
2430
2431 /* following variant is exactly as fast as _mm_hadd_ps(,) ~ 0.30ms / 100,000 iteration
2432 const unsigned int mask10001000 = 136u;
2433 const unsigned int mask11011101 = 221u;
2434 const __m128 sumAdjacent = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, mask10001000), _mm_shuffle_ps(sumFirst, sumSecond, mask11011101));
2435 */
2436
2437 // divide by 4 --> multiply by 0.25
2438 const __m128 division = _mm_mul_ps(sumAdjacent, _mm_set_ps1(0.25f));
2439
2440 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2441 _mm_storeu_ps(result, division);
2442}
2443
2444inline void SSE::average8Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2445{
2446 ocean_assert(image0 && image1);
2447
2448 // 16 * uchar = m128i, but only the first 8 elements are set
2449 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2450 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2451
2452 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2453 const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2454 const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2455
2456 // build overall sum and add 2 for rounding
2457 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2458
2459 // divide by 4 by right shifting of two bits
2460 const __m128i division16 = _mm_srli_epi16(sum, 2);
2461
2462 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2463 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2464
2465 memcpy(result, &division8, sizeof(uint8_t) * 4);
2466}
2467
2468inline void SSE::average8ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2469{
2470 ocean_assert(image0 != nullptr && image1 != nullptr);
2471 ocean_assert(threshold >= 1u);
2472
2473 // we load the first 8 elements, the uppper 8 bytes will be set to zero
2474 const __m128i row0_u_8x8 = _mm_loadl_epi64((__m128i*)image0);
2475 const __m128i row1_u_8x8 = _mm_loadl_epi64((__m128i*)image1);
2476
2477 const __m128i row0_u_16x8 = _mm_cvtepu8_epi16(row0_u_8x8); // converting the lower 8 bytes to 16 byte values
2478 const __m128i row1_u_16x8 = _mm_cvtepu8_epi16(row1_u_8x8);
2479
2480 const __m128i verticalSum_u_16x8 = _mm_adds_epu16(row0_u_16x8, row1_u_16x8);
2481 const __m128i sum_u_16x8 = _mm_hadd_epi16(verticalSum_u_16x8, verticalSum_u_16x8);
2482
2483 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2484
2485 const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2486
2487 memcpy(result, &mask_u_8x8, sizeof(uint8_t) * 4);
2488}
2489
2490inline void SSE::average16Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2491{
2492 ocean_assert(image0 && image1);
2493
2494 // 16 * uchar = m128i
2495 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2496 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2497
2498 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2499 const __m128i sumLow = _mm_add_epi16(removeHighBits16_8(row0), removeHighBits16_8(row1));
2500 const __m128i sumHigh = _mm_add_epi16(moveHighBits16_8(row0), moveHighBits16_8(row1));
2501
2502 // build overall sum and add 2 for rounding
2503 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(int(0x00020002))));
2504
2505 // divide by 4 by right shifting of two bits
2506 const __m128i division16 = _mm_srli_epi16(sum, 2);
2507
2508 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2509 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2510
2511 // copy the lower 64 bit to the memory
2512 _mm_storel_epi64((__m128i*)result, division8);
2513
2514 /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2515 const __m128i avgRows = _mm_avg_epu8(row0, row1);
2516 const __m128i avgRowsSwap = _mm_or_si128(_mm_slli_epi16(avgRows, 8), _mm_srli_epi16(avgRows, 8));
2517
2518 const __m128i avg = _mm_avg_epu8(avgRows, avgRowsSwap); // 1 result in 2 uchar
2519 const __m128i avgOrdered = _mm_shuffle_epi8(avg, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2520
2521 _mm_storel_epi64((__m128i*)result, avgOrdered);
2522 */
2523}
2524
2525inline void SSE::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2526{
2527 ocean_assert(image0 != nullptr && image1 != nullptr);
2528 ocean_assert(threshold >= 1u);
2529
2530 // 16 * uchar = m128i
2531 const __m128i row0_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2532 const __m128i row1_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2533
2534 const __m128i horizontalSum0_u_16x8 = _mm_maddubs_epi16(row0_u_8x16, _mm_set1_epi8(1));
2535 const __m128i horizontalSum1_u_16x8 = _mm_maddubs_epi16(row1_u_8x16, _mm_set1_epi8(1));
2536
2537 const __m128i sum_u_16x8 = _mm_add_epi16(horizontalSum0_u_16x8, horizontalSum1_u_16x8);
2538
2539 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2540
2541 const __m128i mask_u_8x8 = moveLowBits16_8ToLow64(mask_u_16x8);
2542
2543 // copy the lower 64 bit to the memory
2544 _mm_storel_epi64((__m128i*)result, mask_u_8x8);
2545}
2546
2547inline void SSE::average32Elements1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2548{
2549 ocean_assert(image0 && image1);
2550
2551 // first 16 elements
2552 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2553 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2554
2555 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2556 const __m128i firstSumLow = _mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1));
2557 const __m128i firstSumHigh = _mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1));
2558
2559 // build overall sum and add 2 for rounding
2560 const __m128i firstSum = _mm_add_epi16(firstSumLow, _mm_add_epi16(firstSumHigh, _mm_set1_epi32(int(0x00020002))));
2561
2562 // divide by 4 by right shifting of two bits
2563 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2564
2565 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2566 const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2567
2568 // second 16 elements
2569 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2570 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2571
2572 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum
2573 const __m128i secondSumLow = _mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1));
2574 const __m128i secondSumHigh = _mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1));
2575
2576 // build overall sum and add 2 for rounding
2577 const __m128i secondSum = _mm_add_epi16(secondSumLow, _mm_add_epi16(secondSumHigh, _mm_set1_epi32(int(0x00020002))));
2578
2579 // divide by 4 by right shifting of two bits
2580 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2581
2582 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2583 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2584
2585
2586 // combine both divion results
2587 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2588
2589 // copy the 128 bit to the memory
2590 _mm_storeu_si128((__m128i*)result, division8);
2591
2592 /* using _mm_avg_epu8 is a bit faster (~3%) but result is always rounded up
2593 const __m128i avgFirstRows = _mm_avg_epu8(firstRow0, firstRow1);
2594 const __m128i avgFirstRowsSwap = _mm_or_si128(_mm_slli_epi16(avgFirstRows, 8), _mm_srli_epi16(avgFirstRows, 8));
2595
2596 const __m128i avgFirst = _mm_avg_epu8(avgFirstRows, avgFirstRowsSwap); // 1 result in 2 uchar
2597 const __m128i avgFristOrdered = _mm_shuffle_epi8(avgFirst, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 6, 4, 2, 0));
2598
2599 const __m128i avgSecondRows = _mm_avg_epu8(secondRow0, secondRow1);
2600 const __m128i avgSecondRowsSwap = _mm_or_si128(_mm_slli_epi16(avgSecondRows, 8), _mm_srli_epi16(avgSecondRows, 8));
2601
2602 const __m128i avgSecond = _mm_avg_epu8(avgSecondRows, avgSecondRowsSwap); // 1 result in 2 uchar
2603 const __m128i avgSecondOrdered = _mm_shuffle_epi8(avgSecond, _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0));
2604
2605 // combine both divion results
2606 const __m128i combinedAvg = _mm_or_si128(avgFristOrdered, avgSecondOrdered);
2607
2608 // copy the 128 bit to the memory
2609 _mm_storeu_si128((__m128i*)result, combinedAvg);
2610 */
2611}
2612
2613inline void SSE::average32ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint16_t threshold)
2614{
2615 ocean_assert(image0 != nullptr && image1 != nullptr);
2616 ocean_assert(threshold >= 1u);
2617
2618 // load first 16 uchars
2619 const __m128i row0A_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2620 const __m128i row1A_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2621
2622 const __m128i horizontalSum0A_u_16x8 = _mm_maddubs_epi16(row0A_u_8x16, _mm_set1_epi8(1));
2623 const __m128i horizontalSum1A_u_16x8 = _mm_maddubs_epi16(row1A_u_8x16, _mm_set1_epi8(1));
2624
2625 const __m128i sumA_u_16x8 = _mm_add_epi16(horizontalSum0A_u_16x8, horizontalSum1A_u_16x8);
2626
2627 const __m128i maskA_u_16x8 = _mm_cmpgt_epi16(sumA_u_16x8, _mm_set1_epi16(short(threshold - 1)));
2628
2629 const __m128i row0B_u_8x16 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2630 const __m128i row1B_u_8x16 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2631
2632 const __m128i horizontalSum0B_u_16x8 = _mm_maddubs_epi16(row0B_u_8x16, _mm_set1_epi8(1));
2633 const __m128i horizontalSum1B_u_16x8 = _mm_maddubs_epi16(row1B_u_8x16, _mm_set1_epi8(1));
2634
2635 const __m128i sumB_u_16x8 = _mm_add_epi16(horizontalSum0B_u_16x8, horizontalSum1B_u_16x8);
2636
2637 const __m128i maskB_u_16x8 = _mm_cmpgt_epi16(sumB_u_16x8, _mm_set1_epi16(short(threshold - 1u)));
2638
2639 const __m128i mask_u_8x16 = _mm_or_si128(moveLowBits16_8ToLow64(maskA_u_16x8), moveLowBits16_8ToHigh64(maskB_u_16x8));
2640
2641 // copy the 128 bit to the memory
2642 _mm_storeu_si128((__m128i*)result, mask_u_8x16);
2643}
2644
2645inline void SSE::average8Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2646{
2647 ocean_assert(image0 && image1);
2648
2649 // 16 * uchar = m128i, but only the first 8 elements are set
2650 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2651 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2652
2653 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2654 const __m128i shuffledRow0 = shuffleNeighbor2Low64BitsToLow16_8(row0);
2655 const __m128i shuffledRow1 = shuffleNeighbor2Low64BitsToLow16_8(row1);
2656
2657 // build sum and add 2 for rounding
2658 const __m128i sumLow = _mm_add_epi16(shuffledRow0, shuffledRow1);
2659 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumLow), _mm_set1_epi32(int(0x00020002)));
2660
2661 // divide by 4 by right shifting of two bits
2662 const __m128i division16 = _mm_srli_epi16(sum, 2);
2663
2664 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2665 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2666
2667 memcpy(result, &division8, sizeof(uint8_t) * 4);
2668}
2669
2670inline void SSE::average8Elements2Channel64Bit2x2(const float* const image0, const float* const image1, float* const result)
2671{
2672 ocean_assert(image0 && image1);
2673
2674 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2675 const __m128 row0 = _mm_loadu_ps(image0);
2676 const __m128 row1 = _mm_loadu_ps(image1);
2677
2678 // get sum of first 4 elements
2679 const __m128 sumFirst = _mm_add_ps(row0, row1);
2680
2681 // load next 4 elements
2682 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2683 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2684
2685 // get sum of second 4 elements
2686 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2687
2688 // get sum of summed pixels
2689 // mask01000100 = 68u
2690 // mask11101110 = 238u
2691 const __m128 sumComponents = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, 68u), _mm_shuffle_ps(sumFirst, sumSecond, 238u));
2692
2693 // divide by 4 --> multiply by 0.25
2694 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2695
2696 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2697 _mm_storeu_ps(result, division);
2698}
2699
2700inline void SSE::average16Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2701{
2702 ocean_assert(image0 && image1);
2703
2704 // 16 * uchar = m128i
2705 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2706 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2707
2708 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2709 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2710 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2711
2712 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2713 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2714
2715 // divide by 4 by right shifting of two bits
2716 const __m128i division16 = _mm_srli_epi16(sum, 2);
2717
2718 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2719 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2720
2721 // copy the lower 64 bit to the memory
2722 _mm_storel_epi64((__m128i*)result, division8);
2723}
2724
2725inline void SSE::average32Elements2Channel16Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2726{
2727 ocean_assert(image0 && image1);
2728
2729 // first 16 elements: 16 * uchar = m128i
2730 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2731 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2732
2733 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2734 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(row0), shuffleNeighbor2Low64BitsToLow16_8(row1));
2735 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(row0), shuffleNeighbor2High64BitsToLow16_8(row1));
2736
2737 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2738 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2739
2740 // divide by 4 by right shifting of two bits
2741 const __m128i division16 = _mm_srli_epi16(sum, 2);
2742
2743 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2744 const __m128i firstDivision8 = moveLowBits16_8ToLow64(division16);
2745
2746 // second 16 elements
2747 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2748 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2749
2750 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2751 const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor2Low64BitsToLow16_8(secondRow0), shuffleNeighbor2Low64BitsToLow16_8(secondRow1));
2752 const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor2High64BitsToLow16_8(secondRow0), shuffleNeighbor2High64BitsToLow16_8(secondRow1));
2753
2754 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2755 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2756
2757 // divide by 4 by right shifting of two bits
2758 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2759
2760 // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2761 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2762
2763
2764 // combine both divion results
2765 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2766
2767 // copy the 128 bit to the memory
2768 _mm_storeu_si128((__m128i*)result, division8);
2769}
2770
2771inline void SSE::average6Elements3Channel96Bit2x2(const float* const image0, const float* const image1, float* const result)
2772{
2773 ocean_assert(image0 && image1 && result);
2774
2775 // 6 * float = 2 pixel: 00 01 02 03 04 05
2776
2777 // load element 0 up to 3, input does not need to be aligned on any particular boundary.
2778 const __m128 row0 = _mm_loadu_ps(image0);
2779 const __m128 row1 = _mm_loadu_ps(image1);
2780
2781 // get sum of first 4 elements
2782 const __m128 sumFirst = _mm_add_ps(row0, row1);
2783
2784 // load element 2 up to 5 to prevent that we access memory out of our range
2785 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 2);
2786 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 2);
2787
2788 // get sum of second 4 elements
2789 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2790
2791 // get sum of summed pixels
2792 // NOTE: _mm_shuffle_ps resulting first 64bit are always from first __m128, second 64bit from second __m128
2793 // mask111001 = 57u; // 'i+1'th float became 'i'
2794 const __m128 sumComponents = _mm_add_ps(sumFirst, _mm_shuffle_ps(sumSecond, sumSecond, 57u));
2795
2796 // divide by 4 --> multiply by 0.25
2797 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2798
2799 // store 3 elements (96 bit) to the memory
2800
2801#ifdef OCEAN_COMPILER_MSC
2802 memcpy(result, &division.m128_f32[0], sizeof(float) * 3);
2803#else
2804 memcpy(result, &division, sizeof(float) * 3);
2805#endif
2806}
2807
2808inline void SSE::average24Elements3Channel24Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2809{
2810 ocean_assert(image0 && image1 && result);
2811
2812 __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2813 __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2814
2815 // distribute the first 12 elements (element 00 up to 11):
2816 // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2817 //
2818 // -- -- -- -- -- 08 -- 07 -- 06 -- 02 -- 01 -- 00
2819 // -- -- -- -- -- 11 -- 10 -- 09 -- 05 -- 04 -- 03
2820
2821 __m128i shuffleMaskLow = set128i(0xA0A0A0A0A008A007ull, 0xA006A002A001A000ull);
2822 __m128i shuffleMaskHigh = set128i(0xA0A0A0A0A00BA00Aull, 0xA009A005A004A003ull);
2823
2824 __m128i sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2825 __m128i sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2826
2827 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2828 __m128i sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2829
2830 // divide by 4 by right shifting of two bits
2831 __m128i division16 = _mm_srli_epi16(sum, 2);
2832
2833 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2834 __m128i division8 = _mm_shuffle_epi8(division16, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A00A0806040200ull));
2835
2836
2837 // now we load the remaining 12 elements (however, this time we take element 04 up to 15 to prevent that we access memory out of our range)
2838 // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
2839 //
2840 // -- -- -- -- -- 12 -- 11 -- 10 -- 06 -- 05 -- 04
2841 // -- -- -- -- -- 15 -- 14 -- 13 -- 09 -- 08 -- 07
2842
2843 row0 = _mm_lddqu_si128((__m128i*)(image0 + 8));
2844 row1 = _mm_lddqu_si128((__m128i*)(image1 + 8));
2845
2846 shuffleMaskLow = set128i(0xA0A0A0A0A00CA00Bull, 0xA00AA006A005A004ull);
2847 shuffleMaskHigh = set128i(0xA0A0A0A0A00FA00Eull, 0xA00DA009A008A007ull);
2848
2849 sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2850 sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2851
2852 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2853 sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2854
2855 // divide by 4 by right shifting of two bits
2856 division16 = _mm_srli_epi16(sum, 2);
2857
2858 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2859 division8 = _mm_or_si128(division8, _mm_shuffle_epi8(division16, set128i(0xA0A0A0A00A080604ull, 0x0200A0A0A0A0A0A0ull)));
2860
2861#ifdef OCEAN_COMPILER_MSC
2862 memcpy(result, &division8.m128i_u8[0], 12);
2863#else
2864 memcpy(result, &division8, 12);
2865#endif
2866}
2867
2868inline void SSE::average8Elements4Channel128Bit2x2(const float* const image0, const float* const image1, float* const result)
2869{
2870 ocean_assert(image0 && image1);
2871
2872 // 4 * float = m128, input does not need to be aligned on any particular boundary.
2873 const __m128 row0 = _mm_loadu_ps(image0);
2874 const __m128 row1 = _mm_loadu_ps(image1);
2875
2876 // get sum of first 4 elements
2877 const __m128 sumFirstPixel = _mm_add_ps(row0, row1);
2878
2879 // load next 4 elements
2880 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2881 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2882
2883 // get sum of second 4 elements
2884 const __m128 sumSecondPixel = _mm_add_ps(rowSecond0, rowSecond1);
2885
2886 // get sum of summed pixels
2887 const __m128 sumComponents = _mm_add_ps(sumFirstPixel, sumSecondPixel);
2888
2889 // divide by 4 --> multiply by 0.25
2890 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2891
2892 // store 4 elements (128 bit) to the memory, output does not need to be aligned on any particular boundary.
2893 _mm_storeu_ps(result, division);
2894}
2895
2896inline void SSE::average16Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2897{
2898 ocean_assert(image0 && image1);
2899
2900 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2901 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2902
2903 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2904 const __m128i sumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(row0), shuffleNeighbor4Low64BitsToLow16_8(row1));
2905 const __m128i sumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(row0), shuffleNeighbor4High64BitsToLow16_8(row1));
2906
2907 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2908 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(int(0x00020002)));
2909
2910 // divide by 4 by right shifting of two bits
2911 const __m128i division16 = _mm_srli_epi16(sum, 2);
2912
2913 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2914 const __m128i division8 = moveLowBits16_8ToLow64(division16);
2915
2916 // copy the lower 64 bit to the memory
2917 _mm_storel_epi64((__m128i*)result, division8);
2918}
2919
2920inline void SSE::average32Elements4Channel32Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result)
2921{
2922 ocean_assert(image0 && image1);
2923
2924 // first 16 elements
2925 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2926 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2927
2928 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2929 const __m128i firstSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(firstRow0), shuffleNeighbor4Low64BitsToLow16_8(firstRow1));
2930 const __m128i firstSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(firstRow0), shuffleNeighbor4High64BitsToLow16_8(firstRow1));
2931
2932 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2933 const __m128i firstSum = _mm_add_epi16(_mm_hadd_epi16(firstSumLow, firstSumHigh), _mm_set1_epi32(int(0x00020002)));
2934
2935 // divide by 4 by right shifting of two bits
2936 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2937
2938 // shift the lower 8 bit of the eight 16 bit values to the lower 64 bit
2939 const __m128i firstDivision8 = moveLowBits16_8ToLow64(firstDivision16);
2940
2941
2942 // second 16 elements
2943 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2944 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2945
2946 // distribute the 8 elements of 8 bit values into 8 elements of 16 bit values
2947 const __m128i secondSumLow = _mm_add_epi16(shuffleNeighbor4Low64BitsToLow16_8(secondRow0), shuffleNeighbor4Low64BitsToLow16_8(secondRow1));
2948 const __m128i secondSumHigh = _mm_add_epi16(shuffleNeighbor4High64BitsToLow16_8(secondRow0), shuffleNeighbor4High64BitsToLow16_8(secondRow1));
2949
2950 // add neighboring 16 bit elements together to new 16 bit elements and add 2 for rounding to each new element
2951 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(int(0x00020002)));
2952
2953 // divide by 4 by right shifting of two bits
2954 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2955
2956 // shift the lower 8 bit of the eight 16 bit values to the higher 64 bit
2957 const __m128i secondDivision8 = moveLowBits16_8ToHigh64(secondDivision16);
2958
2959
2960 // combine both divion results
2961 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2962
2963 // copy the 128 bit to the memory
2964 _mm_storeu_si128((__m128i*)result, division8);
2965}
2966
2967inline void SSE::average30Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
2968{
2969 ocean_assert(image0 && image1 && image2);
2970
2971 /**
2972 * | 1 2 1 |
2973 * 1/16 | 2 4 2 |
2974 * | 1 2 1 |
2975 */
2976
2977 // first 16 elements (actual 14 are used)
2978 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2979 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2980 const __m128i firstRow2 = _mm_lddqu_si128((__m128i*)image2);
2981
2982 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
2983 const __m128i firstSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(firstRow0), removeHighBits16_8(firstRow1)), _mm_add_epi16(removeHighBits16_8(firstRow1), removeHighBits16_8(firstRow2)));
2984 const __m128i firstSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(firstRow0), moveHighBits16_8(firstRow1)), _mm_add_epi16(moveHighBits16_8(firstRow1), moveHighBits16_8(firstRow2)));
2985
2986 // second 16 elements, starting from 15th element
2987 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 14));
2988 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 14));
2989 const __m128i secondRow2 = _mm_lddqu_si128((__m128i*)(image2 + 14));
2990
2991 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values and create the sum, middle row is summed twice
2992 const __m128i secondSumEven = _mm_add_epi16(_mm_add_epi16(removeHighBits16_8(secondRow0), removeHighBits16_8(secondRow1)), _mm_add_epi16(removeHighBits16_8(secondRow1), removeHighBits16_8(secondRow2)));
2993 const __m128i secondSumOdd = _mm_add_epi16(_mm_add_epi16(moveHighBits16_8(secondRow0), moveHighBits16_8(secondRow1)), _mm_add_epi16(moveHighBits16_8(secondRow1), moveHighBits16_8(secondRow2)));
2994
2995 // build overall sum and add 8 for rounding
2996 // positions 0, 2, 3, 5, 6 are valid, e.g. pos. 0 contains element00 + element01
2997 const __m128i firstSum = _mm_add_epi16(firstSumEven, _mm_add_epi16(firstSumOdd, _mm_set1_epi32(int(0x00080008))));
2998 // e.g. pos. 0 contains now element00 + element01 + element02
2999 const __m128i firstSumWithEven = _mm_add_epi16(firstSum, _mm_shuffle_epi8(firstSumEven, set128i(0xFFFF0F0E0B0AFFFFull, 0x09080504FFFF0302ull)));
3000 // e.g. pos. 0 contains now element00 + element01 + element02 + element01
3001 const __m128i firstSumWithBoth = _mm_add_epi16(firstSumWithEven, _mm_shuffle_epi8(firstSumOdd, set128i(0xFFFF0D0C0908FFFFull, 0x07060302FFFF0100ull)));
3002
3003 // build overall sum and add 8 for rounding
3004 // positions 1, 2, 4, 5, 7 are valid
3005 const __m128i secondSum = _mm_add_epi16(secondSumEven, _mm_add_epi16(secondSumOdd, _mm_set1_epi32(int(0x00080008))));
3006 const __m128i secondSumWithEven = _mm_add_epi16(secondSum, _mm_shuffle_epi8(secondSumEven, set128i(0x0F0EFFFF0D0C0908ull, 0xFFFF07060302FFFFull)));
3007 const __m128i secondSumWithBoth = _mm_add_epi16(secondSumWithEven, _mm_shuffle_epi8(secondSumOdd, set128i(0x0D0CFFFF0B0A0706ull, 0xFFFF05040100FFFFull)));
3008
3009 // divide by 16 by right shifting of four bits
3010 const __m128i firstDivision16 = _mm_srli_epi16(firstSumWithBoth, 4);
3011 const __m128i secondDivision16 = _mm_srli_epi16(secondSumWithBoth, 4);
3012
3013 // reorder valid elements to lowest bits
3014 const __m128i firstDivision8 = _mm_shuffle_epi8(firstDivision16, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0C0A060400ull));
3015 const __m128i secondDivision8 = _mm_shuffle_epi8(secondDivision16, set128i(0xFFFFFFFFFFFF0E0Aull, 0x080402FFFFFFFFFFull));
3016
3017 // combine both divion results
3018 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3019
3020 // copy the lowest 10*8 bit to the memory
3021#ifdef OCEAN_COMPILER_MSC
3022 memcpy(result, &division8.m128i_u8[0], 10);
3023#else
3024 memcpy(result, &division8, 10);
3025#endif
3026}
3027
3029{
3030 /**
3031 * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3032 * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3033 * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3034 */
3035
3036 // We create a bit mask for all 16 bit odd values, an odd value will create an active lower bit in each 16 bit value
3037 const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0001000100010001ull, 0x0001000100010001ull));
3038
3039 // We create a bit mask for all 16 bit negative values, a negative value will create an active lower bit in each 16 bit value
3040 const __m128i maskNegatives = _mm_srli_epi16(_mm_and_si128(value, CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull)), 15);
3041
3042 // We add 1 to each 16 bit value having an active 'odd-bit' and active
3043 // 'negative-bit'
3044 return _mm_add_epi16(value, _mm_and_si128(maskNegatives, maskOdds));
3045}
3046
3047inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3048{
3049 ocean_assert(rightShifts < 16u);
3050
3051 // the offset for negative values: 2^shifts - 1
3052 const __m128i offsetForNegatives_s_16x8 = _mm_set1_epi16(short((1u << rightShifts) - 1u));
3053
3054 // bit mask for all 16 bit negative values
3055 const __m128i maskHigh_s_16x8 = CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull);
3056
3057 // 0x0000 for positive values, 0xFFFF for negative values
3058 const __m128i maskNegativeValues_s_16x8 = _mm_cmpeq_epi16(_mm_and_si128(value, maskHigh_s_16x8), maskHigh_s_16x8);
3059
3060 // 0 for positive values, 2^shifts - 1 for negative values
3061 const __m128i offset_s_16x8 = _mm_and_si128(offsetForNegatives_s_16x8, maskNegativeValues_s_16x8);
3062
3063 return _mm_add_epi16(value, offset_s_16x8);
3064}
3065
3066inline __m128i SSE::divideByRightShiftSigned16Bit(const __m128i& value, const unsigned int rightShifts)
3067{
3068 return _mm_srai_epi16(addOffsetBeforeRightShiftDivisionSigned16Bit(value, rightShifts), int(rightShifts));
3069}
3070
3072{
3073 /**
3074 * SSE does not have an intrinsic for integer division, so right bit shift is used instead.
3075 * Unfortunately, for negative odd integer values v: (v / 2) != (v >> 1) because a right shift rounds towards negative infinity, e.g. -5 / 2 = -2 and -5 >> 1 = -3.
3076 * As a work-around, an offset of 1 is added to all values that are both, negative and odd.
3077 */
3078
3079 // We create a bit mask for all 32 bit odd values, an odd value will create an active lower bit in each 32 bit value
3080 const __m128i maskOdds = _mm_and_si128(value, CV::SSE::set128i(0x0000000100000001ull, 0x0000000100000001ull));
3081
3082 // We create a bit mask for all 32 bit negative values, a negative value will create an active lower bit in each 32 bit value
3083 const __m128i maskNegatives = _mm_srli_epi32(_mm_and_si128(value, CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull)), 31);
3084
3085 // We add 1 to each 32 bit value having an active 'odd-bit' and active 'negative-bit'
3086 return _mm_add_epi32(value, _mm_and_si128(maskNegatives, maskOdds));
3087}
3088
3089inline __m128i SSE::addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3090{
3091 ocean_assert(rightShifts < 32u);
3092
3093 // the offset for negative values: 2^shifts - 1
3094 const __m128i offsetForNegatives_s_32x4 = _mm_set1_epi32(int((1u << rightShifts) - 1u));
3095
3096 // bit mask for all 32 bit negative values
3097 const __m128i maskHigh_s_32x4 = CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull);
3098
3099 // 0x00000000 for positive values, 0xFFFFFFFF for negative values
3100 const __m128i maskNegativeValues_s_32x4 = _mm_cmpeq_epi32(_mm_and_si128(value, maskHigh_s_32x4), maskHigh_s_32x4);
3101
3102 // 0 for positive values, 2^shifts - 1 for negative values
3103 const __m128i offset_s_32x4 = _mm_and_si128(offsetForNegatives_s_32x4, maskNegativeValues_s_32x4);
3104
3105 return _mm_add_epi32(value, offset_s_32x4);
3106}
3107
3108inline __m128i SSE::divideByRightShiftSigned32Bit(const __m128i& value, const unsigned int rightShifts)
3109{
3110 return _mm_srai_epi32(addOffsetBeforeRightShiftDivisionSigned32Bit(value, rightShifts), int(rightShifts));
3111}
3112
3113inline void SSE::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
3114{
3115 ocean_assert(source && response && width >= 10u);
3116
3117 // Load 16 unsigned 8-bit values; left/right/top/bottom pixels
3118 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3119 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3120
3121 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3122 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3123
3124 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3125 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3126 //const __m128i horizontalMinusLo = _mm_shuffle_epi8(horizontalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3127 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3128
3129 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3130 //const __m128i horizontalPlusLo = _mm_shuffle_epi8(horizontalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull));
3131 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3132
3133 // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3134 const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3135 const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3136
3137 // Convert the low and high signed 16-bit differences to signed 8-bit and merge them into a single
3138 const __m128i horizontalGradient = _mm_or_si128(
3139 _mm_shuffle_epi8(horizontalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3140 _mm_shuffle_epi8(horizontalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3141
3142 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3143 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3144 //const __m128i verticalMinusLo = _mm_shuffle_epi8(verticalMinus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == a[7:0]
3145 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3146
3147 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3148 //const __m128i verticalPlusLo = _mm_shuffle_epi8(verticalPlus, set128i(0x8007800680058004ull, 0x8003800280018000ull)); // == b[7:0]
3149 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3150
3151 // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3152 const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3153 const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3154
3155 // Convert the differences to signed char and merge the high and low halves
3156 const __m128i verticalGradient = _mm_or_si128(
3157 _mm_shuffle_epi8(verticalGradientLo, set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3158 _mm_shuffle_epi8(verticalGradientHi, set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3159
3160 // Take the horizontal gradients, [dx0, dx1, dx2, ...], and the vertical gradient, [dy0, dy1, dy2, ...] and interleave them, [dx0, dy0, dx1, dy1, dx2, dy2, ...]
3161 const __m128i interleavedResponseLo = _mm_unpacklo_epi8(horizontalGradient, verticalGradient);
3162 const __m128i interleavedResponseHi = _mm_unpackhi_epi8(horizontalGradient, verticalGradient);
3163
3164 ocean_assert(sizeof(char) == 1ull);
3165 _mm_storeu_si128((__m128i*)response, interleavedResponseLo);
3166 _mm_storeu_si128((__m128i*)(response + 16ull), interleavedResponseHi);
3167}
3168
3169inline void SSE::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
3170{
3171 ocean_assert(source && response && width >= 10u);
3172
3173 // Load 4x(16x8u) values: left/right/top/bottom pixels
3174 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3175 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3176
3177 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3178 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3179
3180 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3181 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3182 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3183
3184 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3185 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3186
3187 // Take the signed difference (right - left) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3188 const __m128i horizontalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusLo, horizontalMinusLo)), 1);
3189 const __m128i horizontalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(horizontalPlusHi, horizontalMinusHi)), 1);
3190
3191 // Convert the above values to signed 16-bit values and split them into a low and high half (shuffle). Use zero padding to fill the 16-bit result (0x80).
3192 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus); // Specialized function since SSE 4.1; no equivalent for the upper half
3193 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3194
3195 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus); // Specialized function since SSE 4.1; no equivalent for the upper half
3196 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus, set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3197
3198 // Take the signed difference (bottom - top) and divide by two to fit values into the range [-128, 127]. (Integer) division by right shifting values by one position.
3199 const __m128i verticalGradientLo = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusLo, verticalMinusLo)), 1);
3200 const __m128i verticalGradientHi = _mm_srai_epi16(addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(_mm_sub_epi16(verticalPlusHi, verticalMinusHi)), 1);
3201
3202 // Squared gradients: h*h, v*v, h*v
3203 const __m128i horizontalHorizontalLo = _mm_mullo_epi16(horizontalGradientLo, horizontalGradientLo);
3204 const __m128i horizontalHorizontalHi = _mm_mullo_epi16(horizontalGradientHi, horizontalGradientHi);
3205
3206 const __m128i verticalVerticalLo = _mm_mullo_epi16(verticalGradientLo, verticalGradientLo);
3207 const __m128i verticalVerticalHi = _mm_mullo_epi16(verticalGradientHi, verticalGradientHi);
3208
3209 const __m128i horzontalVerticalLo = _mm_mullo_epi16(horizontalGradientLo, verticalGradientLo);
3210 const __m128i horzontalVerticalHi = _mm_mullo_epi16(horizontalGradientHi, verticalGradientHi);
3211
3212 // Interleave/pack the above squared gradient, 16S values
3213 //
3214 // a, b, c - Above variables ending in *Lo
3215 // d, e, f - Above variables ending in *Hi
3216 //
3217 // a = [a7, a6, a5, a4, a3, a2, a1, a0]
3218 // b = [b7, b6, b5, b4, b3, b2, b1, b0]
3219 // c = [c7, c6, c5, c4, c3, c2, c1, c0]
3220 //
3221 // d = [d7, d6, d5, d4, d3, d2, d1, d0]
3222 // e = [e7, e6, e5, e4, e3, e2, e1, e0]
3223 // f = [f7, f6, f5, f4, f3, f2, f1, f0]
3224 //
3225 // A = [b2, a2, c1, b1, a1, c0, b0, a0]
3226 // B = [a5, c4, b4, a4, c3, b3, a3, c2]
3227 // C = [c7, b7, a7, c6, b6, a6, c5, b5]
3228 //
3229 // D = [e2, d2, f1, e1, d1, f0, e0, d0]
3230 // E = [d5, f4, e4, d4, f3, e3, d3, f2]
3231 // F = [f7, e7, d7, f6, e6, d6, f5, e5]
3232
3233 const __m128i block0Lo = _mm_or_si128( // == [b2, a2, c1, b1, a1, c0, b0, a0]
3234 _mm_or_si128( // == [b2, a2, 00, b1, a1, 00, b0, a0]
3235 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, a2, 00, 00, a1, 00, 00, a0]
3236 _mm_shuffle_epi8(verticalVerticalLo, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [b2, 00, 00, b1, 00, 00, b0, 00]
3237 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, c1, 00, 00, c0, 00, 00]
3238
3239 const __m128i block1Lo = _mm_or_si128( // == [a5, c4, b4, a4, c3, b3, a3, c2]
3240 _mm_or_si128( // == [a5, 00, b4, a4, 00, b3, a3, 00]
3241 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [a5, 00, 00, a4, 00, 00, a4, 00]
3242 _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, b4, 00, 00, b3, 00, 00]
3243 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, c4, 00, 00, c3, 00, 00, c2]
3244
3245 const __m128i block2Lo = _mm_or_si128( // == [c7, b7, a7, c6, b6, a6, c5, b5]
3246 _mm_or_si128( // == [00, b7, a7, 00, b6, a6, 00, b5]
3247 _mm_shuffle_epi8(horizontalHorizontalLo, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, a7, 00, 00, a6, 00, 00]
3248 _mm_shuffle_epi8(verticalVerticalLo, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, b7, 00, 00, b6, 00, 00, b5]
3249 _mm_shuffle_epi8(horzontalVerticalLo, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [c7, 00, 00, c6, 00, 00, c5, 00]
3250
3251 const __m128i block0Hi = _mm_or_si128( // == [e2, d2, f1, e1, d1, f0, e0, d0]
3252 _mm_or_si128( // == [e2, d2, 00, e1, d1, 00, e0, d0]
3253 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)), // == [00, d2, 00, 00, d1, 00, 00, d0]
3254 _mm_shuffle_epi8(verticalVerticalHi, set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))), // == [e2, 00, 00, e1, 00, 00, e0, 00]
3255 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull))); // == [00, 00, f1, 00, 00, f0, 00, 00]
3256
3257 const __m128i block1Hi = _mm_or_si128( // == [d5, f4, e4, d4, f3, e3, d3, f2]
3258 _mm_or_si128( // == [d5, 00, e4, d4, 00, e3, d3, 00]
3259 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)), // == [d5, 00, 00, d4, 00, 00, d4, 00]
3260 _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))), // == [00, 00, e4, 00, 00, e3, 00, 00]
3261 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull))); // == [00, f4, 00, 00, f3, 00, 00, f2]
3262
3263 const __m128i block2Hi = _mm_or_si128( // == [f7, e7, d7, f6, e6, d6, f5, e5]
3264 _mm_or_si128( // == [00, e7, d7, 00, e6, d6, 00, e5]
3265 _mm_shuffle_epi8(horizontalHorizontalHi, set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)), // == [00, 00, d7, 00, 00, d6, 00, 00]
3266 _mm_shuffle_epi8(verticalVerticalHi, set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))), // == [00, e7, 00, 00, e6, 00, 00, e5]
3267 _mm_shuffle_epi8(horzontalVerticalHi, set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull))); // == [f7, 00, 00, f6, 00, 00, f5, 00]
3268
3269 _mm_storeu_si128((__m128i*)response, block0Lo);
3270 _mm_storeu_si128((__m128i*)(response + 8ull), block1Lo);
3271 _mm_storeu_si128((__m128i*)(response + 16ull), block2Lo);
3272 _mm_storeu_si128((__m128i*)(response + 24ull), block0Hi);
3273 _mm_storeu_si128((__m128i*)(response + 32ull), block1Hi);
3274 _mm_storeu_si128((__m128i*)(response + 40ull), block2Hi);
3275}
3276
3277OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit15Elements(const __m128i& interleaved, __m128i& channel01, __m128i& channel2)
3278{
3279 // interleaved R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 X
3280
3281 // channel01 R0 R1 R2 R3 R4 X X X G0 G1 G2 G3 G4 X X X
3282 // channel2 B0 B1 B2 B3 B4 X X X 0 0 0 0 0 0 0 0
3283
3284 channel01 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFF0d0a070401ull, 0xFFFFFF0c09060300ull));
3285
3286 channel2 = _mm_shuffle_epi8(interleaved, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull));
3287}
3288
3289OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit24Elements(const __m128i& interleavedA, const __m128i& interleavedB, __m128i& channel01, __m128i& channel2)
3290{
3291 // interleavedA R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
3292 // interleavedB G5 B5 R6 G6 B6 R7 G7 B7 X X X X X X X X
3293
3294 // channel01 R0 R1 R2 R3 R4 R5 R6 R7 G0 G1 G2 G3 G4 G5 G6 G7
3295 // channel2 B0 B1 B2 B3 B4 B5 B6 B7 0 0 0 0 0 0 0 0
3296
3297 channel01 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFF0d0a070401ull, 0xFFFF0f0c09060300ull)),
3298 _mm_shuffle_epi8(interleavedB, set128i(0x060300FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3299
3300 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3301 _mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFFFFFFull, 0x070401FFFFFFFFFFull)));
3302}
3303
3304OCEAN_FORCE_INLINE void SSE::deInterleave3Channel8Bit48Elements(const __m128i& interleavedA, const __m128i& interleavedB, const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3305{
3306 channel0 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFF0f0c09060300ull)),
3307 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0e0b08ull, 0x0502FFFFFFFFFFFFull)),
3308 _mm_shuffle_epi8(interleavedC, set128i(0x0d0a070401FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3309
3310 channel1 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3311 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3312 _mm_shuffle_epi8(interleavedC, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3313
3314 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3315 _mm_or_si128(_mm_shuffle_epi8(interleavedB, set128i(0xFFFFFFFFFFFF0d0aull, 0x070401FFFFFFFFFFull)),
3316 _mm_shuffle_epi8(interleavedC, set128i(0x0f0c09060300FFFFull, 0xFFFFFFFFFFFFFFFFull))));
3317}
3318
3319inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3320{
3321 ocean_assert(interleaved != nullptr);
3322
3323 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0, channel1, channel2);
3324}
3325
3326inline void SSE::deInterleave3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* channel0, uint8_t* channel1, uint8_t* channel2)
3327{
3328 ocean_assert(interleaved && channel0 && channel1 && channel2);
3329
3330 __m128i channel0_128, channel1_128, channel2_128;
3331 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), channel0_128, channel1_128, channel2_128);
3332
3333 store128i(channel0_128, channel0);
3334 store128i(channel1_128, channel1);
3335 store128i(channel2_128, channel2);
3336}
3337
3338inline void SSE::deInterleave3Channel8Bit45Elements(const uint8_t* interleaved, __m128i& channel0, __m128i& channel1, __m128i& channel2)
3339{
3340 ocean_assert(interleaved != nullptr);
3341
3342 deInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3), channel0, channel1, channel2);
3343}
3344
3345OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const __m128i& channel0, const __m128i& channel1, const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC)
3346{
3347 interleavedA = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0x05FFFF04FFFF03FFull, 0xFF02FFFF01FFFF00ull)),
3348 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFFFF04FFFF03FFFFull, 0x02FFFF01FFFF00FFull)),
3349 _mm_shuffle_epi8(channel2, set128i(0xFF04FFFF03FFFF02ull, 0xFFFF01FFFF00FFFFull))));
3350
3351 interleavedB = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFF0AFFFF09FFFF08ull, 0xFFFF07FFFF06FFFFull)),
3352 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0x0AFFFF09FFFF08FFull, 0xFF07FFFF06FFFF05ull)),
3353 _mm_shuffle_epi8(channel2, set128i(0xFFFF09FFFF08FFFFull, 0x07FFFF06FFFF05FFull))));
3354
3355 interleavedC = _mm_or_si128(_mm_shuffle_epi8(channel0, set128i(0xFFFF0FFFFF0EFFFFull, 0x0DFFFF0CFFFF0BFFull)),
3356 _mm_or_si128(_mm_shuffle_epi8(channel1, set128i(0xFF0FFFFF0EFFFF0Dull, 0xFFFF0CFFFF0BFFFFull)),
3357 _mm_shuffle_epi8(channel2, set128i(0x0FFFFF0EFFFF0DFFull, 0xFF0CFFFF0BFFFF0Aull))));
3358}
3359
3360OCEAN_FORCE_INLINE void SSE::interleave3Channel8Bit48Elements(const uint8_t* const channel0, const uint8_t* const channel1, const uint8_t* const channel2, uint8_t* const interleaved)
3361{
3362 ocean_assert(channel0 && channel1 && channel2 && interleaved);
3363
3364 __m128i interleavedA_128, interleavedB_128, interleavedC_128;
3365 interleave3Channel8Bit48Elements(load128i(channel0), load128i(channel1), load128i(channel2), interleavedA_128, interleavedB_128, interleavedC_128);
3366
3367 store128i(interleavedA_128, interleaved + 0);
3368 store128i(interleavedB_128, interleaved + 16);
3369 store128i(interleavedC_128, interleaved + 32);
3370}
3371
3372OCEAN_FORCE_INLINE void SSE::reverseChannelOrder2Channel8Bit32Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3373{
3374 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3375
3376 // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3377 // Y A Y A Y A Y A Y A Y A Y A Y A
3378 // output: A Y A Y A Y A Y A Y A Y A Y A Y
3379 // 1 0 3 2 5 4 7 6 9 8 B A D C F E
3380
3381 const __m128i shuffleMask_u_16x8 = set128i(0x0E0F0C0D0A0B0809ull, 0x0607040502030001ull);
3382
3383 store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3384 store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3385}
3386
3387OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2)
3388{
3389 reversedInterleaved0 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFF0c0d0e090a0b06ull, 0x0708030405000102ull)),
3390 _mm_shuffle_epi8(interleaved1, set128i(0x01FFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull)));
3391
3392 reversedInterleaved1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFF0fFFull)),
3393 _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0x0fFF0b0c0d08090aull, 0x050607020304FF00ull)),
3394 _mm_shuffle_epi8(interleaved2, set128i(0xFF00FFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3395
3396 reversedInterleaved2 = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFF0eull)),
3397 _mm_shuffle_epi8(interleaved2, set128i(0x0d0e0f0a0b0c0708ull, 0x09040506010203FFull)));
3398}
3399
3400OCEAN_FORCE_INLINE void SSE::reverseChannelOrder3Channel8Bit48Elements(const uint8_t* interleaved, uint8_t* const reversedInterleaved)
3401{
3402 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3403
3404 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3405 reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3406
3407 store128i(reversedInterleaved0, reversedInterleaved);
3408 store128i(reversedInterleaved1, reversedInterleaved + 16);
3409 store128i(reversedInterleaved2, reversedInterleaved + 32);
3410}
3411
3412OCEAN_FORCE_INLINE void SSE::reverseChannelOrder4Channel8Bit64Elements(const uint8_t* interleaved, uint8_t* reversedInterleaved)
3413{
3414 ocean_assert(interleaved != nullptr && reversedInterleaved != nullptr);
3415
3416 // input: 0 1 2 3 4 5 6 7 8 9 A B C D E F
3417 // R G B A R G B A R G B A R G B A
3418 // output: A B G R A B G R A B G R A B G R
3419 // 3 2 1 0 7 6 5 4 B A 9 8 F E D C
3420
3421 const __m128i shuffleMask_u_16x8 = set128i(0x0C0D0E0F08090A0Bull, 0x0405060700010203ull);
3422
3423 store128i(_mm_shuffle_epi8(load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3424 store128i(_mm_shuffle_epi8(load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3425 store128i(_mm_shuffle_epi8(load128i(interleaved + 32), shuffleMask_u_16x8), reversedInterleaved + 32);
3426 store128i(_mm_shuffle_epi8(load128i(interleaved + 48), shuffleMask_u_16x8), reversedInterleaved + 48);
3427}
3428
3429inline void SSE::reverseChannelOrder3Channel8Bit48Elements(uint8_t* interleaved)
3430{
3431 ocean_assert(interleaved);
3432
3433 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3434 reverseChannelOrder3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32), reversedInterleaved0, reversedInterleaved1, reversedInterleaved2);
3435
3436 store128i(reversedInterleaved0, interleaved);
3437 store128i(reversedInterleaved1, interleaved + 16);
3438 store128i(reversedInterleaved2, interleaved + 32);
3439}
3440
3441inline void SSE::swapReversedChannelOrder3Channel8Bit48Elements(uint8_t* first, uint8_t* second)
3442{
3443 ocean_assert(first && second && first != second);
3444
3445 __m128i first0, first1, first2;
3446 reverseChannelOrder3Channel8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3447
3448 __m128i second0, second1, second2;
3449 reverseChannelOrder3Channel8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3450
3451 store128i(first0, second);
3452 store128i(first1, second + 16);
3453 store128i(first2, second + 32);
3454
3455 store128i(second0, first);
3456 store128i(second1, first + 16);
3457 store128i(second2, first + 32);
3458}
3459
3460inline void SSE::reverseElements8Bit48Elements(const __m128i& elements0, const __m128i& elements1, const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2)
3461{
3462 const __m128i mask = set128i(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
3463
3464 reversedElements0 = _mm_shuffle_epi8(elements2, mask);
3465 reversedElements1 = _mm_shuffle_epi8(elements1, mask);
3466 reversedElements2 = _mm_shuffle_epi8(elements0, mask);
3467}
3468
3469inline void SSE::reverseElements8Bit48Elements(const uint8_t* elements, uint8_t* reversedElements)
3470{
3471 ocean_assert(elements && reversedElements);
3472
3473 __m128i reversedElements0, reversedElements1, reversedElements2;
3474 reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3475
3476 store128i(reversedElements0, reversedElements);
3477 store128i(reversedElements1, reversedElements + 16);
3478 store128i(reversedElements2, reversedElements + 32);
3479}
3480
3481inline void SSE::reverseElements8Bit48Elements(uint8_t* elements)
3482{
3483 ocean_assert(elements);
3484
3485 __m128i reversedElements0, reversedElements1, reversedElements2;
3486 reverseElements8Bit48Elements(load128i(elements), load128i(elements + 16), load128i(elements + 32), reversedElements0, reversedElements1, reversedElements2);
3487
3488 store128i(reversedElements0, elements);
3489 store128i(reversedElements1, elements + 16);
3490 store128i(reversedElements2, elements + 32);
3491}
3492
3493inline void SSE::swapReversedElements8Bit48Elements(uint8_t* first, uint8_t* second)
3494{
3495 ocean_assert(first && second && first != second);
3496
3497 __m128i first0, first1, first2;
3498 reverseElements8Bit48Elements(load128i(first), load128i(first + 16), load128i(first + 32), first0, first1, first2);
3499
3500 __m128i second0, second1, second2;
3501 reverseElements8Bit48Elements(load128i(second), load128i(second + 16), load128i(second + 32), second0, second1, second2);
3502
3503 store128i(first0, second);
3504 store128i(first1, second + 16);
3505 store128i(first2, second + 32);
3506
3507 store128i(second0, first);
3508 store128i(second1, first + 16);
3509 store128i(second2, first + 32);
3510}
3511
3512inline void SSE::shiftChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3513{
3514 ocean_assert(elements && shiftedElements);
3515
3516 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0c0f0e0d080b0a09ull, 0x0407060500030201ull)), shiftedElements);
3517}
3518
3519inline void SSE::shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3520{
3521 ocean_assert(elements && shiftedElements);
3522
3523 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0003020104070605ull, 0x080b0a090c0f0e0dull)), shiftedElements);
3524}
3525
3526inline void SSE::shiftChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3527{
3528 ocean_assert(elements && shiftedElements);
3529
3530 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0e0d0c0f0a09080bull, 0x0605040702010003ull)), shiftedElements);
3531}
3532
3533inline void SSE::shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t* elements, uint8_t* shiftedElements)
3534{
3535 ocean_assert(elements && shiftedElements);
3536
3537 store128i(_mm_shuffle_epi8(load128i(elements), set128i(0x0201000306050407ull, 0x0a09080b0e0d0c0full)), shiftedElements);
3538}
3539
3540inline __m128i SSE::sum1Channel8Bit16Elements(const __m128i& elements)
3541{
3542 const __m128i zero = _mm_setzero_si128();
3543 const __m128i sum = _mm_sad_epu8(elements, zero);
3544
3545 return _mm_add_epi32(_mm_srli_si128(sum, 8), sum);
3546}
3547
3548inline __m128i SSE::sum1Channel8Bit16Elements(const uint8_t* elements)
3549{
3550 ocean_assert(elements != nullptr);
3551
3552 return sum1Channel8Bit16Elements(load128i(elements));
3553}
3554
3555template <bool tBufferHas16Bytes>
3556inline __m128i SSE::sum1Channel8BitFront15Elements(const uint8_t* elements)
3557{
3558 ocean_assert(elements != nullptr);
3559 return sum1Channel8Bit16Elements(load_u8_15_upper_zero<tBufferHas16Bytes>(elements));
3560}
3561
3562inline __m128i SSE::sum1Channel8BitBack15Elements(const uint8_t* elements)
3563{
3564 ocean_assert(elements != nullptr);
3565 return sum1Channel8Bit16Elements(load_u8_16_and_shift_right<1u>(elements));
3566}
3567
3568inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const __m128i& interleaved0, const __m128i& interleaved1, const __m128i& interleaved2)
3569{
3570 // Interleaved0: R BGR BGR BGR BGR BGR
3571 // Interleaved1: GR BGR BGR BGR BGR BG
3572 // Interleaved2: BGR BGR BGR BGR BGR B
3573
3574 // BBBBBBBB RRRRRRRR
3575 const __m128i channel0_2First = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFF0e0b080502ull, 0xFFFF0f0c09060300ull)),
3576 _mm_shuffle_epi8(interleaved1, set128i(0x070401FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3577
3578 // BBBBBBBB RRRRRRRR
3579 const __m128i channel0_2Second = _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFFFF0d0aull, 0xFFFFFFFFFF0e0b08ull)),
3580 _mm_shuffle_epi8(interleaved2, set128i(0x0f0c09060300FFFFull, 0x0d0a070401FFFFFFull)));
3581
3582 // GGGGGGGG GGGGGGGG
3583 const __m128i channel1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0, set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3584 _mm_or_si128(_mm_shuffle_epi8(interleaved1, set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3585 _mm_shuffle_epi8(interleaved2, set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3586
3587 const __m128i zero = _mm_setzero_si128();
3588
3589 // 0000 BBBB 0000 RRRR
3590 const __m128i sum0_2 = _mm_add_epi32(_mm_sad_epu8(channel0_2First, zero), _mm_sad_epu8(channel0_2Second, zero));
3591
3592 // 0000 GGGG 0000 GGGG
3593 const __m128i sum1 = _mm_sad_epu8(channel1, zero);
3594
3595 // 0000 BBBB GGGG RRRR
3596 return _mm_blend_epi16(sum0_2, _mm_add_epi32(_mm_slli_si128(sum1, 4), _mm_srli_si128(sum1, 4)), int(0xC));
3597}
3598
3599inline __m128i SSE::sumInterleave3Channel8Bit48Elements(const uint8_t* interleaved)
3600{
3601 ocean_assert(interleaved != nullptr);
3602
3603 return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), load128i(interleaved + 32));
3604}
3605
3606inline __m128i SSE::sumInterleave3Channel8Bit45Elements(const uint8_t* interleaved)
3607{
3608 ocean_assert(interleaved != nullptr);
3609
3610 return sumInterleave3Channel8Bit48Elements(load128i(interleaved), load128i(interleaved + 16), _mm_srli_si128(load128i(interleaved + 29), 3));
3611}
3612
3613inline __m128i SSE::load128iLower64(const void* const buffer)
3614{
3615 ocean_assert(buffer != nullptr);
3616 return _mm_loadl_epi64((const __m128i*)(buffer));
3617}
3618
3619inline __m128i SSE::load128i(const void* const buffer)
3620{
3621 ocean_assert(buffer != nullptr);
3622 return _mm_lddqu_si128((const __m128i*)(buffer));
3623}
3624
3625template <bool tBufferHas16Bytes>
3626inline __m128i SSE::load_u8_10_upper_zero(const uint8_t* const buffer)
3627{
3628 ocean_assert(buffer != nullptr);
3629
3630 __m128i result;
3631
3632#ifdef OCEAN_COMPILER_MSC
3633
3634 result.m128i_u64[0] = uint64_t(0);
3635 memcpy(result.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3636 memcpy(result.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3637
3638#else
3639
3640 M128i& ourResult = *((M128i*)(&result));
3641
3642 ourResult.m128i_u64[0] = uint64_t(0);
3643 memcpy(ourResult.m128i_u16 + 3, buffer + 0, sizeof(uint16_t));
3644 memcpy(ourResult.m128i_u64 + 1, buffer + 2, sizeof(uint64_t));
3645
3646#endif
3647
3648 return result;
3649}
3650
3651template <>
3652inline __m128i SSE::load_u8_10_upper_zero<true>(const uint8_t* const buffer)
3653{
3654 ocean_assert(buffer != nullptr);
3655
3656 // we load 16 bytes and shift the SSE register by 6 byte afterwards
3657 return _mm_slli_si128(SSE::load128i(buffer), 6);
3658}
3659
3660template <bool tBufferHas16Bytes>
3661inline __m128i SSE::load_u8_15_upper_zero(const uint8_t* const buffer)
3662{
3663 ocean_assert(buffer != nullptr);
3664
3665 __m128i intermediate;
3666 memcpy(&intermediate, buffer, 15);
3667
3668 // we shift the SSE register by 1 byte afterwards
3669 return _mm_slli_si128(intermediate, 1);
3670}
3671
3672template <>
3673inline __m128i SSE::load_u8_15_upper_zero<true>(const uint8_t* const buffer)
3674{
3675 ocean_assert(buffer != nullptr);
3676
3677 // we load 16 bytes and shift the SSE register by 1 byte afterwards
3678 return _mm_slli_si128(_mm_lddqu_si128((__m128i*)(buffer)), 1);
3679}
3680
3681template <bool tBufferHas16Bytes>
3682inline __m128i SSE::load_u8_13_lower_random(const uint8_t* const buffer)
3683{
3684 ocean_assert(buffer != nullptr);
3685
3686 __m128i result;
3687 memcpy(&result, buffer, 13);
3688
3689 return result;
3690}
3691
3692template <>
3693inline __m128i SSE::load_u8_13_lower_random<true>(const uint8_t* const buffer)
3694{
3695 ocean_assert(buffer != nullptr);
3696
3697 // we load the entire 16 bytes to the 128i value as this is the fastest way
3698 return _mm_lddqu_si128((__m128i*)(buffer));
3699}
3700
3701template <bool tBufferHas16Bytes>
3702inline __m128i SSE::load_u8_15_lower_zero(const uint8_t* const buffer)
3703{
3704 ocean_assert(buffer != nullptr);
3705
3706 __m128i result;
3707 memcpy(&result, buffer, 15);
3708
3709#ifdef OCEAN_COMPILER_MSC
3710 result.m128i_u8[15] = 0u;
3711#else
3712 ((M128i&)result).m128i_u8[15] = 0u;
3713#endif
3714
3715 return result;
3716}
3717
3718template <>
3719inline __m128i SSE::load_u8_15_lower_zero<true>(const uint8_t* const buffer)
3720{
3721 ocean_assert(buffer != nullptr);
3722
3723 // we load the entire 16 bytes to the 128i value as this is the fastest way
3724 __m128i result = _mm_lddqu_si128((__m128i*)(buffer));
3725
3726#ifdef OCEAN_COMPILER_MSC
3727 result.m128i_u8[15] = 0u;
3728#else
3729 ((M128i&)result).m128i_u8[15] = 0u;
3730#endif
3731
3732 return result;
3733}
3734
3735template <bool tBufferHas16Bytes>
3736inline __m128i SSE::load_u8_15_lower_random(const uint8_t* const buffer)
3737{
3738 ocean_assert(buffer != nullptr);
3739
3740 __m128i result;
3741 memcpy(&result, buffer, 15);
3742
3743 return result;
3744}
3745
3746template <>
3747inline __m128i SSE::load_u8_15_lower_random<true>(const uint8_t* const buffer)
3748{
3749 ocean_assert(buffer != nullptr);
3750
3751 // we load the entire 16 bytes to the 128i value as this is the fastest way
3752 return _mm_lddqu_si128((__m128i*)(buffer));
3753}
3754
3755template <unsigned int tShiftBytes>
3756inline __m128i SSE::load_u8_16_and_shift_right(const uint8_t* const buffer)
3757{
3758 static_assert(tShiftBytes <= 16u, "Invalid shift!");
3759
3760 ocean_assert(buffer != nullptr);
3761 return _mm_srli_si128(_mm_lddqu_si128((__m128i*)(buffer)), tShiftBytes);
3762}
3763
3764inline void SSE::store128i(const __m128i& value, uint8_t* const buffer)
3765{
3766 ocean_assert(buffer != nullptr);
3767 _mm_storeu_si128((__m128i*)(buffer), value);
3768}
3769
3770inline __m128i SSE::set128i(const unsigned long long high64, const unsigned long long low64)
3771{
3772
3773#ifdef _WINDOWS
3774
3775 #ifdef _WIN64
3776 return _mm_set_epi64x(high64, low64);
3777 #else
3778 return _mm_set_epi32(*(((int*)&high64) + 1), *((int*)&high64), *(((int*)&low64) + 1), *((int*)&low64));
3779 #endif
3780
3781#else
3782
3783 return _mm_set_epi64x(high64, low64);
3784
3785#endif
3786
3787}
3788
3789inline __m128i SSE::removeHighBits32_16(const __m128i& value)
3790{
3791 return _mm_and_si128(value, _mm_set1_epi32(int(0x0000FFFFu)));
3792}
3793
3794inline __m128i SSE::removeLowBits32_16(const __m128i& value)
3795{
3796 return _mm_and_si128(value, _mm_set1_epi32(int(0xFFFF0000u)));
3797}
3798
3799inline __m128i SSE::removeHighBits16_8(const __m128i& value)
3800{
3801 return _mm_and_si128(value, _mm_set1_epi32(int(0x00FF00FFu)));
3802}
3803
3804inline __m128i SSE::removeHighBits16_8_7_lower(const __m128i& value)
3805{
3806 return _mm_and_si128(value, set128i(0x000000FF00FF00FFull, 0x00FF00FF00FF00FFull));
3807}
3808
3809inline __m128i SSE::removeHighBits16_8_7_upper(const __m128i& value)
3810{
3811 return _mm_and_si128(value, set128i(0x00FF00FF00FF00FFull, 0x00FF00FF00FF0000ull));
3812}
3813
3814inline __m128i SSE::moveLowBits16_8ToLow64(const __m128i& value)
3815{
3816 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0E0C0A0806040200ull));
3817}
3818
3819inline __m128i SSE::moveLowBits32_8ToLow32(const __m128i& value)
3820{
3821 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A0A0A00C080400ull));
3822}
3823
3824inline __m128i SSE::moveLowBits32_16ToLow64(const __m128i& value)
3825{
3826 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A0A0ull, 0x0D0C090805040100ull));
3827}
3828
3829inline __m128i SSE::moveLowBits16_8ToHigh64(const __m128i& value)
3830{
3831 return _mm_shuffle_epi8(value, set128i(0x0E0C0A0806040200ull, 0xA0A0A0A0A0A0A0A0ull));
3832}
3833
3834inline __m128i SSE::moveHighBits32_16(const __m128i& value)
3835{
3836 // shift the four 32 bit integers by 16 to the right and fill by zeros
3837 return _mm_srli_epi32(value, 16);
3838}
3839
3840inline __m128i SSE::moveHighBits16_8(const __m128i& value)
3841{
3842 return _mm_shuffle_epi8(value, set128i(0xA00FA00DA00BA009ull, 0xA007A005A003A001ull));
3843}
3844
3845inline __m128i SSE::moveHighBits16_8_5(const __m128i& value)
3846{
3847 return _mm_shuffle_epi8(value, set128i(0xA0A0A0A0A0A0A009ull, 0xA007A005A003A001ull));
3848}
3849
3850inline __m128i SSE::moveHighBits16_8_6(const __m128i& value)
3851{
3852 return _mm_shuffle_epi8(value, set128i(0xFFFFFFFFFF0bFF09ull, 0xFF07FF05FF03FF01ull));
3853}
3854
3855inline __m128i SSE::moveHighBits16_8_7(const __m128i& value)
3856{
3857 return _mm_shuffle_epi8(value, set128i(0xA0A0A00DA00BA009ull, 0xA007A005A003A001ull));
3858}
3859
3860inline __m128i SSE::shuffleLow32ToLow32_8(const __m128i& value)
3861{
3862 return _mm_shuffle_epi8(value, set128i(0xA0A0A003A0A0A002ull, 0xA0A0A001A0A0A000ull));
3863}
3864
3865inline __m128i SSE::shuffleNeighbor4Low64BitsToLow16_8(const __m128i& value)
3866{
3867 // we could also use one of the following mask-defining possibility, all provide the same result
3868 // _mm_set_epi8(0x80, 7, 0x80, 3, 0x80, 6, 0x80, 2, 0x80, 5, 0x80, 1, 0x80, 4, 0x80, 0))
3869 // _mm_set_epi8(0xA0, 7, 0xA0, 3, 0xA0, 6, 0xA0, 2, 0xA0, 5, 0xA0, 1, 0xA0, 4, 0xA0, 0))
3870 // _mm_set_epi8(0xFF, 7, 0xFF, 3, 0xFF, 6, 0xFF, 2, 0xFF, 5, 0xFF, 1, 0xFF, 4, 0xFF, 0))
3871
3872 return _mm_shuffle_epi8(value, set128i(0xA007A003A006A002ull, 0xA005A001A004A000ull));
3873}
3874
3875inline __m128i SSE::shuffleNeighbor4High64BitsToLow16_8(const __m128i& value)
3876{
3877 return _mm_shuffle_epi8(value, set128i(0xA00FA00BA00EA00Aull, 0xA00DA009A00CA008ull));
3878}
3879
3880inline __m128i SSE::shuffleNeighbor2Low64BitsToLow16_8(const __m128i& value)
3881{
3882 return _mm_shuffle_epi8(value, set128i(0xFF07FF05FF06FF04ull, 0xFF03FF01FF02FF00ull));
3883}
3884
3885inline __m128i SSE::shuffleNeighbor2High64BitsToLow16_8(const __m128i& value)
3886{
3887 return _mm_shuffle_epi8(value, set128i(0xFF0FFF0DFF0EFF0Cull, 0xFF0BFF09FF0AFF08ull));
3888}
3889
3891{
3892 return _mm_set1_epi32(int(0x00FF00FFu));
3893}
3894
3896{
3897 return _mm_set1_epi32(int(0x0000FFFFu));
3898}
3899
3900OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8(const __m128i& values0, const __m128i& values1, __m128i& products0, __m128i& products1)
3901{
3902 const __m128i lowProducts = _mm_mullo_epi16(values0, values1);
3903 const __m128i highProducts = _mm_mulhi_epi16(values0, values1);
3904
3905 products0 = _mm_unpacklo_epi16(lowProducts, highProducts);
3906 products1 = _mm_unpackhi_epi16(lowProducts, highProducts);
3907}
3908
3909OCEAN_FORCE_INLINE void SSE::multiplyInt8x16ToInt32x8AndAccumulate(const __m128i& values0, const __m128i& values1, __m128i& results0, __m128i& results1)
3910{
3911 __m128i products0;
3912 __m128i products1;
3913 multiplyInt8x16ToInt32x8(values0, values1, products0, products1);
3914
3915 results0 = _mm_add_epi32(results0, products0);
3916 results1 = _mm_add_epi32(results1, products1);
3917}
3918
3919inline unsigned int SSE::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
3920{
3921 ocean_assert(pixel);
3922 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
3923
3924 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
3925}
3926
3927inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
3928{
3929 ocean_assert(pixel0 && pixel1);
3930
3931 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
3932
3933 return sqrDistance(*pixel0, (uint8_t)interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
3934}
3935
3936inline unsigned int SSE::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
3937{
3938 ocean_assert(pixel0 && pixel1);
3939
3940 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
3941 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
3942
3943 return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
3944}
3945
3946}
3947
3948}
3949
3950#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
3951
3952#endif // META_OCEAN_CV_SSE_H
This class implements computer vision functions using SSE extensions.
Definition SSE.h:42
static __m128i divideByRightShiftSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 32 bit values by applying a right shift.
Definition SSE.h:3108
static void average32Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2725
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 ...
Definition SSE.h:3113
static unsigned int sum_u32_first_2(const __m128i &value)
Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1331
static void average24Elements3Channel24Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2808
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition SSE.h:1265
static void reverseElements8Bit48Elements(const __m128i &elements0, const __m128i &elements1, const __m128i &elements2, __m128i &reversedElements0, __m128i &reversedElements1, __m128i &reversedElements2)
Reverses the order of 48 elements with 8 bit per element.
Definition SSE.h:3460
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition SSE.h:3619
static void average16Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2700
static __m128i load_u8_16_and_shift_right(const uint8_t *const buffer)
Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified ...
Definition SSE.h:3756
static __m128i moveLowBits32_16ToLow64(const __m128i &value)
Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3824
static __m128i moveLowBits32_8ToLow32(const __m128i &value)
Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0...
Definition SSE.h:3819
static __m128i moveHighBits16_8_6(const __m128i &value)
Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3850
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i &value)
Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3071
static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d &value)
Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
Definition SSE.h:1358
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., and image with 3 channels and 8 bit per element.
Definition SSE.h:3277
static void store128i(const __m128i &value, uint8_t *const buffer)
Stores a 128i value to the memory.
Definition SSE.h:3764
static __m128i sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
Definition SSE.h:1436
static __m128i sumInterleave3Channel8Bit45Elements(const uint8_t *interleaved)
Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3606
static __m128i moveLowBits16_8ToHigh64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with ...
Definition SSE.h:3829
static __m128i divideByRightShiftSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 16 bit values by applying a right shift.
Definition SSE.h:3066
static __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3875
static void swapReversedElements8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
Definition SSE.h:3493
static __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit pr...
Definition SSE.h:1374
static void average8ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2468
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:1583
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i &values0, const __m128i &values1, __m128i &results0, __m128i &results1)
Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
Definition SSE.h:3909
static __m128i sumSquareDifference8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1463
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1340
static __m128i sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1381
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2525
static __m128i moveHighBits16_8_5(const __m128i &value)
Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3845
static __m128i shuffleLow32ToLow32_8(const __m128i &value)
Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
Definition SSE.h:3860
static void shiftChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3512
static __m128i moveHighBits16_8(const __m128i &value)
Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3840
static __m128i removeHighBits16_8_7_upper(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
Definition SSE.h:3809
static void deInterleave3Channel8Bit45Elements(const uint8_t *interleaved, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3338
static unsigned int value_u32(const __m128i &value)
Returns one specific 32 bit unsigned integer value of a m128i value object.
Definition SSE.h:1311
static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const __m128i &channel0, const __m128i &channel1, const __m128i &channel2, __m128i &interleavedA, __m128i &interleavedB, __m128i &interleavedC)
Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3345
static __m128i load_u8_15_upper_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3661
static __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3880
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition SSE.h:1260
static __m128i sum1Channel8Bit16Elements(const __m128i &elements)
Sums 16 elements with 8 bit per element.
Definition SSE.h:3540
static __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3865
static void average8Elements2Channel64Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
Definition SSE.h:2670
static __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 16 bit value, so they each value can be right shifted to al...
Definition SSE.h:3047
static __m128i load_u8_15_lower_random(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3736
static __m128i removeHighBits16_8_7_lower(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
Definition SSE.h:3804
static void average8Elements4Channel128Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
Definition SSE.h:2868
static __m128i load_u8_10_upper_zero(const uint8_t *const buffer)
Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes,...
Definition SSE.h:3626
static __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
Definition SSE.h:1543
static __m128i moveHighBits32_16(const __m128i &value)
Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
Definition SSE.h:3834
static void average16Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2896
static __m128i moveHighBits16_8_7(const __m128i &value)
Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3855
static __m128i bitMaskRemoveHigh32_16()
Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
Definition SSE.h:3895
static __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1550
static __m128i removeHighBits32_16(const __m128i &value)
Removes the higher 16 bits of four 32 bit elements.
Definition SSE.h:3789
static __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3885
static void average6Elements3Channel96Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
Definition SSE.h:2771
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two seperated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition SSE.h:2264
static __m128i interpolation3Channel24Bit12Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2077
static __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to al...
Definition SSE.h:3089
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2117
static void average8Elements1Channel32Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
Definition SSE.h:2410
static void shiftChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3526
static void average8Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2444
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., and image with 3 channels and 8 bit per element.
Definition SSE.h:3289
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1255
static __m128i interpolation1Channel8Bit15Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2025
static uint16_t value_u16(const __m128i &value)
Returns one specific 16 bit unsigned integer value of a m128i value object.
Definition SSE.h:1299
static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2, __m128i &reversedInterleaved0, __m128i &reversedInterleaved1, __m128i &reversedInterleaved2)
Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channe...
Definition SSE.h:3387
static __m128i sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1367
static __m128i removeLowBits32_16(const __m128i &value)
Removes the lower 16 bits of four 32 bit elements.
Definition SSE.h:3794
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:1733
static uint8_t value_u8(const __m128i &value)
Returns one specific 8 bit unsigned integer value of a m128i value object.
Definition SSE.h:1276
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 16 fol...
Definition SSE.h:3169
static __m128i bitMaskRemoveHigh16_8()
Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
Definition SSE.h:3890
static __m128i removeHighBits16_8(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements.
Definition SSE.h:3799
static __m128i sum1Channel8BitBack15Elements(const uint8_t *elements)
Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is ...
Definition SSE.h:3562
static __m128i load_u8_15_lower_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3702
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., and image with 3 channels and 8 bit per element.
Definition SSE.h:3304
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1533
static __m128i sumInterleave3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2)
Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3568
static void average32Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2920
static void average30Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition SSE.h:2967
static __m128i sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1491
static __m128i sum1Channel8BitFront15Elements(const uint8_t *elements)
Sums the first 15 elements of a buffer with 8 bit per element.
Definition SSE.h:3556
static void average32ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
Definition SSE.h:2613
static void average32Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2547
static __m128i sumSquareDifference8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1408
static OCEAN_FORCE_INLINE float sum_f32_4(const __m128 &value)
Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
Definition SSE.h:1349
static __m128i load_u8_13_lower_random(const uint8_t *const buffer)
Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes,...
Definition SSE.h:3682
static void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interl...
Definition SSE.h:3441
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition SSE.h:1322
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition SSE.h:1270
static __m128i moveLowBits16_8ToLow64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3814
static __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
Definition SSE.h:1518
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition SSE.h:3919
static void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3533
static __m128i load128iLower64(const void *const buffer)
Loads the lower 64 bit of a 128i value from the memory.
Definition SSE.h:3613
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition SSE.h:3927
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3770
static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels...
Definition SSE.h:3412
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i &value)
Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3028
static void average8Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2645
static void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3519
static __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1526
static void average16Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2490
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i &values0, const __m128i &values1, __m128i &products0, __m128i &products1)
Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
Definition SSE.h:3900
static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels...
Definition SSE.h:3372
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:1879
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:71
float m128_f32[4]
The four 32 bit elements.
Definition SSE.h:73
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:82
double m128d_f64[2]
The two 64 bit elements.
Definition SSE.h:84
This union defines a wrapper for the __m128i SSE intrinsic data type.
Definition SSE.h:51
uint64_t m128i_u64[2]
The two 64 bit elements.
Definition SSE.h:53
uint16_t m128i_u16[8]
The eight 16 bit elements.
Definition SSE.h:59
uint32_t m128i_u32[4]
The four 32 bit elements.
Definition SSE.h:56
uint8_t m128i_u8[16]
The sixteen 8 bit elements.
Definition SSE.h:62