Ocean
NEON.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_NEON_H
9 #define META_OCEAN_CV_NEON_H
10 
11 #include "ocean/cv/CV.h"
12 
13 #include "ocean/base/Utilities.h"
14 
15 #include "ocean/math/Math.h"
16 
17 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
18 
19 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
20  #include <arm_neon.h>
21 #endif // __ARM_NEON__
22 
23 namespace Ocean
24 {
25 
26 namespace CV
27 {
28 
29 /**
30  * This class implements computer vision functions using NEON extensions.
31  * @ingroup cv
32  */
33 class NEON
34 {
35  public:
36 
37  /**
38  * Prefetches a block of temporal memory into all cache levels.
39  * @param data Data to be prefetched
40  */
41  static inline void prefetchT0(const void* const data);
42 
43  /**
44  * Prefetches a block of temporal memory in all cache levels except 0th cache level.
45  * @param data Data to be prefetched
46  */
47  static inline void prefetchT1(const void* const data);
48 
49  /**
50  * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
51  * @param data Data to be prefetched
52  */
53  static inline void prefetchT2(const void* const data);
54 
55  /**
56  * Prefetches a block of non-temporal memory into non-temporal cache structure.
57  * @param data Data to be prefetched
58  */
59  static inline void prefetchNTA(const void* const data);
60 
61  /**
62  * Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit precision.
63  * @param image0 First 9 elements to determine the ssd for, may be non aligned
64  * @param image1 Second 9 elements to determine the ssd for, may be non aligned
65  * @return SSD result distributed over four terms of the sum
66  */
67  static inline uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1);
68 
69  /**
70  * Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit precision.
71  * @param image0 First 10 elements to determine the ssd for, may be non aligned
72  * @param image1 Second 10 elements to determine the ssd for, may be non aligned
73  * @return SSD result distributed over four terms of the sum
74  */
75  static inline uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1);
76 
77  /**
78  * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
79  * @param image0 First 11 elements to determine the ssd for, may be non aligned
80  * @param image1 Second 11 elements to determine the ssd for, may be non aligned
81  * @return SSD result distributed over four terms of the sum
82  */
83  static inline uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
84 
85  /**
86  * Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit precision.
87  * @param image0 First 12 elements to determine the ssd for, may be non aligned
88  * @param image1 Second 12 elements to determine the ssd for, may be non aligned
89  * @return SSD result distributed over four terms of the sum
90  */
91  static inline uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
92 
93  /**
94  * Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit precision.
95  * @param image0 First 13 elements to determine the ssd for, may be non aligned
96  * @param image1 Second 13 elements to determine the ssd for, may be non aligned
97  * @return SSD result distributed over four terms of the sum
98  */
99  static inline uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
100 
101  /**
102  * Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit precision.
103  * @param image0 First 14 elements to determine the ssd for, may be non aligned
104  * @param image1 Second 14 elements to determine the ssd for, may be non aligned
105  * @return SSD result distributed over four terms of the sum
106  */
107  static inline uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1);
108 
109  /**
110  * Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit precision.
111  * @param image0 First 15 elements to determine the ssd for, may be non aligned
112  * @param image1 Second 15 elements to determine the ssd for, may be non aligned
113  * @return SSD result distributed over four terms of the sum
114  */
115  static inline uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1);
116 
117  /**
118  * Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit precision.
119  * @param image0 First 9 elements to determine the ssd for, may be non aligned
120  * @param image1 Second 9 elements to determine the ssd for, may be non aligned
121  * @return SSD result distributed over four terms of the sum
122  */
123  static inline uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1);
124 
125  /**
126  * Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit precision.
127  * @param image0 First 10 elements to determine the ssd for, may be non aligned
128  * @param image1 Second 10 elements to determine the ssd for, may be non aligned
129  * @return SSD result distributed over four terms of the sum
130  */
131  static inline uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
132 
133  /**
134  * Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit precision.
135  * @param image0 First 11 elements to determine the ssd for, may be non aligned
136  * @param image1 Second 11 elements to determine the ssd for, may be non aligned
137  * @return SSD result distributed over four terms of the sum
138  */
139  static inline uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1);
140 
141  /**
142  * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision.
143  * @param image0 First 12 elements to determine the ssd for, may be non aligned
144  * @param image1 Second 12 elements to determine the ssd for, may be non aligned
145  * @return SSD result distributed over four terms of the sum
146  */
147  static inline uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
148 
149  /**
150  * Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit precision.
151  * @param image0 First 13 elements to determine the ssd for, may be non aligned
152  * @param image1 Second 13 elements to determine the ssd for, may be non aligned
153  * @return SSD result distributed over four terms of the sum
154  */
155  static inline uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
156 
157  /**
158  * Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit precision.
159  * @param image0 First 14 elements to determine the ssd for, may be non aligned
160  * @param image1 Second 14 elements to determine the ssd for, may be non aligned
161  * @return SSD result distributed over four terms of the sum
162  */
163  static inline uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1);
164 
165  /**
166  * Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit precision.
167  * @param image0 First 15 elements to determine the ssd for, may be non aligned
168  * @param image1 Second 15 elements to determine the ssd for, may be non aligned
169  * @return SSD result distributed over four terms of the sum
170  */
171  static inline uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
172 
173  /**
174  * Sum square difference determination for 16 elements with 8 bit precision.
175  * @param image0 First 16 elements to determine the ssd for, may be non aligned
176  * @param image1 Second 16 elements to determine the ssd for, may be non aligned
177  * @return SSD result distributed over four terms of the sum
178  */
179  static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
180 
181  /**
182  * Sum square difference determination for 16 elements with 8 bit precision.
183  * @param row0 First 16 elements to determine the ssd for
184  * @param row1 Second 16 elements to determine the ssd for
185  * @return SSD result distributed over four terms of the sum
186  */
187  static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
188 
189  /**
190  * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
191  * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
192  * @param row0 First row of 16 elements (16 pixels), must be valid
193  * @param row1 Second row of 16 elements (16 pixels), must be valid
194  * @param result Resulting 8 average elements (8 pixels), must be valid
195  */
196  static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
197 
198  /**
199  * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
200  * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).
201  * @param row0 First row of 32 elements (32 pixels), must be valid
202  * @param row1 Second row of 32 elements (32 pixels), must be valid
203  * @param result Resulting 16 average elements (16 pixels), must be valid
204  */
205  static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
206 
207  /**
208  * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
209  * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
210  * @param image0 First row of 16 elements
211  * @param image1 Second row of 16 elements
212  * @param threshold Minimal threshold to result in a pixel with value 255
213  * @param result Resulting 8 average elements
214  */
215  static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold = 192u);
216 
217  /**
218  * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
219  * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).
220  * @param row0 First row of 32 elements (16 pixels), must be valid
221  * @param row1 Second row of 32 elements (16 pixels), must be valid
222  * @param result Resulting 16 average elements (8 pixels), must be valid
223  */
224  static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
225 
226  /**
227  * Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
228  * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 2 channels).
229  * @param row0 First row of 64 elements (32 pixels), must be valid
230  * @param row1 Second row of 64 elements (32 pixels), must be valid
231  * @param result Resulting 32 average elements (16 pixels), must be valid
232  */
233  static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
234 
235  /**
236  * Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
237  * The function takes two rows of 48 elements and returns 24 average elements (8 averaged pixels, each with 3 channels).
238  * @param row0 First row of 48 elements (16 pixels), must be valid
239  * @param row1 Second row of 48 elements (16 pixels), must be valid
240  * @param result Resulting 24 average elements (8 pixels), must be valid
241  */
242  static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
243 
244  /**
245  * Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
246  * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 4 channels).
247  * @param row0 First row of 64 elements (16 pixels), must be valid
248  * @param row1 Second row of 64 elements (16 pixels), must be valid
249  * @param result Resulting 32 average elements (8 pixels), must be valid
250  */
251  static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
252 
253  /**
254  * Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
255  * The function takes two rows of 24 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
256  * @param image0 First row of 24 elements
257  * @param image1 Second row of 24 elements
258  * @param image2 Third row of 24 elements
259  * @param result Resulting 8 average elements
260  */
261  static inline void average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
262 
263  /**
264  * Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
265  * The function takes two rows of 48 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).<br>
266  * Beware: This function calculates an approximation only.
267  * @param image0 First row of 48 elements
268  * @param image1 Second row of 48 elements
269  * @param image2 Third row of 48 elements
270  * @param result Resulting 16 average elements
271  */
272  static inline void average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
273 
274  /**
275  * Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 bit frame.
276  * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
277  * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
278  * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
279  * @param width The width of the original frame in pixel, with range [10, infinity)
280  */
281  static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
282 
283  /**
284  * Determines the squared horizontal and vertical gradients and the product of both gradients for 8 following pixels for a given 1 channel 8 bit frame.
285  * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
286  * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
287  * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
288  * @param width The width of the original frame in pixel, with range [10, infinity)
289  */
290  static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
291 
292  /**
293  * Sum square difference determination for 8 elements with 8 bit precision.
294  * @param image0 First 16 elements to determine the ssd for, may be non aligned
295  * @param image1 Second 16 elements to determine the ssd for, may be non aligned
296  * @return SSD result distributed over four terms of the sum
297  */
298  static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1);
299 
300  /**
301  * Sum square difference determination for 8 elements with 8 bit precision.
302  * @param row0 First 16 elements to determine the ssd for
303  * @param row1 Second 16 elements to determine the ssd for
304  * @return SSD result distributed over four terms of the sum
305  */
306  static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1);
307 
308  /**
309  * Sum absolute difference determination for 16 elements with 8 bit precision.
310  * @param image0 First 16 elements to determine the ssd for, may be non aligned
311  * @param image1 Second 16 elements to determine the ssd for, may be non aligned
312  * @return SSD result distributed over four terms of the sum
313  */
314  static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
315 
316  /**
317  * Sum absolute difference determination for 16 elements with 8 bit precision.
318  * @param row0 First 16 elements to determine the ssd for
319  * @param row1 Second 16 elements to determine the ssd for
320  * @return SSD result distributed over four terms of the sum
321  */
322  static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
323 
324  /**
325  * Sums the four 32 bit values and returns the result.
326  * Beware: This function is slow due the usage of the individual lanes, providing a large target buffer is much faster.
327  * @param value The value holding the four 32 bit values
328  * @return Sum result
329  */
330  static OCEAN_FORCE_INLINE unsigned int sum32x4ByLanes(const uint32x4_t& value);
331 
332  /**
333  * Removes (sets to zero) the high 16 bits of four 32 bit elements.
334  * Given: PONM-LKJI-HGFE-DCBA<br>
335  * Result: 00NM-00JI-00FE-00BA
336  * @param value The value to remove the high bits for
337  * @return Result
338  */
339  static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t& value);
340 
341  /**
342  * Removes (sets to zero) the high 8 bits of four 16 bit elements.
343  * Given: HGFE-DCBA<br>
344  * Result: 0G0E-0C0A
345  * @param value The value to remove the high bits for
346  * @return Result
347  */
348  static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t& value);
349 
350  /**
351  * Removes (sets to zero) the high 8 bits of eight 16 bit elements.
352  * Given: PONM-LKJI-HGFE-DCBA<br>
353  * Result: 0O0M-0K0I-0G0E-0C0A
354  * @param value The value to remove the high bits for
355  * @return Result
356  */
357  static OCEAN_FORCE_INLINE uint16x8_t removeHighBits16_8(const uint16x8_t& value);
358 
359  /**
360  * Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
361  * Given: PONM-LKJI-HGFE-DCBA<br>
362  * Result: 00PO-00LK-00HG-00DC
363  * @param value The value to remove the high bits for
364  * @return Result
365  */
366  static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t& value);
367 
368  /**
369  * Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
370  * Given: HGFE-DCBA<br>
371  * Result: 0H0F-0D0B
372  * @param value The value to remove the high bits for
373  * @return Result
374  */
375  static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t& value);
376 
377  /**
378  * Moves the high 8 bits of eight 16 bit elements to the low 8 bits and fill the high bits with 0.
379  * Given: PONM-LKJI-HGFE-DCBA<br>
380  * Result: 0P0N-0L0J-0H0F-0D0B
381  * @param value The value to remove the high bits for
382  * @return Result
383  */
384  static OCEAN_FORCE_INLINE uint16x8_t moveHighBits16_8(const uint16x8_t& value);
385 
386  /**
387  * Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
388  * Further, the combination is done with saturation (the 32 bit values will be clamped to 16 bit values before the combination is done).
389  * Given: 00DD-00CC-00BB-00AA (low)<br>
390  * Given: 00HH-00GG-00FF-00EE (high)<br>
391  * Result: HH-GG-FF-EE-DD-CC-BB-AA
392  * @param low The 128 bit register with the (resulting) lower 16 bit values
393  * @param high The 128 bit register with the (resulting) higher 16 bit values
394  * @return The resulting 128 bit register with 16 bit values
395  */
396  static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high);
397 
398  /**
399  * Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
400  * Further, the combination is done with saturation (the 16 bit values will be clamped to 8 bit values before the combination is done).
401  * Given: 0H0G-0F0E-0D0C-0B0A (low)<br>
402  * Given: 0P0O-0N0M-0L0K-0J0I (high)<br>
403  * Result: P-O-N-M-L-K-J-I-H-G-F-E-D-C-B-A
404  * @param low The 128 bit register with the (resulting) lower 8 bit values
405  * @param high The 128 bit register with the (resulting) higher 8 bit values
406  * @return The resulting 128 bit register with 16 bit values
407  */
408  static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high);
409 
410  /**
411  * Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
412  * @param rowTop The top row containing 6 short values, must be valid
413  * @param rowCenter The center row containing 6 short values, must be valid
414  * @param rowBottom The bottom row containing 6 short values, must be valid
415  * @return The resulting four sums of the four 3x3 blocks
416  */
417  static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom);
418 
419  /**
420  * Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t values.
421  * This function does not check whether the multiplication results in an overflow.
422  * @param value_u_64x2 The uint64x2_t value to multiply
423  * @param value_u_32x2 The uint32x2_t value to multiply
424  * @return The resulting multiplication result
425  */
426  static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2);
427 
428  /**
429  * Copies the sign of a given value to another one.
430  * @param signReceiver First value receiving the sign from the second value
431  * @param signProvider Second value providing the sign for the first one
432  * @return First value with the sign of the second one
433  */
434  static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t& signReceiver, const int32x4_t& signProvider);
435 
436  /**
437  * Casts 16 float elements to 16 uint8_t elements.
438  * @param sourceA_f_32x4 The first 4 float elements
439  * @param sourceB_f_32x4 The second 4 float elements
440  * @param sourceC_f_32x4 The third 4 float elements
441  * @param sourceD_f_32x4 The fourth 4 float elements
442  * @return The resulting 16 uint8_t elements
443  */
444  static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4);
445 
446  /**
447  * Casts 16 float elements to 16 uint8_t elements.
448  * @param source The 16 float elements, must be valid
449  * @return The resulting 16 uint8_t elements
450  */
451  static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float* const source);
452 
453  /**
454  * Casts 16 uint8_t elements to 16 float elements.
455  * @param source_u_8x16 The 16 uint8_t elements, must be valid
456  * @return The resulting 16 float elements
457  */
458  static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8x16_t& source_u_8x16);
459 
460  /**
461  * Casts 16 uint8_t elements to 16 float elements.
462  * @param source The 16 uint8_t elements, must be valid
463  * @return The resulting 16 float elements
464  */
465  static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8_t* const source);
466 
467  private:
468 
469  /**
470  * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
471  * @param pixel Uppler left pixel in the frame
472  * @param size Size of one frame row in bytes
473  * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
474  * @param fxy_ Product of the fx and the inverse fy interpolation factor
475  * @param fx_y Product of the inverse fx and the fy interpolation factor
476  * @param fxy Product of the fx and the fy interpolation factor
477  * @return Interpolated pixel values
478  */
479  static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
480 
481  /**
482  * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
483  * @param pixel0 Uppler left pixel in the first frame
484  * @param pixel1 Uppler left pixel in the second frame
485  * @param size0 Size of one frame row in bytes
486  * @param size1 Size of one frame row in bytes
487  * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
488  * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
489  * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
490  * @param f1xy Product of the fx and the fy interpolation factor for the second image
491  * @return Interpolated sum of square difference
492  */
493  static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
494 
495  /**
496  * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
497  * @param pixel0 Uppler left pixel in the first frame
498  * @param pixel1 Uppler left pixel in the second frame
499  * @param size0 Size of one frame row in bytes
500  * @param size1 Size of one frame row in bytes
501  * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
502  * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
503  * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
504  * @param f0xy Product of the fx and the fy interpolation factor for the first image
505  * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
506  * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
507  * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
508  * @param f1xy Product of the fx and the fy interpolation factor for the second image
509  * @return Interpolated sum of square difference
510  */
511  static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
512 };
513 
514 inline void NEON::prefetchT0(const void* const data)
515 {
516  __builtin_prefetch(data, 0, 0);
517 }
518 
519 inline void NEON::prefetchT1(const void* const data)
520 {
521  __builtin_prefetch(data, 0, 1);
522 }
523 
524 inline void NEON::prefetchT2(const void* const data)
525 {
526  __builtin_prefetch(data, 0, 2);
527 }
528 
529 inline void NEON::prefetchNTA(const void* const data)
530 {
531  __builtin_prefetch(data, 0, 3);
532 }
533 
534 inline uint32x4_t NEON::sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1)
535 {
536  ocean_assert(image0 && image1);
537 
538  const uint8x16_t row0 = vld1q_u8(image0);
539  const uint8x16_t row1 = vld1q_u8(image1);
540 
541  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
542  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
543 }
544 
545 inline uint32x4_t NEON::sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1)
546 {
547  ocean_assert(image0 && image1);
548 
549  const uint8x16_t row0 = vld1q_u8(image0);
550  const uint8x16_t row1 = vld1q_u8(image1);
551 
552  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
553  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
554 }
555 
556 inline uint32x4_t NEON::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
557 {
558  ocean_assert(image0 && image1);
559 
560  const uint8x16_t row0 = vld1q_u8(image0);
561  const uint8x16_t row1 = vld1q_u8(image1);
562 
563  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
564  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
565 }
566 
567 inline uint32x4_t NEON::sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
568 {
569  ocean_assert(image0 && image1);
570 
571  const uint8x16_t row0 = vld1q_u8(image0);
572  const uint8x16_t row1 = vld1q_u8(image1);
573 
574  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
575  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
576 }
577 
578 inline uint32x4_t NEON::sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
579 {
580  ocean_assert(image0 && image1);
581 
582  const uint8x16_t row0 = vld1q_u8(image0);
583  const uint8x16_t row1 = vld1q_u8(image1);
584 
585  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
586  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
587 }
588 
589 inline uint32x4_t NEON::sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1)
590 {
591  ocean_assert(image0 && image1);
592 
593  const uint8x16_t row0 = vld1q_u8(image0);
594  const uint8x16_t row1 = vld1q_u8(image1);
595 
596  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
597  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
598 }
599 
600 inline uint32x4_t NEON::sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1)
601 {
602  ocean_assert(image0 && image1);
603 
604  const uint8x16_t row0 = vld1q_u8(image0);
605  const uint8x16_t row1 = vld1q_u8(image1);
606 
607  const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
608  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
609 }
610 
611 inline uint32x4_t NEON::sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1)
612 {
613  ocean_assert(image0 && image1);
614 
615  const uint8x16_t row0 = vld1q_u8(image0);
616  const uint8x16_t row1 = vld1q_u8(image1);
617 
618  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
619  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
620 }
621 
622 inline uint32x4_t NEON::sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
623 {
624  ocean_assert(image0 && image1);
625 
626  const uint8x16_t row0 = vld1q_u8(image0);
627  const uint8x16_t row1 = vld1q_u8(image1);
628 
629  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
630  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
631 }
632 
633 inline uint32x4_t NEON::sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1)
634 {
635  ocean_assert(image0 && image1);
636 
637  const uint8x16_t row0 = vld1q_u8(image0);
638  const uint8x16_t row1 = vld1q_u8(image1);
639 
640  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
641  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
642 }
643 
644 inline uint32x4_t NEON::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
645 {
646  ocean_assert(image0 && image1);
647 
648  const uint8x16_t row0 = vld1q_u8(image0);
649  const uint8x16_t row1 = vld1q_u8(image1);
650 
651  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
652  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
653 }
654 
655 inline uint32x4_t NEON::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
656 {
657  ocean_assert(image0 && image1);
658 
659  const uint8x16_t row0 = vld1q_u8(image0);
660  const uint8x16_t row1 = vld1q_u8(image1);
661 
662  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
663  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
664 }
665 
666 inline uint32x4_t NEON::sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1)
667 {
668  ocean_assert(image0 && image1);
669 
670  const uint8x16_t row0 = vld1q_u8(image0);
671  const uint8x16_t row1 = vld1q_u8(image1);
672 
673  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
674  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
675 }
676 
677 inline uint32x4_t NEON::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
678 {
679  ocean_assert(image0 && image1);
680 
681  const uint8x16_t row0 = vld1q_u8(image0);
682  const uint8x16_t row1 = vld1q_u8(image1);
683 
684  const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
685  return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
686 }
687 
688 inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
689 {
690  ocean_assert(image0 && image1);
691 
692  uint8x16_t row0 = vld1q_u8(image0);
693  uint8x16_t row1 = vld1q_u8(image1);
694 
695  return sumSquareDifference8Bit16Elements(row0, row1);
696 }
697 
698 inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
699 {
700  // Absolute difference between the arguments
701  uint8x16_t subtract = vabdq_u8(row0, row1);
702 
703  uint8x8_t subtractLow = vget_low_u8(subtract);
704  uint8x8_t subtractHigh = vget_high_u8(subtract);
705 
706  uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
707  uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
708 
709  return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
710 }
711 
712 inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1)
713 {
714  ocean_assert(image0 && image1);
715 
716  const uint8x8_t row0 = vld1_u8(image0);
717  const uint8x8_t row1 = vld1_u8(image1);
718 
719  return sumSquareDifference8Bit8Elements(row0, row1);
720 }
721 
722 inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1)
723 {
724  // subtract the 8 elements (usage of saturation and bitwise or operator)
725  const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
726 
727  // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
728  const uint16x4_t subtractLow = removeHighBits16_8(vreinterpret_u16_u8(subtract));
729  const uint16x4_t subtractHigh = moveHighBits16_8(vreinterpret_u16_u8(subtract));
730 
731  const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
732 
733  // square the 16 elements
734  const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
735 
736  // summing the 8 elements of 16 bit values
737  return vaddq_u32(removeHighBits32_16(vreinterpretq_u32_u16(square)), moveHighBits32_16(vreinterpretq_u32_u16(square)));
738 }
739 
740 inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
741 {
742  ocean_assert(image0 && image1);
743 
744  uint8x16_t row0 = vld1q_u8(image0);
745  uint8x16_t row1 = vld1q_u8(image1);
746 
747  return sumAbsoluteDifference8Bit16Elements(row0, row1);
748 }
749 
750 inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
751 {
752  // subtract the 16 elements (usage of saturation and bitwise or operator)
753  uint8x16_t subtract = vabdq_u8(row0, row1);
754 
755  uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
756 
757  return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
758 }
759 
760 OCEAN_FORCE_INLINE void NEON::average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
761 {
762  ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
763 
764  // we load 16 successive pixels (= 1 * 16 = 16 values)
765 
766  const uint8x16_t m128_row0 = vld1q_u8(row0);
767  const uint8x16_t m128_row1 = vld1q_u8(row1);
768 
769  // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
770  // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
771 
772  // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
773  // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
774 
775  // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
776  // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
777 
778  const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
779 
780  // we write back the results
781 
782  vst1_u8(result, average);
783 }
784 
785 OCEAN_FORCE_INLINE void NEON::average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
786 {
787  // @see average16Elements1Channel8Bit2x2() for a detailed documentation
788 
789  ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
790 
791  const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
792  const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
793 
794  const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
795  const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
796 
797  const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
798  const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
799 
800  const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
801 
802  vst1q_u8(result, average_u_8x16);
803 }
804 
805 inline void NEON::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold)
806 {
807  ocean_assert(image0 && image1 && result);
808 
809  const uint8x16_t row0 = vld1q_u8(image0);
810  const uint8x16_t row1 = vld1q_u8(image1);
811 
812  // calculate normal average
813  const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
814 
815  // thresholding
816  const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
817 
818  vst1_u8(result, thresholded);
819 }
820 
821 OCEAN_FORCE_INLINE void NEON::average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
822 {
823  ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
824 
825  // we load 16 successive pixels (= 2 * 16 = 32 values) and directly deinterleave the 2 channels
826  // from YA YA YA YA ... so that we receive the following patterns:
827  // m2_128_row0.val[0]: Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y
828  // m2_128_row0.val[1]: A A A A A A A A A A A A A A A A
829 
830  const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
831  const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
832 
833  // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
834  // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
835 
836  // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
837  // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
838 
839  // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
840  // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
841 
842  uint8x8x2_t average;
843 
844  average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
845  average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
846 
847  // we write back the results, this time we interleave the results again
848 
849  vst2_u8(result, average);
850 }
851 
852 OCEAN_FORCE_INLINE void NEON::average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
853 {
854  // @see average32Elements2Channel16Bit2x2() for a detailed documentation
855 
856  ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
857 
858  const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
859  const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
860 
861  const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
862  const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
863 
864  const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
865  const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
866  const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
867  const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
868 
869  uint8x16x2_t average_u_8x16x2;
870 
871  average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
872  average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
873 
874  vst2q_u8(result, average_u_8x16x2);
875 }
876 
877 OCEAN_FORCE_INLINE void NEON::average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
878 {
879  ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
880 
881  // we load 16 successive pixels (= 3 * 16 = 48 values) and directly deinterleave the 3 channels
882  // from RGB RGB RGB RGB ... so that we receive the following patterns:
883  // m3_128_row0.val[0]: R R R R R R R R R R R R R R R R
884  // m3_128_row0.val[1]: G G G G G G G G G G G G G G G G
885  // m3_128_row0.val[2]: B B B B B B B B B B B B B B B B
886 
887  const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
888  const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
889 
890  // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
891  // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
892 
893  // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
894  // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
895 
896  // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
897  // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
898 
899  uint8x8x3_t average;
900 
901  average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
902  average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
903  average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
904 
905  // we write back the results, this time we interleave the results again
906 
907  vst3_u8(result, average);
908 
909  /* the following code would provide a more precise rounding
910  uint16x8_t zero4 = vmovq_n_u16(0x0002u);
911 
912  uint16x8_t redTmp = vpadalq_u8(zero4, row0.val[0]);
913  average.val[0] = vmovn_u16(vshrq_n_u16(vpadalq_u8(redTmp, row1.val[0]), 2));
914 
915  uint16x8_t greenTmp = vpadalq_u8(zero4, row0.val[1]);
916  average.val[1] = vmovn_u16(vshrq_n_u16(vpadalq_u8(greenTmp, row1.val[1]), 2));
917 
918  uint16x8_t blueTmp = vpadalq_u8(zero4, row0.val[2]);
919  average.val[2] = vmovn_u16(vshrq_n_u16(vpadalq_u8(blueTmp, row1.val[2]), 2));*/
920 }
921 
922 OCEAN_FORCE_INLINE void NEON::average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
923 {
924  ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
925 
926  // we load 16 successive pixels (= 4 * 16 = 64 values) and directly deinterleave the 4 channels
927  // from RGBA RGBA RGBA RGBA ... so that we receive the following patterns:
928  // m4_128_row0.val[0]: R R R R R R R R R R R R R R R R
929  // m4_128_row0.val[1]: G G G G G G G G G G G G G G G G
930  // m4_128_row0.val[2]: B B B B B B B B B B B B B B B B
931  // m4_128_row0.val[3]: A A A A A A A A A A A A A A A A
932 
933  const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
934  const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
935 
936  // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
937  // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
938 
939  // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
940  // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
941 
942  // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
943  // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
944 
945  uint8x8x4_t average;
946 
947  average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
948  average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
949  average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
950  average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
951 
952  // we write back the results, this time we interleave the results again
953 
954  vst4_u8(result, average);
955 }
956 
957 inline void NEON::average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
958 {
959  ocean_assert(image0 && image1 && image2 && result);
960 
961  /**
962  * | 1 2 1 |
963  * 1/16 | 2 4 2 |
964  * | 1 2 1 |
965  */
966 
967  // load 3 * 8 uchars
968  uint8x8x3_t row0 = vld3_u8(image0);
969  uint8x8x3_t row1 = vld3_u8(image1);
970  uint8x8x3_t row2 = vld3_u8(image2);
971 
972  uint16x8x3_t sumPerRow;
973 
974  // create sum across rows, middle row is summed twice
975  sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
976  sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
977  sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
978 
979  // create sum across neighbouring pixels, second element within trio is summed twice
980  const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
981 
982  // calculate the average: (sum + 8u) >> 4
983  const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
984 
985  vst1_u8(result, average);
986 }
987 
988 inline void NEON::average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
989 {
990  ocean_assert(image0 && image1 && image2 && result);
991 
992  /**
993  * | 1 2 1 |
994  * 1/16 | 2 4 2 |
995  * | 1 2 1 |
996  */
997 
998  // load 3 * 16 uchars and de-interleave triples:
999  //
1000  // row0: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 ... A44 A45 A46 A47
1001  // row1: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 ... B44 B45 B46 B47
1002  // row2: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 ... C44 C45 C46 C47
1003 
1004  uint8x16x3_t row0 = vld3q_u8(image0);
1005  uint8x16x3_t row1 = vld3q_u8(image1);
1006  uint8x16x3_t row2 = vld3q_u8(image2);
1007 
1008  // now de-interleaved:
1009  //
1010  // val[0] val[1] valu[2]
1011  // row0: A0 A3 A6 A9 ... A45 A1 A4 A7 A10 ... A46 A2 A5 A8 A11 ... A47
1012  // row1: B0 B3 B6 B9 ... B45 B1 B4 B7 B10 ... B46 B2 B5 B8 B11 ... B47
1013  // row2: C0 C3 C6 C9 ... C45 C1 C4 C7 C10 ... C46 C2 C5 C8 C11 ... C47
1014 
1015  // now we need to 'multiply' row1 by 2 and val[1] by 2, we solve this by creating the average of the first and second row followed by the average with the middle row
1016 
1017  uint8x16x3_t averagePerRow;
1018  averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1019  averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1020  averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1021 
1022  // we apply the same idea as bevore in vertical direction
1023  const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1024 
1025  vst1q_u8(result, average);
1026 }
1027 
1028 inline void NEON::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
1029 {
1030  ocean_assert(source && response && width >= 10u);
1031 
1032  // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1033  int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1034  // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1035  int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1036 
1037  // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1038  int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1039  // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1040  int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1041 
1042  int8x8x2_t result;
1043 
1044  // we subtract the horizontal values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1045  result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1046  // we subtract the vertical values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1047  result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1048 
1049  // we store the determined results interleaved
1050  vst2_s8((int8_t*)response, result);
1051 }
1052 
1053 inline void NEON::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
1054 {
1055  ocean_assert(source && response && width >= 10u);
1056 
1057  // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1058  int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1059  // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1060  int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1061 
1062  // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1063  int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1064  // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1065  int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1066 
1067  // we subtract the horizontal values (right - left) and divide the result by 2
1068  int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1069  // we subtract the vertical values (bottom - top) and divide the result by 2
1070  int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1071 
1072  int16x8x3_t result;
1073 
1074  // we multiply horizontal with horizontal
1075  result.val[0] = vmulq_s16(horizontal, horizontal);
1076  // we multiply vertical with vertical
1077  result.val[1] = vmulq_s16(vertical, vertical);
1078  // we multiply horizontal with vertical
1079  result.val[2] = vmulq_s16(horizontal, vertical);
1080 
1081  // we store the determined results interleaved (h*h, v*v, h*v, h*h, v*v, h*v, ...)
1082  vst3q_s16(response, result);
1083 }
1084 
1085 OCEAN_FORCE_INLINE unsigned int NEON::sum32x4ByLanes(const uint32x4_t& value)
1086 {
1087  return vgetq_lane_u32(value, 0) + vgetq_lane_u32(value, 1) + vgetq_lane_u32(value, 2) + vgetq_lane_u32(value, 3);
1088 }
1089 
1090 OCEAN_FORCE_INLINE uint32x4_t NEON::removeHighBits32_16(const uint32x4_t& value)
1091 {
1092  return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1093 }
1094 
1095 OCEAN_FORCE_INLINE uint16x4_t NEON::removeHighBits16_8(const uint16x4_t& value)
1096 {
1097  return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1098 }
1099 
1100 OCEAN_FORCE_INLINE uint16x8_t NEON::removeHighBits16_8(const uint16x8_t& value)
1101 {
1102  return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1103 }
1104 
1105 OCEAN_FORCE_INLINE uint32x4_t NEON::moveHighBits32_16(const uint32x4_t& value)
1106 {
1107  return vshrq_n_u32(value, 16);
1108 }
1109 
1110 OCEAN_FORCE_INLINE uint16x4_t NEON::moveHighBits16_8(const uint16x4_t& value)
1111 {
1112  return vshr_n_u16(value, 8);
1113 }
1114 
1115 OCEAN_FORCE_INLINE uint16x8_t NEON::moveHighBits16_8(const uint16x8_t& value)
1116 {
1117  return vshrq_n_u16(value, 8);
1118 }
1119 
1120 OCEAN_FORCE_INLINE uint16x8_t NEON::combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high)
1121 {
1122  return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1123 }
1124 
1125 OCEAN_FORCE_INLINE uint8x16_t NEON::combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high)
1126 {
1127  return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1128 }
1129 
1130 OCEAN_FORCE_INLINE int32x4_t NEON::sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom)
1131 {
1132  ocean_assert(rowTop != nullptr);
1133  ocean_assert(rowCenter != nullptr);
1134  ocean_assert(rowBottom != nullptr);
1135 
1136  // 1 1 1
1137  // 1 1 1
1138  // 1 1 1
1139 
1140  // 1 1 1
1141  // 1 1 1
1142  // 1 1 1
1143 
1144  // 1 1 1
1145  // 1 1 1
1146  // 1 1 1
1147 
1148  // ...
1149 
1150  // load the top row
1151  const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1152  const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1153  const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1154 
1155  // load the center row
1156  const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1157  const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1158  const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1159 
1160  // load the bottom row
1161  const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1162  const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1163  const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1164 
1165  // summing up the individual elements (16 bit + 16 bit -> 32 bit)
1166  const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1167  const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1168  const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1169  const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1170 
1171  // summing up the intermediate results
1172  const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1173  const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1174 
1175  const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1176 
1177  // adding the last missing row
1178  return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1179 }
1180 
1181 OCEAN_FORCE_INLINE uint64x2_t NEON::multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2)
1182 {
1183  // uint64_t * uint32_t
1184  // = (high(uint64_t) + low(uint64_t)) * uint32_t
1185  // = (((high(uint64_t) >> 32) * uint32_t) << 32) + low(uint64_t) * uint32_t
1186 
1187  // [ valueA_u_64, valueB_u64 ] -> [ high(valueA_u_64), high(valueB_u64) ], [ low(valueA_u_64), low(valueB_u64) ]
1188  const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1189 
1190  const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1191  const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1192 
1193  const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1194 
1195  return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1196 }
1197 
1198 OCEAN_FORCE_INLINE int32x4_t NEON::copySign(const uint32x4_t& signReceiver_u_32x4, const int32x4_t& signProvider_s_32x4)
1199 {
1200  const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1201 
1202  const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign < 0 ? 0xFF : 0x00;
1203  const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign >= 0 ? 0xFF : 0x00;
1204 
1205  return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1206 }
1207 
1208 OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4)
1209 {
1210  const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1211  const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1212  const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1213  const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1214 
1215  const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1216  const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1217 
1218  return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1219 }
1220 
1221 OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float* const source)
1222 {
1223  ocean_assert(source != nullptr);
1224 
1225 #ifdef OCEAN_DEBUG
1226  for (unsigned int n = 0u; n < 16u; ++n)
1227  {
1228  ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1229  }
1230 #endif
1231 
1232  return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1233 }
1234 
1235 OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8x16_t& source_u_8x16)
1236 {
1237  const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1238  const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1239 
1240  const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1241  const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1242  const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1243  const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1244 
1245  float32x4x4_t result_u_32x4x4;
1246  result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1247  result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1248  result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1249  result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1250 
1251  return result_u_32x4x4;
1252 }
1253 
1254 OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8_t* const source)
1255 {
1256  ocean_assert(source != nullptr);
1257 
1258  return cast16ElementsNEON(vld1q_u8(source));
1259 }
1260 
1261 inline unsigned int NEON::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
1262 {
1263  ocean_assert(pixel);
1264  ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1265 
1266  return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1267 }
1268 
1269 inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1270 {
1271  ocean_assert(pixel0 && pixel1);
1272 
1273  ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1274 
1275  return sqrDistance((unsigned int)*pixel0, interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1276 }
1277 
1278 inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1279 {
1280  ocean_assert(pixel0 && pixel1);
1281 
1282  ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1283  ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1284 
1285  return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1286 }
1287 
1288 }
1289 
1290 }
1291 
1292 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1293 
1294 #endif // META_OCEAN_CV_NEON_H
This class implements computer vision functions using NEON extensions.
Definition: NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:567
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: NEON.h:821
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition: NEON.h:805
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition: NEON.h:519
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: NEON.h:760
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:677
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:622
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition: NEON.h:988
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: NEON.h:852
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition: NEON.h:1110
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition: NEON.h:1053
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition: NEON.h:740
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition: NEON.h:1120
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition: NEON.h:957
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: NEON.h:785
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:578
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:556
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition: NEON.h:529
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition: NEON.h:1095
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:666
static OCEAN_FORCE_INLINE unsigned int sum32x4ByLanes(const uint32x4_t &value)
Sums the four 32 bit values and returns the result.
Definition: NEON.h:1085
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition: NEON.h:1105
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition: NEON.h:534
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition: NEON.h:1208
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition: NEON.h:1125
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition: NEON.h:1261
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition: NEON.h:524
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition: NEON.h:1130
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:633
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:589
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition: NEON.h:611
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: NEON.h:922
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:600
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:545
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:644
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition: NEON.h:688
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: NEON.h:514
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: NEON.h:877
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition: NEON.h:712
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:655
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition: NEON.h:1269
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition: NEON.h:1198
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition: NEON.h:1028
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition: NEON.h:1181
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition: NEON.h:1090
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15