Ocean
Loading...
Searching...
No Matches
NEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_NEON_H
9#define META_OCEAN_CV_NEON_H
10
11#include "ocean/cv/CV.h"
12
14
15#include "ocean/math/Math.h"
16
17#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
18
19#if defined(__ARM_NEON__) || defined(__ARM_NEON)
20 #include <arm_neon.h>
21#endif // __ARM_NEON__
22
23namespace Ocean
24{
25
26namespace CV
27{
28
29/**
30 * This class implements computer vision functions using NEON extensions.
31 * @ingroup cv
32 */
33class NEON
34{
35 public:
36
37 /**
38 * Prefetches a block of temporal memory into all cache levels.
39 * @param data Data to be prefetched
40 */
41 static inline void prefetchT0(const void* const data);
42
43 /**
44 * Prefetches a block of temporal memory in all cache levels except 0th cache level.
45 * @param data Data to be prefetched
46 */
47 static inline void prefetchT1(const void* const data);
48
49 /**
50 * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
51 * @param data Data to be prefetched
52 */
53 static inline void prefetchT2(const void* const data);
54
55 /**
56 * Prefetches a block of non-temporal memory into non-temporal cache structure.
57 * @param data Data to be prefetched
58 */
59 static inline void prefetchNTA(const void* const data);
60
61 /**
62 * Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit precision.
63 * @param image0 First 9 elements to determine the ssd for, may be non aligned
64 * @param image1 Second 9 elements to determine the ssd for, may be non aligned
65 * @return SSD result distributed over four terms of the sum
66 */
67 static inline uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1);
68
69 /**
70 * Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit precision.
71 * @param image0 First 10 elements to determine the ssd for, may be non aligned
72 * @param image1 Second 10 elements to determine the ssd for, may be non aligned
73 * @return SSD result distributed over four terms of the sum
74 */
75 static inline uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1);
76
77 /**
78 * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
79 * @param image0 First 11 elements to determine the ssd for, may be non aligned
80 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
81 * @return SSD result distributed over four terms of the sum
82 */
83 static inline uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
84
85 /**
86 * Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit precision.
87 * @param image0 First 12 elements to determine the ssd for, may be non aligned
88 * @param image1 Second 12 elements to determine the ssd for, may be non aligned
89 * @return SSD result distributed over four terms of the sum
90 */
91 static inline uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
92
93 /**
94 * Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit precision.
95 * @param image0 First 13 elements to determine the ssd for, may be non aligned
96 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
97 * @return SSD result distributed over four terms of the sum
98 */
99 static inline uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
100
101 /**
102 * Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit precision.
103 * @param image0 First 14 elements to determine the ssd for, may be non aligned
104 * @param image1 Second 14 elements to determine the ssd for, may be non aligned
105 * @return SSD result distributed over four terms of the sum
106 */
107 static inline uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1);
108
109 /**
110 * Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit precision.
111 * @param image0 First 15 elements to determine the ssd for, may be non aligned
112 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
113 * @return SSD result distributed over four terms of the sum
114 */
115 static inline uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1);
116
117 /**
118 * Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit precision.
119 * @param image0 First 9 elements to determine the ssd for, may be non aligned
120 * @param image1 Second 9 elements to determine the ssd for, may be non aligned
121 * @return SSD result distributed over four terms of the sum
122 */
123 static inline uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1);
124
125 /**
126 * Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit precision.
127 * @param image0 First 10 elements to determine the ssd for, may be non aligned
128 * @param image1 Second 10 elements to determine the ssd for, may be non aligned
129 * @return SSD result distributed over four terms of the sum
130 */
131 static inline uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
132
133 /**
134 * Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit precision.
135 * @param image0 First 11 elements to determine the ssd for, may be non aligned
136 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
137 * @return SSD result distributed over four terms of the sum
138 */
139 static inline uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1);
140
141 /**
142 * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision.
143 * @param image0 First 12 elements to determine the ssd for, may be non aligned
144 * @param image1 Second 12 elements to determine the ssd for, may be non aligned
145 * @return SSD result distributed over four terms of the sum
146 */
147 static inline uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
148
149 /**
150 * Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit precision.
151 * @param image0 First 13 elements to determine the ssd for, may be non aligned
152 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
153 * @return SSD result distributed over four terms of the sum
154 */
155 static inline uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
156
157 /**
158 * Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit precision.
159 * @param image0 First 14 elements to determine the ssd for, may be non aligned
160 * @param image1 Second 14 elements to determine the ssd for, may be non aligned
161 * @return SSD result distributed over four terms of the sum
162 */
163 static inline uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1);
164
165 /**
166 * Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit precision.
167 * @param image0 First 15 elements to determine the ssd for, may be non aligned
168 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
169 * @return SSD result distributed over four terms of the sum
170 */
171 static inline uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
172
173 /**
174 * Sum square difference determination for 16 elements with 8 bit precision.
175 * @param image0 First 16 elements to determine the ssd for, may be non aligned
176 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
177 * @return SSD result distributed over four terms of the sum
178 */
179 static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
180
181 /**
182 * Sum square difference determination for 16 elements with 8 bit precision.
183 * @param row0 First 16 elements to determine the ssd for
184 * @param row1 Second 16 elements to determine the ssd for
185 * @return SSD result distributed over four terms of the sum
186 */
187 static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
188
189 /**
190 * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
191 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
192 * @param row0 First row of 16 elements (16 pixels), must be valid
193 * @param row1 Second row of 16 elements (16 pixels), must be valid
194 * @param result Resulting 8 average elements (8 pixels), must be valid
195 */
196 static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
197
198 /**
199 * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
200 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).
201 * @param row0 First row of 32 elements (32 pixels), must be valid
202 * @param row1 Second row of 32 elements (32 pixels), must be valid
203 * @param result Resulting 16 average elements (16 pixels), must be valid
204 */
205 static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
206
207 /**
208 * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
209 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
210 * @param image0 First row of 16 elements
211 * @param image1 Second row of 16 elements
212 * @param threshold Minimal threshold to result in a pixel with value 255
213 * @param result Resulting 8 average elements
214 */
215 static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold = 192u);
216
217 /**
218 * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
219 * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).
220 * @param row0 First row of 32 elements (16 pixels), must be valid
221 * @param row1 Second row of 32 elements (16 pixels), must be valid
222 * @param result Resulting 16 average elements (8 pixels), must be valid
223 */
224 static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
225
226 /**
227 * Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
228 * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 2 channels).
229 * @param row0 First row of 64 elements (32 pixels), must be valid
230 * @param row1 Second row of 64 elements (32 pixels), must be valid
231 * @param result Resulting 32 average elements (16 pixels), must be valid
232 */
233 static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
234
235 /**
236 * Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
237 * The function takes two rows of 48 elements and returns 24 average elements (8 averaged pixels, each with 3 channels).
238 * @param row0 First row of 48 elements (16 pixels), must be valid
239 * @param row1 Second row of 48 elements (16 pixels), must be valid
240 * @param result Resulting 24 average elements (8 pixels), must be valid
241 */
242 static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
243
244 /**
245 * Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
246 * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 4 channels).
247 * @param row0 First row of 64 elements (16 pixels), must be valid
248 * @param row1 Second row of 64 elements (16 pixels), must be valid
249 * @param result Resulting 32 average elements (8 pixels), must be valid
250 */
251 static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
252
253 /**
254 * Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
255 * The function takes two rows of 24 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
256 * @param image0 First row of 24 elements
257 * @param image1 Second row of 24 elements
258 * @param image2 Third row of 24 elements
259 * @param result Resulting 8 average elements
260 */
261 static inline void average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
262
263 /**
264 * Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
265 * The function takes two rows of 48 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).<br>
266 * Beware: This function calculates an approximation only.
267 * @param image0 First row of 48 elements
268 * @param image1 Second row of 48 elements
269 * @param image2 Third row of 48 elements
270 * @param result Resulting 16 average elements
271 */
272 static inline void average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
273
274 /**
275 * Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 bit frame.
276 * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
277 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
278 * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
279 * @param width The width of the original frame in pixel, with range [10, infinity)
280 */
281 static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
282
283 /**
284 * Determines the squared horizontal and vertical gradients and the product of both gradients for 8 following pixels for a given 1 channel 8 bit frame.
285 * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
286 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
287 * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
288 * @param width The width of the original frame in pixel, with range [10, infinity)
289 */
290 static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
291
292 /**
293 * Sum square difference determination for 8 elements with 8 bit precision.
294 * @param image0 First 16 elements to determine the ssd for, may be non aligned
295 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
296 * @return SSD result distributed over four terms of the sum
297 */
298 static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1);
299
300 /**
301 * Sum square difference determination for 8 elements with 8 bit precision.
302 * @param row0 First 16 elements to determine the ssd for
303 * @param row1 Second 16 elements to determine the ssd for
304 * @return SSD result distributed over four terms of the sum
305 */
306 static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1);
307
308 /**
309 * Sum absolute difference determination for 16 elements with 8 bit precision.
310 * @param image0 First 16 elements to determine the ssd for, may be non aligned
311 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
312 * @return SSD result distributed over four terms of the sum
313 */
314 static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
315
316 /**
317 * Sum absolute difference determination for 16 elements with 8 bit precision.
318 * @param row0 First 16 elements to determine the ssd for
319 * @param row1 Second 16 elements to determine the ssd for
320 * @return SSD result distributed over four terms of the sum
321 */
322 static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
323
324 /**
325 * Horizontally sums the four 32 bit values and returns the result.
326 * @param value The value holding the four 32 bit values
327 * @return The resulting sum
328 */
329 static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t& value);
330
331 /**
332 * Removes (sets to zero) the high 16 bits of four 32 bit elements.
333 * Given: PONM-LKJI-HGFE-DCBA<br>
334 * Result: 00NM-00JI-00FE-00BA
335 * @param value The value to remove the high bits for
336 * @return Result
337 */
338 static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t& value);
339
340 /**
341 * Removes (sets to zero) the high 8 bits of four 16 bit elements.
342 * Given: HGFE-DCBA<br>
343 * Result: 0G0E-0C0A
344 * @param value The value to remove the high bits for
345 * @return Result
346 */
347 static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t& value);
348
349 /**
350 * Removes (sets to zero) the high 8 bits of eight 16 bit elements.
351 * Given: PONM-LKJI-HGFE-DCBA<br>
352 * Result: 0O0M-0K0I-0G0E-0C0A
353 * @param value The value to remove the high bits for
354 * @return Result
355 */
356 static OCEAN_FORCE_INLINE uint16x8_t removeHighBits16_8(const uint16x8_t& value);
357
358 /**
359 * Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
360 * Given: PONM-LKJI-HGFE-DCBA<br>
361 * Result: 00PO-00LK-00HG-00DC
362 * @param value The value to remove the high bits for
363 * @return Result
364 */
365 static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t& value);
366
367 /**
368 * Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
369 * Given: HGFE-DCBA<br>
370 * Result: 0H0F-0D0B
371 * @param value The value to remove the high bits for
372 * @return Result
373 */
374 static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t& value);
375
376 /**
377 * Moves the high 8 bits of eight 16 bit elements to the low 8 bits and fill the high bits with 0.
378 * Given: PONM-LKJI-HGFE-DCBA<br>
379 * Result: 0P0N-0L0J-0H0F-0D0B
380 * @param value The value to remove the high bits for
381 * @return Result
382 */
383 static OCEAN_FORCE_INLINE uint16x8_t moveHighBits16_8(const uint16x8_t& value);
384
385 /**
386 * Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
387 * Further, the combination is done with saturation (the 32 bit values will be clamped to 16 bit values before the combination is done).
388 * Given: 00DD-00CC-00BB-00AA (low)<br>
389 * Given: 00HH-00GG-00FF-00EE (high)<br>
390 * Result: HH-GG-FF-EE-DD-CC-BB-AA
391 * @param low The 128 bit register with the (resulting) lower 16 bit values
392 * @param high The 128 bit register with the (resulting) higher 16 bit values
393 * @return The resulting 128 bit register with 16 bit values
394 */
395 static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high);
396
397 /**
398 * Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
399 * Further, the combination is done with saturation (the 16 bit values will be clamped to 8 bit values before the combination is done).
400 * Given: 0H0G-0F0E-0D0C-0B0A (low)<br>
401 * Given: 0P0O-0N0M-0L0K-0J0I (high)<br>
402 * Result: P-O-N-M-L-K-J-I-H-G-F-E-D-C-B-A
403 * @param low The 128 bit register with the (resulting) lower 8 bit values
404 * @param high The 128 bit register with the (resulting) higher 8 bit values
405 * @return The resulting 128 bit register with 16 bit values
406 */
407 static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high);
408
409 /**
410 * Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
411 * @param rowTop The top row containing 6 short values, must be valid
412 * @param rowCenter The center row containing 6 short values, must be valid
413 * @param rowBottom The bottom row containing 6 short values, must be valid
414 * @return The resulting four sums of the four 3x3 blocks
415 */
416 static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom);
417
418 /**
419 * Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t values.
420 * This function does not check whether the multiplication results in an overflow.
421 * @param value_u_64x2 The uint64x2_t value to multiply
422 * @param value_u_32x2 The uint32x2_t value to multiply
423 * @return The resulting multiplication result
424 */
425 static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2);
426
427 /**
428 * Copies the sign of a given value to another one.
429 * @param signReceiver First value receiving the sign from the second value
430 * @param signProvider Second value providing the sign for the first one
431 * @return First value with the sign of the second one
432 */
433 static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t& signReceiver, const int32x4_t& signProvider);
434
435 /**
436 * Casts 16 float elements to 16 uint8_t elements.
437 * @param sourceA_f_32x4 The first 4 float elements
438 * @param sourceB_f_32x4 The second 4 float elements
439 * @param sourceC_f_32x4 The third 4 float elements
440 * @param sourceD_f_32x4 The fourth 4 float elements
441 * @return The resulting 16 uint8_t elements
442 */
443 static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4);
444
445 /**
446 * Casts 16 float elements to 16 uint8_t elements.
447 * @param source The 16 float elements, must be valid
448 * @return The resulting 16 uint8_t elements
449 */
450 static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float* const source);
451
452 /**
453 * Casts 16 uint8_t elements to 16 float elements.
454 * @param source_u_8x16 The 16 uint8_t elements, must be valid
455 * @return The resulting 16 float elements
456 */
457 static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8x16_t& source_u_8x16);
458
459 /**
460 * Casts 16 uint8_t elements to 16 float elements.
461 * @param source The 16 uint8_t elements, must be valid
462 * @return The resulting 16 float elements
463 */
464 static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8_t* const source);
465
466 private:
467
468 /**
469 * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
470 * @param pixel Upper left pixel in the frame
471 * @param size Size of one frame row in bytes
472 * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
473 * @param fxy_ Product of the fx and the inverse fy interpolation factor
474 * @param fx_y Product of the inverse fx and the fy interpolation factor
475 * @param fxy Product of the fx and the fy interpolation factor
476 * @return Interpolated pixel values
477 */
478 static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
479
480 /**
481 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
482 * @param pixel0 Upper left pixel in the first frame
483 * @param pixel1 Upper left pixel in the second frame
484 * @param size0 Size of one frame row in bytes
485 * @param size1 Size of one frame row in bytes
486 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
487 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
488 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
489 * @param f1xy Product of the fx and the fy interpolation factor for the second image
490 * @return Interpolated sum of square difference
491 */
492 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
493
494 /**
495 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
496 * @param pixel0 Upper left pixel in the first frame
497 * @param pixel1 Upper left pixel in the second frame
498 * @param size0 Size of one frame row in bytes
499 * @param size1 Size of one frame row in bytes
500 * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
501 * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
502 * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
503 * @param f0xy Product of the fx and the fy interpolation factor for the first image
504 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
505 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
506 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
507 * @param f1xy Product of the fx and the fy interpolation factor for the second image
508 * @return Interpolated sum of square difference
509 */
510 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
511};
512
513inline void NEON::prefetchT0(const void* const data)
514{
515 __builtin_prefetch(data, 0, 0);
516}
517
518inline void NEON::prefetchT1(const void* const data)
519{
520 __builtin_prefetch(data, 0, 1);
521}
522
523inline void NEON::prefetchT2(const void* const data)
524{
525 __builtin_prefetch(data, 0, 2);
526}
527
528inline void NEON::prefetchNTA(const void* const data)
529{
530 __builtin_prefetch(data, 0, 3);
531}
532
533inline uint32x4_t NEON::sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1)
534{
535 ocean_assert(image0 && image1);
536
537 const uint8x16_t row0 = vld1q_u8(image0);
538 const uint8x16_t row1 = vld1q_u8(image1);
539
540 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
541 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
542}
543
544inline uint32x4_t NEON::sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1)
545{
546 ocean_assert(image0 && image1);
547
548 const uint8x16_t row0 = vld1q_u8(image0);
549 const uint8x16_t row1 = vld1q_u8(image1);
550
551 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
552 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
553}
554
555inline uint32x4_t NEON::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
556{
557 ocean_assert(image0 && image1);
558
559 const uint8x16_t row0 = vld1q_u8(image0);
560 const uint8x16_t row1 = vld1q_u8(image1);
561
562 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
563 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
564}
565
566inline uint32x4_t NEON::sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
567{
568 ocean_assert(image0 && image1);
569
570 const uint8x16_t row0 = vld1q_u8(image0);
571 const uint8x16_t row1 = vld1q_u8(image1);
572
573 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
574 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
575}
576
577inline uint32x4_t NEON::sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
578{
579 ocean_assert(image0 && image1);
580
581 const uint8x16_t row0 = vld1q_u8(image0);
582 const uint8x16_t row1 = vld1q_u8(image1);
583
584 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
585 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
586}
587
588inline uint32x4_t NEON::sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1)
589{
590 ocean_assert(image0 && image1);
591
592 const uint8x16_t row0 = vld1q_u8(image0);
593 const uint8x16_t row1 = vld1q_u8(image1);
594
595 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
596 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
597}
598
599inline uint32x4_t NEON::sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1)
600{
601 ocean_assert(image0 && image1);
602
603 const uint8x16_t row0 = vld1q_u8(image0);
604 const uint8x16_t row1 = vld1q_u8(image1);
605
606 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
607 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
608}
609
610inline uint32x4_t NEON::sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1)
611{
612 ocean_assert(image0 && image1);
613
614 const uint8x16_t row0 = vld1q_u8(image0);
615 const uint8x16_t row1 = vld1q_u8(image1);
616
617 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
618 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
619}
620
621inline uint32x4_t NEON::sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
622{
623 ocean_assert(image0 && image1);
624
625 const uint8x16_t row0 = vld1q_u8(image0);
626 const uint8x16_t row1 = vld1q_u8(image1);
627
628 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
629 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
630}
631
632inline uint32x4_t NEON::sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1)
633{
634 ocean_assert(image0 && image1);
635
636 const uint8x16_t row0 = vld1q_u8(image0);
637 const uint8x16_t row1 = vld1q_u8(image1);
638
639 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
640 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
641}
642
643inline uint32x4_t NEON::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
644{
645 ocean_assert(image0 && image1);
646
647 const uint8x16_t row0 = vld1q_u8(image0);
648 const uint8x16_t row1 = vld1q_u8(image1);
649
650 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
651 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
652}
653
654inline uint32x4_t NEON::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
655{
656 ocean_assert(image0 && image1);
657
658 const uint8x16_t row0 = vld1q_u8(image0);
659 const uint8x16_t row1 = vld1q_u8(image1);
660
661 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
662 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
663}
664
665inline uint32x4_t NEON::sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1)
666{
667 ocean_assert(image0 && image1);
668
669 const uint8x16_t row0 = vld1q_u8(image0);
670 const uint8x16_t row1 = vld1q_u8(image1);
671
672 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
673 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
674}
675
676inline uint32x4_t NEON::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
677{
678 ocean_assert(image0 && image1);
679
680 const uint8x16_t row0 = vld1q_u8(image0);
681 const uint8x16_t row1 = vld1q_u8(image1);
682
683 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
684 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
685}
686
687inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
688{
689 ocean_assert(image0 && image1);
690
691 uint8x16_t row0 = vld1q_u8(image0);
692 uint8x16_t row1 = vld1q_u8(image1);
693
694 return sumSquareDifference8Bit16Elements(row0, row1);
695}
696
697inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
698{
699 // Absolute difference between the arguments
700 uint8x16_t subtract = vabdq_u8(row0, row1);
701
702 uint8x8_t subtractLow = vget_low_u8(subtract);
703 uint8x8_t subtractHigh = vget_high_u8(subtract);
704
705 uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
706 uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
707
708 return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
709}
710
711inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1)
712{
713 ocean_assert(image0 && image1);
714
715 const uint8x8_t row0 = vld1_u8(image0);
716 const uint8x8_t row1 = vld1_u8(image1);
717
718 return sumSquareDifference8Bit8Elements(row0, row1);
719}
720
721inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1)
722{
723 // subtract the 8 elements (usage of saturation and bitwise or operator)
724 const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
725
726 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
727 const uint16x4_t subtractLow = removeHighBits16_8(vreinterpret_u16_u8(subtract));
728 const uint16x4_t subtractHigh = moveHighBits16_8(vreinterpret_u16_u8(subtract));
729
730 const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
731
732 // square the 16 elements
733 const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
734
735 // summing the 8 elements of 16 bit values
736 return vaddq_u32(removeHighBits32_16(vreinterpretq_u32_u16(square)), moveHighBits32_16(vreinterpretq_u32_u16(square)));
737}
738
739inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
740{
741 ocean_assert(image0 && image1);
742
743 uint8x16_t row0 = vld1q_u8(image0);
744 uint8x16_t row1 = vld1q_u8(image1);
745
746 return sumAbsoluteDifference8Bit16Elements(row0, row1);
747}
748
749inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
750{
751 // subtract the 16 elements (usage of saturation and bitwise or operator)
752 uint8x16_t subtract = vabdq_u8(row0, row1);
753
754 uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
755
756 return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
757}
758
759OCEAN_FORCE_INLINE void NEON::average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
760{
761 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
762
763 // we load 16 successive pixels (= 1 * 16 = 16 values)
764
765 const uint8x16_t m128_row0 = vld1q_u8(row0);
766 const uint8x16_t m128_row1 = vld1q_u8(row1);
767
768 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
769 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
770
771 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
772 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
773
774 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
775 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
776
777 const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
778
779 // we write back the results
780
781 vst1_u8(result, average);
782}
783
784OCEAN_FORCE_INLINE void NEON::average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
785{
786 // @see average16Elements1Channel8Bit2x2() for a detailed documentation
787
788 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
789
790 const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
791 const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
792
793 const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
794 const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
795
796 const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
797 const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
798
799 const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
800
801 vst1q_u8(result, average_u_8x16);
802}
803
804inline void NEON::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold)
805{
806 ocean_assert(image0 && image1 && result);
807
808 const uint8x16_t row0 = vld1q_u8(image0);
809 const uint8x16_t row1 = vld1q_u8(image1);
810
811 // calculate normal average
812 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
813
814 // thresholding
815 const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
816
817 vst1_u8(result, thresholded);
818}
819
820OCEAN_FORCE_INLINE void NEON::average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
821{
822 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
823
824 // we load 16 successive pixels (= 2 * 16 = 32 values) and directly deinterleave the 2 channels
825 // from YA YA YA YA ... so that we receive the following patterns:
826 // m2_128_row0.val[0]: Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y
827 // m2_128_row0.val[1]: A A A A A A A A A A A A A A A A
828
829 const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
830 const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
831
832 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
833 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
834
835 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
836 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
837
838 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
839 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
840
841 uint8x8x2_t average;
842
843 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
844 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
845
846 // we write back the results, this time we interleave the results again
847
848 vst2_u8(result, average);
849}
850
851OCEAN_FORCE_INLINE void NEON::average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
852{
853 // @see average32Elements2Channel16Bit2x2() for a detailed documentation
854
855 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
856
857 const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
858 const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
859
860 const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
861 const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
862
863 const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
864 const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
865 const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
866 const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
867
868 uint8x16x2_t average_u_8x16x2;
869
870 average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
871 average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
872
873 vst2q_u8(result, average_u_8x16x2);
874}
875
876OCEAN_FORCE_INLINE void NEON::average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
877{
878 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
879
880 // we load 16 successive pixels (= 3 * 16 = 48 values) and directly deinterleave the 3 channels
881 // from RGB RGB RGB RGB ... so that we receive the following patterns:
882 // m3_128_row0.val[0]: R R R R R R R R R R R R R R R R
883 // m3_128_row0.val[1]: G G G G G G G G G G G G G G G G
884 // m3_128_row0.val[2]: B B B B B B B B B B B B B B B B
885
886 const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
887 const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
888
889 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
890 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
891
892 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
893 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
894
895 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
896 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
897
898 uint8x8x3_t average;
899
900 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
901 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
902 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
903
904 // we write back the results, this time we interleave the results again
905
906 vst3_u8(result, average);
907
908 /* the following code would provide a more precise rounding
909 uint16x8_t zero4 = vmovq_n_u16(0x0002u);
910
911 uint16x8_t redTmp = vpadalq_u8(zero4, row0.val[0]);
912 average.val[0] = vmovn_u16(vshrq_n_u16(vpadalq_u8(redTmp, row1.val[0]), 2));
913
914 uint16x8_t greenTmp = vpadalq_u8(zero4, row0.val[1]);
915 average.val[1] = vmovn_u16(vshrq_n_u16(vpadalq_u8(greenTmp, row1.val[1]), 2));
916
917 uint16x8_t blueTmp = vpadalq_u8(zero4, row0.val[2]);
918 average.val[2] = vmovn_u16(vshrq_n_u16(vpadalq_u8(blueTmp, row1.val[2]), 2));*/
919}
920
921OCEAN_FORCE_INLINE void NEON::average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
922{
923 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
924
925 // we load 16 successive pixels (= 4 * 16 = 64 values) and directly deinterleave the 4 channels
926 // from RGBA RGBA RGBA RGBA ... so that we receive the following patterns:
927 // m4_128_row0.val[0]: R R R R R R R R R R R R R R R R
928 // m4_128_row0.val[1]: G G G G G G G G G G G G G G G G
929 // m4_128_row0.val[2]: B B B B B B B B B B B B B B B B
930 // m4_128_row0.val[3]: A A A A A A A A A A A A A A A A
931
932 const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
933 const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
934
935 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
936 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
937
938 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
939 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
940
941 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
942 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
943
944 uint8x8x4_t average;
945
946 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
947 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
948 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
949 average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
950
951 // we write back the results, this time we interleave the results again
952
953 vst4_u8(result, average);
954}
955
956inline void NEON::average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
957{
958 ocean_assert(image0 && image1 && image2 && result);
959
960 /**
961 * | 1 2 1 |
962 * 1/16 | 2 4 2 |
963 * | 1 2 1 |
964 */
965
966 // load 3 * 8 uchars
967 uint8x8x3_t row0 = vld3_u8(image0);
968 uint8x8x3_t row1 = vld3_u8(image1);
969 uint8x8x3_t row2 = vld3_u8(image2);
970
971 uint16x8x3_t sumPerRow;
972
973 // create sum across rows, middle row is summed twice
974 sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
975 sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
976 sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
977
978 // create sum across neighbouring pixels, second element within trio is summed twice
979 const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
980
981 // calculate the average: (sum + 8u) >> 4
982 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
983
984 vst1_u8(result, average);
985}
986
987inline void NEON::average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
988{
989 ocean_assert(image0 && image1 && image2 && result);
990
991 /**
992 * | 1 2 1 |
993 * 1/16 | 2 4 2 |
994 * | 1 2 1 |
995 */
996
997 // load 3 * 16 uchars and de-interleave triples:
998 //
999 // row0: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 ... A44 A45 A46 A47
1000 // row1: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 ... B44 B45 B46 B47
1001 // row2: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 ... C44 C45 C46 C47
1002
1003 uint8x16x3_t row0 = vld3q_u8(image0);
1004 uint8x16x3_t row1 = vld3q_u8(image1);
1005 uint8x16x3_t row2 = vld3q_u8(image2);
1006
1007 // now de-interleaved:
1008 //
1009 // val[0] val[1] valu[2]
1010 // row0: A0 A3 A6 A9 ... A45 A1 A4 A7 A10 ... A46 A2 A5 A8 A11 ... A47
1011 // row1: B0 B3 B6 B9 ... B45 B1 B4 B7 B10 ... B46 B2 B5 B8 B11 ... B47
1012 // row2: C0 C3 C6 C9 ... C45 C1 C4 C7 C10 ... C46 C2 C5 C8 C11 ... C47
1013
1014 // now we need to 'multiply' row1 by 2 and val[1] by 2, we solve this by creating the average of the first and second row followed by the average with the middle row
1015
1016 uint8x16x3_t averagePerRow;
1017 averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1018 averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1019 averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1020
1021 // we apply the same idea as bevore in vertical direction
1022 const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1023
1024 vst1q_u8(result, average);
1025}
1026
1027inline void NEON::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
1028{
1029 ocean_assert(source && response && width >= 10u);
1030
1031 // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1032 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1033 // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1034 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1035
1036 // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1037 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1038 // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1039 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1040
1041 int8x8x2_t result;
1042
1043 // we subtract the horizontal values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1044 result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1045 // we subtract the vertical values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1046 result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1047
1048 // we store the determined results interleaved
1049 vst2_s8((int8_t*)response, result);
1050}
1051
1052inline void NEON::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
1053{
1054 ocean_assert(source && response && width >= 10u);
1055
1056 // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1057 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1058 // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1059 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1060
1061 // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1062 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1063 // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1064 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1065
1066 // we subtract the horizontal values (right - left) and divide the result by 2
1067 int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1068 // we subtract the vertical values (bottom - top) and divide the result by 2
1069 int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1070
1071 int16x8x3_t result;
1072
1073 // we multiply horizontal with horizontal
1074 result.val[0] = vmulq_s16(horizontal, horizontal);
1075 // we multiply vertical with vertical
1076 result.val[1] = vmulq_s16(vertical, vertical);
1077 // we multiply horizontal with vertical
1078 result.val[2] = vmulq_s16(horizontal, vertical);
1079
1080 // we store the determined results interleaved (h*h, v*v, h*v, h*h, v*v, h*v, ...)
1081 vst3q_s16(response, result);
1082}
1083
1084OCEAN_FORCE_INLINE uint32_t NEON::sumHorizontal_u_32x4(const uint32x4_t& value_u_32x4)
1085{
1086#if defined(__aarch64__)
1087
1088 return vaddvq_u32(value_u_32x4);
1089
1090#else
1091
1092 const uint32x2_t sum_u_32x2 = vpadd_u32(vget_low_u32(value_u_32x4), vget_high_u32(value_u_32x4));
1093 return vget_lane_u32(vpadd_u32(sum_u_32x2, sum_u_32x2), 0);
1094
1095#endif // __aarch64__
1096}
1097
1098OCEAN_FORCE_INLINE uint32x4_t NEON::removeHighBits32_16(const uint32x4_t& value)
1099{
1100 return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1101}
1102
1103OCEAN_FORCE_INLINE uint16x4_t NEON::removeHighBits16_8(const uint16x4_t& value)
1104{
1105 return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1106}
1107
1108OCEAN_FORCE_INLINE uint16x8_t NEON::removeHighBits16_8(const uint16x8_t& value)
1109{
1110 return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1111}
1112
1113OCEAN_FORCE_INLINE uint32x4_t NEON::moveHighBits32_16(const uint32x4_t& value)
1114{
1115 return vshrq_n_u32(value, 16);
1116}
1117
1118OCEAN_FORCE_INLINE uint16x4_t NEON::moveHighBits16_8(const uint16x4_t& value)
1119{
1120 return vshr_n_u16(value, 8);
1121}
1122
1123OCEAN_FORCE_INLINE uint16x8_t NEON::moveHighBits16_8(const uint16x8_t& value)
1124{
1125 return vshrq_n_u16(value, 8);
1126}
1127
1128OCEAN_FORCE_INLINE uint16x8_t NEON::combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high)
1129{
1130 return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1131}
1132
1133OCEAN_FORCE_INLINE uint8x16_t NEON::combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high)
1134{
1135 return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1136}
1137
1138OCEAN_FORCE_INLINE int32x4_t NEON::sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom)
1139{
1140 ocean_assert(rowTop != nullptr);
1141 ocean_assert(rowCenter != nullptr);
1142 ocean_assert(rowBottom != nullptr);
1143
1144 // 1 1 1
1145 // 1 1 1
1146 // 1 1 1
1147
1148 // 1 1 1
1149 // 1 1 1
1150 // 1 1 1
1151
1152 // 1 1 1
1153 // 1 1 1
1154 // 1 1 1
1155
1156 // ...
1157
1158 // load the top row
1159 const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1160 const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1161 const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1162
1163 // load the center row
1164 const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1165 const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1166 const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1167
1168 // load the bottom row
1169 const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1170 const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1171 const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1172
1173 // summing up the individual elements (16 bit + 16 bit -> 32 bit)
1174 const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1175 const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1176 const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1177 const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1178
1179 // summing up the intermediate results
1180 const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1181 const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1182
1183 const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1184
1185 // adding the last missing row
1186 return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1187}
1188
1189OCEAN_FORCE_INLINE uint64x2_t NEON::multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2)
1190{
1191 // uint64_t * uint32_t
1192 // = (high(uint64_t) + low(uint64_t)) * uint32_t
1193 // = (((high(uint64_t) >> 32) * uint32_t) << 32) + low(uint64_t) * uint32_t
1194
1195 // [ valueA_u_64, valueB_u64 ] -> [ high(valueA_u_64), high(valueB_u64) ], [ low(valueA_u_64), low(valueB_u64) ]
1196 const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1197
1198 const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1199 const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1200
1201 const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1202
1203 return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1204}
1205
1206OCEAN_FORCE_INLINE int32x4_t NEON::copySign(const uint32x4_t& signReceiver_u_32x4, const int32x4_t& signProvider_s_32x4)
1207{
1208 const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1209
1210 const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign < 0 ? 0xFF : 0x00;
1211 const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign >= 0 ? 0xFF : 0x00;
1212
1213 return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1214}
1215
1216OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4)
1217{
1218 const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1219 const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1220 const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1221 const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1222
1223 const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1224 const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1225
1226 return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1227}
1228
1229OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float* const source)
1230{
1231 ocean_assert(source != nullptr);
1232
1233#ifdef OCEAN_DEBUG
1234 for (unsigned int n = 0u; n < 16u; ++n)
1235 {
1236 ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1237 }
1238#endif
1239
1240 return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1241}
1242
1243OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8x16_t& source_u_8x16)
1244{
1245 const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1246 const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1247
1248 const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1249 const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1250 const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1251 const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1252
1253 float32x4x4_t result_u_32x4x4;
1254 result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1255 result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1256 result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1257 result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1258
1259 return result_u_32x4x4;
1260}
1261
1262OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8_t* const source)
1263{
1264 ocean_assert(source != nullptr);
1265
1266 return cast16ElementsNEON(vld1q_u8(source));
1267}
1268
1269inline unsigned int NEON::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
1270{
1271 ocean_assert(pixel);
1272 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1273
1274 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1275}
1276
1277inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1278{
1279 ocean_assert(pixel0 && pixel1);
1280
1281 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1282
1283 return sqrDistance((unsigned int)*pixel0, interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1284}
1285
1286inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1287{
1288 ocean_assert(pixel0 && pixel1);
1289
1290 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1291 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1292
1293 return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1294}
1295
1296}
1297
1298}
1299
1300#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1301
1302#endif // META_OCEAN_CV_NEON_H
This class implements computer vision functions using NEON extensions.
Definition NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:566
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:820
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1084
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition NEON.h:804
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition NEON.h:518
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:759
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:676
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:621
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:987
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:851
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition NEON.h:1118
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition NEON.h:1052
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition NEON.h:739
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition NEON.h:1128
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:956
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:784
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:577
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:555
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition NEON.h:528
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition NEON.h:1103
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:665
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition NEON.h:1113
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:533
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition NEON.h:1216
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition NEON.h:1133
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition NEON.h:1269
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition NEON.h:523
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition NEON.h:1138
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:632
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:588
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:610
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition NEON.h:921
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:599
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:544
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:643
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition NEON.h:687
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition NEON.h:513
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition NEON.h:876
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition NEON.h:711
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:654
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition NEON.h:1277
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition NEON.h:1206
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition NEON.h:1027
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition NEON.h:1189
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition NEON.h:1098
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15