Ocean
Loading...
Searching...
No Matches
NEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_NEON_H
9#define META_OCEAN_CV_NEON_H
10
11#include "ocean/cv/CV.h"
12
14
15#include "ocean/math/Math.h"
16
17#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
18
19#if defined(__ARM_NEON__) || defined(__ARM_NEON)
20 #include <arm_neon.h>
21#endif // __ARM_NEON__
22
23namespace Ocean
24{
25
26namespace CV
27{
28
29/**
30 * This class implements computer vision functions using NEON extensions.
31 * @ingroup cv
32 */
33class NEON
34{
35 public:
36
37 /**
38 * Prefetches a block of temporal memory into all cache levels.
39 * @param data Data to be prefetched
40 */
41 static inline void prefetchT0(const void* const data);
42
43 /**
44 * Prefetches a block of temporal memory in all cache levels except 0th cache level.
45 * @param data Data to be prefetched
46 */
47 static inline void prefetchT1(const void* const data);
48
49 /**
50 * Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
51 * @param data Data to be prefetched
52 */
53 static inline void prefetchT2(const void* const data);
54
55 /**
56 * Prefetches a block of non-temporal memory into non-temporal cache structure.
57 * @param data Data to be prefetched
58 */
59 static inline void prefetchNTA(const void* const data);
60
61 /**
62 * Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit precision.
63 * @param image0 First 9 elements to determine the ssd for, may be non aligned
64 * @param image1 Second 9 elements to determine the ssd for, may be non aligned
65 * @return SSD result distributed over four terms of the sum
66 */
67 static inline uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1);
68
69 /**
70 * Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit precision.
71 * @param image0 First 10 elements to determine the ssd for, may be non aligned
72 * @param image1 Second 10 elements to determine the ssd for, may be non aligned
73 * @return SSD result distributed over four terms of the sum
74 */
75 static inline uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1);
76
77 /**
78 * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
79 * @param image0 First 11 elements to determine the ssd for, may be non aligned
80 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
81 * @return SSD result distributed over four terms of the sum
82 */
83 static inline uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
84
85 /**
86 * Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit precision.
87 * @param image0 First 12 elements to determine the ssd for, may be non aligned
88 * @param image1 Second 12 elements to determine the ssd for, may be non aligned
89 * @return SSD result distributed over four terms of the sum
90 */
91 static inline uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
92
93 /**
94 * Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit precision.
95 * @param image0 First 13 elements to determine the ssd for, may be non aligned
96 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
97 * @return SSD result distributed over four terms of the sum
98 */
99 static inline uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
100
101 /**
102 * Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit precision.
103 * @param image0 First 14 elements to determine the ssd for, may be non aligned
104 * @param image1 Second 14 elements to determine the ssd for, may be non aligned
105 * @return SSD result distributed over four terms of the sum
106 */
107 static inline uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1);
108
109 /**
110 * Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit precision.
111 * @param image0 First 15 elements to determine the ssd for, may be non aligned
112 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
113 * @return SSD result distributed over four terms of the sum
114 */
115 static inline uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1);
116
117 /**
118 * Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit precision.
119 * @param image0 First 9 elements to determine the ssd for, may be non aligned
120 * @param image1 Second 9 elements to determine the ssd for, may be non aligned
121 * @return SSD result distributed over four terms of the sum
122 */
123 static inline uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1);
124
125 /**
126 * Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit precision.
127 * @param image0 First 10 elements to determine the ssd for, may be non aligned
128 * @param image1 Second 10 elements to determine the ssd for, may be non aligned
129 * @return SSD result distributed over four terms of the sum
130 */
131 static inline uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
132
133 /**
134 * Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit precision.
135 * @param image0 First 11 elements to determine the ssd for, may be non aligned
136 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
137 * @return SSD result distributed over four terms of the sum
138 */
139 static inline uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1);
140
141 /**
142 * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision.
143 * @param image0 First 12 elements to determine the ssd for, may be non aligned
144 * @param image1 Second 12 elements to determine the ssd for, may be non aligned
145 * @return SSD result distributed over four terms of the sum
146 */
147 static inline uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
148
149 /**
150 * Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit precision.
151 * @param image0 First 13 elements to determine the ssd for, may be non aligned
152 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
153 * @return SSD result distributed over four terms of the sum
154 */
155 static inline uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
156
157 /**
158 * Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit precision.
159 * @param image0 First 14 elements to determine the ssd for, may be non aligned
160 * @param image1 Second 14 elements to determine the ssd for, may be non aligned
161 * @return SSD result distributed over four terms of the sum
162 */
163 static inline uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1);
164
165 /**
166 * Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit precision.
167 * @param image0 First 15 elements to determine the ssd for, may be non aligned
168 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
169 * @return SSD result distributed over four terms of the sum
170 */
171 static inline uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
172
173 /**
174 * Sum square difference determination for 16 elements with 8 bit precision.
175 * @param image0 First 16 elements to determine the ssd for, may be non aligned
176 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
177 * @return SSD result distributed over four terms of the sum
178 */
179 static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
180
181 /**
182 * Sum square difference determination for 16 elements with 8 bit precision.
183 * @param row0 First 16 elements to determine the ssd for
184 * @param row1 Second 16 elements to determine the ssd for
185 * @return SSD result distributed over four terms of the sum
186 */
187 static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
188
189 /**
190 * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
191 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
192 * @param row0 First row of 16 elements (16 pixels), must be valid
193 * @param row1 Second row of 16 elements (16 pixels), must be valid
194 * @param result Resulting 8 average elements (8 pixels), must be valid
195 */
196 static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
197
198 /**
199 * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
200 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).
201 * @param row0 First row of 32 elements (32 pixels), must be valid
202 * @param row1 Second row of 32 elements (32 pixels), must be valid
203 * @param result Resulting 16 average elements (16 pixels), must be valid
204 */
205 static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
206
207 /**
208 * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
209 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
210 * @param image0 First row of 16 elements
211 * @param image1 Second row of 16 elements
212 * @param threshold Minimal threshold to result in a pixel with value 255
213 * @param result Resulting 8 average elements
214 */
215 static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold = 192u);
216
217 /**
218 * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
219 * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).
220 * @param row0 First row of 32 elements (16 pixels), must be valid
221 * @param row1 Second row of 32 elements (16 pixels), must be valid
222 * @param result Resulting 16 average elements (8 pixels), must be valid
223 */
224 static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
225
226 /**
227 * Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
228 * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 2 channels).
229 * @param row0 First row of 64 elements (32 pixels), must be valid
230 * @param row1 Second row of 64 elements (32 pixels), must be valid
231 * @param result Resulting 32 average elements (16 pixels), must be valid
232 */
233 static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
234
235 /**
236 * Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
237 * The function takes two rows of 48 elements and returns 24 average elements (8 averaged pixels, each with 3 channels).
238 * @param row0 First row of 48 elements (16 pixels), must be valid
239 * @param row1 Second row of 48 elements (16 pixels), must be valid
240 * @param result Resulting 24 average elements (8 pixels), must be valid
241 */
242 static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
243
244 /**
245 * Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
246 * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 4 channels).
247 * @param row0 First row of 64 elements (16 pixels), must be valid
248 * @param row1 Second row of 64 elements (16 pixels), must be valid
249 * @param result Resulting 32 average elements (8 pixels), must be valid
250 */
251 static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
252
253 /**
254 * Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
255 * The function takes two rows of 24 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
256 * @param image0 First row of 24 elements
257 * @param image1 Second row of 24 elements
258 * @param image2 Third row of 24 elements
259 * @param result Resulting 8 average elements
260 */
261 static inline void average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
262
263 /**
264 * Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
265 * The function takes two rows of 48 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).<br>
266 * Beware: This function calculates an approximation only.
267 * @param image0 First row of 48 elements
268 * @param image1 Second row of 48 elements
269 * @param image2 Third row of 48 elements
270 * @param result Resulting 16 average elements
271 */
272 static inline void average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
273
274 /**
275 * Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 bit frame.
276 * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
277 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
278 * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
279 * @param width The width of the original frame in pixel, with range [10, infinity)
280 */
281 static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
282
283 /**
284 * Determines the squared horizontal and vertical gradients and the product of both gradients for 8 following pixels for a given 1 channel 8 bit frame.
285 * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
286 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
287 * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
288 * @param width The width of the original frame in pixel, with range [10, infinity)
289 */
290 static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
291
292 /**
293 * Sum square difference determination for 8 elements with 8 bit precision.
294 * @param image0 First 16 elements to determine the ssd for, may be non aligned
295 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
296 * @return SSD result distributed over four terms of the sum
297 */
298 static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1);
299
300 /**
301 * Sum square difference determination for 8 elements with 8 bit precision.
302 * @param row0 First 16 elements to determine the ssd for
303 * @param row1 Second 16 elements to determine the ssd for
304 * @return SSD result distributed over four terms of the sum
305 */
306 static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1);
307
308 /**
309 * Sum absolute difference determination for 16 elements with 8 bit precision.
310 * @param image0 First 16 elements to determine the ssd for, may be non aligned
311 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
312 * @return SSD result distributed over four terms of the sum
313 */
314 static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
315
316 /**
317 * Sum absolute difference determination for 16 elements with 8 bit precision.
318 * @param row0 First 16 elements to determine the ssd for
319 * @param row1 Second 16 elements to determine the ssd for
320 * @return SSD result distributed over four terms of the sum
321 */
322 static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
323
324 /**
325 * Sums the four 32 bit values and returns the result.
326 * Beware: This function is slow due the usage of the individual lanes, providing a large target buffer is much faster.
327 * @param value The value holding the four 32 bit values
328 * @return Sum result
329 */
330 static OCEAN_FORCE_INLINE unsigned int sum32x4ByLanes(const uint32x4_t& value);
331
332 /**
333 * Removes (sets to zero) the high 16 bits of four 32 bit elements.
334 * Given: PONM-LKJI-HGFE-DCBA<br>
335 * Result: 00NM-00JI-00FE-00BA
336 * @param value The value to remove the high bits for
337 * @return Result
338 */
339 static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t& value);
340
341 /**
342 * Removes (sets to zero) the high 8 bits of four 16 bit elements.
343 * Given: HGFE-DCBA<br>
344 * Result: 0G0E-0C0A
345 * @param value The value to remove the high bits for
346 * @return Result
347 */
348 static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t& value);
349
350 /**
351 * Removes (sets to zero) the high 8 bits of eight 16 bit elements.
352 * Given: PONM-LKJI-HGFE-DCBA<br>
353 * Result: 0O0M-0K0I-0G0E-0C0A
354 * @param value The value to remove the high bits for
355 * @return Result
356 */
357 static OCEAN_FORCE_INLINE uint16x8_t removeHighBits16_8(const uint16x8_t& value);
358
359 /**
360 * Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
361 * Given: PONM-LKJI-HGFE-DCBA<br>
362 * Result: 00PO-00LK-00HG-00DC
363 * @param value The value to remove the high bits for
364 * @return Result
365 */
366 static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t& value);
367
368 /**
369 * Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
370 * Given: HGFE-DCBA<br>
371 * Result: 0H0F-0D0B
372 * @param value The value to remove the high bits for
373 * @return Result
374 */
375 static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t& value);
376
377 /**
378 * Moves the high 8 bits of eight 16 bit elements to the low 8 bits and fill the high bits with 0.
379 * Given: PONM-LKJI-HGFE-DCBA<br>
380 * Result: 0P0N-0L0J-0H0F-0D0B
381 * @param value The value to remove the high bits for
382 * @return Result
383 */
384 static OCEAN_FORCE_INLINE uint16x8_t moveHighBits16_8(const uint16x8_t& value);
385
386 /**
387 * Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
388 * Further, the combination is done with saturation (the 32 bit values will be clamped to 16 bit values before the combination is done).
389 * Given: 00DD-00CC-00BB-00AA (low)<br>
390 * Given: 00HH-00GG-00FF-00EE (high)<br>
391 * Result: HH-GG-FF-EE-DD-CC-BB-AA
392 * @param low The 128 bit register with the (resulting) lower 16 bit values
393 * @param high The 128 bit register with the (resulting) higher 16 bit values
394 * @return The resulting 128 bit register with 16 bit values
395 */
396 static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high);
397
398 /**
399 * Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
400 * Further, the combination is done with saturation (the 16 bit values will be clamped to 8 bit values before the combination is done).
401 * Given: 0H0G-0F0E-0D0C-0B0A (low)<br>
402 * Given: 0P0O-0N0M-0L0K-0J0I (high)<br>
403 * Result: P-O-N-M-L-K-J-I-H-G-F-E-D-C-B-A
404 * @param low The 128 bit register with the (resulting) lower 8 bit values
405 * @param high The 128 bit register with the (resulting) higher 8 bit values
406 * @return The resulting 128 bit register with 16 bit values
407 */
408 static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high);
409
410 /**
411 * Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
412 * @param rowTop The top row containing 6 short values, must be valid
413 * @param rowCenter The center row containing 6 short values, must be valid
414 * @param rowBottom The bottom row containing 6 short values, must be valid
415 * @return The resulting four sums of the four 3x3 blocks
416 */
417 static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom);
418
419 /**
420 * Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t values.
421 * This function does not check whether the multiplication results in an overflow.
422 * @param value_u_64x2 The uint64x2_t value to multiply
423 * @param value_u_32x2 The uint32x2_t value to multiply
424 * @return The resulting multiplication result
425 */
426 static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2);
427
428 /**
429 * Copies the sign of a given value to another one.
430 * @param signReceiver First value receiving the sign from the second value
431 * @param signProvider Second value providing the sign for the first one
432 * @return First value with the sign of the second one
433 */
434 static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t& signReceiver, const int32x4_t& signProvider);
435
436 /**
437 * Casts 16 float elements to 16 uint8_t elements.
438 * @param sourceA_f_32x4 The first 4 float elements
439 * @param sourceB_f_32x4 The second 4 float elements
440 * @param sourceC_f_32x4 The third 4 float elements
441 * @param sourceD_f_32x4 The fourth 4 float elements
442 * @return The resulting 16 uint8_t elements
443 */
444 static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4);
445
446 /**
447 * Casts 16 float elements to 16 uint8_t elements.
448 * @param source The 16 float elements, must be valid
449 * @return The resulting 16 uint8_t elements
450 */
451 static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float* const source);
452
453 /**
454 * Casts 16 uint8_t elements to 16 float elements.
455 * @param source_u_8x16 The 16 uint8_t elements, must be valid
456 * @return The resulting 16 float elements
457 */
458 static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8x16_t& source_u_8x16);
459
460 /**
461 * Casts 16 uint8_t elements to 16 float elements.
462 * @param source The 16 uint8_t elements, must be valid
463 * @return The resulting 16 float elements
464 */
465 static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8_t* const source);
466
467 private:
468
469 /**
470 * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
471 * @param pixel Uppler left pixel in the frame
472 * @param size Size of one frame row in bytes
473 * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
474 * @param fxy_ Product of the fx and the inverse fy interpolation factor
475 * @param fx_y Product of the inverse fx and the fy interpolation factor
476 * @param fxy Product of the fx and the fy interpolation factor
477 * @return Interpolated pixel values
478 */
479 static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
480
481 /**
482 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
483 * @param pixel0 Uppler left pixel in the first frame
484 * @param pixel1 Uppler left pixel in the second frame
485 * @param size0 Size of one frame row in bytes
486 * @param size1 Size of one frame row in bytes
487 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
488 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
489 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
490 * @param f1xy Product of the fx and the fy interpolation factor for the second image
491 * @return Interpolated sum of square difference
492 */
493 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
494
495 /**
496 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
497 * @param pixel0 Uppler left pixel in the first frame
498 * @param pixel1 Uppler left pixel in the second frame
499 * @param size0 Size of one frame row in bytes
500 * @param size1 Size of one frame row in bytes
501 * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
502 * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
503 * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
504 * @param f0xy Product of the fx and the fy interpolation factor for the first image
505 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
506 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
507 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
508 * @param f1xy Product of the fx and the fy interpolation factor for the second image
509 * @return Interpolated sum of square difference
510 */
511 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
512};
513
514inline void NEON::prefetchT0(const void* const data)
515{
516 __builtin_prefetch(data, 0, 0);
517}
518
519inline void NEON::prefetchT1(const void* const data)
520{
521 __builtin_prefetch(data, 0, 1);
522}
523
524inline void NEON::prefetchT2(const void* const data)
525{
526 __builtin_prefetch(data, 0, 2);
527}
528
529inline void NEON::prefetchNTA(const void* const data)
530{
531 __builtin_prefetch(data, 0, 3);
532}
533
534inline uint32x4_t NEON::sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1)
535{
536 ocean_assert(image0 && image1);
537
538 const uint8x16_t row0 = vld1q_u8(image0);
539 const uint8x16_t row1 = vld1q_u8(image1);
540
541 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
542 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
543}
544
545inline uint32x4_t NEON::sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1)
546{
547 ocean_assert(image0 && image1);
548
549 const uint8x16_t row0 = vld1q_u8(image0);
550 const uint8x16_t row1 = vld1q_u8(image1);
551
552 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
553 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
554}
555
556inline uint32x4_t NEON::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
557{
558 ocean_assert(image0 && image1);
559
560 const uint8x16_t row0 = vld1q_u8(image0);
561 const uint8x16_t row1 = vld1q_u8(image1);
562
563 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
564 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
565}
566
567inline uint32x4_t NEON::sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
568{
569 ocean_assert(image0 && image1);
570
571 const uint8x16_t row0 = vld1q_u8(image0);
572 const uint8x16_t row1 = vld1q_u8(image1);
573
574 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
575 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
576}
577
578inline uint32x4_t NEON::sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
579{
580 ocean_assert(image0 && image1);
581
582 const uint8x16_t row0 = vld1q_u8(image0);
583 const uint8x16_t row1 = vld1q_u8(image1);
584
585 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
586 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
587}
588
589inline uint32x4_t NEON::sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1)
590{
591 ocean_assert(image0 && image1);
592
593 const uint8x16_t row0 = vld1q_u8(image0);
594 const uint8x16_t row1 = vld1q_u8(image1);
595
596 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
597 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
598}
599
600inline uint32x4_t NEON::sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1)
601{
602 ocean_assert(image0 && image1);
603
604 const uint8x16_t row0 = vld1q_u8(image0);
605 const uint8x16_t row1 = vld1q_u8(image1);
606
607 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
608 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
609}
610
611inline uint32x4_t NEON::sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1)
612{
613 ocean_assert(image0 && image1);
614
615 const uint8x16_t row0 = vld1q_u8(image0);
616 const uint8x16_t row1 = vld1q_u8(image1);
617
618 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
619 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
620}
621
622inline uint32x4_t NEON::sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
623{
624 ocean_assert(image0 && image1);
625
626 const uint8x16_t row0 = vld1q_u8(image0);
627 const uint8x16_t row1 = vld1q_u8(image1);
628
629 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
630 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
631}
632
633inline uint32x4_t NEON::sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1)
634{
635 ocean_assert(image0 && image1);
636
637 const uint8x16_t row0 = vld1q_u8(image0);
638 const uint8x16_t row1 = vld1q_u8(image1);
639
640 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
641 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
642}
643
644inline uint32x4_t NEON::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
645{
646 ocean_assert(image0 && image1);
647
648 const uint8x16_t row0 = vld1q_u8(image0);
649 const uint8x16_t row1 = vld1q_u8(image1);
650
651 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
652 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
653}
654
655inline uint32x4_t NEON::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
656{
657 ocean_assert(image0 && image1);
658
659 const uint8x16_t row0 = vld1q_u8(image0);
660 const uint8x16_t row1 = vld1q_u8(image1);
661
662 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
663 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
664}
665
666inline uint32x4_t NEON::sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1)
667{
668 ocean_assert(image0 && image1);
669
670 const uint8x16_t row0 = vld1q_u8(image0);
671 const uint8x16_t row1 = vld1q_u8(image1);
672
673 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
674 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
675}
676
677inline uint32x4_t NEON::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
678{
679 ocean_assert(image0 && image1);
680
681 const uint8x16_t row0 = vld1q_u8(image0);
682 const uint8x16_t row1 = vld1q_u8(image1);
683
684 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
685 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
686}
687
688inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
689{
690 ocean_assert(image0 && image1);
691
692 uint8x16_t row0 = vld1q_u8(image0);
693 uint8x16_t row1 = vld1q_u8(image1);
694
695 return sumSquareDifference8Bit16Elements(row0, row1);
696}
697
698inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
699{
700 // Absolute difference between the arguments
701 uint8x16_t subtract = vabdq_u8(row0, row1);
702
703 uint8x8_t subtractLow = vget_low_u8(subtract);
704 uint8x8_t subtractHigh = vget_high_u8(subtract);
705
706 uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
707 uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
708
709 return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
710}
711
712inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1)
713{
714 ocean_assert(image0 && image1);
715
716 const uint8x8_t row0 = vld1_u8(image0);
717 const uint8x8_t row1 = vld1_u8(image1);
718
719 return sumSquareDifference8Bit8Elements(row0, row1);
720}
721
722inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1)
723{
724 // subtract the 8 elements (usage of saturation and bitwise or operator)
725 const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
726
727 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
728 const uint16x4_t subtractLow = removeHighBits16_8(vreinterpret_u16_u8(subtract));
729 const uint16x4_t subtractHigh = moveHighBits16_8(vreinterpret_u16_u8(subtract));
730
731 const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
732
733 // square the 16 elements
734 const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
735
736 // summing the 8 elements of 16 bit values
737 return vaddq_u32(removeHighBits32_16(vreinterpretq_u32_u16(square)), moveHighBits32_16(vreinterpretq_u32_u16(square)));
738}
739
740inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
741{
742 ocean_assert(image0 && image1);
743
744 uint8x16_t row0 = vld1q_u8(image0);
745 uint8x16_t row1 = vld1q_u8(image1);
746
747 return sumAbsoluteDifference8Bit16Elements(row0, row1);
748}
749
750inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
751{
752 // subtract the 16 elements (usage of saturation and bitwise or operator)
753 uint8x16_t subtract = vabdq_u8(row0, row1);
754
755 uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
756
757 return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
758}
759
760OCEAN_FORCE_INLINE void NEON::average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
761{
762 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
763
764 // we load 16 successive pixels (= 1 * 16 = 16 values)
765
766 const uint8x16_t m128_row0 = vld1q_u8(row0);
767 const uint8x16_t m128_row1 = vld1q_u8(row1);
768
769 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
770 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
771
772 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
773 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
774
775 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
776 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
777
778 const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
779
780 // we write back the results
781
782 vst1_u8(result, average);
783}
784
785OCEAN_FORCE_INLINE void NEON::average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
786{
787 // @see average16Elements1Channel8Bit2x2() for a detailed documentation
788
789 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
790
791 const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
792 const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
793
794 const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
795 const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
796
797 const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
798 const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
799
800 const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
801
802 vst1q_u8(result, average_u_8x16);
803}
804
805inline void NEON::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold)
806{
807 ocean_assert(image0 && image1 && result);
808
809 const uint8x16_t row0 = vld1q_u8(image0);
810 const uint8x16_t row1 = vld1q_u8(image1);
811
812 // calculate normal average
813 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
814
815 // thresholding
816 const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
817
818 vst1_u8(result, thresholded);
819}
820
821OCEAN_FORCE_INLINE void NEON::average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
822{
823 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
824
825 // we load 16 successive pixels (= 2 * 16 = 32 values) and directly deinterleave the 2 channels
826 // from YA YA YA YA ... so that we receive the following patterns:
827 // m2_128_row0.val[0]: Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y
828 // m2_128_row0.val[1]: A A A A A A A A A A A A A A A A
829
830 const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
831 const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
832
833 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
834 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
835
836 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
837 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
838
839 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
840 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
841
842 uint8x8x2_t average;
843
844 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
845 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
846
847 // we write back the results, this time we interleave the results again
848
849 vst2_u8(result, average);
850}
851
852OCEAN_FORCE_INLINE void NEON::average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
853{
854 // @see average32Elements2Channel16Bit2x2() for a detailed documentation
855
856 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
857
858 const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
859 const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
860
861 const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
862 const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
863
864 const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
865 const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
866 const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
867 const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
868
869 uint8x16x2_t average_u_8x16x2;
870
871 average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
872 average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
873
874 vst2q_u8(result, average_u_8x16x2);
875}
876
877OCEAN_FORCE_INLINE void NEON::average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
878{
879 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
880
881 // we load 16 successive pixels (= 3 * 16 = 48 values) and directly deinterleave the 3 channels
882 // from RGB RGB RGB RGB ... so that we receive the following patterns:
883 // m3_128_row0.val[0]: R R R R R R R R R R R R R R R R
884 // m3_128_row0.val[1]: G G G G G G G G G G G G G G G G
885 // m3_128_row0.val[2]: B B B B B B B B B B B B B B B B
886
887 const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
888 const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
889
890 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
891 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
892
893 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
894 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
895
896 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
897 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
898
899 uint8x8x3_t average;
900
901 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
902 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
903 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
904
905 // we write back the results, this time we interleave the results again
906
907 vst3_u8(result, average);
908
909 /* the following code would provide a more precise rounding
910 uint16x8_t zero4 = vmovq_n_u16(0x0002u);
911
912 uint16x8_t redTmp = vpadalq_u8(zero4, row0.val[0]);
913 average.val[0] = vmovn_u16(vshrq_n_u16(vpadalq_u8(redTmp, row1.val[0]), 2));
914
915 uint16x8_t greenTmp = vpadalq_u8(zero4, row0.val[1]);
916 average.val[1] = vmovn_u16(vshrq_n_u16(vpadalq_u8(greenTmp, row1.val[1]), 2));
917
918 uint16x8_t blueTmp = vpadalq_u8(zero4, row0.val[2]);
919 average.val[2] = vmovn_u16(vshrq_n_u16(vpadalq_u8(blueTmp, row1.val[2]), 2));*/
920}
921
922OCEAN_FORCE_INLINE void NEON::average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
923{
924 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
925
926 // we load 16 successive pixels (= 4 * 16 = 64 values) and directly deinterleave the 4 channels
927 // from RGBA RGBA RGBA RGBA ... so that we receive the following patterns:
928 // m4_128_row0.val[0]: R R R R R R R R R R R R R R R R
929 // m4_128_row0.val[1]: G G G G G G G G G G G G G G G G
930 // m4_128_row0.val[2]: B B B B B B B B B B B B B B B B
931 // m4_128_row0.val[3]: A A A A A A A A A A A A A A A A
932
933 const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
934 const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
935
936 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
937 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
938
939 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
940 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
941
942 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
943 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
944
945 uint8x8x4_t average;
946
947 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
948 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
949 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
950 average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
951
952 // we write back the results, this time we interleave the results again
953
954 vst4_u8(result, average);
955}
956
957inline void NEON::average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
958{
959 ocean_assert(image0 && image1 && image2 && result);
960
961 /**
962 * | 1 2 1 |
963 * 1/16 | 2 4 2 |
964 * | 1 2 1 |
965 */
966
967 // load 3 * 8 uchars
968 uint8x8x3_t row0 = vld3_u8(image0);
969 uint8x8x3_t row1 = vld3_u8(image1);
970 uint8x8x3_t row2 = vld3_u8(image2);
971
972 uint16x8x3_t sumPerRow;
973
974 // create sum across rows, middle row is summed twice
975 sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
976 sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
977 sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
978
979 // create sum across neighbouring pixels, second element within trio is summed twice
980 const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
981
982 // calculate the average: (sum + 8u) >> 4
983 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
984
985 vst1_u8(result, average);
986}
987
988inline void NEON::average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
989{
990 ocean_assert(image0 && image1 && image2 && result);
991
992 /**
993 * | 1 2 1 |
994 * 1/16 | 2 4 2 |
995 * | 1 2 1 |
996 */
997
998 // load 3 * 16 uchars and de-interleave triples:
999 //
1000 // row0: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 ... A44 A45 A46 A47
1001 // row1: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 ... B44 B45 B46 B47
1002 // row2: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 ... C44 C45 C46 C47
1003
1004 uint8x16x3_t row0 = vld3q_u8(image0);
1005 uint8x16x3_t row1 = vld3q_u8(image1);
1006 uint8x16x3_t row2 = vld3q_u8(image2);
1007
1008 // now de-interleaved:
1009 //
1010 // val[0] val[1] valu[2]
1011 // row0: A0 A3 A6 A9 ... A45 A1 A4 A7 A10 ... A46 A2 A5 A8 A11 ... A47
1012 // row1: B0 B3 B6 B9 ... B45 B1 B4 B7 B10 ... B46 B2 B5 B8 B11 ... B47
1013 // row2: C0 C3 C6 C9 ... C45 C1 C4 C7 C10 ... C46 C2 C5 C8 C11 ... C47
1014
1015 // now we need to 'multiply' row1 by 2 and val[1] by 2, we solve this by creating the average of the first and second row followed by the average with the middle row
1016
1017 uint8x16x3_t averagePerRow;
1018 averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1019 averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1020 averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1021
1022 // we apply the same idea as bevore in vertical direction
1023 const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1024
1025 vst1q_u8(result, average);
1026}
1027
1028inline void NEON::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
1029{
1030 ocean_assert(source && response && width >= 10u);
1031
1032 // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1033 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1034 // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1035 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1036
1037 // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1038 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1039 // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1040 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1041
1042 int8x8x2_t result;
1043
1044 // we subtract the horizontal values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1045 result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1046 // we subtract the vertical values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1047 result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1048
1049 // we store the determined results interleaved
1050 vst2_s8((int8_t*)response, result);
1051}
1052
1053inline void NEON::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
1054{
1055 ocean_assert(source && response && width >= 10u);
1056
1057 // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1058 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1059 // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1060 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1061
1062 // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1063 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1064 // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1065 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1066
1067 // we subtract the horizontal values (right - left) and divide the result by 2
1068 int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1069 // we subtract the vertical values (bottom - top) and divide the result by 2
1070 int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1071
1072 int16x8x3_t result;
1073
1074 // we multiply horizontal with horizontal
1075 result.val[0] = vmulq_s16(horizontal, horizontal);
1076 // we multiply vertical with vertical
1077 result.val[1] = vmulq_s16(vertical, vertical);
1078 // we multiply horizontal with vertical
1079 result.val[2] = vmulq_s16(horizontal, vertical);
1080
1081 // we store the determined results interleaved (h*h, v*v, h*v, h*h, v*v, h*v, ...)
1082 vst3q_s16(response, result);
1083}
1084
1085OCEAN_FORCE_INLINE unsigned int NEON::sum32x4ByLanes(const uint32x4_t& value)
1086{
1087 return vgetq_lane_u32(value, 0) + vgetq_lane_u32(value, 1) + vgetq_lane_u32(value, 2) + vgetq_lane_u32(value, 3);
1088}
1089
1090OCEAN_FORCE_INLINE uint32x4_t NEON::removeHighBits32_16(const uint32x4_t& value)
1091{
1092 return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1093}
1094
1095OCEAN_FORCE_INLINE uint16x4_t NEON::removeHighBits16_8(const uint16x4_t& value)
1096{
1097 return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1098}
1099
1100OCEAN_FORCE_INLINE uint16x8_t NEON::removeHighBits16_8(const uint16x8_t& value)
1101{
1102 return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1103}
1104
1105OCEAN_FORCE_INLINE uint32x4_t NEON::moveHighBits32_16(const uint32x4_t& value)
1106{
1107 return vshrq_n_u32(value, 16);
1108}
1109
1110OCEAN_FORCE_INLINE uint16x4_t NEON::moveHighBits16_8(const uint16x4_t& value)
1111{
1112 return vshr_n_u16(value, 8);
1113}
1114
1115OCEAN_FORCE_INLINE uint16x8_t NEON::moveHighBits16_8(const uint16x8_t& value)
1116{
1117 return vshrq_n_u16(value, 8);
1118}
1119
1120OCEAN_FORCE_INLINE uint16x8_t NEON::combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high)
1121{
1122 return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1123}
1124
1125OCEAN_FORCE_INLINE uint8x16_t NEON::combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high)
1126{
1127 return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1128}
1129
1130OCEAN_FORCE_INLINE int32x4_t NEON::sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom)
1131{
1132 ocean_assert(rowTop != nullptr);
1133 ocean_assert(rowCenter != nullptr);
1134 ocean_assert(rowBottom != nullptr);
1135
1136 // 1 1 1
1137 // 1 1 1
1138 // 1 1 1
1139
1140 // 1 1 1
1141 // 1 1 1
1142 // 1 1 1
1143
1144 // 1 1 1
1145 // 1 1 1
1146 // 1 1 1
1147
1148 // ...
1149
1150 // load the top row
1151 const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1152 const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1153 const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1154
1155 // load the center row
1156 const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1157 const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1158 const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1159
1160 // load the bottom row
1161 const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1162 const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1163 const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1164
1165 // summing up the individual elements (16 bit + 16 bit -> 32 bit)
1166 const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1167 const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1168 const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1169 const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1170
1171 // summing up the intermediate results
1172 const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1173 const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1174
1175 const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1176
1177 // adding the last missing row
1178 return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1179}
1180
1181OCEAN_FORCE_INLINE uint64x2_t NEON::multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2)
1182{
1183 // uint64_t * uint32_t
1184 // = (high(uint64_t) + low(uint64_t)) * uint32_t
1185 // = (((high(uint64_t) >> 32) * uint32_t) << 32) + low(uint64_t) * uint32_t
1186
1187 // [ valueA_u_64, valueB_u64 ] -> [ high(valueA_u_64), high(valueB_u64) ], [ low(valueA_u_64), low(valueB_u64) ]
1188 const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1189
1190 const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1191 const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1192
1193 const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1194
1195 return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1196}
1197
1198OCEAN_FORCE_INLINE int32x4_t NEON::copySign(const uint32x4_t& signReceiver_u_32x4, const int32x4_t& signProvider_s_32x4)
1199{
1200 const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1201
1202 const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign < 0 ? 0xFF : 0x00;
1203 const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign >= 0 ? 0xFF : 0x00;
1204
1205 return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1206}
1207
1208OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4)
1209{
1210 const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1211 const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1212 const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1213 const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1214
1215 const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1216 const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1217
1218 return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1219}
1220
1221OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float* const source)
1222{
1223 ocean_assert(source != nullptr);
1224
1225#ifdef OCEAN_DEBUG
1226 for (unsigned int n = 0u; n < 16u; ++n)
1227 {
1228 ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1229 }
1230#endif
1231
1232 return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1233}
1234
1235OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8x16_t& source_u_8x16)
1236{
1237 const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1238 const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1239
1240 const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1241 const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1242 const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1243 const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1244
1245 float32x4x4_t result_u_32x4x4;
1246 result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1247 result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1248 result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1249 result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1250
1251 return result_u_32x4x4;
1252}
1253
1254OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8_t* const source)
1255{
1256 ocean_assert(source != nullptr);
1257
1258 return cast16ElementsNEON(vld1q_u8(source));
1259}
1260
1261inline unsigned int NEON::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
1262{
1263 ocean_assert(pixel);
1264 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1265
1266 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1267}
1268
1269inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1270{
1271 ocean_assert(pixel0 && pixel1);
1272
1273 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1274
1275 return sqrDistance((unsigned int)*pixel0, interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1276}
1277
1278inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1279{
1280 ocean_assert(pixel0 && pixel1);
1281
1282 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1283 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1284
1285 return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1286}
1287
1288}
1289
1290}
1291
1292#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1293
1294#endif // META_OCEAN_CV_NEON_H
This class implements computer vision functions using NEON extensions.
Definition NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:567
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:821
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition NEON.h:805
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition NEON.h:519
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:760
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:677
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:622
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:988
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:852
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition NEON.h:1110
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition NEON.h:1053
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition NEON.h:740
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition NEON.h:1120
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:957
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:785
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:578
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:556
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition NEON.h:529
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition NEON.h:1095
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:666
static OCEAN_FORCE_INLINE unsigned int sum32x4ByLanes(const uint32x4_t &value)
Sums the four 32 bit values and returns the result.
Definition NEON.h:1085
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition NEON.h:1105
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:534
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition NEON.h:1208
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition NEON.h:1125
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition NEON.h:1261
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition NEON.h:524
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition NEON.h:1130
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:633
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:589
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:611
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition NEON.h:922
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:600
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:545
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:644
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition NEON.h:688
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition NEON.h:514
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition NEON.h:877
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition NEON.h:712
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:655
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition NEON.h:1269
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition NEON.h:1198
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition NEON.h:1028
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition NEON.h:1181
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition NEON.h:1090
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition Accessor.h:15