Ocean
Loading...
Searching...
No Matches
NEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_NEON_H
9#define META_OCEAN_CV_NEON_H
10
11#include "ocean/cv/CV.h"
12
14
15#include "ocean/math/Math.h"
16
17#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
18
19#if defined(__ARM_NEON__) || defined(__ARM_NEON)
20 #include <arm_neon.h>
21#endif // __ARM_NEON__
22
23namespace Ocean
24{
25
26namespace CV
27{
28
29/**
30 * This class implements computer vision functions using NEON extensions.
31 * @ingroup cv
32 */
33class NEON
34{
35 public:
36
37 /**
38 * Creates a uint8x8_t vector from 8 individual uint8_t values.
39 * This function provides a portable way to initialize uint8x8_t vectors across compilers.
40 * @param v0 Element at index 0
41 * @param v1 Element at index 1
42 * @param v2 Element at index 2
43 * @param v3 Element at index 3
44 * @param v4 Element at index 4
45 * @param v5 Element at index 5
46 * @param v6 Element at index 6
47 * @param v7 Element at index 7
48 * @return A uint8x8_t vector containing the specified values
49 */
50 static constexpr uint8x8_t create_uint8x8(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7);
51
52 /**
53 * Creates a uint8x16_t vector from 16 individual uint8_t values.
54 * This function provides a portable way to initialize uint8x16_t vectors across compilers.
55 * @param v0 Element at index 0
56 * @param v1 Element at index 1
57 * @param v2 Element at index 2
58 * @param v3 Element at index 3
59 * @param v4 Element at index 4
60 * @param v5 Element at index 5
61 * @param v6 Element at index 6
62 * @param v7 Element at index 7
63 * @param v8 Element at index 8
64 * @param v9 Element at index 9
65 * @param v10 Element at index 10
66 * @param v11 Element at index 11
67 * @param v12 Element at index 12
68 * @param v13 Element at index 13
69 * @param v14 Element at index 14
70 * @param v15 Element at index 15
71 * @return A uint8x16_t vector containing the specified values
72 */
73 static constexpr uint8x16_t create_uint8x16(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7, const uint8_t v8, const uint8_t v9, const uint8_t v10, const uint8_t v11, const uint8_t v12, const uint8_t v13, const uint8_t v14, const uint8_t v15);
74
75 /**
76 * Creates an int8x16_t vector from 16 individual int8_t values.
77 * This function provides a portable way to initialize int8x16_t vectors across compilers.
78 * @param v0 Element at index 0
79 * @param v1 Element at index 1
80 * @param v2 Element at index 2
81 * @param v3 Element at index 3
82 * @param v4 Element at index 4
83 * @param v5 Element at index 5
84 * @param v6 Element at index 6
85 * @param v7 Element at index 7
86 * @param v8 Element at index 8
87 * @param v9 Element at index 9
88 * @param v10 Element at index 10
89 * @param v11 Element at index 11
90 * @param v12 Element at index 12
91 * @param v13 Element at index 13
92 * @param v14 Element at index 14
93 * @param v15 Element at index 15
94 * @return An int8x16_t vector containing the specified values
95 */
96 static constexpr int8x16_t create_int8x16(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3, const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7, const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11, const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15);
97
98 /**
99 * Creates an int16x8_t vector from 8 individual int16_t values.
100 * This function provides a portable way to initialize int16x8_t vectors across compilers.
101 * @param v0 Element at index 0
102 * @param v1 Element at index 1
103 * @param v2 Element at index 2
104 * @param v3 Element at index 3
105 * @param v4 Element at index 4
106 * @param v5 Element at index 5
107 * @param v6 Element at index 6
108 * @param v7 Element at index 7
109 * @return An int16x8_t vector containing the specified values
110 */
111 static constexpr int16x8_t create_int16x8(const int16_t v0, const int16_t v1, const int16_t v2, const int16_t v3, const int16_t v4, const int16_t v5, const int16_t v6, const int16_t v7);
112
113 /**
114 * Creates a uint32x4_t vector from 4 individual uint32_t values.
115 * This function provides a portable way to initialize uint32x4_t vectors across compilers.
116 * @param v0 Element at index 0
117 * @param v1 Element at index 1
118 * @param v2 Element at index 2
119 * @param v3 Element at index 3
120 * @return A uint32x4_t vector containing the specified values
121 */
122 static constexpr uint32x4_t create_uint32x4(const uint32_t v0, const uint32_t v1, const uint32_t v2, const uint32_t v3);
123
124 /**
125 * Creates a uint16x8_t vector from 8 individual uint16_t values.
126 * This function provides a portable way to initialize uint16x8_t vectors across compilers.
127 * @param v0 Element at index 0
128 * @param v1 Element at index 1
129 * @param v2 Element at index 2
130 * @param v3 Element at index 3
131 * @param v4 Element at index 4
132 * @param v5 Element at index 5
133 * @param v6 Element at index 6
134 * @param v7 Element at index 7
135 * @return A uint16x8_t vector containing the specified values
136 */
137 static constexpr uint16x8_t create_uint16x8(const uint16_t v0, const uint16_t v1, const uint16_t v2, const uint16_t v3, const uint16_t v4, const uint16_t v5, const uint16_t v6, const uint16_t v7);
138
139 /**
140 * Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit precision.
141 * @param image0 First 9 elements to determine the ssd for, may be non aligned
142 * @param image1 Second 9 elements to determine the ssd for, may be non aligned
143 * @return SSD result distributed over four terms of the sum
144 */
145 static inline uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1);
146
147 /**
148 * Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit precision.
149 * @param image0 First 10 elements to determine the ssd for, may be non aligned
150 * @param image1 Second 10 elements to determine the ssd for, may be non aligned
151 * @return SSD result distributed over four terms of the sum
152 */
153 static inline uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1);
154
155 /**
156 * Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit precision.
157 * @param image0 First 11 elements to determine the ssd for, may be non aligned
158 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
159 * @return SSD result distributed over four terms of the sum
160 */
161 static inline uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1);
162
163 /**
164 * Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit precision.
165 * @param image0 First 12 elements to determine the ssd for, may be non aligned
166 * @param image1 Second 12 elements to determine the ssd for, may be non aligned
167 * @return SSD result distributed over four terms of the sum
168 */
169 static inline uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1);
170
171 /**
172 * Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit precision.
173 * @param image0 First 13 elements to determine the ssd for, may be non aligned
174 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
175 * @return SSD result distributed over four terms of the sum
176 */
177 static inline uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1);
178
179 /**
180 * Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit precision.
181 * @param image0 First 14 elements to determine the ssd for, may be non aligned
182 * @param image1 Second 14 elements to determine the ssd for, may be non aligned
183 * @return SSD result distributed over four terms of the sum
184 */
185 static inline uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1);
186
187 /**
188 * Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit precision.
189 * @param image0 First 15 elements to determine the ssd for, may be non aligned
190 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
191 * @return SSD result distributed over four terms of the sum
192 */
193 static inline uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1);
194
195 /**
196 * Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit precision.
197 * @param image0 First 9 elements to determine the ssd for, may be non aligned
198 * @param image1 Second 9 elements to determine the ssd for, may be non aligned
199 * @return SSD result distributed over four terms of the sum
200 */
201 static inline uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1);
202
203 /**
204 * Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit precision.
205 * @param image0 First 10 elements to determine the ssd for, may be non aligned
206 * @param image1 Second 10 elements to determine the ssd for, may be non aligned
207 * @return SSD result distributed over four terms of the sum
208 */
209 static inline uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1);
210
211 /**
212 * Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit precision.
213 * @param image0 First 11 elements to determine the ssd for, may be non aligned
214 * @param image1 Second 11 elements to determine the ssd for, may be non aligned
215 * @return SSD result distributed over four terms of the sum
216 */
217 static inline uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1);
218
219 /**
220 * Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit precision.
221 * @param image0 First 12 elements to determine the ssd for, may be non aligned
222 * @param image1 Second 12 elements to determine the ssd for, may be non aligned
223 * @return SSD result distributed over four terms of the sum
224 */
225 static inline uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1);
226
227 /**
228 * Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit precision.
229 * @param image0 First 13 elements to determine the ssd for, may be non aligned
230 * @param image1 Second 13 elements to determine the ssd for, may be non aligned
231 * @return SSD result distributed over four terms of the sum
232 */
233 static inline uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1);
234
235 /**
236 * Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit precision.
237 * @param image0 First 14 elements to determine the ssd for, may be non aligned
238 * @param image1 Second 14 elements to determine the ssd for, may be non aligned
239 * @return SSD result distributed over four terms of the sum
240 */
241 static inline uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1);
242
243 /**
244 * Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit precision.
245 * @param image0 First 15 elements to determine the ssd for, may be non aligned
246 * @param image1 Second 15 elements to determine the ssd for, may be non aligned
247 * @return SSD result distributed over four terms of the sum
248 */
249 static inline uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1);
250
251 /**
252 * Sum square difference determination for 16 elements with 8 bit precision.
253 * @param image0 First 16 elements to determine the ssd for, may be non aligned
254 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
255 * @return SSD result distributed over four terms of the sum
256 */
257 static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
258
259 /**
260 * Sum square difference determination for 16 elements with 8 bit precision.
261 * @param row0 First 16 elements to determine the ssd for
262 * @param row1 Second 16 elements to determine the ssd for
263 * @return SSD result distributed over four terms of the sum
264 */
265 static inline uint32x4_t sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
266
267 /**
268 * Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
269 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
270 * @param row0 First row of 16 elements (16 pixels), must be valid
271 * @param row1 Second row of 16 elements (16 pixels), must be valid
272 * @param result Resulting 8 average elements (8 pixels), must be valid
273 */
274 static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
275
276 /**
277 * Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
278 * The function takes two rows of 32 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).
279 * @param row0 First row of 32 elements (32 pixels), must be valid
280 * @param row1 Second row of 32 elements (32 pixels), must be valid
281 * @param result Resulting 16 average elements (16 pixels), must be valid
282 */
283 static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
284
285 /**
286 * Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
287 * The function takes two rows of 16 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
288 * @param image0 First row of 16 elements
289 * @param image1 Second row of 16 elements
290 * @param threshold Minimal threshold to result in a pixel with value 255
291 * @param result Resulting 8 average elements
292 */
293 static inline void average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold = 192u);
294
295 /**
296 * Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
297 * The function takes two rows of 32 elements and returns 16 average elements (8 averaged pixels, each with 2 channels).
298 * @param row0 First row of 32 elements (16 pixels), must be valid
299 * @param row1 Second row of 32 elements (16 pixels), must be valid
300 * @param result Resulting 16 average elements (8 pixels), must be valid
301 */
302 static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
303
304 /**
305 * Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
306 * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 2 channels).
307 * @param row0 First row of 64 elements (32 pixels), must be valid
308 * @param row1 Second row of 64 elements (32 pixels), must be valid
309 * @param result Resulting 32 average elements (16 pixels), must be valid
310 */
311 static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
312
313 /**
314 * Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
315 * The function takes two rows of 48 elements and returns 24 average elements (8 averaged pixels, each with 3 channels).
316 * @param row0 First row of 48 elements (16 pixels), must be valid
317 * @param row1 Second row of 48 elements (16 pixels), must be valid
318 * @param result Resulting 24 average elements (8 pixels), must be valid
319 */
320 static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
321
322 /**
323 * Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
324 * The function takes two rows of 64 elements and returns 32 average elements (16 averaged pixels, each with 4 channels).
325 * @param row0 First row of 64 elements (16 pixels), must be valid
326 * @param row1 Second row of 64 elements (16 pixels), must be valid
327 * @param result Resulting 32 average elements (8 pixels), must be valid
328 */
329 static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result);
330
331 /**
332 * Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
333 * The function takes two rows of 24 elements and returns 8 average elements (8 averaged pixels, each with 1 channels).
334 * @param image0 First row of 24 elements
335 * @param image1 Second row of 24 elements
336 * @param image2 Third row of 24 elements
337 * @param result Resulting 8 average elements
338 */
339 static inline void average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
340
341 /**
342 * Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
343 * The function takes two rows of 48 elements and returns 16 average elements (16 averaged pixels, each with 1 channels).<br>
344 * Beware: This function calculates an approximation only.
345 * @param image0 First row of 48 elements
346 * @param image1 Second row of 48 elements
347 * @param image2 Third row of 48 elements
348 * @param result Resulting 16 average elements
349 */
350 static inline void average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result);
351
352 /**
353 * Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 bit frame.
354 * The resulting gradients are interleaved and each response is inside the range [-127, 127] as the standard response is divided by two.
355 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
356 * @param response Resulting gradient responses, first the horizontal response then the vertical response (zipped) for 8 pixels
357 * @param width The width of the original frame in pixel, with range [10, infinity)
358 */
359 static inline void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width);
360
361 /**
362 * Determines the squared horizontal and vertical gradients and the product of both gradients for 8 following pixels for a given 1 channel 8 bit frame.
363 * The resulting gradients are interleaved and each response is inside the range [-(127 * 127), 127 * 127] as the standard response is divided by two.
364 * @param source The source position of the first pixel to determine the gradient for, this pixel must not be a border pixel in the original frame
365 * @param response Resulting gradient responses, first the horizontal response then the vertical response and afterwards the product of horizontal and vertical response (zipped) for 8 pixels
366 * @param width The width of the original frame in pixel, with range [10, infinity)
367 */
368 static inline void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width);
369
370 /**
371 * Sum square difference determination for 8 elements with 8 bit precision.
372 * @param image0 First 16 elements to determine the ssd for, may be non aligned
373 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
374 * @return SSD result distributed over four terms of the sum
375 */
376 static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1);
377
378 /**
379 * Sum square difference determination for 8 elements with 8 bit precision.
380 * @param row0 First 16 elements to determine the ssd for
381 * @param row1 Second 16 elements to determine the ssd for
382 * @return SSD result distributed over four terms of the sum
383 */
384 static inline uint32x4_t sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1);
385
386 /**
387 * Sum absolute difference determination for 16 elements with 8 bit precision.
388 * @param image0 First 16 elements to determine the ssd for, may be non aligned
389 * @param image1 Second 16 elements to determine the ssd for, may be non aligned
390 * @return SSD result distributed over four terms of the sum
391 */
392 static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1);
393
394 /**
395 * Sum absolute difference determination for 16 elements with 8 bit precision.
396 * @param row0 First 16 elements to determine the ssd for
397 * @param row1 Second 16 elements to determine the ssd for
398 * @return SSD result distributed over four terms of the sum
399 */
400 static inline uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1);
401
402 /**
403 * Horizontally sums the four 32 bit values and returns the result.
404 * @param value The value holding the four 32 bit values
405 * @return The resulting sum
406 */
407 static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t& value);
408
409 /**
410 * Removes (sets to zero) the high 16 bits of four 32 bit elements.
411 * Given: PONM-LKJI-HGFE-DCBA<br>
412 * Result: 00NM-00JI-00FE-00BA
413 * @param value The value to remove the high bits for
414 * @return Result
415 */
416 static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t& value);
417
418 /**
419 * Removes (sets to zero) the high 8 bits of four 16 bit elements.
420 * Given: HGFE-DCBA<br>
421 * Result: 0G0E-0C0A
422 * @param value The value to remove the high bits for
423 * @return Result
424 */
425 static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t& value);
426
427 /**
428 * Removes (sets to zero) the high 8 bits of eight 16 bit elements.
429 * Given: PONM-LKJI-HGFE-DCBA<br>
430 * Result: 0O0M-0K0I-0G0E-0C0A
431 * @param value The value to remove the high bits for
432 * @return Result
433 */
434 static OCEAN_FORCE_INLINE uint16x8_t removeHighBits16_8(const uint16x8_t& value);
435
436 /**
437 * Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
438 * Given: PONM-LKJI-HGFE-DCBA<br>
439 * Result: 00PO-00LK-00HG-00DC
440 * @param value The value to remove the high bits for
441 * @return Result
442 */
443 static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t& value);
444
445 /**
446 * Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
447 * Given: HGFE-DCBA<br>
448 * Result: 0H0F-0D0B
449 * @param value The value to remove the high bits for
450 * @return Result
451 */
452 static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t& value);
453
454 /**
455 * Moves the high 8 bits of eight 16 bit elements to the low 8 bits and fill the high bits with 0.
456 * Given: PONM-LKJI-HGFE-DCBA<br>
457 * Result: 0P0N-0L0J-0H0F-0D0B
458 * @param value The value to remove the high bits for
459 * @return Result
460 */
461 static OCEAN_FORCE_INLINE uint16x8_t moveHighBits16_8(const uint16x8_t& value);
462
463 /**
464 * Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
465 * Further, the combination is done with saturation (the 32 bit values will be clamped to 16 bit values before the combination is done).
466 * Given: 00DD-00CC-00BB-00AA (low)<br>
467 * Given: 00HH-00GG-00FF-00EE (high)<br>
468 * Result: HH-GG-FF-EE-DD-CC-BB-AA
469 * @param low The 128 bit register with the (resulting) lower 16 bit values
470 * @param high The 128 bit register with the (resulting) higher 16 bit values
471 * @return The resulting 128 bit register with 16 bit values
472 */
473 static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high);
474
475 /**
476 * Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
477 * Further, the combination is done with saturation (the 16 bit values will be clamped to 8 bit values before the combination is done).
478 * Given: 0H0G-0F0E-0D0C-0B0A (low)<br>
479 * Given: 0P0O-0N0M-0L0K-0J0I (high)<br>
480 * Result: P-O-N-M-L-K-J-I-H-G-F-E-D-C-B-A
481 * @param low The 128 bit register with the (resulting) lower 8 bit values
482 * @param high The 128 bit register with the (resulting) higher 8 bit values
483 * @return The resulting 128 bit register with 16 bit values
484 */
485 static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high);
486
487 /**
488 * Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
489 * @param rowTop The top row containing 6 short values, must be valid
490 * @param rowCenter The center row containing 6 short values, must be valid
491 * @param rowBottom The bottom row containing 6 short values, must be valid
492 * @return The resulting four sums of the four 3x3 blocks
493 */
494 static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom);
495
496 /**
497 * Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t values.
498 * This function does not check whether the multiplication results in an overflow.
499 * @param value_u_64x2 The uint64x2_t value to multiply
500 * @param value_u_32x2 The uint32x2_t value to multiply
501 * @return The resulting multiplication result
502 */
503 static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2);
504
505 /**
506 * Copies the sign of a given value to another one.
507 * @param signReceiver First value receiving the sign from the second value
508 * @param signProvider Second value providing the sign for the first one
509 * @return First value with the sign of the second one
510 */
511 static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t& signReceiver, const int32x4_t& signProvider);
512
513 /**
514 * Casts 16 float elements to 16 uint8_t elements.
515 * @param sourceA_f_32x4 The first 4 float elements
516 * @param sourceB_f_32x4 The second 4 float elements
517 * @param sourceC_f_32x4 The third 4 float elements
518 * @param sourceD_f_32x4 The fourth 4 float elements
519 * @return The resulting 16 uint8_t elements
520 */
521 static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4);
522
523 /**
524 * Casts 16 float elements to 16 uint8_t elements.
525 * @param source The 16 float elements, must be valid
526 * @return The resulting 16 uint8_t elements
527 */
528 static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float* const source);
529
530 /**
531 * Casts 16 uint8_t elements to 16 float elements.
532 * @param source_u_8x16 The 16 uint8_t elements, must be valid
533 * @return The resulting 16 float elements
534 */
535 static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8x16_t& source_u_8x16);
536
537 /**
538 * Casts 16 uint8_t elements to 16 float elements.
539 * @param source The 16 uint8_t elements, must be valid
540 * @return The resulting 16 float elements
541 */
542 static OCEAN_FORCE_INLINE float32x4x4_t cast16ElementsNEON(const uint8_t* const source);
543
544 private:
545
546 /**
547 * Returns the interpolated pixel values for one 2 channel 16 bit pixel.
548 * @param pixel Upper left pixel in the frame
549 * @param size Size of one frame row in bytes
550 * @param fx_y_ Product of the inverse fx and the inverse fy interpolation factor
551 * @param fxy_ Product of the fx and the inverse fy interpolation factor
552 * @param fx_y Product of the inverse fx and the fy interpolation factor
553 * @param fxy Product of the fx and the fy interpolation factor
554 * @return Interpolated pixel values
555 */
556 static inline unsigned int interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy);
557
558 /**
559 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
560 * @param pixel0 Upper left pixel in the first frame
561 * @param pixel1 Upper left pixel in the second frame
562 * @param size0 Size of one frame row in bytes
563 * @param size1 Size of one frame row in bytes
564 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
565 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
566 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
567 * @param f1xy Product of the fx and the fy interpolation factor for the second image
568 * @return Interpolated sum of square difference
569 */
570 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
571
572 /**
573 * Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
574 * @param pixel0 Upper left pixel in the first frame
575 * @param pixel1 Upper left pixel in the second frame
576 * @param size0 Size of one frame row in bytes
577 * @param size1 Size of one frame row in bytes
578 * @param f0x_y_ Product of the inverse fx and the inverse fy interpolation factor for the first image
579 * @param f0xy_ Product of the fx and the inverse fy interpolation factor for the first image
580 * @param f0x_y Product of the inverse fx and the fy interpolation factor for the first image
581 * @param f0xy Product of the fx and the fy interpolation factor for the first image
582 * @param f1x_y_ Product of the inverse fx and the inverse fy interpolation factor for the second image
583 * @param f1xy_ Product of the fx and the inverse fy interpolation factor for the second image
584 * @param f1x_y Product of the inverse fx and the fy interpolation factor for the second image
585 * @param f1xy Product of the fx and the fy interpolation factor for the second image
586 * @return Interpolated sum of square difference
587 */
588 static inline unsigned int ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy);
589};
590
591constexpr uint8x8_t NEON::create_uint8x8(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7)
592{
593#ifdef OCEAN_COMPILER_MSC
594 return uint8x8_t{{uint64_t(v0) | (uint64_t(v1) << 8) | (uint64_t(v2) << 16) | (uint64_t(v3) << 24) | (uint64_t(v4) << 32) | (uint64_t(v5) << 40) | (uint64_t(v6) << 48) | (uint64_t(v7) << 56)}};
595#else
596 return uint8x8_t{v0, v1, v2, v3, v4, v5, v6, v7};
597#endif
598}
599
600constexpr uint8x16_t NEON::create_uint8x16(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7, const uint8_t v8, const uint8_t v9, const uint8_t v10, const uint8_t v11, const uint8_t v12, const uint8_t v13, const uint8_t v14, const uint8_t v15)
601{
602#ifdef OCEAN_COMPILER_MSC
603 return uint8x16_t{{uint64_t(v0) | (uint64_t(v1) << 8) | (uint64_t(v2) << 16) | (uint64_t(v3) << 24) | (uint64_t(v4) << 32) | (uint64_t(v5) << 40) | (uint64_t(v6) << 48) | (uint64_t(v7) << 56), uint64_t(v8) | (uint64_t(v9) << 8) | (uint64_t(v10) << 16) | (uint64_t(v11) << 24) | (uint64_t(v12) << 32) | (uint64_t(v13) << 40) | (uint64_t(v14) << 48) | (uint64_t(v15) << 56)}};
604#else
605 return uint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
606#endif
607}
608
609constexpr int8x16_t NEON::create_int8x16(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3, const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7, const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11, const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
610{
611#ifdef OCEAN_COMPILER_MSC
612 return int8x16_t{create_uint8x16(uint8_t(v0), uint8_t(v1), uint8_t(v2), uint8_t(v3), uint8_t(v4), uint8_t(v5), uint8_t(v6), uint8_t(v7), uint8_t(v8), uint8_t(v9), uint8_t(v10), uint8_t(v11), uint8_t(v12), uint8_t(v13), uint8_t(v14), uint8_t(v15))};
613#else
614 return int8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
615#endif
616}
617
618constexpr int16x8_t NEON::create_int16x8(const int16_t v0, const int16_t v1, const int16_t v2, const int16_t v3, const int16_t v4, const int16_t v5, const int16_t v6, const int16_t v7)
619{
620#ifdef OCEAN_COMPILER_MSC
621 return int16x8_t{{uint64_t(uint16_t(v0)) | (uint64_t(uint16_t(v1)) << 16) | (uint64_t(uint16_t(v2)) << 32) | (uint64_t(uint16_t(v3)) << 48), uint64_t(uint16_t(v4)) | (uint64_t(uint16_t(v5)) << 16) | (uint64_t(uint16_t(v6)) << 32) | (uint64_t(uint16_t(v7)) << 48)}};
622#else
623 return int16x8_t{v0, v1, v2, v3, v4, v5, v6, v7};
624#endif
625}
626
627constexpr uint32x4_t NEON::create_uint32x4(const uint32_t v0, const uint32_t v1, const uint32_t v2, const uint32_t v3)
628{
629#ifdef OCEAN_COMPILER_MSC
630 return uint32x4_t{{uint64_t(v0) | (uint64_t(v1) << 32), uint64_t(v2) | (uint64_t(v3) << 32)}};
631#else
632 return uint32x4_t{v0, v1, v2, v3};
633#endif
634}
635
636constexpr uint16x8_t NEON::create_uint16x8(const uint16_t v0, const uint16_t v1, const uint16_t v2, const uint16_t v3, const uint16_t v4, const uint16_t v5, const uint16_t v6, const uint16_t v7)
637{
638#ifdef OCEAN_COMPILER_MSC
639 return uint16x8_t{{uint64_t(v0) | (uint64_t(v1) << 16) | (uint64_t(v2) << 32) | (uint64_t(v3) << 48), uint64_t(v4) | (uint64_t(v5) << 16) | (uint64_t(v6) << 32) | (uint64_t(v7) << 48)}};
640#else
641 return uint16x8_t{v0, v1, v2, v3, v4, v5, v6, v7};
642#endif
643}
644
645inline uint32x4_t NEON::sumSquareDifferences8BitBack9Elements(const uint8_t* const image0, const uint8_t* const image1)
646{
647 ocean_assert(image0 && image1);
648
649 const uint8x16_t row0 = vld1q_u8(image0);
650 const uint8x16_t row1 = vld1q_u8(image1);
651
652 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
653 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
654}
655
656inline uint32x4_t NEON::sumSquareDifferences8BitBack10Elements(const uint8_t* const image0, const uint8_t* const image1)
657{
658 ocean_assert(image0 && image1);
659
660 const uint8x16_t row0 = vld1q_u8(image0);
661 const uint8x16_t row1 = vld1q_u8(image1);
662
663 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
664 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
665}
666
667inline uint32x4_t NEON::sumSquareDifferences8BitBack11Elements(const uint8_t* const image0, const uint8_t* const image1)
668{
669 ocean_assert(image0 && image1);
670
671 const uint8x16_t row0 = vld1q_u8(image0);
672 const uint8x16_t row1 = vld1q_u8(image1);
673
674 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
675 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
676}
677
678inline uint32x4_t NEON::sumSquareDifferences8BitBack12Elements(const uint8_t* const image0, const uint8_t* const image1)
679{
680 ocean_assert(image0 && image1);
681
682 const uint8x16_t row0 = vld1q_u8(image0);
683 const uint8x16_t row1 = vld1q_u8(image1);
684
685 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
686 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
687}
688
689inline uint32x4_t NEON::sumSquareDifferences8BitBack13Elements(const uint8_t* const image0, const uint8_t* const image1)
690{
691 ocean_assert(image0 && image1);
692
693 const uint8x16_t row0 = vld1q_u8(image0);
694 const uint8x16_t row1 = vld1q_u8(image1);
695
696 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
697 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
698}
699
700inline uint32x4_t NEON::sumSquareDifferences8BitBack14Elements(const uint8_t* const image0, const uint8_t* const image1)
701{
702 ocean_assert(image0 && image1);
703
704 const uint8x16_t row0 = vld1q_u8(image0);
705 const uint8x16_t row1 = vld1q_u8(image1);
706
707 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
708 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
709}
710
711inline uint32x4_t NEON::sumSquareDifferences8BitBack15Elements(const uint8_t* const image0, const uint8_t* const image1)
712{
713 ocean_assert(image0 && image1);
714
715 const uint8x16_t row0 = vld1q_u8(image0);
716 const uint8x16_t row1 = vld1q_u8(image1);
717
718 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
719 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
720}
721
722inline uint32x4_t NEON::sumSquareDifference8BitFront9Elements(const uint8_t* const image0, const uint8_t* const image1)
723{
724 ocean_assert(image0 && image1);
725
726 const uint8x16_t row0 = vld1q_u8(image0);
727 const uint8x16_t row1 = vld1q_u8(image1);
728
729 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
730 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
731}
732
733inline uint32x4_t NEON::sumSquareDifference8BitFront10Elements(const uint8_t* const image0, const uint8_t* const image1)
734{
735 ocean_assert(image0 && image1);
736
737 const uint8x16_t row0 = vld1q_u8(image0);
738 const uint8x16_t row1 = vld1q_u8(image1);
739
740 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
741 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
742}
743
744inline uint32x4_t NEON::sumSquareDifference8BitFront11Elements(const uint8_t* const image0, const uint8_t* const image1)
745{
746 ocean_assert(image0 && image1);
747
748 const uint8x16_t row0 = vld1q_u8(image0);
749 const uint8x16_t row1 = vld1q_u8(image1);
750
751 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
752 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
753}
754
755inline uint32x4_t NEON::sumSquareDifference8BitFront12Elements(const uint8_t* const image0, const uint8_t* const image1)
756{
757 ocean_assert(image0 && image1);
758
759 const uint8x16_t row0 = vld1q_u8(image0);
760 const uint8x16_t row1 = vld1q_u8(image1);
761
762 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
763 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
764}
765
766inline uint32x4_t NEON::sumSquareDifference8BitFront13Elements(const uint8_t* const image0, const uint8_t* const image1)
767{
768 ocean_assert(image0 && image1);
769
770 const uint8x16_t row0 = vld1q_u8(image0);
771 const uint8x16_t row1 = vld1q_u8(image1);
772
773 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
774 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
775}
776
777inline uint32x4_t NEON::sumSquareDifference8BitFront14Elements(const uint8_t* const image0, const uint8_t* const image1)
778{
779 ocean_assert(image0 && image1);
780
781 const uint8x16_t row0 = vld1q_u8(image0);
782 const uint8x16_t row1 = vld1q_u8(image1);
783
784 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
785 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
786}
787
788inline uint32x4_t NEON::sumSquareDifference8BitFront15Elements(const uint8_t* const image0, const uint8_t* const image1)
789{
790 ocean_assert(image0 && image1);
791
792 const uint8x16_t row0 = vld1q_u8(image0);
793 const uint8x16_t row1 = vld1q_u8(image1);
794
795 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
796 return sumSquareDifference8Bit16Elements(vandq_u8(row0, mask), vandq_u8(row1, mask));
797}
798
799inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
800{
801 ocean_assert(image0 && image1);
802
803 uint8x16_t row0 = vld1q_u8(image0);
804 uint8x16_t row1 = vld1q_u8(image1);
805
806 return sumSquareDifference8Bit16Elements(row0, row1);
807}
808
809inline uint32x4_t NEON::sumSquareDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
810{
811 // Absolute difference between the arguments
812 uint8x16_t subtract = vabdq_u8(row0, row1);
813
814 uint8x8_t subtractLow = vget_low_u8(subtract);
815 uint8x8_t subtractHigh = vget_high_u8(subtract);
816
817 uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
818 uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
819
820 return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
821}
822
823inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8_t* const image0, const uint8_t* const image1)
824{
825 ocean_assert(image0 && image1);
826
827 const uint8x8_t row0 = vld1_u8(image0);
828 const uint8x8_t row1 = vld1_u8(image1);
829
830 return sumSquareDifference8Bit8Elements(row0, row1);
831}
832
833inline uint32x4_t NEON::sumSquareDifference8Bit8Elements(const uint8x8_t& row0, const uint8x8_t& row1)
834{
835 // subtract the 8 elements (usage of saturation and bitwise or operator)
836 const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
837
838 // distribute the 16 elements of 8 bit values into 16 elements of 16 bit values (necessary for multiplication)
839 const uint16x4_t subtractLow = removeHighBits16_8(vreinterpret_u16_u8(subtract));
840 const uint16x4_t subtractHigh = moveHighBits16_8(vreinterpret_u16_u8(subtract));
841
842 const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
843
844 // square the 16 elements
845 const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
846
847 // summing the 8 elements of 16 bit values
848 return vaddq_u32(removeHighBits32_16(vreinterpretq_u32_u16(square)), moveHighBits32_16(vreinterpretq_u32_u16(square)));
849}
850
851inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8_t* const image0, const uint8_t* const image1)
852{
853 ocean_assert(image0 && image1);
854
855 uint8x16_t row0 = vld1q_u8(image0);
856 uint8x16_t row1 = vld1q_u8(image1);
857
858 return sumAbsoluteDifference8Bit16Elements(row0, row1);
859}
860
861inline uint32x4_t NEON::sumAbsoluteDifference8Bit16Elements(const uint8x16_t& row0, const uint8x16_t& row1)
862{
863 // subtract the 16 elements (usage of saturation and bitwise or operator)
864 uint8x16_t subtract = vabdq_u8(row0, row1);
865
866 uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
867
868 return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
869}
870
871OCEAN_FORCE_INLINE void NEON::average16Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
872{
873 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
874
875 // we load 16 successive pixels (= 1 * 16 = 16 values)
876
877 const uint8x16_t m128_row0 = vld1q_u8(row0);
878 const uint8x16_t m128_row1 = vld1q_u8(row1);
879
880 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
881 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
882
883 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
884 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
885
886 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
887 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
888
889 const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
890
891 // we write back the results
892
893 vst1_u8(result, average);
894}
895
896OCEAN_FORCE_INLINE void NEON::average32Elements1Channel8Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
897{
898 // @see average16Elements1Channel8Bit2x2() for a detailed documentation
899
900 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
901
902 const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
903 const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
904
905 const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
906 const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
907
908 const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
909 const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
910
911 const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
912
913 vst1q_u8(result, average_u_8x16);
914}
915
916inline void NEON::average16ElementsBinary1Channel8Bit2x2(const uint8_t* const image0, const uint8_t* const image1, uint8_t* const result, const uint8_t threshold)
917{
918 ocean_assert(image0 && image1 && result);
919
920 const uint8x16_t row0 = vld1q_u8(image0);
921 const uint8x16_t row1 = vld1q_u8(image1);
922
923 // calculate normal average
924 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
925
926 // thresholding
927 const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
928
929 vst1_u8(result, thresholded);
930}
931
932OCEAN_FORCE_INLINE void NEON::average32Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
933{
934 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
935
936 // we load 16 successive pixels (= 2 * 16 = 32 values) and directly deinterleave the 2 channels
937 // from YA YA YA YA ... so that we receive the following patterns:
938 // m2_128_row0.val[0]: Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y
939 // m2_128_row0.val[1]: A A A A A A A A A A A A A A A A
940
941 const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
942 const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
943
944 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
945 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
946
947 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
948 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
949
950 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
951 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
952
953 uint8x8x2_t average;
954
955 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
956 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
957
958 // we write back the results, this time we interleave the results again
959
960 vst2_u8(result, average);
961}
962
963OCEAN_FORCE_INLINE void NEON::average64Elements2Channel16Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
964{
965 // @see average32Elements2Channel16Bit2x2() for a detailed documentation
966
967 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
968
969 const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
970 const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
971
972 const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
973 const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
974
975 const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
976 const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
977 const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
978 const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
979
980 uint8x16x2_t average_u_8x16x2;
981
982 average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
983 average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
984
985 vst2q_u8(result, average_u_8x16x2);
986}
987
988OCEAN_FORCE_INLINE void NEON::average48Elements3Channel24Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
989{
990 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
991
992 // we load 16 successive pixels (= 3 * 16 = 48 values) and directly deinterleave the 3 channels
993 // from RGB RGB RGB RGB ... so that we receive the following patterns:
994 // m3_128_row0.val[0]: R R R R R R R R R R R R R R R R
995 // m3_128_row0.val[1]: G G G G G G G G G G G G G G G G
996 // m3_128_row0.val[2]: B B B B B B B B B B B B B B B B
997
998 const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
999 const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
1000
1001 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
1002 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
1003
1004 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
1005 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
1006
1007 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
1008 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
1009
1010 uint8x8x3_t average;
1011
1012 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
1013 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
1014 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
1015
1016 // we write back the results, this time we interleave the results again
1017
1018 vst3_u8(result, average);
1019
1020 /* the following code would provide a more precise rounding
1021 uint16x8_t zero4 = vmovq_n_u16(0x0002u);
1022
1023 uint16x8_t redTmp = vpadalq_u8(zero4, row0.val[0]);
1024 average.val[0] = vmovn_u16(vshrq_n_u16(vpadalq_u8(redTmp, row1.val[0]), 2));
1025
1026 uint16x8_t greenTmp = vpadalq_u8(zero4, row0.val[1]);
1027 average.val[1] = vmovn_u16(vshrq_n_u16(vpadalq_u8(greenTmp, row1.val[1]), 2));
1028
1029 uint16x8_t blueTmp = vpadalq_u8(zero4, row0.val[2]);
1030 average.val[2] = vmovn_u16(vshrq_n_u16(vpadalq_u8(blueTmp, row1.val[2]), 2));*/
1031}
1032
1033OCEAN_FORCE_INLINE void NEON::average64Elements4Channel32Bit2x2(const uint8_t* const row0, const uint8_t* const row1, uint8_t* const result)
1034{
1035 ocean_assert(row0 != nullptr && row1 != nullptr && result != nullptr);
1036
1037 // we load 16 successive pixels (= 4 * 16 = 64 values) and directly deinterleave the 4 channels
1038 // from RGBA RGBA RGBA RGBA ... so that we receive the following patterns:
1039 // m4_128_row0.val[0]: R R R R R R R R R R R R R R R R
1040 // m4_128_row0.val[1]: G G G G G G G G G G G G G G G G
1041 // m4_128_row0.val[2]: B B B B B B B B B B B B B B B B
1042 // m4_128_row0.val[3]: A A A A A A A A A A A A A A A A
1043
1044 const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
1045 const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
1046
1047 // now we simply average the corresponding values of two rows by using NEON's rounding halving add function:
1048 // vrhaddq_u8: Vr[i] := (Va[i] + Vb[i] + 1) >> 1
1049
1050 // the next step will be to add successive pairs within the merged row by using NEON's long pairwise add function:
1051 // vpaddlq_u8 adds two neighboring 8 bit integers and creates 16 bit integer sums
1052
1053 // finally we use NEON's rounding narrowing shift function (converting 16 bit integers to 8 bit integers)
1054 // vrshrn_n_u16: Vr_8[i] := (Va_16[i] + 1) >> 1
1055
1056 uint8x8x4_t average;
1057
1058 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
1059 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
1060 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
1061 average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
1062
1063 // we write back the results, this time we interleave the results again
1064
1065 vst4_u8(result, average);
1066}
1067
1068inline void NEON::average24Elements1Channel8Bit3x3(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
1069{
1070 ocean_assert(image0 && image1 && image2 && result);
1071
1072 /**
1073 * | 1 2 1 |
1074 * 1/16 | 2 4 2 |
1075 * | 1 2 1 |
1076 */
1077
1078 // load 3 * 8 uchars
1079 uint8x8x3_t row0 = vld3_u8(image0);
1080 uint8x8x3_t row1 = vld3_u8(image1);
1081 uint8x8x3_t row2 = vld3_u8(image2);
1082
1083 uint16x8x3_t sumPerRow;
1084
1085 // create sum across rows, middle row is summed twice
1086 sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
1087 sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
1088 sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
1089
1090 // create sum across neighbouring pixels, second element within trio is summed twice
1091 const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
1092
1093 // calculate the average: (sum + 8u) >> 4
1094 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
1095
1096 vst1_u8(result, average);
1097}
1098
1099inline void NEON::average48Elements1Channel8Bit3x3Approximation(const uint8_t* const image0, const uint8_t* const image1, const uint8_t* const image2, uint8_t* const result)
1100{
1101 ocean_assert(image0 && image1 && image2 && result);
1102
1103 /**
1104 * | 1 2 1 |
1105 * 1/16 | 2 4 2 |
1106 * | 1 2 1 |
1107 */
1108
1109 // load 3 * 16 uchars and de-interleave triples:
1110 //
1111 // row0: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 ... A44 A45 A46 A47
1112 // row1: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 ... B44 B45 B46 B47
1113 // row2: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 ... C44 C45 C46 C47
1114
1115 uint8x16x3_t row0 = vld3q_u8(image0);
1116 uint8x16x3_t row1 = vld3q_u8(image1);
1117 uint8x16x3_t row2 = vld3q_u8(image2);
1118
1119 // now de-interleaved:
1120 //
1121 // val[0] val[1] valu[2]
1122 // row0: A0 A3 A6 A9 ... A45 A1 A4 A7 A10 ... A46 A2 A5 A8 A11 ... A47
1123 // row1: B0 B3 B6 B9 ... B45 B1 B4 B7 B10 ... B46 B2 B5 B8 B11 ... B47
1124 // row2: C0 C3 C6 C9 ... C45 C1 C4 C7 C10 ... C46 C2 C5 C8 C11 ... C47
1125
1126 // now we need to 'multiply' row1 by 2 and val[1] by 2, we solve this by creating the average of the first and second row followed by the average with the middle row
1127
1128 uint8x16x3_t averagePerRow;
1129 averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1130 averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1131 averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1132
1133 // we apply the same idea as bevore in vertical direction
1134 const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1135
1136 vst1q_u8(result, average);
1137}
1138
1139inline void NEON::gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t* source, int8_t* response, const unsigned int width)
1140{
1141 ocean_assert(source && response && width >= 10u);
1142
1143 // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1144 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1145 // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1146 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1147
1148 // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1149 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1150 // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1151 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1152
1153 int8x8x2_t result;
1154
1155 // we subtract the horizontal values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1156 result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1157 // we subtract the vertical values (right - left), and divide the result by 2, and narrow the results to 8 bit values
1158 result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1159
1160 // we store the determined results interleaved
1161 vst2_s8((int8_t*)response, result);
1162}
1163
1164inline void NEON::gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t* source, int16_t* response, const unsigned int width)
1165{
1166 ocean_assert(source && response && width >= 10u);
1167
1168 // we load the left 8 unsigned 8bit elements and store them has signed 16bit values
1169 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1170 // we load the right 8 unsigned 8bit elements and store them has signed 16bit values
1171 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1172
1173 // we load the top 8 unsigned 8bit elements and store them has signed 16bit values
1174 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1175 // we load the bottom 8 unsigned 8bit elements and store them has signed 16bit values
1176 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1177
1178 // we subtract the horizontal values (right - left) and divide the result by 2
1179 int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1180 // we subtract the vertical values (bottom - top) and divide the result by 2
1181 int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1182
1183 int16x8x3_t result;
1184
1185 // we multiply horizontal with horizontal
1186 result.val[0] = vmulq_s16(horizontal, horizontal);
1187 // we multiply vertical with vertical
1188 result.val[1] = vmulq_s16(vertical, vertical);
1189 // we multiply horizontal with vertical
1190 result.val[2] = vmulq_s16(horizontal, vertical);
1191
1192 // we store the determined results interleaved (h*h, v*v, h*v, h*h, v*v, h*v, ...)
1193 vst3q_s16(response, result);
1194}
1195
1196OCEAN_FORCE_INLINE uint32_t NEON::sumHorizontal_u_32x4(const uint32x4_t& value_u_32x4)
1197{
1198#if defined(__aarch64__)
1199
1200 return vaddvq_u32(value_u_32x4);
1201
1202#else
1203
1204 const uint32x2_t sum_u_32x2 = vpadd_u32(vget_low_u32(value_u_32x4), vget_high_u32(value_u_32x4));
1205 return vget_lane_u32(vpadd_u32(sum_u_32x2, sum_u_32x2), 0);
1206
1207#endif // __aarch64__
1208}
1209
1210OCEAN_FORCE_INLINE uint32x4_t NEON::removeHighBits32_16(const uint32x4_t& value)
1211{
1212 return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1213}
1214
1215OCEAN_FORCE_INLINE uint16x4_t NEON::removeHighBits16_8(const uint16x4_t& value)
1216{
1217 return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1218}
1219
1220OCEAN_FORCE_INLINE uint16x8_t NEON::removeHighBits16_8(const uint16x8_t& value)
1221{
1222 return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1223}
1224
1225OCEAN_FORCE_INLINE uint32x4_t NEON::moveHighBits32_16(const uint32x4_t& value)
1226{
1227 return vshrq_n_u32(value, 16);
1228}
1229
1230OCEAN_FORCE_INLINE uint16x4_t NEON::moveHighBits16_8(const uint16x4_t& value)
1231{
1232 return vshr_n_u16(value, 8);
1233}
1234
1235OCEAN_FORCE_INLINE uint16x8_t NEON::moveHighBits16_8(const uint16x8_t& value)
1236{
1237 return vshrq_n_u16(value, 8);
1238}
1239
1240OCEAN_FORCE_INLINE uint16x8_t NEON::combineLowBits32x4to16x8(const uint32x4_t& low, const uint32x4_t& high)
1241{
1242 return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1243}
1244
1245OCEAN_FORCE_INLINE uint8x16_t NEON::combineLowBits16x8to8x16(const uint16x8_t& low, const uint16x8_t& high)
1246{
1247 return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1248}
1249
1250OCEAN_FORCE_INLINE int32x4_t NEON::sum16Bit4Blocks3x3(const short* const rowTop, const short* const rowCenter, const short* const rowBottom)
1251{
1252 ocean_assert(rowTop != nullptr);
1253 ocean_assert(rowCenter != nullptr);
1254 ocean_assert(rowBottom != nullptr);
1255
1256 // 1 1 1
1257 // 1 1 1
1258 // 1 1 1
1259
1260 // 1 1 1
1261 // 1 1 1
1262 // 1 1 1
1263
1264 // 1 1 1
1265 // 1 1 1
1266 // 1 1 1
1267
1268 // ...
1269
1270 // load the top row
1271 const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1272 const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1273 const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1274
1275 // load the center row
1276 const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1277 const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1278 const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1279
1280 // load the bottom row
1281 const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1282 const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1283 const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1284
1285 // summing up the individual elements (16 bit + 16 bit -> 32 bit)
1286 const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1287 const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1288 const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1289 const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1290
1291 // summing up the intermediate results
1292 const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1293 const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1294
1295 const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1296
1297 // adding the last missing row
1298 return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1299}
1300
1301OCEAN_FORCE_INLINE uint64x2_t NEON::multiply(const uint64x2_t& value_u_64x2, const uint32x2_t& value_u_32x2)
1302{
1303 // uint64_t * uint32_t
1304 // = (high(uint64_t) + low(uint64_t)) * uint32_t
1305 // = (((high(uint64_t) >> 32) * uint32_t) << 32) + low(uint64_t) * uint32_t
1306
1307 // [ valueA_u_64, valueB_u64 ] -> [ high(valueA_u_64), high(valueB_u64) ], [ low(valueA_u_64), low(valueB_u64) ]
1308 const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1309
1310 const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1311 const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1312
1313 const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1314
1315 return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1316}
1317
1318OCEAN_FORCE_INLINE int32x4_t NEON::copySign(const uint32x4_t& signReceiver_u_32x4, const int32x4_t& signProvider_s_32x4)
1319{
1320 const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1321
1322 const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign < 0 ? 0xFF : 0x00;
1323 const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0)); // sign >= 0 ? 0xFF : 0x00;
1324
1325 return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1326}
1327
1328OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float32x4_t& sourceA_f_32x4, const float32x4_t& sourceB_f_32x4, const float32x4_t& sourceC_f_32x4, const float32x4_t& sourceD_f_32x4)
1329{
1330 const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1331 const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1332 const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1333 const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1334
1335 const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1336 const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1337
1338 return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1339}
1340
1341OCEAN_FORCE_INLINE uint8x16_t NEON::cast16ElementsNEON(const float* const source)
1342{
1343 ocean_assert(source != nullptr);
1344
1345#ifdef OCEAN_DEBUG
1346 for (unsigned int n = 0u; n < 16u; ++n)
1347 {
1348 ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1349 }
1350#endif
1351
1352 return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1353}
1354
1355OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8x16_t& source_u_8x16)
1356{
1357 const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1358 const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1359
1360 const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1361 const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1362 const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1363 const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1364
1365 float32x4x4_t result_u_32x4x4;
1366 result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1367 result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1368 result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1369 result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1370
1371 return result_u_32x4x4;
1372}
1373
1374OCEAN_FORCE_INLINE float32x4x4_t NEON::cast16ElementsNEON(const uint8_t* const source)
1375{
1376 ocean_assert(source != nullptr);
1377
1378 return cast16ElementsNEON(vld1q_u8(source));
1379}
1380
1381inline unsigned int NEON::interpolation2Channel16Bit1x1(const uint8_t* const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
1382{
1383 ocean_assert(pixel);
1384 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1385
1386 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1387}
1388
1389inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int /*size0*/, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1390{
1391 ocean_assert(pixel0 && pixel1);
1392
1393 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1394
1395 return sqrDistance((unsigned int)*pixel0, interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1396}
1397
1398inline unsigned int NEON::ssd2Channel16Bit1x1(const uint8_t* const pixel0, const uint8_t* const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f0x_y_, const unsigned int f0xy_, const unsigned int f0x_y, const unsigned int f0xy, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
1399{
1400 ocean_assert(pixel0 && pixel1);
1401
1402 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1403 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1404
1405 return sqrDistance(interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy), interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
1406}
1407
1408}
1409
1410}
1411
1412#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1413
1414#endif // META_OCEAN_CV_NEON_H
This class implements computer vision functions using NEON extensions.
Definition NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:678
static constexpr int8x16_t create_int8x16(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3, const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7, const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11, const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
Creates an int8x16_t vector from 16 individual int8_t values.
Definition NEON.h:609
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:932
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1196
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition NEON.h:916
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:871
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:788
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:733
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:1099
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:963
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition NEON.h:1230
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition NEON.h:1164
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition NEON.h:851
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition NEON.h:1240
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:1068
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:896
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:689
static constexpr int16x8_t create_int16x8(const int16_t v0, const int16_t v1, const int16_t v2, const int16_t v3, const int16_t v4, const int16_t v5, const int16_t v6, const int16_t v7)
Creates an int16x8_t vector from 8 individual int16_t values.
Definition NEON.h:618
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:667
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition NEON.h:1215
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:777
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition NEON.h:1225
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:645
static constexpr uint16x8_t create_uint16x8(const uint16_t v0, const uint16_t v1, const uint16_t v2, const uint16_t v3, const uint16_t v4, const uint16_t v5, const uint16_t v6, const uint16_t v7)
Creates a uint16x8_t vector from 8 individual uint16_t values.
Definition NEON.h:636
static constexpr uint8x8_t create_uint8x8(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7)
Creates a uint8x8_t vector from 8 individual uint8_t values.
Definition NEON.h:591
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition NEON.h:1328
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition NEON.h:1245
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition NEON.h:1381
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition NEON.h:1250
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:744
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:700
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:722
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition NEON.h:1033
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:711
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:656
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:755
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition NEON.h:799
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition NEON.h:988
static constexpr uint32x4_t create_uint32x4(const uint32_t v0, const uint32_t v1, const uint32_t v2, const uint32_t v3)
Creates a uint32x4_t vector from 4 individual uint32_t values.
Definition NEON.h:627
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition NEON.h:823
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:766
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition NEON.h:1389
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition NEON.h:1318
static constexpr uint8x16_t create_uint8x16(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7, const uint8_t v8, const uint8_t v9, const uint8_t v10, const uint8_t v11, const uint8_t v12, const uint8_t v13, const uint8_t v14, const uint8_t v15)
Creates a uint8x16_t vector from 16 individual uint8_t values.
Definition NEON.h:600
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition NEON.h:1139
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition NEON.h:1301
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition NEON.h:1210
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15