Ocean
Loading...
Searching...
No Matches
FrameConverterY10_Packed.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_FRAME_CONVERTER_Y_10_PACKED_H
9#define META_OCEAN_CV_FRAME_CONVERTER_Y_10_PACKED_H
10
11#include "ocean/cv/CV.h"
14#include "ocean/cv/NEON.h"
15
16#include "ocean/base/Memory.h"
18#include "ocean/base/Worker.h"
19
20#include <unordered_map>
21
22namespace Ocean
23{
24
25namespace CV
26{
27
28/**
29 * This class provides functions to convert frames with Y10_PACKED pixel format.
30 * @ingroup cv
31 */
32class OCEAN_CV_EXPORT FrameConverterY10_Packed : public FrameConverter
33{
34 public:
35
36 /**
37 * This class implements the manager for lookup tables.
38 */
39 class LookupTableManager : public Singleton<LookupTableManager>
40 {
41 protected:
42
43 /// Definition of a map mapping gamma values to the memory of lookup tables.
44 using LookupTables = std::unordered_map<float, Memory>;
45
46 public:
47
48 /**
49 * Returns the lookup table for a gamma compression/correction function.
50 * The gamma compression/correction is based the following equation
51 * <pre>
52 * Y8 = 255 * (Y10 / 1023) ^ gamma
53 * </pre>
54 * @param gamma The gamma value for which the lookup table will be returned, with range (0, 2)
55 * @return The requested lookup table, will be valid
56 */
57 const uint8_t* lookupTable(const float gamma);
58
59 protected:
60
61 /// The lookup tables.
63
64 /// The lock of the manager.
66 };
67
68 public:
69
70 /**
71 * Converts a Y10_PACKED frame to a Y8 frame.
72 * @param source The source frame buffer, must be valid
73 * @param target The target frame buffer, must be valid
74 * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
75 * @param height The height of the frame in pixel, with range [1, infinity)
76 * @param flag Determining the type of conversion
77 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
78 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
79 * @param worker Optional worker object to distribute the computational load
80 */
81 static inline void convertY10_PackedToY8Linear(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
82
83 /**
84 * Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a lookup table.
85 * The gamma compression/correction is based the following equation
86 * <pre>
87 * Y8 = 255 * (Y10 / 1023) ^ gamma
88 * </pre>
89 * @param source The source frame buffer, must be valid
90 * @param target The target frame buffer, must be valid
91 * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
92 * @param height The height of the frame in pixel, with range [1, infinity)
93 * @param flag Determining the type of conversion
94 * @param gamma The gamma value to be applied, with range (0, 2)
95 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
96 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
97 * @param worker Optional worker object to distribute the computational load
98 */
99 static inline void convertY10_PackedToY8GammaLUT(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
100
101 /**
102 * Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a 3-step linear approximation.
103 * The gamma compression/correction is based the following equation
104 * <pre>
105 * Y8 = 255 * (Y10 / 1023) ^ gamma
106 * </pre>
107 * @param source The source frame buffer, must be valid
108 * @param target The target frame buffer, must be valid
109 * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
110 * @param height The height of the frame in pixel, with range [1, infinity)
111 * @param flag Determining the type of conversion
112 * @param gamma The gamma value to be applied, with range (0, 2)
113 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
114 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
115 * @param worker Optional worker object to distribute the computational load
116 */
117 static void convertY10_PackedToY8GammaApproximated(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
118
119 /**
120 * Converts a Y10_PACKED frame to a Y10 frame, so that this function simply unpacks the 10 bits.
121 * @param source The source frame buffer, must be valid
122 * @param target The target frame buffer, must be valid
123 * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
124 * @param height The height of the frame in pixel, with range [1, infinity)
125 * @param flag Determining the type of conversion
126 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
127 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
128 * @param worker Optional worker object to distribute the computational load
129 */
130 static inline void convertY10_PackedToY10(const uint8_t* const source, uint16_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
131
132 /**
133 * Converts a Y10_PACKED frame to a RGB24 frame.
134 * @param source The source frame buffer, must be valid
135 * @param target The target frame buffer, must be valid
136 * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
137 * @param height The height of the frame in pixel, with range [1, infinity)
138 * @param flag Determining the type of conversion
139 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
140 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
141 * @param worker Optional worker object to distribute the computational load
142 */
143 static inline void convertY10_PackedToBGR24(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
144
145 /**
146 * Converts a Y10_PACKED frame to a RGB24 frame.
147 * @param source The source frame buffer, must be valid
148 * @param target The target frame buffer, must be valid
149 * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
150 * @param height The height of the frame in pixel, with range [1, infinity)
151 * @param flag Determining the type of conversion
152 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
153 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
154 * @param worker Optional worker object to distribute the computational load
155 */
156 static inline void convertY10_PackedToRGB24(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
157
158 protected:
159
160 /**
161 * Converts a Y10_Packed row to a Y8 row.
162 * This function simply applies a linear bit reduction from 10 bits to 8 bits.
163 * @param source The pointer to the source pixels, must be valid
164 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
165 * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
166 * @param unusedParameters Unused parameters, must be nullptr
167 */
168 static void convertRowY10_PackedToY8Linear(const uint8_t* source, uint8_t* target, const size_t size, const void* unusedParameters = nullptr);
169
170 /**
171 * Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a lookup table.
172 * @param source The pointer to the source pixels, must be valid
173 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
174 * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
175 * @param parameters The pointer to the `uint8_t` lookup table to be used, must be valid
176 */
177 static void convertRowY10_PackedToY8GammaLUT(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
178
179 /**
180 * Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a 3-step linear interpolation.
181 * @param source The pointer to the source pixels, must be valid
182 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
183 * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
184 * @param parameters Three linear slope parameters and two intercept parameters, must be valid
185 */
186 template <unsigned int tStep01, unsigned int tStep12>
187 static void convertRowY10_PackedToY8GammaApproximated(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
188
189 /**
190 * Converts a Y10_Packed row to a Y10 row.
191 * This function simply applies an unpacking of the 10 bits.
192 * @param source The pointer to the source pixels, must be valid
193 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
194 * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
195 * @param unusedParameters Unused parameters, must be nullptr
196 */
197 static void convertRowY10_PackedToY10(const uint8_t* source, uint16_t* target, const size_t size, const void* unusedParameters = nullptr);
198
199 /**
200 * Converts a Y10_Packed row to a RGB24 row, or BGR24 row.
201 * This function simply applies a linear bit reduction from 10 bits to 8 bits.
202 * @param source The pointer to the source pixels, must be valid
203 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
204 * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
205 * @param unusedParameters Unused parameters, must be nullptr
206 */
207 static void convertRowY10_PackedToYYY24Linear(const uint8_t* source, uint8_t* target, const size_t size, const void* unusedParameters = nullptr);
208
209#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
210
211 /**
212 * Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying a liner conversion.
213 * @param source The souce buffer with 16 elements, must be valid
214 * @param target The resulting 16 Y8 pixels, must be valid
215 */
216 static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8LinearNEON(const uint8_t* const source, uint8_t* const target);
217
218 /**
219 * Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying gamma compression/correction with a 3-step linear interpolation.
220 * @param source The souce buffer with 16 elements, must be valid
221 * @param m0_256_s_16x4 The slope of the first linear approximation (with multiple of 256), with range (-10 * 256, 10 * 256)
222 * @param m1_256_s_16x4 The slope of the second linear approximation (with multiple of 256), with range (-10 * 256, 10 * 256)
223 * @param m2_256_s_16x4 The slope of the third linear approximation (with multiple of 256), with range (-10 * 256, 10 * 256)
224 * @param c1_s_16x8 The intercept of the second linear approximation, with range (-255, 255)
225 * @param c2_s_16x8 The intercept of the third linear approximation, with range (-255, 255)
226 * @param target The resulting 16 Y8 pixels, must be valid
227 */
228 template <unsigned int tStep01, unsigned int tStep12>
229 static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8ApproximatedNEON(const uint8_t* const source, const int16x4_t& m0_256_s_16x4, const int16x4_t& m1_256_s_16x4, const int16x4_t& m2_256_s_16x4, const int16x8_t& c1_s_16x8, const int16x8_t& c2_s_16x8, uint8_t* const target);
230
231#endif
232};
233
234inline void FrameConverterY10_Packed::convertY10_PackedToY8Linear(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
235{
236 ocean_assert(source != nullptr && target != nullptr);
237 ocean_assert(width >= 4u && height >= 1u);
238 ocean_assert(width % 4u == 0u);
239
240 const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
241 const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
242
243 constexpr void* options = nullptr;
244
245 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
246
247 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToY8Linear, CV::FrameChannels::reverseRowPixelOrderInPlace<uint8_t, 1u>, areContinuous, options, worker);
248}
249
250inline void FrameConverterY10_Packed::convertY10_PackedToY8GammaLUT(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
251{
252 ocean_assert(source != nullptr && target != nullptr);
253 ocean_assert(width >= 4u && height >= 1u);
254 ocean_assert(width % 4u == 0u);
255
256 ocean_assert(gamma > 0.0f && gamma < 2.0f);
257
258 const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
259 const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
260
261 const void* const options = LookupTableManager::get().lookupTable(gamma);
262
263 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
264
265 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToY8GammaLUT, CV::FrameChannels::reverseRowPixelOrderInPlace<uint8_t, 1u>, areContinuous, options, worker);
266}
267
268inline void FrameConverterY10_Packed::convertY10_PackedToY10(const uint8_t* source, uint16_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
269{
270 ocean_assert(source != nullptr && target != nullptr);
271 ocean_assert(width >= 4u && height >= 1u);
272 ocean_assert(width % 4u == 0u);
273
274 const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
275 const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
276
277 constexpr void* options = nullptr;
278
279 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
280
281 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToY10, CV::FrameChannels::reverseRowPixelOrderInPlace<uint16_t, 1u>, areContinuous, options, worker);
282}
283
284inline void FrameConverterY10_Packed::convertY10_PackedToBGR24(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
285{
286 convertY10_PackedToRGB24(source, target, width, height, flag, sourcePaddingElements, targetPaddingElements, worker);
287}
288
289inline void FrameConverterY10_Packed::convertY10_PackedToRGB24(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
290{
291 ocean_assert(source != nullptr && target != nullptr);
292 ocean_assert(width >= 4u && height >= 1u);
293 ocean_assert(width % 4u == 0u);
294
295 const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
296 const unsigned int targetStrideElements = width * 3u + targetPaddingElements;
297
298 constexpr void* options = nullptr;
299
300 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
301
302 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToYYY24Linear, CV::FrameChannels::reverseRowPixelOrderInPlace<uint8_t, 3u>, areContinuous, options, worker);
303}
304
305template <unsigned int tStep01, unsigned int tStep12>
306void FrameConverterY10_Packed::convertRowY10_PackedToY8GammaApproximated(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters)
307{
308 static_assert(0u < tStep01 && tStep01 < tStep12 && tStep12 < 1023u, "Invalid steps");
309
310 ocean_assert(source != nullptr && target != nullptr);
311 ocean_assert(size >= 4 && size % 4 == 0);
312 ocean_assert(parameters != nullptr);
313
314 // applying a 3-step linear approximation
315 // https://www.desmos.com/calculator/pezgk5slux
316
317 const int* coefficients = reinterpret_cast<const int*>(parameters);
318
319 const int32_t m0_256 = coefficients[0];
320 const int32_t m1_256 = coefficients[1];
321 const int32_t m2_256 = coefficients[2];
322
323 const int32_t c1 = coefficients[3];
324 const int32_t c2 = coefficients[4];
325
326 size_t blocks4 = size / size_t(4);
327
328#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
329
330 const size_t blocks16 = size / size_t(16);
331
332 const int16x4_t m0_256_s_16x4 = vdup_n_s16(int16_t(m0_256));
333 const int16x4_t m1_256_s_16x4 = vdup_n_s16(int16_t(m1_256));
334 const int16x4_t m2_256_s_16x4 = vdup_n_s16(int16_t(m2_256));
335
336 const int16x8_t c1_s_16x8 = vdupq_n_s16(int16_t(c1));
337 const int16x8_t c2_s_16x8 = vdupq_n_s16(int16_t(c2));
338
339 for (size_t n = 0; n < blocks16; ++n)
340 {
341 convert16PixelY10_PackedToY8ApproximatedNEON<tStep01, tStep12>(source, m0_256_s_16x4, m1_256_s_16x4, m2_256_s_16x4, c1_s_16x8, c2_s_16x8, target);
342
343 target += 16;
344 source += 20;
345 }
346
347 blocks4 = (size - blocks16 * size_t(16)) / size_t(4);
348 ocean_assert(blocks4 <= size / size_t(4));
349
350#endif // OCEAN_HARDWARE_NEON_VERSION
351
352 int32_t result256;
353
354 const int32_t c1_256 = c1 * 256;
355 const int32_t c2_256 = c2 * 256;
356
357 for (size_t n = 0; n < blocks4; ++n)
358 {
359 const int32_t x[4] =
360 {
361 int32_t(uint16_t(source[0]) << uint16_t(2) | (uint16_t(source[4]) & uint16_t(0b00000011))),
362 int32_t(uint16_t(source[1]) << uint16_t(2) | ((uint16_t(source[4]) & uint16_t(0b00001100)) >> uint16_t(2))),
363 int32_t(uint16_t(source[2]) << uint16_t(2) | ((uint16_t(source[4]) & uint16_t(0b00110000)) >> uint16_t(4))),
364 int32_t(uint16_t(source[3]) << uint16_t(2) | (uint16_t(source[4]) >> uint16_t(6)))
365 };
366
367 for (unsigned int i = 0u; i < 4u; ++i)
368 {
369 const uint32_t& xx = x[i];
370
371 if (xx < tStep01)
372 {
373 result256 = (m0_256 * xx);
374 }
375 else if (xx <= tStep12)
376 {
377 result256 = (m1_256 * xx + c1_256);
378 }
379 else
380 {
381 result256 = (m2_256 * xx + c2_256);
382 }
383
384 ocean_assert(0 <= result256 && result256 <= 255 * 256);
385
386 target[i] = int8_t((uint32_t(result256) + 128u) >> 8u);
387 }
388
389 target += 4;
390 source += 5;
391 }
392}
393
394#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
395
396OCEAN_FORCE_INLINE void FrameConverterY10_Packed::convert16PixelY10_PackedToY8LinearNEON(const uint8_t* const source, uint8_t* const target)
397{
398#ifdef __aarch64__
399
400 const uint8x16_t packedA_u_8x16 = vld1q_u8(source);
401 const uint8x8_t packedB_u_8x8 = vld1_u8(source + 12);
402
403 // F E D C B A 9 8 7 6 5 4 3 2 1 0
404 // D C B A 8 7 6 5 3 2 1 0 X X X X
405 constexpr uint8x16_t shuffle_u_8x16 = NEON::create_uint8x16(16u, 16u, 16u, 16u, 0u, 1u, 2u, 3u, 5u, 6u, 7u, 8u, 10u, 11u, 12u, 13u);
406 const uint8x16_t intermediateA_u_8x16 = vqtbl1q_u8(packedA_u_8x16, shuffle_u_8x16);
407
408 const uint8x8_t intermediateB_u_8x8 = vext_u8(packedB_u_8x8, packedB_u_8x8, 3);
409
410 const uint8x16_t target_u_8x16 = vextq_u8(intermediateA_u_8x16, vcombine_u8(intermediateB_u_8x8, intermediateB_u_8x8), 4);
411
412#else
413
414 constexpr uint8x16_t mask_u_8x16 = NEON::create_uint8x16(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0xFFu, 0xFFu, 0xFFu);
415
416 const uint8x16_t packedA_u_8x16 = vld1q_u8(source);
417 const uint8x8_t packedB_u_8x8 = vld1_u8(source + 11);
418
419 const uint8x8_t packedAA_u_8x8 = vget_low_u8(packedA_u_8x16);
420 const uint8x8_t packedAB_u_8x8 = vget_high_u8(packedA_u_8x16);
421
422 constexpr uint8x8_t shuffleA_u_8x8 = NEON::create_uint8x8(8u, 0u, 1u, 2u, 3u, 5u, 6u, 7u);
423 constexpr uint8x8_t shuffleB_u_8x8 = NEON::create_uint8x8(0u, 2u, 3u, 4u, 5u, 7u, 8u, 8u);
424 const uint8x16_t intermediateA_u_8x16 = vextq_u8(vcombine_u8(vtbl1_u8(packedAA_u_8x8, shuffleA_u_8x8), vtbl1_u8(packedAB_u_8x8, shuffleB_u_8x8)), mask_u_8x16, 1); // we use the first zero element of mask_u_8x16
425
426 const uint8x16_t intermediateB_u_8x16 = vcombine_u8(vget_low_u8(mask_u_8x16), vand_u8(packedB_u_8x8, vget_high_u8(mask_u_8x16)));
427
428 const uint8x16_t target_u_8x16 = vorrq_u8(intermediateA_u_8x16, intermediateB_u_8x16);
429
430#endif // __aarch64__
431
432 vst1q_u8(target, target_u_8x16);
433}
434
435template <unsigned int tStep01, unsigned int tStep12>
436OCEAN_FORCE_INLINE void FrameConverterY10_Packed::convert16PixelY10_PackedToY8ApproximatedNEON(const uint8_t* const source, const int16x4_t& m0_s_16x4, const int16x4_t& m1_s_16x4, const int16x4_t& m2_s_16x4, const int16x8_t& c1_s_16x8, const int16x8_t& c2_s_16x8, uint8_t* const target)
437{
438 static_assert(0u < tStep01 && tStep01 < tStep12 && tStep12 < 1023u, "Invalid steps");
439
440 constexpr int8x16_t leftShifts_s_8x16 = NEON::create_int8x16(6, 0, 4, 0, 2, 0, 0, 0, 6, 0, 4, 0, 2, 0, 0, 0);
441 constexpr int16x8_t rightShifts_s_16x8 = NEON::create_int16x8(-6, -6, -6, -6, -6, -6, -6, -6);
442
443#ifdef __aarch64__
444
445 const uint8x16_t packedAB_u_8x16 = vld1q_u8(source);
446 const uint8x16_t packedCD_u_8x16 = vld1q_u8(source + 4);
447
448 // F E D C B A 9 8 7 6 5 4 3 2 1 0
449 // 8 9 7 9 6 9 5 9 3 4 2 4 1 4 0 4
450 constexpr uint8x16_t shuffleAB_u_8x16 = NEON::create_uint8x16(4u, 0u, 4u, 1u, 4u, 2u, 4u, 3u, 9u, 5u, 9u, 6u, 9u, 7u, 9u, 8u);
451 const uint8x16_t intermediateAB_u_8x16 = vqtbl1q_u8(packedAB_u_8x16, shuffleAB_u_8x16);
452
453 constexpr uint8x16_t shuffleCD_u_8x16 = NEON::create_uint8x16(10u, 6u, 10u, 7u, 10u, 8u, 10u, 9u, 15u, 11u, 15u, 12u, 15u, 13u, 15u, 14u);
454 const uint8x16_t intermediateCD_u_8x16 = vqtbl1q_u8(packedCD_u_8x16, shuffleCD_u_8x16);
455
456#else
457
458 constexpr uint8x8_t shuffleAB_u_8x8 = NEON::create_uint8x8(4u, 0u, 4u, 1u, 4u, 2u, 4u, 3u);
459 constexpr uint8x8_t shuffleC_u_8x8 = NEON::create_uint8x8(6u, 2u, 6u, 3u, 6u, 4u, 6u, 5u);
460 constexpr uint8x8_t shuffleD_u_8x8 = NEON::create_uint8x8(7u, 3u, 7u, 4u, 7u, 5u, 7u, 6u);
461
462 const uint8x16_t packedAB_u_8x16 = vld1q_u8(source);
463 const uint8x8_t packedForD_u_8x8 = vld1_u8(source + 12);
464
465 const uint8x8_t packedForA_u_8x8 = vget_low_u8(packedAB_u_8x16);
466 const uint8x8_t packedForB_u_8x8 = vget_low_u8(vextq_u8(packedAB_u_8x16, packedAB_u_8x16, 5));
467 const uint8x8_t packedForC_u_8x8 = vget_high_u8(packedAB_u_8x16);
468
469 const uint8x16_t intermediateAB_u_8x16 = vcombine_u8(vtbl1_u8(packedForA_u_8x8, shuffleAB_u_8x8), vtbl1_u8(packedForB_u_8x8, shuffleAB_u_8x8));
470 const uint8x16_t intermediateCD_u_8x16 = vcombine_u8(vtbl1_u8(packedForC_u_8x8, shuffleC_u_8x8), vtbl1_u8(packedForD_u_8x8, shuffleD_u_8x8));
471
472#endif // __aarch64__
473
474
475 // ... XXXXXX99 33333333 44XXXXXX 22222222 XX44XXXX 11111111 XXXX44XX 00000000 XXXXXX44
476 // ... 99------ 33333333 44------ 22222222 44------ 11111111 44------ 00000000 44------
477 const uint16x8_t intermediateAB_u_16x8 = vreinterpretq_u16_u8(vshlq_u8(intermediateAB_u_8x16, leftShifts_s_8x16));
478 const uint16x8_t intermediateCD_u_16x8 = vreinterpretq_u16_u8(vshlq_u8(intermediateCD_u_8x16, leftShifts_s_8x16));
479
480
481 // ... 99------ 33333333 44------ 22222222 44------ 11111111 44------ 00000000 44------
482 // ... 55555599 ------33 33333344 ------22 22222244 ------11 11111144 ------00 00000044
483 const uint16x8_t unpackedAB_u_16x8 = vshlq_u16(intermediateAB_u_16x8, rightShifts_s_16x8);
484 const uint16x8_t unpackedCD_u_16x8 = vshlq_u16(intermediateCD_u_16x8, rightShifts_s_16x8);
485
486 // now, we have 16 uin16_t unpacked values for which we will approximate the gamma compression/correction
487
488 // approximation via three linear equations
489 // [ 0, step01]: f_0(x) = m_0 * x, with f_0(0) = 0
490 // [step01, step12]: f_1(x) = m_1 * x + c_1
491 // [step21, 1 ]: f_2(x) = m_2 * x + c_2, with f_2(1) = 1
492
493 constexpr int16x8_t step01_s_16x8 = NEON::create_int16x8(int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01));
494 constexpr int16x8_t step12_s_16x8 = NEON::create_int16x8(int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12));
495
496 // determining masks to switch between one of the tree linear equations
497
498 const uint16x8_t isWithin0AB_u_16x8 = vcleq_s16(vreinterpretq_s16_u16(unpackedAB_u_16x8), step01_s_16x8); // unpackedAB <= step01 ? 0xFFFFFFFF : 0x00000000
499 const uint16x8_t isWithin0CD_u_16x8 = vcleq_s16(vreinterpretq_s16_u16(unpackedCD_u_16x8), step01_s_16x8);
500 const uint8x16_t isWithin0_u_8x16 = vcombine_u8(vmovn_u16(isWithin0AB_u_16x8), vmovn_u16(isWithin0CD_u_16x8));
501
502 const uint16x8_t isWithin2AB_u_16x8 = vcgtq_s16(vreinterpretq_s16_u16(unpackedAB_u_16x8), step12_s_16x8); // unpackedAB > step12 ? 0xFFFFFFFF : 0x00000000
503 const uint16x8_t isWithin2CD_u_16x8 = vcgtq_s16(vreinterpretq_s16_u16(unpackedCD_u_16x8), step12_s_16x8);
504 const uint8x16_t isWithin2_u_8x16 = vcombine_u8(vmovn_u16(isWithin2AB_u_16x8), vmovn_u16(isWithin2CD_u_16x8));
505
506 const uint8x16_t isWithin1_u_8x16 = vmvnq_u8(vorrq_u8(isWithin0_u_8x16, isWithin2_u_8x16)); // unpacked > step01 && unpacked <= step02 ? 0xFFFFFFFF : 0x00000000
507
508
509 const int16x4_t unpackedA_s_16x4 = vreinterpret_s16_u16(vget_low_u8(unpackedAB_u_16x8));
510 const int16x4_t unpackedB_s_16x4 = vreinterpret_s16_u16(vget_high_u8(unpackedAB_u_16x8));
511 const int16x4_t unpackedC_s_16x4 = vreinterpret_s16_u16(vget_low_u8(unpackedCD_u_16x8));
512 const int16x4_t unpackedD_s_16x4 = vreinterpret_s16_u16(vget_high_u8(unpackedCD_u_16x8));
513
514 // result0 = (m0 * x) / 256)
515 const uint16x8_t resultAB0_u_16x8 = vcombine_u16(vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedA_s_16x4), 8), vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedB_s_16x4), 8));
516 const uint16x8_t resultCD0_u_16x8 = vcombine_u16(vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedC_s_16x4), 8), vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedD_s_16x4), 8));
517
518 // result1 = ((m1 * x) / 256 + c1)
519 const int16x8_t resultAB1_s_16x8 = vaddq_s16(c1_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedA_s_16x4), 8), vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedB_s_16x4), 8)));
520 const int16x8_t resultCD1_s_16x8 = vaddq_s16(c1_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedC_s_16x4), 8), vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedD_s_16x4), 8)));
521
522 // result2 = ((m2 * x) / 256 + c2)
523 const int16x8_t resultAB2_s_16x8 = vaddq_s16(c2_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedA_s_16x4), 8), vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedB_s_16x4), 8)));
524 const int16x8_t resultCD2_s_16x8 = vaddq_s16(c2_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedC_s_16x4), 8), vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedD_s_16x4), 8)));
525
526 const uint8x16_t result0_u_8x16 = vcombine_u8(vqmovn_u16(resultAB0_u_16x8), vqmovn_u16(resultCD0_u_16x8));
527 const uint8x16_t result1_u_8x16 = vcombine_u8(vqmovun_s16(resultAB1_s_16x8), vqmovun_s16(resultCD1_s_16x8));
528 const uint8x16_t result2_u_8x16 = vcombine_u8(vqmovun_s16(resultAB2_s_16x8), vqmovun_s16(resultCD2_s_16x8));
529
530
531 // result0 & isWithin0 | result1 & isWithin1 | result2 & isWithin2
532 const uint8x16_t result_u_8x16 = vorrq_u8(vorrq_u8(vandq_u8(result0_u_8x16, isWithin0_u_8x16), vandq_u8(result1_u_8x16, isWithin1_u_8x16)), vandq_u8(result2_u_8x16, isWithin2_u_8x16));
533
534 vst1q_u8(target, result_u_8x16);
535}
536
537#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
538
539}
540
541}
542
543#endif // META_OCEAN_CV_FRAME_CONVERTER_Y_10_PACKED_H
This is the base class for all frame converter classes.
Definition FrameConverter.h:33
ConversionFlag
Definition of individual conversion flags.
Definition FrameConverter.h:40
static void convertGenericPixelFormat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourceStrideElements, const unsigned int targetStrideElements, const ConversionFlag flag, const RowConversionFunction< TSource, TTarget > rowConversionFunction, const RowReversePixelOrderInPlaceFunction< TTarget > targetReversePixelOrderInPlaceFunction, const bool areContinuous, const void *options, Worker *worker)
Converts a frame with generic pixel format (e.g., RGBA32, BGR24, YUV24, ...) to a frame with generic ...
Definition FrameConverter.h:3484
This class implements the manager for lookup tables.
Definition FrameConverterY10_Packed.h:40
const uint8_t * lookupTable(const float gamma)
Returns the lookup table for a gamma compression/correction function.
LookupTables lookupTables_
The lookup tables.
Definition FrameConverterY10_Packed.h:62
Lock lock_
The lock of the manager.
Definition FrameConverterY10_Packed.h:65
std::unordered_map< float, Memory > LookupTables
Definition of a map mapping gamma values to the memory of lookup tables.
Definition FrameConverterY10_Packed.h:44
This class provides functions to convert frames with Y10_PACKED pixel format.
Definition FrameConverterY10_Packed.h:33
static void convertY10_PackedToBGR24(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a RGB24 frame.
Definition FrameConverterY10_Packed.h:284
static void convertY10_PackedToY10(const uint8_t *const source, uint16_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y10 frame, so that this function simply unpacks the 10 bits.
Definition FrameConverterY10_Packed.h:268
static void convertY10_PackedToY8GammaApproximated(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a 3-step lin...
static void convertRowY10_PackedToY8GammaLUT(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a lookup table.
static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8LinearNEON(const uint8_t *const source, uint8_t *const target)
Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying a liner conversio...
Definition FrameConverterY10_Packed.h:396
static void convertRowY10_PackedToY10(const uint8_t *source, uint16_t *target, const size_t size, const void *unusedParameters=nullptr)
Converts a Y10_Packed row to a Y10 row.
static void convertRowY10_PackedToY8Linear(const uint8_t *source, uint8_t *target, const size_t size, const void *unusedParameters=nullptr)
Converts a Y10_Packed row to a Y8 row.
static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8ApproximatedNEON(const uint8_t *const source, const int16x4_t &m0_256_s_16x4, const int16x4_t &m1_256_s_16x4, const int16x4_t &m2_256_s_16x4, const int16x8_t &c1_s_16x8, const int16x8_t &c2_s_16x8, uint8_t *const target)
Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying gamma compression...
Definition FrameConverterY10_Packed.h:436
static void convertY10_PackedToY8Linear(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y8 frame.
Definition FrameConverterY10_Packed.h:234
static void convertY10_PackedToY8GammaLUT(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a lookup tab...
Definition FrameConverterY10_Packed.h:250
static void convertRowY10_PackedToYYY24Linear(const uint8_t *source, uint8_t *target, const size_t size, const void *unusedParameters=nullptr)
Converts a Y10_Packed row to a RGB24 row, or BGR24 row.
static void convertRowY10_PackedToY8GammaApproximated(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a 3-step linear i...
Definition FrameConverterY10_Packed.h:306
static void convertY10_PackedToRGB24(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a RGB24 frame.
Definition FrameConverterY10_Packed.h:289
static constexpr int8x16_t create_int8x16(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3, const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7, const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11, const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
Creates an int8x16_t vector from 16 individual int8_t values.
Definition NEON.h:609
static constexpr int16x8_t create_int16x8(const int16_t v0, const int16_t v1, const int16_t v2, const int16_t v3, const int16_t v4, const int16_t v5, const int16_t v6, const int16_t v7)
Creates an int16x8_t vector from 8 individual int16_t values.
Definition NEON.h:618
static constexpr uint8x8_t create_uint8x8(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7)
Creates a uint8x8_t vector from 8 individual uint8_t values.
Definition NEON.h:591
static constexpr uint8x16_t create_uint8x16(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7, const uint8_t v8, const uint8_t v9, const uint8_t v10, const uint8_t v11, const uint8_t v12, const uint8_t v13, const uint8_t v14, const uint8_t v15)
Creates a uint8x16_t vector from 16 individual uint8_t values.
Definition NEON.h:600
This class implements a recursive lock object.
Definition Lock.h:31
This template class is the base class for all singleton objects.
Definition Singleton.h:71
static LookupTableManager & get()
Returns a reference to the unique object.
Definition Singleton.h:115
This class implements a worker able to distribute function calls over different threads.
Definition Worker.h:33
The namespace covering the entire Ocean framework.
Definition Accessor.h:15