Ocean
FrameConverterY10_Packed.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_FRAME_CONVERTER_Y_10_PACKED_H
9 #define META_OCEAN_CV_FRAME_CONVERTER_Y_10_PACKED_H
10 
11 #include "ocean/cv/CV.h"
12 #include "ocean/cv/FrameChannels.h"
14 
15 #include "ocean/base/Memory.h"
16 #include "ocean/base/Singleton.h"
17 #include "ocean/base/Worker.h"
18 
19 #include <unordered_map>
20 
21 namespace Ocean
22 {
23 
24 namespace CV
25 {
26 
27 /**
28  * This class provides functions to convert frames with Y10_PACKED pixel format.
29  * @ingroup cv
30  */
31 class OCEAN_CV_EXPORT FrameConverterY10_Packed : public FrameConverter
32 {
33  public:
34 
35  /**
36  * This class implements the manager for lookup tables.
37  */
38  class LookupTableManager : public Singleton<LookupTableManager>
39  {
40  protected:
41 
42  /// Definition of a map mapping gamma values to the memory of lookup tables.
43  typedef std::unordered_map<float, Memory> LookupTables;
44 
45  public:
46 
47  /**
48  * Returns the lookup table for a gamma compression/correction function.
49  * The gamma compression/correction is based the following equation
50  * <pre>
51  * Y8 = 255 * (Y10 / 1023) ^ gamma
52  * </pre>
53  * @param gamma The gamma value for which the lookup table will be returned, with range (0, 2)
54  * @return The requested lookup table, will be valid
55  */
56  const uint8_t* lookupTable(const float gamma);
57 
58  protected:
59 
60  /// The lookup tables.
62 
63  /// The lock of the manager.
65  };
66 
67  public:
68 
69  /**
70  * Converts a Y10_PACKED frame to a Y8 frame.
71  * @param source The source frame buffer, must be valid
72  * @param target The target frame buffer, must be valid
73  * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
74  * @param height The height of the frame in pixel, with range [1, infinity)
75  * @param flag Determining the type of conversion
76  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
77  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
78  * @param worker Optional worker object to distribute the computational load
79  */
80  static inline void convertY10_PackedToY8Linear(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
81 
82  /**
83  * Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a lookup table.
84  * The gamma compression/correction is based the following equation
85  * <pre>
86  * Y8 = 255 * (Y10 / 1023) ^ gamma
87  * </pre>
88  * @param source The source frame buffer, must be valid
89  * @param target The target frame buffer, must be valid
90  * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
91  * @param height The height of the frame in pixel, with range [1, infinity)
92  * @param flag Determining the type of conversion
93  * @param gamma The gamma value to be applied, with range (0, 2)
94  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
95  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
96  * @param worker Optional worker object to distribute the computational load
97  */
98  static inline void convertY10_PackedToY8GammaLUT(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
99 
100  /**
101  * Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a 3-step linear approximation.
102  * The gamma compression/correction is based the following equation
103  * <pre>
104  * Y8 = 255 * (Y10 / 1023) ^ gamma
105  * </pre>
106  * @param source The source frame buffer, must be valid
107  * @param target The target frame buffer, must be valid
108  * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
109  * @param height The height of the frame in pixel, with range [1, infinity)
110  * @param flag Determining the type of conversion
111  * @param gamma The gamma value to be applied, with range (0, 2)
112  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
113  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
114  * @param worker Optional worker object to distribute the computational load
115  */
116  static void convertY10_PackedToY8GammaApproximated(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
117 
118  /**
119  * Converts a Y10_PACKED frame to a Y10 frame, so that this function simply unpacks the 10 bits.
120  * @param source The source frame buffer, must be valid
121  * @param target The target frame buffer, must be valid
122  * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
123  * @param height The height of the frame in pixel, with range [1, infinity)
124  * @param flag Determining the type of conversion
125  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
126  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
127  * @param worker Optional worker object to distribute the computational load
128  */
129  static inline void convertY10_PackedToY10(const uint8_t* const source, uint16_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
130 
131  /**
132  * Converts a Y10_PACKED frame to a RGB24 frame.
133  * @param source The source frame buffer, must be valid
134  * @param target The target frame buffer, must be valid
135  * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
136  * @param height The height of the frame in pixel, with range [1, infinity)
137  * @param flag Determining the type of conversion
138  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
139  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
140  * @param worker Optional worker object to distribute the computational load
141  */
142  static inline void convertY10_PackedToBGR24(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
143 
144  /**
145  * Converts a Y10_PACKED frame to a RGB24 frame.
146  * @param source The source frame buffer, must be valid
147  * @param target The target frame buffer, must be valid
148  * @param width The width of the frame in pixel, with range [4, infinity), must be a multiple of 4
149  * @param height The height of the frame in pixel, with range [1, infinity)
150  * @param flag Determining the type of conversion
151  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
152  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
153  * @param worker Optional worker object to distribute the computational load
154  */
155  static inline void convertY10_PackedToRGB24(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
156 
157  protected:
158 
159  /**
160  * Converts a Y10_Packed row to a Y8 row.
161  * This function simply applies a linear bit reduction from 10 bits to 8 bits.
162  * @param source The pointer to the source pixels, must be valid
163  * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
164  * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
165  * @param unusedParameters Unused parameters, must be nullptr
166  */
167  static void convertRowY10_PackedToY8Linear(const uint8_t* source, uint8_t* target, const size_t size, const void* unusedParameters = nullptr);
168 
169  /**
170  * Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a lookup table.
171  * @param source The pointer to the source pixels, must be valid
172  * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
173  * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
174  * @param parameters The pointer to the `uint8_t` lookup table to be used, must be valid
175  */
176  static void convertRowY10_PackedToY8GammaLUT(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
177 
178  /**
179  * Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a 3-step linear interpolation.
180  * @param source The pointer to the source pixels, must be valid
181  * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
182  * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
183  * @param parameters Three linear slope parameters and two intercept parameters, must be valid
184  */
185  template <unsigned int tStep01, unsigned int tStep12>
186  static void convertRowY10_PackedToY8GammaApproximated(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
187 
188  /**
189  * Converts a Y10_Packed row to a Y10 row.
190  * This function simply applies an unpacking of the 10 bits.
191  * @param source The pointer to the source pixels, must be valid
192  * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
193  * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
194  * @param unusedParameters Unused parameters, must be nullptr
195  */
196  static void convertRowY10_PackedToY10(const uint8_t* source, uint16_t* target, const size_t size, const void* unusedParameters = nullptr);
197 
198  /**
199  * Converts a Y10_Packed row to a RGB24 row, or BGR24 row.
200  * This function simply applies a linear bit reduction from 10 bits to 8 bits.
201  * @param source The pointer to the source pixels, must be valid
202  * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
203  * @param size The number of source (and target pixels) to convert, with range [4, infinity), must be a multiple of 4
204  * @param unusedParameters Unused parameters, must be nullptr
205  */
206  static void convertRowY10_PackedToYYY24Linear(const uint8_t* source, uint8_t* target, const size_t size, const void* unusedParameters = nullptr);
207 
208 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
209 
210  /**
211  * Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying a liner conversion.
212  * @param source The souce buffer with 16 elements, must be valid
213  * @param target The resulting 16 Y8 pixels, must be valid
214  */
215  static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8LinearNEON(const uint8_t* const source, uint8_t* const target);
216 
217  /**
218  * Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying gamma compression/correction with a 3-step linear interpolation.
219  * @param source The souce buffer with 16 elements, must be valid
220  * @param m0_256_s_16x4 The slope of the first linear approximation (with multiple of 256), with range (-10 * 256, 10 * 256)
221  * @param m1_256_s_16x4 The slope of the second linear approximation (with multiple of 256), with range (-10 * 256, 10 * 256)
222  * @param m2_256_s_16x4 The slope of the third linear approximation (with multiple of 256), with range (-10 * 256, 10 * 256)
223  * @param c1_s_16x8 The intercept of the second linear approximation, with range (-255, 255)
224  * @param c2_s_16x8 The intercept of the third linear approximation, with range (-255, 255)
225  * @param target The resulting 16 Y8 pixels, must be valid
226  */
227  template <unsigned int tStep01, unsigned int tStep12>
228  static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8ApproximatedNEON(const uint8_t* const source, const int16x4_t& m0_256_s_16x4, const int16x4_t& m1_256_s_16x4, const int16x4_t& m2_256_s_16x4, const int16x8_t& c1_s_16x8, const int16x8_t& c2_s_16x8, uint8_t* const target);
229 
230 #endif
231 };
232 
233 inline void FrameConverterY10_Packed::convertY10_PackedToY8Linear(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
234 {
235  ocean_assert(source != nullptr && target != nullptr);
236  ocean_assert(width >= 4u && height >= 1u);
237  ocean_assert(width % 4u == 0u);
238 
239  const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
240  const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
241 
242  constexpr void* options = nullptr;
243 
244  const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
245 
246  FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToY8Linear, CV::FrameChannels::reverseRowPixelOrderInPlace<uint8_t, 1u>, areContinuous, options, worker);
247 }
248 
249 inline void FrameConverterY10_Packed::convertY10_PackedToY8GammaLUT(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
250 {
251  ocean_assert(source != nullptr && target != nullptr);
252  ocean_assert(width >= 4u && height >= 1u);
253  ocean_assert(width % 4u == 0u);
254 
255  ocean_assert(gamma > 0.0f && gamma < 2.0f);
256 
257  const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
258  const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
259 
260  const void* const options = LookupTableManager::get().lookupTable(gamma);
261 
262  const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
263 
264  FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToY8GammaLUT, CV::FrameChannels::reverseRowPixelOrderInPlace<uint8_t, 1u>, areContinuous, options, worker);
265 }
266 
267 inline void FrameConverterY10_Packed::convertY10_PackedToY10(const uint8_t* source, uint16_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
268 {
269  ocean_assert(source != nullptr && target != nullptr);
270  ocean_assert(width >= 4u && height >= 1u);
271  ocean_assert(width % 4u == 0u);
272 
273  const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
274  const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
275 
276  constexpr void* options = nullptr;
277 
278  const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
279 
280  FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToY10, CV::FrameChannels::reverseRowPixelOrderInPlace<uint16_t, 1u>, areContinuous, options, worker);
281 }
282 
283 inline void FrameConverterY10_Packed::convertY10_PackedToBGR24(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
284 {
285  convertY10_PackedToRGB24(source, target, width, height, flag, sourcePaddingElements, targetPaddingElements, worker);
286 }
287 
288 inline void FrameConverterY10_Packed::convertY10_PackedToRGB24(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
289 {
290  ocean_assert(source != nullptr && target != nullptr);
291  ocean_assert(width >= 4u && height >= 1u);
292  ocean_assert(width % 4u == 0u);
293 
294  const unsigned int sourceStrideElements = width * 5u / 4u + sourcePaddingElements;
295  const unsigned int targetStrideElements = width * 3u + targetPaddingElements;
296 
297  constexpr void* options = nullptr;
298 
299  const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
300 
301  FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, flag, convertRowY10_PackedToYYY24Linear, CV::FrameChannels::reverseRowPixelOrderInPlace<uint8_t, 3u>, areContinuous, options, worker);
302 }
303 
304 template <unsigned int tStep01, unsigned int tStep12>
305 void FrameConverterY10_Packed::convertRowY10_PackedToY8GammaApproximated(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters)
306 {
307  static_assert(0u < tStep01 && tStep01 < tStep12 && tStep12 < 1023u, "Invalid steps");
308 
309  ocean_assert(source != nullptr && target != nullptr);
310  ocean_assert(size >= 4 && size % 4 == 0);
311  ocean_assert(parameters != nullptr);
312 
313  // applying a 3-step linear approximation
314  // https://www.desmos.com/calculator/pezgk5slux
315 
316  const int* coefficients = reinterpret_cast<const int*>(parameters);
317 
318  const int32_t m0_256 = coefficients[0];
319  const int32_t m1_256 = coefficients[1];
320  const int32_t m2_256 = coefficients[2];
321 
322  const int32_t c1 = coefficients[3];
323  const int32_t c2 = coefficients[4];
324 
325  size_t blocks4 = size / size_t(4);
326 
327 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
328 
329  const size_t blocks16 = size / size_t(16);
330 
331  const int16x4_t m0_256_s_16x4 = vdup_n_s16(int16_t(m0_256));
332  const int16x4_t m1_256_s_16x4 = vdup_n_s16(int16_t(m1_256));
333  const int16x4_t m2_256_s_16x4 = vdup_n_s16(int16_t(m2_256));
334 
335  const int16x8_t c1_s_16x8 = vdupq_n_s16(int16_t(c1));
336  const int16x8_t c2_s_16x8 = vdupq_n_s16(int16_t(c2));
337 
338  for (size_t n = 0; n < blocks16; ++n)
339  {
340  convert16PixelY10_PackedToY8ApproximatedNEON<tStep01, tStep12>(source, m0_256_s_16x4, m1_256_s_16x4, m2_256_s_16x4, c1_s_16x8, c2_s_16x8, target);
341 
342  target += 16;
343  source += 20;
344  }
345 
346  blocks4 = (size - blocks16 * size_t(16)) / size_t(4);
347  ocean_assert(blocks4 <= size / size_t(4));
348 
349 #endif // OCEAN_HARDWARE_NEON_VERSION
350 
351  int32_t result256;
352 
353  const int32_t c1_256 = c1 * 256;
354  const int32_t c2_256 = c2 * 256;
355 
356  for (size_t n = 0; n < blocks4; ++n)
357  {
358  const int32_t x[4] =
359  {
360  int32_t(uint16_t(source[0]) << uint16_t(2) | (uint16_t(source[4]) & uint16_t(0b00000011))),
361  int32_t(uint16_t(source[1]) << uint16_t(2) | ((uint16_t(source[4]) & uint16_t(0b00001100)) >> uint16_t(2))),
362  int32_t(uint16_t(source[2]) << uint16_t(2) | ((uint16_t(source[4]) & uint16_t(0b00110000)) >> uint16_t(4))),
363  int32_t(uint16_t(source[3]) << uint16_t(2) | (uint16_t(source[4]) >> uint16_t(6)))
364  };
365 
366  for (unsigned int i = 0u; i < 4u; ++i)
367  {
368  const uint32_t& xx = x[i];
369 
370  if (xx < tStep01)
371  {
372  result256 = (m0_256 * xx);
373  }
374  else if (xx <= tStep12)
375  {
376  result256 = (m1_256 * xx + c1_256);
377  }
378  else
379  {
380  result256 = (m2_256 * xx + c2_256);
381  }
382 
383  ocean_assert(0 <= result256 && result256 <= 255 * 256);
384 
385  target[i] = int8_t((uint32_t(result256) + 128u) >> 8u);
386  }
387 
388  target += 4;
389  source += 5;
390  }
391 }
392 
393 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
394 
395 OCEAN_FORCE_INLINE void FrameConverterY10_Packed::convert16PixelY10_PackedToY8LinearNEON(const uint8_t* const source, uint8_t* const target)
396 {
397 #ifdef __aarch64__
398 
399  const uint8x16_t packedA_u_8x16 = vld1q_u8(source);
400  const uint8x8_t packedB_u_8x8 = vld1_u8(source + 12);
401 
402  // F E D C B A 9 8 7 6 5 4 3 2 1 0
403  // D C B A 8 7 6 5 3 2 1 0 X X X X
404  constexpr uint8x16_t shuffle_u_8x16 = {16u, 16u, 16u, 16u, 0u, 1u, 2u, 3u, 5u, 6u, 7u, 8u, 10u, 11u, 12u, 13u};
405  const uint8x16_t intermediateA_u_8x16 = vqtbl1q_u8(packedA_u_8x16, shuffle_u_8x16);
406 
407  const uint8x8_t intermediateB_u_8x8 = vext_u8(packedB_u_8x8, packedB_u_8x8, 3);
408 
409  const uint8x16_t target_u_8x16 = vextq_u8(intermediateA_u_8x16, vcombine_u8(intermediateB_u_8x8, intermediateB_u_8x8), 4);
410 
411 #else
412 
413  constexpr uint8x16_t mask_u_8x16 = {0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0xFFu, 0xFFu, 0xFFu};
414 
415  const uint8x16_t packedA_u_8x16 = vld1q_u8(source);
416  const uint8x8_t packedB_u_8x8 = vld1_u8(source + 11);
417 
418  const uint8x8_t packedAA_u_8x8 = vget_low_u8(packedA_u_8x16);
419  const uint8x8_t packedAB_u_8x8 = vget_high_u8(packedA_u_8x16);
420 
421  constexpr uint8x8_t shuffleA_u_8x8 = {8u, 0u, 1u, 2u, 3u, 5u, 6u, 7u};
422  constexpr uint8x8_t shuffleB_u_8x8 = {0u, 2u, 3u, 4u, 5u, 7u, 8u, 8u};
423  const uint8x16_t intermediateA_u_8x16 = vextq_u8(vcombine_u8(vtbl1_u8(packedAA_u_8x8, shuffleA_u_8x8), vtbl1_u8(packedAB_u_8x8, shuffleB_u_8x8)), mask_u_8x16, 1); // we use the first zero element of mask_u_8x16
424 
425  const uint8x16_t intermediateB_u_8x16 = vcombine_u8(vget_low_u8(mask_u_8x16), vand_u8(packedB_u_8x8, vget_high_u8(mask_u_8x16)));
426 
427  const uint8x16_t target_u_8x16 = vorrq_u8(intermediateA_u_8x16, intermediateB_u_8x16);
428 
429 #endif // __aarch64__
430 
431  vst1q_u8(target, target_u_8x16);
432 }
433 
434 template <unsigned int tStep01, unsigned int tStep12>
435 OCEAN_FORCE_INLINE void FrameConverterY10_Packed::convert16PixelY10_PackedToY8ApproximatedNEON(const uint8_t* const source, const int16x4_t& m0_s_16x4, const int16x4_t& m1_s_16x4, const int16x4_t& m2_s_16x4, const int16x8_t& c1_s_16x8, const int16x8_t& c2_s_16x8, uint8_t* const target)
436 {
437  static_assert(0u < tStep01 && tStep01 < tStep12 && tStep12 < 1023u, "Invalid steps");
438 
439  constexpr int8x16_t leftShifts_s_8x16 = {6, 0, 4, 0, 2, 0, 0, 0, 6, 0, 4, 0, 2, 0, 0, 0};
440  constexpr int16x8_t rightShifts_s_16x8 = {-6, -6, -6, -6, -6, -6, -6, -6};
441 
442 #ifdef __aarch64__
443 
444  const uint8x16_t packedAB_u_8x16 = vld1q_u8(source);
445  const uint8x16_t packedCD_u_8x16 = vld1q_u8(source + 4);
446 
447  // F E D C B A 9 8 7 6 5 4 3 2 1 0
448  // 8 9 7 9 6 9 5 9 3 4 2 4 1 4 0 4
449  constexpr uint8x16_t shuffleAB_u_8x16 = {4u, 0u, 4u, 1u, 4u, 2u, 4u, 3u, 9u, 5u, 9u, 6u, 9u, 7u, 9u, 8u};
450  const uint8x16_t intermediateAB_u_8x16 = vqtbl1q_u8(packedAB_u_8x16, shuffleAB_u_8x16);
451 
452  constexpr uint8x16_t shuffleCD_u_8x16 = {10u, 6u, 10u, 7u, 10u, 8u, 10u, 9u, 15u, 11u, 15u, 12u, 15u, 13u, 15u, 14u};
453  const uint8x16_t intermediateCD_u_8x16 = vqtbl1q_u8(packedCD_u_8x16, shuffleCD_u_8x16);
454 
455 #else
456 
457  constexpr uint8x8_t shuffleAB_u_8x8 = {4u, 0u, 4u, 1u, 4u, 2u, 4u, 3u};
458  constexpr uint8x8_t shuffleC_u_8x8 = {6u, 2u, 6u, 3u, 6u, 4u, 6u, 5u};
459  constexpr uint8x8_t shuffleD_u_8x8 = {7u, 3u, 7u, 4u, 7u, 5u, 7u, 6u};
460 
461  const uint8x16_t packedAB_u_8x16 = vld1q_u8(source);
462  const uint8x8_t packedForD_u_8x8 = vld1_u8(source + 12);
463 
464  const uint8x8_t packedForA_u_8x8 = vget_low_u8(packedAB_u_8x16);
465  const uint8x8_t packedForB_u_8x8 = vget_low_u8(vextq_u8(packedAB_u_8x16, packedAB_u_8x16, 5));
466  const uint8x8_t packedForC_u_8x8 = vget_high_u8(packedAB_u_8x16);
467 
468  const uint8x16_t intermediateAB_u_8x16 = vcombine_u8(vtbl1_u8(packedForA_u_8x8, shuffleAB_u_8x8), vtbl1_u8(packedForB_u_8x8, shuffleAB_u_8x8));
469  const uint8x16_t intermediateCD_u_8x16 = vcombine_u8(vtbl1_u8(packedForC_u_8x8, shuffleC_u_8x8), vtbl1_u8(packedForD_u_8x8, shuffleD_u_8x8));
470 
471 #endif // __aarch64__
472 
473 
474  // ... XXXXXX99 33333333 44XXXXXX 22222222 XX44XXXX 11111111 XXXX44XX 00000000 XXXXXX44
475  // ... 99------ 33333333 44------ 22222222 44------ 11111111 44------ 00000000 44------
476  const uint16x8_t intermediateAB_u_16x8 = vreinterpretq_u16_u8(vshlq_u8(intermediateAB_u_8x16, leftShifts_s_8x16));
477  const uint16x8_t intermediateCD_u_16x8 = vreinterpretq_u16_u8(vshlq_u8(intermediateCD_u_8x16, leftShifts_s_8x16));
478 
479 
480  // ... 99------ 33333333 44------ 22222222 44------ 11111111 44------ 00000000 44------
481  // ... 55555599 ------33 33333344 ------22 22222244 ------11 11111144 ------00 00000044
482  const uint16x8_t unpackedAB_u_16x8 = vshlq_u16(intermediateAB_u_16x8, rightShifts_s_16x8);
483  const uint16x8_t unpackedCD_u_16x8 = vshlq_u16(intermediateCD_u_16x8, rightShifts_s_16x8);
484 
485  // now, we have 16 uin16_t unpacked values for which we will approximate the gamma compression/correction
486 
487  // approximation via three linear equations
488  // [ 0, step01]: f_0(x) = m_0 * x, with f_0(0) = 0
489  // [step01, step12]: f_1(x) = m_1 * x + c_1
490  // [step21, 1 ]: f_2(x) = m_2 * x + c_2, with f_2(1) = 1
491 
492  constexpr int16x8_t step01_s_16x8 = {int32_t(tStep01), int32_t(tStep01), int32_t(tStep01), int32_t(tStep01), int32_t(tStep01), int32_t(tStep01), int32_t(tStep01), int32_t(tStep01)};
493  constexpr int16x8_t step12_s_16x8 = {int32_t(tStep12), int32_t(tStep12), int32_t(tStep12), int32_t(tStep12), int32_t(tStep12), int32_t(tStep12), int32_t(tStep12), int32_t(tStep12)};
494 
495  // determining masks to switch between one of the tree linear equations
496 
497  const uint16x8_t isWithin0AB_u_16x8 = vcleq_s16(vreinterpretq_s16_u16(unpackedAB_u_16x8), step01_s_16x8); // unpackedAB <= step01 ? 0xFFFFFFFF : 0x00000000
498  const uint16x8_t isWithin0CD_u_16x8 = vcleq_s16(vreinterpretq_s16_u16(unpackedCD_u_16x8), step01_s_16x8);
499  const uint8x16_t isWithin0_u_8x16 = vcombine_u8(vmovn_u16(isWithin0AB_u_16x8), vmovn_u16(isWithin0CD_u_16x8));
500 
501  const uint16x8_t isWithin2AB_u_16x8 = vcgtq_s16(vreinterpretq_s16_u16(unpackedAB_u_16x8), step12_s_16x8); // unpackedAB > step12 ? 0xFFFFFFFF : 0x00000000
502  const uint16x8_t isWithin2CD_u_16x8 = vcgtq_s16(vreinterpretq_s16_u16(unpackedCD_u_16x8), step12_s_16x8);
503  const uint8x16_t isWithin2_u_8x16 = vcombine_u8(vmovn_u16(isWithin2AB_u_16x8), vmovn_u16(isWithin2CD_u_16x8));
504 
505  const uint8x16_t isWithin1_u_8x16 = vmvnq_u8(vorrq_u8(isWithin0_u_8x16, isWithin2_u_8x16)); // unpacked > step01 && unpacked <= step02 ? 0xFFFFFFFF : 0x00000000
506 
507 
508  const int16x4_t unpackedA_s_16x4 = vreinterpret_s16_u16(vget_low_u8(unpackedAB_u_16x8));
509  const int16x4_t unpackedB_s_16x4 = vreinterpret_s16_u16(vget_high_u8(unpackedAB_u_16x8));
510  const int16x4_t unpackedC_s_16x4 = vreinterpret_s16_u16(vget_low_u8(unpackedCD_u_16x8));
511  const int16x4_t unpackedD_s_16x4 = vreinterpret_s16_u16(vget_high_u8(unpackedCD_u_16x8));
512 
513  // result0 = (m0 * x) / 256)
514  const uint16x8_t resultAB0_u_16x8 = vcombine_u16(vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedA_s_16x4), 8), vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedB_s_16x4), 8));
515  const uint16x8_t resultCD0_u_16x8 = vcombine_u16(vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedC_s_16x4), 8), vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedD_s_16x4), 8));
516 
517  // result1 = ((m1 * x) / 256 + c1)
518  const int16x8_t resultAB1_s_16x8 = vaddq_s16(c1_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedA_s_16x4), 8), vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedB_s_16x4), 8)));
519  const int16x8_t resultCD1_s_16x8 = vaddq_s16(c1_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedC_s_16x4), 8), vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedD_s_16x4), 8)));
520 
521  // result2 = ((m2 * x) / 256 + c2)
522  const int16x8_t resultAB2_s_16x8 = vaddq_s16(c2_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedA_s_16x4), 8), vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedB_s_16x4), 8)));
523  const int16x8_t resultCD2_s_16x8 = vaddq_s16(c2_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedC_s_16x4), 8), vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedD_s_16x4), 8)));
524 
525  const uint8x16_t result0_u_8x16 = vcombine_u8(vqmovn_u16(resultAB0_u_16x8), vqmovn_u16(resultCD0_u_16x8));
526  const uint8x16_t result1_u_8x16 = vcombine_u8(vqmovun_s16(resultAB1_s_16x8), vqmovun_s16(resultCD1_s_16x8));
527  const uint8x16_t result2_u_8x16 = vcombine_u8(vqmovun_s16(resultAB2_s_16x8), vqmovun_s16(resultCD2_s_16x8));
528 
529 
530  // result0 & isWithin0 | result1 & isWithin1 | result2 & isWithin2
531  const uint8x16_t result_u_8x16 = vorrq_u8(vorrq_u8(vandq_u8(result0_u_8x16, isWithin0_u_8x16), vandq_u8(result1_u_8x16, isWithin1_u_8x16)), vandq_u8(result2_u_8x16, isWithin2_u_8x16));
532 
533  vst1q_u8(target, result_u_8x16);
534 }
535 
536 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
537 
538 }
539 
540 }
541 
542 #endif // META_OCEAN_CV_FRAME_CONVERTER_Y_10_PACKED_H
This is the base class for all frame converter classes.
Definition: FrameConverter.h:32
ConversionFlag
Definition of individual conversion flags.
Definition: FrameConverter.h:39
static void convertGenericPixelFormat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourceStrideElements, const unsigned int targetStrideElements, const ConversionFlag flag, const RowConversionFunction< TSource, TTarget > rowConversionFunction, const RowReversePixelOrderInPlaceFunction< TTarget > targetReversePixelOrderInPlaceFunction, const bool areContinuous, const void *options, Worker *worker)
Converts a frame with generic pixel format (e.g., RGBA32, BGR24, YUV24, ...) to a frame with generic ...
Definition: FrameConverter.h:3160
This class implements the manager for lookup tables.
Definition: FrameConverterY10_Packed.h:39
const uint8_t * lookupTable(const float gamma)
Returns the lookup table for a gamma compression/correction function.
std::unordered_map< float, Memory > LookupTables
Definition of a map mapping gamma values to the memory of lookup tables.
Definition: FrameConverterY10_Packed.h:43
LookupTables lookupTables_
The lookup tables.
Definition: FrameConverterY10_Packed.h:61
Lock lock_
The lock of the manager.
Definition: FrameConverterY10_Packed.h:64
This class provides functions to convert frames with Y10_PACKED pixel format.
Definition: FrameConverterY10_Packed.h:32
static void convertY10_PackedToBGR24(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a RGB24 frame.
Definition: FrameConverterY10_Packed.h:283
static void convertY10_PackedToY10(const uint8_t *const source, uint16_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y10 frame, so that this function simply unpacks the 10 bits.
Definition: FrameConverterY10_Packed.h:267
static void convertY10_PackedToY8GammaApproximated(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a 3-step lin...
static void convertRowY10_PackedToY8GammaLUT(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a lookup table.
static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8LinearNEON(const uint8_t *const source, uint8_t *const target)
Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying a liner conversio...
Definition: FrameConverterY10_Packed.h:395
static void convertRowY10_PackedToY10(const uint8_t *source, uint16_t *target, const size_t size, const void *unusedParameters=nullptr)
Converts a Y10_Packed row to a Y10 row.
static void convertRowY10_PackedToY8Linear(const uint8_t *source, uint8_t *target, const size_t size, const void *unusedParameters=nullptr)
Converts a Y10_Packed row to a Y8 row.
static OCEAN_FORCE_INLINE void convert16PixelY10_PackedToY8ApproximatedNEON(const uint8_t *const source, const int16x4_t &m0_256_s_16x4, const int16x4_t &m1_256_s_16x4, const int16x4_t &m2_256_s_16x4, const int16x8_t &c1_s_16x8, const int16x8_t &c2_s_16x8, uint8_t *const target)
Converts 16 pixels (20 elements) of a Y10_Packed buffer to 16 Y8 pixels by applying gamma compression...
Definition: FrameConverterY10_Packed.h:435
static void convertY10_PackedToY8Linear(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y8 frame.
Definition: FrameConverterY10_Packed.h:233
static void convertY10_PackedToY8GammaLUT(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const float gamma, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a Y8 frame by applying gamma compression/correction using a lookup tab...
Definition: FrameConverterY10_Packed.h:249
static void convertRowY10_PackedToYYY24Linear(const uint8_t *source, uint8_t *target, const size_t size, const void *unusedParameters=nullptr)
Converts a Y10_Packed row to a RGB24 row, or BGR24 row.
static void convertRowY10_PackedToY8GammaApproximated(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a Y10_Packed row to a Y8 row by applying gamma compression/correction with a 3-step linear i...
Definition: FrameConverterY10_Packed.h:305
static void convertY10_PackedToRGB24(const uint8_t *const source, uint8_t *const target, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Converts a Y10_PACKED frame to a RGB24 frame.
Definition: FrameConverterY10_Packed.h:288
This class implements a recursive lock object.
Definition: Lock.h:31
This template class is the base class for all singleton objects.
Definition: Singleton.h:71
static LookupTableManager & get()
Returns a reference to the unique object.
Definition: Singleton.h:115
This class implements a worker able to distribute function calls over different threads.
Definition: Worker.h:33
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15