Ocean
Loading...
Searching...
No Matches
FrameChannels.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_FRAME_CHANNELS_H
9#define META_OCEAN_CV_FRAME_CHANNELS_H
10
11#include "ocean/cv/CV.h"
13#include "ocean/cv/NEON.h"
14#include "ocean/cv/SSE.h"
15
16#include "ocean/base/DataType.h"
17#include "ocean/base/Frame.h"
18#include "ocean/base/Worker.h"
19
20namespace Ocean
21{
22
23namespace CV
24{
25
26/**
27 * This class implements frame channel conversion, transformation and extraction functions.
28 * @ingroup cv
29 */
30class OCEAN_CV_EXPORT FrameChannels : public FrameConverter
31{
32 public:
33
34 /**
35 * Definition of a constant to specify that the number of channels are not known at compile time but at runtime only.
36 */
37 static constexpr unsigned int CHANNELS_NOT_KNOWN_AT_COMPILE_TIME = 0u;
38
39 /**
40 * Definition of a function pointer to a function able to operate on an entire image row.
41 */
42 template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels>
43 using RowOperatorFunction = void(*)(const TSource* sourceRow, TTarget* targetRow, const unsigned int width, const unsigned int height, unsigned int rowIndex, const unsigned int sourceStrideElements, const unsigned int targetStrideElements);
44
45 /**
46 * The following comfort class provides comfortable functions simplifying prototyping applications but also increasing binary size of the resulting applications.
47 * Best practice is to avoid using these functions if binary size matters,<br>
48 * as for every comfort function a corresponding function exists with specialized functionality not increasing binary size significantly.<br>
49 */
50 class OCEAN_CV_EXPORT Comfort
51 {
52 public:
53
54 /**
55 * Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24, FORMAT_BGRA32 into individual frames with one channel only.
56 * Usage:
57 * @code
58 * Frame rgbSourceFrame = ...;
59 *
60 * Frames targetFrames;
61 *
62 * if (separateTo1Channel(rgbSourceFrame, targetFrames))
63 * {
64 * ocean_assert(targetFrames.size() == 3);
65 *
66 * // do something with targetFrames
67 * }
68 * @endcode
69 * @param sourceFrame The frame to be separated, must be valid
70 * @param targetFrames The resulting frames each holding one channel of the source frame, will be set automatically
71 * @param targetPixelFormat Optional explicit pixel format of the target frames, must bet a pixel format with 1 channel and must fit with the data type of the source pixel format, otherwise FORMAT_UNDEFINED
72 * @return True, if succeeded
73 */
74 static bool separateTo1Channel(const Frame& sourceFrame, Frames& targetFrames, const FrameType::PixelFormat targetPixelFormat = FrameType::FORMAT_UNDEFINED);
75
76 /**
77 * Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24, FORMAT_BGRA32 into individual frames with one channel only.
78 * Usage:
79 * @code
80 * Frame rgbSourceFrame = ...;
81 *
82 * Frame targetFrameA;
83 * Frame targetFrameB;
84 * Frame targetFrameC;
85 *
86 * if (separateTo1Channel(rgbSourceFrame, {&targetFrameA, &targetFrameB, &targetFrameC}))
87 * {
88 * // do something with targetFrames
89 * }
90 * @endcode
91 * @param sourceFrame The frame to be separated, must be valid
92 * @param targetFrames The resulting frames each holding one channel of the source frame, one for each source channels
93 * @param targetPixelFormat Optional explicit pixel format of the target frames, must bet a pixel format with 1 channel and must fit with the data type of the source pixel format, otherwise FORMAT_UNDEFINED
94 * @return True, if succeeded
95 */
96 static bool separateTo1Channel(const Frame& sourceFrame, const std::initializer_list<Frame*>& targetFrames, const FrameType::PixelFormat targetPixelFormat = FrameType::FORMAT_UNDEFINED);
97
98 /**
99 * Zips/interleaves 1-channel images into one image with n-channels.
100 * Usage:
101 * @code
102 * Frame sourceFrameA = ...;
103 * Frame sourceFrameB = ...;
104 * Frame sourceFrameC = ...;
105 *
106 * Frame targetFrame;
107 * if (zipChannels({sourceFrameA, sourceFrameB, sourceFrameC}, targetFrame))
108 * {
109 * ocean_assert(targetFrame.channels() == 3u);
110 *
111 * // do something with targetFrame
112 * }
113 * @endcode
114 * @param sourceFrames The frames to be zipped/interleaved, must be valid
115 * @param targetFrame The resulting frame holding n channels, will be set automatically
116 * @param targetPixelFormat Optional explicit pixel format of the target frames, must bet a pixel format with 1 channel and must fit with the data type of the source pixel format, otherwise FORMAT_UNDEFINED
117 * @return True, if succeeded
118 */
119 static bool zipChannels(const std::initializer_list<Frame>& sourceFrames, Frame& targetFrame, const FrameType::PixelFormat targetPixelFormat = FrameType::FORMAT_UNDEFINED);
120
121 /**
122 * Zips/interleaves 1-channel images into one image with n-channels.
123 * Usage:
124 * @code
125 * Frames sourceFrames = ...;
126 *
127 * Frame targetFrame;
128 * if (zipChannels(sourceFrames, targetFrame))
129 * {
130 * ocean_assert(targetFrame.channels() == sourceFrames.size());
131 *
132 * // do something with targetFrame
133 * }
134 * @endcode
135 * @param sourceFrames The frames to be zipped/interleaved, must be valid
136 * @param targetFrame The resulting frame holding n channels, will be set automatically
137 * @param targetPixelFormat Optional explicit pixel format of the target frames, must bet a pixel format with 1 channel and must fit with the data type of the source pixel format, otherwise FORMAT_UNDEFINED
138 * @return True, if succeeded
139 */
140 static bool zipChannels(const Frames& sourceFrames, Frame& targetFrame, const FrameType::PixelFormat targetPixelFormat = FrameType::FORMAT_UNDEFINED);
141
142 /**
143 * Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
144 * @param frame The image to convert, must be valid
145 * @param worker Optional worker object to distribute the computation
146 * @return True, if succeeded
147 * @see straightAlphaToPremultipliedAlpha().
148 */
149 static bool premultipliedAlphaToStraightAlpha(Frame& frame, Worker* worker = nullptr);
150
151 /**
152 * Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
153 * @param source The source image to convert, must be valid
154 * @param target The resulting converted target image, the frame type will be changed if it is not match to the source frame
155 * @param worker Optional worker object to distribute the computation
156 * @return True, if succeeded
157 * @see straightAlphaToPremultipliedAlpha().
158 */
159 static bool premultipliedAlphaToStraightAlpha(const Frame& source, Frame& target, Worker* worker = nullptr);
160
161 /**
162 * Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied alpha.
163 * @param frame The image to convert, must be valid
164 * @param worker Optional worker object to distribute the computation
165 * @see premultipliedAlphaToStraightAlpha().
166 */
167 static bool straightAlphaToPremultipliedAlpha(Frame& frame, Worker* worker = nullptr);
168
169 /**
170 * Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied alpha.
171 * @param source The source image to convert, must be valid
172 * @param target The resulting converted target image, must be valid
173 * @param worker Optional worker object to distribute the computation
174 * @see premultipliedAlphaToStraightAlpha().
175 */
176 static bool straightAlphaToPremultipliedAlpha(const Frame& source, Frame& target, Worker* worker = nullptr);
177 };
178
179 /**
180 * Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24, FORMAT_BGRA32 into individual frames with one channel only.
181 * Usage:
182 * @code
183 * const unsigned int width = ...;
184 * const unsigned int height = ...;
185 *
186 * uint8_t* sourceFrame = ...;
187 * const unsigned int sourceFramePaddingElements = ...;
188 *
189 * constexpr unsigned int channels = 2u;
190 *
191 * const uint8_t* targetFrames[channels] = {..., ...};
192 * const unsigned int targetFramesPaddingElements[2] = {..., ...};
193 *
194 * separateTo1Channel<uint8_t, uint8_t, channels>(sourceFrame, targetFrames, width, height, channels, sourceFramePaddingElements, targetFramesPaddingElements);
195 * @endcode
196 * @param sourceFrame The frame to be separated, must be valid
197 * @param targetFrames The pointers to the resulting separated frames each holding one channel of the source frame, with already allocated memory
198 * @param width The width of the source frame in pixel, with range [1, infinity)
199 * @param height The height of the source frame in pixel, with range [1, infinity)
200 * @param channels The number of channels the source frame has, with range [1, infinity)
201 * @param sourceFramePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
202 * @param targetFramesPaddingElements The array of padding elements at the end of each target row, one for each target frame, in elements, with range [0, infinity), nullptr if all are zero
203 * @tparam TSource The data type of each source pixel channel, e.g., 'uint8_t', 'float', ...
204 * @tparam TTarget The data type of each target pixel channel, e.g., 'uint8_t', 'float', ...
205 * @tparam tChannels The number of source frames (and target channels) if known at compile time; otherwise CHANNELS_NOT_KNOWN_AT_COMPILE_TIME == 0, if know at compile time must be identical with 'channels'
206 */
207 template <typename TSource, typename TTarget, unsigned int tChannels = CHANNELS_NOT_KNOWN_AT_COMPILE_TIME>
208 static void separateTo1Channel(const TSource* const sourceFrame, TTarget* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements);
209
210 /**
211 * Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24, FORMAT_BGRA32 into individual frames with one channel only.
212 * Usage:
213 * @code
214 * const unsigned int width = ...;
215 * const unsigned int height = ...;
216 *
217 * const uint8_t* sourceFrame = ...;
218 * const unsigned int sourceFramePaddingElements = ...;
219 *
220 * uint8_t* targetFrame0 = ...;
221 * uint8_t* targetFrame1 = ...;
222 * const unsigned int targetFramePaddingElements0 = ...;
223 * const unsigned int targetFramePaddingElements1 = ...;
224 *
225 * separateTo1Channel<uint8_t, uint8_t>(sourceFrame, {targetFrame0, targetFrame1}, width, height, sourceFramePaddingElements, {targetFramePaddingElements0, targetFramePaddingElements1});
226 * @endcode
227 * @param sourceFrame The frame to be separated, must be valid
228 * @param targetFrames The pointers to the resulting separated frames each holding one channel of the source frame, with already allocated memory
229 * @param width The width of the source frame in pixel, with range [1, infinity)
230 * @param height The height of the source frame in pixel, with range [1, infinity)
231 * @param sourceFramePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
232 * @param targetFramesPaddingElements The array of padding elements at the end of each target row, one for each target frame, in elements, with range [0, infinity)
233 * @tparam TSource The data type of each source pixel channel, e.g., 'uint8_t', 'float', ...
234 * @tparam TTarget The data type of each target pixel channel, e.g., 'uint8_t', 'float', ...
235 */
236 template <typename TSource, typename TTarget>
237 static void separateTo1Channel(const TSource* const sourceFrame, const std::initializer_list<TTarget*>& targetFrames, const unsigned int width, const unsigned int height, const unsigned int sourceFramePaddingElements, const std::initializer_list<const unsigned int>& targetFramesPaddingElements);
238
239 /**
240 * Zips/interleaves 1-channel images into one image with n-channels.
241 * Usage:
242 * @code
243 * const unsigned int width = ...;
244 * const unsigned int height = ...;
245 *
246 * const uint8_t* sourceFrames[2] = {..., ...};
247 * const unsigned int sourceFramesPaddingElements[2] = {..., ...};
248 *
249 * uint8_t* targetFrame = ...;
250 * const unsigned int targetFramePaddingElements = ...;
251 *
252 * zipChannels<uint8_t, uint8_t>(sourceFrames, targetFrame, width, height, 2u, sourceFramesPaddingElements, targetFramePaddingElements);
253 * @endcode
254 * @param sourceFrames The pointers to the individual 1-channel frames, one for each image, must be valid
255 * @param targetFrame The pointer to the resulting zipped frame holding n-channels, must be valid
256 * @param width The width of the source frames in pixel, with range [1, infinity)
257 * @param height The height of the source frames in pixel, with range [1, infinity)
258 * @param channels The number of provided source frames (and the number of channels the target frame will have), with range [1, infinity)
259 * @param sourceFramesPaddingElements The array of padding elements at the end of each source row, one for each source frame, in elements, with range [0, infinity), nullptr if all are zero
260 * @param targetFramePaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
261 * @tparam TSource The data type of each source pixel channel, e.g., 'uint8_t', 'float', ...
262 * @tparam TTarget The data type of each target pixel channel, e.g., 'uint8_t', 'float', ...
263 * @tparam tChannels The number of source frames (and target channels) if known at compile time; otherwise CHANNELS_NOT_KNOWN_AT_COMPILE_TIME == 0, if know at compile time must be identical with 'channels'
264 */
265 template <typename TSource, typename TTarget, unsigned int tChannels = CHANNELS_NOT_KNOWN_AT_COMPILE_TIME>
266 static void zipChannels(const TSource* const* const sourceFrames, TTarget* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements);
267
268 /**
269 * Zips/interleaves 1-channel images into one image with n-channels.
270 * Usage:
271 * @code
272 * const unsigned int width = ...;
273 * const unsigned int height = ...;
274 *
275 * const uint8_t* sourceFrame0 = ...;
276 * const uint8_t* sourceFrame1 = ...;
277 * const unsigned int sourceFramePaddingElements0 = ...;
278 * const unsigned int sourceFramePaddingElements1 = ...;
279 *
280 * uint8_t* targetFrame = ...;
281 * const unsigned int targetFramePaddingElements = ...;
282 *
283 * zipChannels<uint8_t, uint8_t>({sourceFrame0, sourceFrame1}, targetFrame, width, height, {sourceFramePaddingElements0, sourceFramePaddingElements1}, targetFramePaddingElements);
284 * @endcode
285 * @param sourceFrames The pointers to the individual 1-channel frames, one for each image, must be valid
286 * @param targetFrame The pointer to the resulting zipped frame holding n-channels, must be valid
287 * @param width The width of the source frames in pixel, with range [1, infinity)
288 * @param height The height of the source frames in pixel, with range [1, infinity)
289 * @param sourceFramesPaddingElements The array of padding elements at the end of each source row, one for each source frame, in elements, with range [0, infinity)
290 * @param targetFramePaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
291 * @tparam TSource The data type of each source pixel channel, e.g., 'uint8_t', 'float', ...
292 * @tparam TTarget The data type of each target pixel channel, e.g., 'uint8_t', 'float', ...
293 */
294 template <typename TSource, typename TTarget>
295 static void zipChannels(const std::initializer_list<const TSource*>& sourceFrames, TTarget* const targetFrame, const unsigned int width, const unsigned int height, const std::initializer_list<unsigned int>& sourceFramesPaddingElements, const unsigned int targetFramePaddingElements);
296
297 /**
298 * Adds a new channel to a given frame with zipped pixel format, the new channel will be added to the front of all existing channels.
299 * @param source The source frame to which the new channel will be added, must be valid
300 * @param sourceNewChannel The 1-channel frame providing the new channel information, must be valid
301 * @param target The target frame receiving the joined channels, must be valid
302 * @param width The width of the frames in pixel, with range [1, infinity)
303 * @param height The height of the frames in pixel, with range [1, infinity)
304 * @param conversionFlag The conversion to be applied
305 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
306 * @param sourceNewChannelPaddingElements The number of padding elements at the end of each new-channel-source row, in elements, with range [0, infinity)
307 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
308 * @param worker Optional worker object to distribute the computational load
309 * @tparam T Data type of each channel pixel value
310 * @tparam tSourceChannels Number of channels of the source frame (without the new channel), with range [1, infinity)
311 */
312 template <typename T, unsigned int tSourceChannels>
313 static inline void addFirstChannel(const T* source, const T* sourceNewChannel, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int sourceNewChannelPaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
314
315 /**
316 * Adds a new channel to a given frame with zipped pixel format, the value of the new channel will be the same for each pixel.
317 * @param source The source frame that provided the existing channels
318 * @param newChannelValue Value that will be assigned to the new channel for each pixel
319 * @param target The target frame to that the existing channels and the new channel will be added (as new first channel)
320 * @param width The width of the frames in pixel, with range [1, infinity)
321 * @param height The height of the frames in pixel, with range [1, infinity)
322 * @param conversionFlag The conversion to be applied
323 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
324 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
325 * @param worker Optional worker object to distribute the computational load
326 * @tparam T Data type of each channel pixel value
327 * @tparam tSourceChannels Number of channels of the source frame (without the new channel)
328 */
329 template <typename T, unsigned int tSourceChannels>
330 static inline void addFirstChannelValue(const T* source, const T newChannelValue, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
331
332 /**
333 * Adds a new channel to a given frame with zipped pixel format, the new channel will be added to the back of all existing channels.
334 * @param source The source frame to which the new channel will be added, must be valid
335 * @param sourceNewChannel The 1-channel frame providing the new channel information, must be valid
336 * @param target The target frame receiving the joined channels, must be valid
337 * @param width The width of the frames in pixel, with range [1, infinity)
338 * @param height The height of the frames in pixel, with range [1, infinity)
339 * @param conversionFlag The conversion to be applied
340 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
341 * @param sourceNewChannelPaddingElements The number of padding elements at the end of each new-channel-source row, in elements, with range [0, infinity)
342 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
343 * @param worker Optional worker object to distribute the computational load
344 * @tparam T Data type of each channel pixel value
345 * @tparam tSourceChannels Number of channels of the source frame (without the new channel), with range [1, infinity)
346 */
347 template <typename T, unsigned int tSourceChannels>
348 static inline void addLastChannel(const T* source, const T* sourceNewChannel, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int sourceNewChannelPaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
349
350 /**
351 * Adds a new channel to a given frame with zipped pixel format, the value of the new channel will be the same for each pixel.
352 * @param source The source frame that provided the existing channels
353 * @param newChannelValue Value that will be assigned to the new channel for each pixel
354 * @param target The target frame to that the existing channels and the new channel will be added (as new last channel)
355 * @param width The width of the frames in pixel, with range [1, infinity)
356 * @param height The height of the frames in pixel, with range [1, infinity)
357 * @param conversionFlag The conversion to be applied
358 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
359 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
360 * @param worker Optional worker object to distribute the computational load
361 * @tparam T Data type of each channel pixel value
362 * @tparam tSourceChannels Number of channels of the source frame (without the new channel)
363 */
364 template <typename T, unsigned int tSourceChannels>
365 static inline void addLastChannelValue(const T* source, const T newChannelValue, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
366
367 /**
368 * Removes the first channel from a given frame with zipped (generic) pixel format.
369 * This function is mainly a wrapper around FrameChannels::shuffleChannels().
370 * @param source The source frame from that the first channel will be removed, must be valid
371 * @param target The target frame without the first channel, must be valid
372 * @param width The width of the frames in pixel, with range [1, infinity)
373 * @param height The height of the frames in pixel, with range [1, infinity)
374 * @param conversionFlag The conversion to be applied
375 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
376 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
377 * @param worker Optional worker object to distribute the computational load
378 * @tparam T Data type of each channel pixel value
379 * @tparam tSourceChannels Number of channels of the source frame (including the channel that will be removed), with range [2, infinity)
380 * @see FrameChannels::shuffleChannels<T, tSourceChannels, tTargetChannels, tShufflePattern>(), removeLastChannel().
381 */
382 template <typename T, unsigned int tSourceChannels>
383 static inline void removeFirstChannel(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
384
385 /**
386 * Removes the last channel from a given frame with zipped (generic) pixel format.
387 * This function is mainly a wrapper around FrameChannels::shuffleChannels().
388 * @param source The source frame from that the first channel will be removed, must be valid
389 * @param target The target frame without the first channel, must be valid
390 * @param width The width of the frames in pixel, with range [1, infinity)
391 * @param height The height of the frames in pixel, with range [1, infinity)
392 * @param conversionFlag The conversion to be applied
393 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
394 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
395 * @param worker Optional worker object to distribute the computational load
396 * @tparam T Data type of each channel pixel value
397 * @tparam tSourceChannels Number of channels of the frame (including the channel that will be removed), with range [2, infinity)
398 * @see FrameChannels::shuffleChannels<T, tSourceChannels, tTargetChannels, tShufflePattern>(), removeFirstChannel().
399 */
400 template <typename T, unsigned int tSourceChannels>
401 static inline void removeLastChannel(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
402
403 /**
404 * Copies one channel from a given frame with zipped pixel format to another frame with zipped pixel format.
405 * @param source The source frame from that the channel will be copied, must be valid
406 * @param target The target frame to which the channel will be copied, must be valid
407 * @param width The width of both frames in pixel, with range [1, infinity)
408 * @param height The height of both frames in pixel, with range [1, infinity)
409 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
410 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
411 * @param worker Optional worker object to distribute the computational load
412 * @tparam T Data type of each channel pixel value
413 * @tparam tSourceChannels Number of channels in the source frame, with range [1, infinity)
414 * @tparam tTargetChannels Number of channels in the target frame, with range [1, infinity)
415 * @tparam tSourceChannelIndex The index of the source channel that will be copied, with range [0, tSourceChannels - 1]
416 * @tparam tTargetChannelIndex The index of the target channel that will be copied, with range [0, tTargetChannels - 1]
417 */
418 template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tSourceChannelIndex, unsigned int tTargetChannelIndex>
419 static inline void copyChannel(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
420
421 /**
422 * Sets one channel of a frame with a specific unique value.
423 * @param frame The frame in that one channel of each pixel will be set
424 * @param width The width of the frame in pixel, with range [1, infinity)
425 * @param height The height of the frame in pixel, with range [1, infinity)
426 * @param value The value to be set
427 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
428 * @param worker Optional worker object to distribute the computation
429 * @tparam T Data type of each channel pixel value
430 * @tparam tChannel Index of the channel that will be inverted, with range [0, tChannels)
431 * @tparam tChannels Number of data channels of the frames, with range [1, infinity)
432 */
433 template <typename T, unsigned int tChannel, unsigned int tChannels>
434 static inline void setChannel(T* frame, const unsigned int width, const unsigned int height, const T value, const unsigned int framePaddingElements, Worker* worker = nullptr);
435
436 /**
437 * Reverses the order of the channels of a frame with zipped pixel format.
438 * The first channel will be exchanged with the last channel, the second channel will be exchanged with the second last channel and so on.
439 * @param source The source frame from that the channels will be swapped, must be valid
440 * @param target The target frame that receives the swapped channels, must be valid
441 * @param width The width of the source frame in pixel, with range (0, infinity)
442 * @param height The height of the source frame in pixel, with range (0, infinity)
443 * @param conversionFlag The conversion to be applied
444 * @param sourcePaddingElements Optional padding at the end of each source row in elements, with range [0, infinity)
445 * @param targetPaddingElements Optional padding at the end of each target row in elements, with range [0, infinity)
446 * @param worker Optional worker object to distribute the computation
447 * @tparam T Data type of each channel pixel value
448 * @tparam tChannels Number of data channels, with range [1, infinity)
449 */
450 template <typename T, unsigned int tChannels>
451 static inline void reverseChannelOrder(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
452
453 /**
454 * Shuffles the channels of a frame by an arbitrary pattern.
455 * The shuffle pattern is defined in groups of four bits defining the source channels.<br>
456 * For the shuffling from e.g., an RGBA32 row to a BGRA32 row the pattern 0x3012u must be defined:
457 * <pre>
458 * source pixel R G B A
459 * 0 1 2 3
460 * target pixel B G R A
461 * 2 1 0 3
462 * pattern (with reversed order): 0x3012
463 * </pre>
464 * @param source The source frame for which the channels will be shuffled, must be valid
465 * @param target The target frame that receives the shuffled channels, must be valid
466 * @param width The width of the source frame in pixel, with range [1, infinity)
467 * @param height The height of the source frame in pixel, with range [1, infinity)
468 * @param conversionFlag The conversion to be applied
469 * @param sourcePaddingElements Optional padding at the end of each source row in elements, with range [0, infinity)
470 * @param targetPaddingElements Optional padding at the end of each target row in elements, with range [0, infinity)
471 * @param worker Optional worker object to distribute the computation
472 * @tparam T Data type of each channel pixel value
473 * @tparam tSourceChannels Number of source data channels, with range [1, 8u]
474 * @tparam tTargetChannels Number of target data channels, with range [1, 8u]
475 * @tparam tShufflePattern Groups of four bits define the source channel, e.g., 0x76543210 defines the identity transformation, 0x01234567 defines the reverse transformation
476 */
477 template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
478 static inline void shuffleChannels(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
479
480 /**
481 * Shuffles the channels of source frame and sets the last channel with constant value in the target frame.
482 * The shuffle pattern is defined in groups of four bits defining the source channels.<br>
483 * For the shuffling from e.g., an RGB24 row to a BGRA32 row the pattern 0x012u must be defined:
484 * <pre>
485 * source pixel R G B
486 * 0 1 2
487 * target pixel B G R A
488 * 2 1 0
489 * pattern (with reversed order): 0x012
490 * </pre>
491 * @param source The source frame for which the channels will be shuffled, must be valid
492 * @param newChannelValue The constant channel value which will be added as last channel to the target frame, with range [0, infinity)
493 * @param target The target frame that receives the shuffled channels, must be valid
494 * @param width The width of the source frame in pixel, with range [1, infinity)
495 * @param height The height of the source frame in pixel, with range [1, infinity)
496 * @param conversionFlag The conversion to be applied
497 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
498 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
499 * @param worker Optional worker object to distribute the computation
500 * @tparam T Data type of each channel pixel value
501 * @tparam tSourceChannels Number of source data channels, with range [1, 8u]
502 * @tparam tTargetChannels Number of target data channels, including the additional extra target channel, with range [2, 8u]
503 * @tparam tShufflePattern Groups of four bits define the source channel, e.g., 0x76543210 defines the identity transformation, 0x01234567 defines the reverse transformation
504 */
505 template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
506 static inline void shuffleChannelsAndSetLastChannelValue(const T* source, const T newChannelValue, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
507
508 /**
509 * Narrows 16 bit channels of a frame to 8 bit channels.
510 * @param source The source frame for which the channels will be narrowed, must be valid
511 * @param target The target frame that receives the narrowed channels, must be valid
512 * @param width The width of the source frame in pixel, with range [1, infinity)
513 * @param height The height of the source frame in pixel, with range [1, infinity)
514 * @param conversionFlag The conversion to be applied
515 * @param sourcePaddingElements Optional padding at the end of each source row in elements, with range [0, infinity)
516 * @param targetPaddingElements Optional padding at the end of each target row in elements, with range [0, infinity)
517 * @param worker Optional worker object to distribute the computation
518 * @tparam tChannels Number of source data channels, with range [1, infinity)
519 */
520 template <unsigned int tChannels>
521 static inline void narrow16BitPerChannelTo8BitPerChannel(const uint16_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
522
523 /**
524 * Applies a specific modifier function on each pixel.
525 * @param source The source frame providing the pixel information, must be valid
526 * @param target The target frame receiving the pixel information, must be valid
527 * @param width The width of the source frame in pixel, with range (0, infinity)
528 * @param height The height of the source frame in pixel, with range (0, infinity)
529 * @param conversionFlag The conversion to be applied
530 * @param worker Optional worker object to distribute the computation
531 * @tparam T Data type of each channel pixel value
532 * @tparam tChannels Number of data channels, with range [1, infinity)
533 * @tparam tPixelFunction Pixel modification function
534 */
535 template <typename T, unsigned int tChannels, void (*tPixelFunction)(const T*, T*)>
536 static void applyPixelModifier(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, Worker* worker = nullptr);
537
538 /**
539 * Applies a specific modifier function on each pixel.
540 * @param source The source frame providing the pixel information, must be valid
541 * @param target The target frame receiving the pixel information, must be valid
542 * @param width The width of the source frame in pixel, with range [1, infinity)
543 * @param height The height of the source frame in pixel, with range [1, infinity)
544 * @param sourcePaddingElements The number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
545 * @param targetPaddingElements The number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
546 * @param conversionFlag The conversion to be applied
547 * @param worker Optional worker object to distribute the computation
548 * @tparam TSource Data type of each source channel pixel value
549 * @tparam TTarget Data type of each target channel pixel value
550 * @tparam tSourceChannels Number of source data channels, with range [1, infinity)
551 * @tparam tTargetChannels Number of target data channels, with range [1, infinity)
552 * @tparam tPixelFunction Pixel modification function
553 */
554 template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tPixelFunction)(const TSource*, TTarget*)>
555 static void applyAdvancedPixelModifier(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, Worker* worker = nullptr);
556
557 /**
558 * Generic bivariate pixel operations
559 * Applies bivariate per-pixel operators: `C(y, x) = op(A(y, x), B(y, x))`. Input and output must have the same frame type and have a single plane.
560 * @param source0 First source frame
561 * @param source1 Second source frame
562 * @param target The target frame
563 * @param width The width of the source frame in pixel, with range [1, infinity)
564 * @param height The height of the source frame in pixel, with range [1, infinity)
565 * @param source0PaddingElements The number of padding elements at the end of each row of the first source, in elements, with range [0, infinity)
566 * @param source1PaddingElements The number of padding elements at the end of each row of the second source, in elements, with range [0, infinity)
567 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
568 * @param conversionFlag The conversion to be applied
569 * @param worker Optional worker object to distribute the computation
570 * @tparam TSource0 Type of the first data source
571 * @tparam TSource1 Type of the second data source
572 * @tparam TTarget Type of the target
573 * @tparam TIntermediate Data type that is used for the computation of intermediate results, e.g. if TSource0 and TSource1 are different
574 * @tparam tSourceChannels Number of channels of the two sources, range: [1, infinity)
575 * @tparam tTargetChannels Number of channels of the target, range: [1, infinity)
576 * @tparam tOperator The operation (function) that is applied on both sources to yield the value for the target (called per pixel)
577 */
578 template <typename TSource0, typename TSource1, typename TTarget, typename TIntermediate, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tOperator)(const TSource0*, const TSource1*, TTarget*)>
579 static void applyBivariateOperator(const TSource0* source0, const TSource1* source1, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int source0PaddingElements, const unsigned int source1PaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, Worker* worker = nullptr);
580
581 /**
582 * Applies a row operator to all rows of a source image.
583 * The row operator is given as function pointer and is intended to transform a source row to a target row.<br>
584 * The function allows to implement e.g., frame filters with few lines of code, source and target frame must have the same size.
585 * @param source The source frame to which the row operator is applied, must be valid
586 * @param target The target frame receiving the result of the row operator, must be valid
587 * @param width The width of the source frame and target frame in pixel, with range [1, infinity)
588 * @param height The height of the source frame and target frame in pixel, with range [1, infinity)
589 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
590 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
591 * @param rowOperatorFunction The pointer to the row operator function, must be valid
592 * @param worker Optional worker object to distribute the computation
593 * @tparam TSource The data type of the source elements
594 * @tparam TTarget The data type of the target elements
595 * @tparam tSourceChannels The number of channels the source frame has, with range [1, infinity)
596 * @tparam tTargetChannels The number of channels the target frame has, with range [1, infinity)
597 */
598 template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels>
599 static void applyRowOperator(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const RowOperatorFunction<TSource, TTarget, tSourceChannels, tTargetChannels>& rowOperatorFunction, Worker* worker = nullptr);
600
601 /**
602 * Transforms a frame with generic pixel format (with zipped pixel information) like RGB24 or YUV24, to a frame with same pixel format and channel number.
603 * This function mainly mirrors or flips an image.
604 * @param source The source frame buffer, must be valid
605 * @param target The target frame buffer, must be valid
606 * @param width The width of the frame in pixel, with range [1, infinity)
607 * @param height The height of the frame in pixel, with range [1, infinity)
608 * @param conversionFlag The conversion to be applied
609 * @param sourcePaddingElements Optional padding at the end of each source row in elements, with range [0, infinity)
610 * @param targetPaddingElements Optional padding at the end of each target row in elements, with range [0, infinity)
611 * @param worker Optional worker object to distribute the computation
612 * @tparam T Data type of each channel pixel value, e.g., 'uint8_t', 'float', ...
613 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
614 */
615 template <typename T, unsigned int tChannels>
616 static inline void transformGeneric(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker);
617
618 /**
619 * Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
620 * @param frame The image to convert, must be valid
621 * @param width The width of the image in pixel, with range [1, infinity)
622 * @param height The height of the image in pixel, with range [1, infinity)
623 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
624 * @param worker Optional worker object to distribute the computation
625 * @tparam tChannels The number of frame channels, with range [2, infinity)
626 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
627 * @see straightAlphaToPremultipliedAlpha8BitPerChannel().
628 */
629 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
630 static inline void premultipliedAlphaToStraightAlpha8BitPerChannel(uint8_t* const frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, Worker* worker = nullptr);
631
632 /**
633 * Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
634 * @param source The source image to convert, must be valid
635 * @param target The resulting converted target image, must be valid
636 * @param width The width of the image in pixel, with range [1, infinity)
637 * @param height The height of the image in pixel, with range [1, infinity)
638 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
639 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
640 * @param worker Optional worker object to distribute the computation
641 * @tparam tChannels The number of frame channels, with range [2, infinity)
642 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
643 * @see straightAlphaToPremultipliedAlpha8BitPerChannel().
644 */
645 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
646 static inline void premultipliedAlphaToStraightAlpha8BitPerChannel(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
647
648 /**
649 * Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied alpha.
650 * @param frame The image to convert, must be valid
651 * @param width The width of the image in pixel, with range [1, infinity)
652 * @param height The height of the image in pixel, with range [1, infinity)
653 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
654 * @param worker Optional worker object to distribute the computation
655 * @tparam tChannels The number of frame channels, with range [2, infinity)
656 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
657 * @see premultipliedAlphaToStraightAlpha8BitPerChannel().
658 */
659 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
660 static inline void straightAlphaToPremultipliedAlpha8BitPerChannel(uint8_t* const frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, Worker* worker = nullptr);
661
662 /**
663 * Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied alpha.
664 * @param source The source image to convert, must be valid
665 * @param target The resulting converted target image, must be valid
666 * @param width The width of the image in pixel, with range [1, infinity)
667 * @param height The height of the image in pixel, with range [1, infinity)
668 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
669 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
670 * @param worker Optional worker object to distribute the computation
671 * @tparam tChannels The number of frame channels, with range [2, infinity)
672 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
673 * @see premultipliedAlphaToStraightAlpha8BitPerChannel().
674 */
675 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
676 static inline void straightAlphaToPremultipliedAlpha8BitPerChannel(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
677
678 /**
679 * Reverses/mirrors the order of pixels in a given row (or a memory block in general).
680 * @param source The pointer to the source pixels, must be valid
681 * @param target The pointer to the target pixels receiving the reversed/mirrored pixel data, must be valid
682 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
683 * @tparam T The data type of the pixel elements, e.g, 'uint8_t', 'int'
684 * @tparam tChannels The number of channels (the number of elements) each pixel has, with range [1, infinity)
685 */
686 template <typename T, unsigned int tChannels>
687 static void reverseRowPixelOrder(const T* source, T* target, const size_t size);
688
689 /**
690 * Reverses/mirrors the order of pixels in a given row (or a memory block in general) in place.
691 * @param data The pointer to the pixels, must be valid
692 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
693 * @tparam T The data type of the pixel elements, e.g, 'uint8_t', 'int'
694 * @tparam tChannels The number of channels (the number of elements) each pixel has, with range [1, infinity)
695 */
696 template <typename T, unsigned int tChannels>
697 static void reverseRowPixelOrderInPlace(T* data, const size_t size);
698
699 /**
700 * Reverses/mirrors the order of channels in a given row (or a memory block in general).
701 * @param source The pointer to the source pixels, must be valid
702 * @param target The pointer to the target pixels receiving the reversed/mirrored channels, must be valid
703 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
704 * @param unusedOptions An unused options parameters, must be nullptr
705 * @tparam T The data type of the pixel elements, e.g, 'uint8_t', 'int'
706 * @tparam tChannels The number of channels (the number of elements) each pixel has, with range [1, infinity)
707 */
708 template <typename T, unsigned int tChannels>
709 static void reverseRowChannelOrder(const T* source, T* target, const size_t size, const void* unusedOptions = nullptr);
710
711 /**
712 * Shuffles the channels of row pixels by application of a specified shuffle pattern.
713 * The shuffle pattern is defined in groups of four bits defining the source channels.<br>
714 * For the shuffling from e.g., an RGBA32 row to a BGRA32 row the pattern 0x3012u must be defined:
715 * <pre>
716 * source pixel R G B A
717 * 0 1 2 3
718 * target pixel B G R A
719 * 2 1 0 3
720 * pattern (with reversed order): 0x3012
721 * </pre>
722 * @param source The pointer to the source pixels, must be valid
723 * @param target The pointer to the target pixels, receiving the shuffled channels, must be valid
724 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
725 * @param unusedOptions An unused options parameters, must be nullptr
726 * @tparam T Data type of each channel pixel value, e.g, 'uint8_t' or 'float'
727 * @tparam tSourceChannels Number of source data channels, with range [1, 8u]
728 * @tparam tTargetChannels Number of target data channels, with range [1, 8u]
729 * @tparam tShufflePattern Groups of four bits define the source channel, e.g., 0x76543210 defines the identity transformation, 0x01234567 defines the reverse transformation
730 */
731 template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
732 static inline void shuffleRowChannels(const T* source, T* target, const size_t size, const void* unusedOptions = nullptr);
733
734 /**
735 * Shuffles the channels of row pixels by application of a specified shuffle pattern and sets the last channel with constant value in the target row.
736 * The shuffle pattern is defined in groups of four bits defining the source channels.<br>
737 * For the shuffling from e.g., an RGB24 row to a BGRA32 row the pattern 0x012u must be defined:
738 * <pre>
739 * source pixel R G B
740 * 0 1 2
741 * target pixel B G R A
742 * 2 1 0
743 * pattern (with reversed order): 0x012
744 * </pre>
745 * @param source The pointer to the source pixels, must be valid
746 * @param target The pointer to the target pixels, receiving the shuffled channels, must be valid
747 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
748 * @param options Pointer to the constant channel value which will be added to the end of the target channels, must be valid
749 * @tparam T Data type of each channel pixel value, e.g, 'uint8_t' or 'float'
750 * @tparam tSourceChannels Number of source data channels, with range [1, 8u]
751 * @tparam tTargetChannels Number of target data channels, including the additional extra target channel, with range [2, 8u]
752 * @tparam tShufflePattern Groups of four bits define the source channel, e.g., 0x76543210 defines the identity transformation, 0x01234567 defines the reverse transformation
753 */
754 template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
755 static inline void shuffleRowChannelsAndSetLastChannelValue(const T* source, T* target, const size_t size, const void* options = nullptr);
756
757 /**
758 * Converts a row of pixels with 3 channels to pixels with one channel by a linear combination of the four channels.
759 * This function can be used to e.g., convert RGB24 to Y8, or BGR24 to Y8.
760 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
761 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
762 * @param source The pointer to the source pixels, must be valid
763 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
764 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
765 * @param channelMultiplicationFactors_128 The three uint32_t multiplication factors, one for each channel, with range [0, 128], while the sum of all four factors must be 128, must be valid
766 * @tparam tUseFactorChannel0 True, if the value(s) of factorChannel0 is not zero; False, if the value(s) of factorChannel0 is zero
767 * @tparam tUseFactorChannel1 True, if the value(s) of factorChannel1 is not zero; False, if the value(s) of factorChannel1 is zero
768 * @tparam tUseFactorChannel2 True, if the value(s) of factorChannel2 is not zero; False, if the value(s) of factorChannel2 is zero
769 */
770 template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2>
771 static void convertRow3ChannelsTo1Channel8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* channelMultiplicationFactors_128);
772
773 /**
774 * Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the three channels plus an translational part applied to the source data before applying the linear transformation.
775 * This function can be used to e.g., convert RGB24 to YUV24, or YUV24 to RGB24.
776 * The linear combination is defined by three integer multiplication factor for each source channel with 64 as denominator, plus one translation parameter for each source channel (with 1 as denominator).<br>
777 * Beware: As this function applies integer multiplication factors (with 6 bits precision) the conversion result has an accuracy of +/- 4 color intensities.<br>
778 * The transformation is based on the following pattern:
779 * <pre>
780 * t0 = clamp(0, f00 * (s0 - b0) + f01 * (s1 - b1) + f02 * (s2 - b2), 255)
781 * t1 = clamp(0, f10 * (s0 - b0) + f11 * (s1 - b1) + f12 * (s2 - b2), 255)
782 * t2 = clamp(0, f20 * (s0 - b0) + f21 * (s1 - b1) + f22 * (s2 - b2), 255)
783 * </pre>
784 * With t target, s source, f factor, and b bias/translation.<br>
785 * Factors must be specified in relation to a denominator of 64, bias values must be specified with a denominator of 1.
786 * @param source The pointer to the source pixels, must be valid
787 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
788 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
789 * @param parameters The 12 int32_t parameters of the column-aligned 3x3 transformation matrix, plus 3 translation parameters: f00_64, f10_64, f20_64, f01_64, f02_64, ..., f22_64, with ranges [-128, 128], b0, b1, b2, with ranges [0, 128]
790 */
791 static void convertRow3ChannelsTo3Channels8BitPerChannel6BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
792
793 /**
794 * Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the three channels plus a bias (translation) part.
795 * This function can be used to e.g., convert RGB24 to YUV24, or BGR24 to YVU24.
796 * The linear combination is defined by three integer multiplication factor for each source channel with 128 as denominator, plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
797 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
798 * The transformation is based on the following pattern:
799 * <pre>
800 * t0 = clamp(0, f00 * s0 + f01 * s1 + f02 * s2 + b0, 255)
801 * t1 = clamp(0, f10 * s0 + f11 * s1 + f12 * s2 + b1, 255)
802 * t2 = clamp(0, f20 * s0 + f21 * s1 + f22 * s2 + b2, 255)
803 * </pre>
804 * With t target, s source, f factor, and b bias.<br>
805 * Factors must be specified in relation to a denominator of 128, bias values must be specified with a denominator of 1.
806 * @param source The pointer to the source pixels, must be valid
807 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
808 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
809 * @param parameters The 12 int32_t parameters of the column-aligned 3x4 transformation matrix: f00_128, f10_128, f20_128, f01_128, f02_128, ..., f22_128, b0, b1, b2, with ranges [-127, 127]
810 */
811 static void convertRow3ChannelsTo3Channels8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
812
813 /**
814 * Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the three channels plus a bias (translation) part.
815 * This function can be used to e.g., convert YUV24 to RGB24, or YVU24 to BGR24.
816 * The linear combination is defined by three integer multiplication factor for each source channel with 1024 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
817 * Beware: As this function applies integer multiplication factors (with 10 bits precision) the conversion result has an accuracy of +/- 1 color intensities.<br>
818 * The transformation is based on the following pattern:
819 * <pre>
820 * t0 = clamp(0, f00 * s0 + f01 * s1 + f02 * s2 + b0, 255)
821 * t1 = clamp(0, f10 * s0 + f11 * s1 + f12 * s2 + b1, 255)
822 * t2 = clamp(0, f20 * s0 + f21 * s1 + f22 * s2 + b2, 255)
823 * </pre>
824 * With t target, s source, f factor, and b bias.<br>
825 * Factors must be specified in relation to a denominator of 1024, bias values must be specified with a denominator of 1.
826 * @param source The pointer to the source pixels, must be valid
827 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
828 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
829 * @param parameters The 12 int32_t parameters of the column-aligned 3x4 transformation matrix: f00_1024, f10_1024, f20_1024, f01_1024, f02_1024, ..., f22_1024, b0, b1, b2, with ranges [-1024 * 16, 1024 * 16]
830 */
831 static void convertRow3ChannelsTo3Channels8BitPerChannel10BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
832
833 /**
834 * Converts a row of pixels with 3 channels to pixels with 4 channels by a linear combination of the three channels plus an translational part applied to the source data before applying the linear transformation (for the first three channels).
835 * The fourth channel is set to a constant value, e.g., for an alpha channel.<br>
836 * This function can be used to e.g., convert YUV24 to RGBA32, or YVU24 to BGRA32.<br>
837 * The linear combination is defined by three integer multiplication factor for each source channel with 64 as denominator, plus one translation parameter for each source channel (with 1 as denominator).<br>
838 * Beware: As this function applies integer multiplication factors (with 6 bits precision) the conversion result has an accuracy of +/- 4 color intensities.<br>
839 * The transformation is based on the following pattern:
840 * <pre>
841 * t0 = clamp(0, f00 * (s0 - b0) + f01 * (s1 - b1) + f02 * (s2 - b2), 255)
842 * t1 = clamp(0, f10 * (s0 - b0) + f11 * (s1 - b1) + f12 * (s2 - b2), 255)
843 * t2 = clamp(0, f20 * (s0 - b0) + f21 * (s1 - b1) + f22 * (s2 - b2), 255)
844 * t3 = valueChannel3
845 * </pre>
846 * With t target, s source, f factor, and b bias/translation.<br>
847 * Factors must be specified in relation to a denominator of 64, bias values must be specified with a denominator of 1.
848 * @param source The pointer to the source pixels, must be valid
849 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
850 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
851 * @param parameters The 13 int32_t parameters of the column-aligned 3x3 transformation matrix, plus 3 translation parameters: f00_64, f10_64, f20_64, f01_64, f02_64, ..., f22_64, with ranges [-128, 128], b0, b1, b2, with ranges [0, 128], valueChannel3, with range [0, 255]
852 */
853 static void convertRow3ChannelsTo4Channels8BitPerChannel6BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
854
855 /**
856 * Converts a row of pixels with 4 channels to pixels with one channel by a linear combination of the four channels.
857 * This function can be used to e.g., convert RGBA32 to Y8, or ARGB32 to Y8, or RGB32 to Y8.
858 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
859 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
860 * <pre>
861 * t0 = f0 * s0 + f1 * s1 + f2 * s2 + f3 * s3
862 * </pre>
863 * @param source The pointer to the source pixels, must be valid
864 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
865 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
866 * @param channelMultiplicationFactors_128 The four uint32_t multiplication factors, one for each channel, with range [0, 127], while the sum of all four factors must be 128, must be valid
867 * @tparam tUseFactorChannel0 True, if the value(s) of factorChannel0 is not zero; False, if the value(s) of factorChannel0 is zero
868 * @tparam tUseFactorChannel1 True, if the value(s) of factorChannel1 is not zero; False, if the value(s) of factorChannel1 is zero
869 * @tparam tUseFactorChannel2 True, if the value(s) of factorChannel2 is not zero; False, if the value(s) of factorChannel2 is zero
870 * @tparam tUseFactorChannel3 True, if the value(s) of factorChannel3 is not zero; False, if the value(s) of factorChannel3 is zero
871 */
872 template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2, bool tUseFactorChannel3>
873 static void convertRow4ChannelsTo1Channel8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* channelMultiplicationFactors_128);
874
875 /**
876 * Converts a row of pixels with 4 channels to pixels with two channel by a linear combination of the four channels.
877 * This function can be used to e.g., convert RGBA32 to YA16, or ARGB32 to AY16.
878 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
879 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
880 * The transformation is based on the following pattern:
881 * <pre>
882 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + f03 * s3
883 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + f13 * s3
884 * </pre>
885 * @param source The pointer to the source pixels, must be valid
886 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
887 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
888 * @param multiplicationFactors_128 The 8 int32_t parameters of the column-aligned 2x4 transformation matrix: f00_128, f10_128, f01_128, ..., f13_128, with range [0, 127], while the sum of all four row factors must be 128, must be valid
889 */
890 static void convertRow4ChannelsTo2Channels8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* multiplicationFactors_128);
891
892 /**
893 * Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the three channels plus a bias (translation) part.
894 * This function can be used to e.g., convert RGBA32 to YUV24, or BGRA24 to YVU24.
895 * The linear combination is defined by three integer multiplication factor for each source channel with 128 as denominator, plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
896 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
897 * The transformation is based on the following pattern:
898 * <pre>
899 * t0 = clamp(0, f00 * s0 + f01 * s1 + f02 * s2 + f03 * s3 + b0, 255)
900 * t1 = clamp(0, f10 * s0 + f11 * s1 + f12 * s2 + f13 * s3 + b1, 255)
901 * t2 = clamp(0, f20 * s0 + f21 * s1 + f22 * s2 + f23 * s3 + b2, 255)
902 * </pre>
903 * With t target, s source, f factor, and b bias.<br>
904 * Factors must be specified in relation to a denominator of 128, bias values must be specified with a denominator of 1.
905 * @param source The pointer to the source pixels, must be valid
906 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
907 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
908 * @param parameters The 12 int32_t parameters of the column-aligned 3x4 transformation matrix: f00_128, f10_128, f20_128, f01_128, f02_128, ..., f23_128, b0, b1, b2, with ranges [-127, 127]
909 */
910 static void convertRow4ChannelsTo3Channels8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* parameters);
911
912 /**
913 * Narrows a row of pixels with 16 bit channels to pixels with 8 bit channels.
914 * @param source The pointer to the source pixels, must be valid
915 * @param target The pointer to the target pixels receiving the converted pixel data, must be valid
916 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
917 * @param unusedParameters Unused parameter, must be nullptr
918 * @tparam tChannels The number of channels the source (and target) frame have, with range [1, infinity)
919 */
920 template <unsigned int tChannels>
921 static void narrowRow16BitPerChannelTo8BitPerChannel(const uint16_t* source, uint8_t* target, const size_t size, const void* unusedParameters = nullptr);
922
923 /**
924 * Adds a channel to a given row with generic (zipped) pixel format and copies the information of the new channel from a one-channel image.
925 * The channel can be added at new first channel or as new last channel.
926 * @param sources The pointer to the multi-channel source frame and to the single-channel source frame, must be valid
927 * @param targets The one pointer to the target image, must be valid
928 * @param multipleRowIndex The index of the multiple-row to be handled, with range [0, height - 1]
929 * @param width The width of the frame in pixel, with range [1, infinity), must be even
930 * @param height The height of the frame in pixel, with range [1, infinity), must be even
931 * @param conversionFlag The conversion to be applied
932 * @param options The 1 options parameters: padding parameters of 1-channel source frame, must be valid
933 * @tparam T Data type of each channel pixel value, e.g, 'uint8_t' or 'float'
934 * @tparam tSourceChannels Number of channels of the source frame (without the new channel), with range [1, infinity)
935 * @tparam tAddToFront True, to add the channel to the front (as new first channel); False, to add the channel to the back (as new last channel).
936 */
937 template <typename T, unsigned int tSourceChannels, bool tAddToFront>
938 static void addChannelRow(const void** sources, void** targets, const unsigned int multipleRowIndex, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const void* options);
939
940 /**
941 * Adds a channel to a given row with generic (zipped) pixel format and sets all values to a specified value.
942 * The channel can be added at new first channel or as new last channel.
943 * @param source The pointer to the source pixels, must be valid
944 * @param target The pointer to the target pixels, receiving the additional channels, must be valid
945 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
946 * @param channelValueParameter The pointer to the value of the channel to be set (with data type 'T'), must be valid
947 * @tparam T Data type of each channel pixel value, e.g, 'uint8_t' or 'float'
948 * @tparam tSourceChannels Number of channels of the source frame (without the new channel), with range [1, infinity)
949 * @tparam tAddToFront True, to add the channel to the front (as new first channel); False, to add the channel to the back (as new last channel).
950 */
951 template <typename T, unsigned int tSourceChannels, bool tAddToFront>
952 static void addChannelValueRow(const T* source, T* target, const size_t size, const void* channelValueParameter);
953
954 /**
955 * Copies one channel from a source row to a target row with generic (zipped) pixel format.
956 * @param source The pointer to the source pixels, must be valid
957 * @param target The pointer to the target pixels, receiving the additional channels, must be valid
958 * @param size The number of source (and target pixels) to convert, with range [1, infinity)
959 * @tparam T Data type of each channel pixel value, e.g, 'uint8_t' or 'float'
960 * @param unusedParameters Unused parameters, must be nullptr
961 * @tparam tSourceChannels Number of channels of the source frame, with range [1, infinity)
962 * @tparam tTargetChannels Number of channels of the target frame, with range [1, infinity)
963 * @tparam tSourceChannelIndex The index of the source channel to be copied, with range [0, tSourceChannels - 1]
964 * @tparam tTargetChannelIndex The index of the target channel to be copied, with range [0, tTargetChannels - 1]
965 */
966 template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tSourceChannelIndex, unsigned int tTargetChannelIndex>
967 static void copyChannelRow(const T* source, T* target, const size_t size, const void* unusedParameters = nullptr);
968
969 protected:
970
971 /**
972 * Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24, FORMAT_BGRA32 into individual frames with one channel only.
973 * @param sourceFrame The frame to be separated, must be valid
974 * @param targetFrames The pointers to the resulting separated frames each holding one channel of the source frame, with already allocated memory
975 * @param width The width of the source frame in pixel, with range [1, infinity)
976 * @param height The height of the source frame in pixel, with range [1, infinity)
977 * @param channels The number of channels the source frame has, with range [1, infinity)
978 * @param sourceFramePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
979 * @param targetFramesPaddingElements The array of padding elements at the end of each target row, one for each target frame, in elements, with range [0, infinity)
980 * @tparam TSource The data type of each source pixel channel, e.g., 'uint8_t', 'float', ...
981 * @tparam TTarget The data type of each target pixel channel, e.g., 'uint8_t', 'float', ...
982 */
983 template <typename TSource, typename TTarget>
984 static void separateTo1ChannelRuntime(const TSource* const sourceFrame, TTarget* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements);
985
986 /**
987 * Zips/interleaves 1-channel images into one image with n-channels.
988 * @param sourceFrames The pointers to the individual 1-channel frames, one for each image, must be valid
989 * @param targetFrame The pointer to the resulting zipped frame holding n-channels, must be valid
990 * @param width The width of the source frames in pixel, with range [1, infinity)
991 * @param height The height of the source frames in pixel, with range [1, infinity)
992 * @param channels The number of provided source frames (and the number of channels the target frame will have), with range [1, infinity)
993 * @param sourceFramesPaddingElements The array of padding elements at the end of each source row, one for each source frame, in elements, with range [0, infinity)
994 * @param targetFramePaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
995 * @tparam TSource The data type of each source pixel channel, e.g., 'uint8_t', 'float', ...
996 * @tparam TTarget The data type of each target pixel channel, e.g., 'uint8_t', 'float', ...
997 */
998 template <typename TSource, typename TTarget>
999 static void zipChannelsRuntime(const TSource* const* const sourceFrames, TTarget* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements);
1000
1001 /**
1002 * Sets one channel of a frame with one unique value.
1003 * @param frame The frame in that one channel of each pixel will be set, must be valid
1004 * @param width The width of the frame in pixel, with range [1, infinity)
1005 * @param value The value to be set
1006 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
1007 * @param firstRow First row to be handled
1008 * @param numberRows Number of rows to be handled
1009 * @tparam T Data type of each channel pixel value
1010 * @tparam tChannel Index of the channel that will be inverted, with range [0, tChannels)
1011 * @tparam tChannels Number of data channels of the frames, with range [1, infinity)
1012 */
1013 template <typename T, unsigned int tChannel, unsigned int tChannels>
1014 static void setChannelSubset(T* frame, const unsigned int width, const T value, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1015
1016 /**
1017 * Applies a specific modifier function on each pixel.
1018 * @param source The source frame providing the pixel information, must be valid
1019 * @param target The target frame receiving the pixel information, must be valid
1020 * @param width The width of the source frame in pixel
1021 * @param height The height of the source frame in pixel
1022 * @param conversionFlag The conversion to be applied
1023 * @param firstRow First row to be handled
1024 * @param numberRows Number of rows to be handled
1025 * @tparam T Data type of each channel pixel value
1026 * @tparam tChannels Number of data channels, with range [1, infinity)
1027 * @tparam tPixelFunction Pixel modification function
1028 */
1029 template <typename T, unsigned int tChannels, void (*tPixelFunction)(const T*, T*)>
1030 static void applyPixelModifierSubset(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows);
1031
1032 /**
1033 * Applies a specific modifier function on each pixel.
1034 * @param source The source frame providing the pixel information, must be valid
1035 * @param target The target frame receiving the pixel information, must be valid
1036 * @param width The width of the source frame in pixel, with range [1, infinity)
1037 * @param height The height of the source frame in pixel, with range [1, infinity)
1038 * @param sourcePaddingElements The number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
1039 * @param targetPaddingElements The number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
1040 * @param conversionFlag The conversion to be applied
1041 * @param firstRow First row to be handled
1042 * @param numberRows Number of rows to be handled
1043 * @tparam TSource Data type of each source channel pixel value
1044 * @tparam TTarget Data type of each target channel pixel value
1045 * @tparam tSourceChannels Number of source data channels, with range [1, infinity)
1046 * @tparam tTargetChannels Number of target data channels, with range [1, infinity)
1047 * @tparam tPixelFunction Pixel modification function
1048 */
1049 template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tPixelFunction)(const TSource*, TTarget*)>
1050 static void applyAdvancedPixelModifierSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows);
1051
1052 /**
1053 * Generic bivariate pixel operations
1054 * @param source0 First source frame
1055 * @param source1 Second source frame
1056 * @param target The target frame
1057 * @param width The width of the source frame in pixel, with range [1, infinity)
1058 * @param height The height of the source frame in pixel, with range [1, infinity)
1059 * @param source0PaddingElements The number of padding elements at the end of each row of the first source, in elements, with range [0, infinity)
1060 * @param source1PaddingElements The number of padding elements at the end of each row of the second source, in elements, with range [0, infinity)
1061 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
1062 * @param conversionFlag The conversion to be applied
1063 * @param firstRow First row to be handled
1064 * @param numberRows Number of rows to be handled
1065 * @tparam TSource0 Type of the first data source
1066 * @tparam TSource1 Type of the second data source
1067 * @tparam TTarget Type of the target
1068 * @tparam TIntermediate Type for the computation of intermediate result, e.g. if TSource0 and TSource1 are different
1069 * @tparam tSourceChannels Number of channels of the two sources, range: [1, infinity)
1070 * @tparam tTargetChannels Number of channels of the target, range: [1, infinity)
1071 * @tparam tOperator The operation (function) that is applied on both sources to yield the value for the target (called per pixel)
1072 */
1073 template <typename TSource0, typename TSource1, typename TTarget, typename TIntermediate, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tOperator)(const TSource0*, const TSource1*, TTarget*)>
1074 static void applyBivariateOperatorSubset(const TSource0* source0, const TSource1* source1, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int source0PaddingElements, const unsigned int source1PaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows);
1075
1076 /**
1077 * Applies a row operator to a subset of all rows of a source image.
1078 * The row operator is given as function pointer and is intended to transform a source row to a target row.<br>
1079 * The function allows to implement e.g., frame filters with few lines of code, source and target frame must have the same size.
1080 * @param source The source frame to which the row operator is applied, must be valid
1081 * @param target The target frame receiving the result of the row operator, must be valid
1082 * @param width The width of the source frame and target frame in pixel, with range [1, infinity)
1083 * @param height The height of the source frame and target frame in pixel, with range [1, infinity)
1084 * @param sourceStrideElements The number of stride elements at the end of each source row, in elements, with range [width * tSourceChannels, infinity)
1085 * @param targetStrideElements The number of padding elements at the end of each target row, in elements, with range [width * tTargetChannels, infinity)
1086 * @param rowOperatorFunction The pointer to the row operator function, must be valid
1087 * @param firstRow The first row to be handled, with range [0, height - 1]
1088 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
1089 * @tparam TSource The data type of the source elements
1090 * @tparam TTarget The data type of the target elements
1091 * @tparam tSourceChannels The number of channels the source frame has, with range [1, infinity)
1092 * @tparam tTargetChannels The number of channels the target frame has, with range [1, infinity)
1093 */
1094 template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels>
1095 static void applyRowOperatorSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourceStrideElements, const unsigned int targetStrideElements, const RowOperatorFunction<TSource, TTarget, tSourceChannels, tTargetChannels> rowOperatorFunction, const unsigned int firstRow, const unsigned int numberRows);
1096
1097 /**
1098 * Transforms a subset of a frame with generic pixel format (with zipped pixel information) like RGB24 or YUV24, to a frame with same pixel format and channel number.
1099 * @param source The source frame buffer, must be valid
1100 * @param target The target frame buffer, must be valid
1101 * @param width The width of the frame in pixel, with range [1, infinity)
1102 * @param height The height of the frame in pixel, with range [1, infinity)
1103 * @param conversionFlag The conversion to be applied
1104 * @param rowReversePixelOrderFunction The function able to reverse the pixel order, must be valid
1105 * @param bytesPerRow The actual number of bytes each row covers, not including optional padding bytes at the end of each row, with range [width, infinity)
1106 * @param sourceStrideBytes The number of bytes between to start points of successive rows in the source frame, with range [0, infinity)
1107 * @param targetStrideBytes The number of bytes between to start points of successive rows in the target frame, with range [0, infinity)
1108 * @param firstRow The first row to be handled, with range [0, height - 1]
1109 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
1110 */
1111 static void transformGenericSubset(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const RowReversePixelOrderFunction<void> rowReversePixelOrderFunction, const unsigned int bytesPerRow, const unsigned int sourceStrideBytes, const unsigned int targetStrideBytes, const unsigned int firstRow, const unsigned int numberRows);
1112
1113 /**
1114 * Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
1115 * @param frame The image to convert, must be valid
1116 * @param width The width of the image in pixel, with range [1, infinity)
1117 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
1118 * @param firstRow The first row to be handled, with range [0, height - 1]
1119 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
1120 * @tparam tChannels The number of frame channels, with range [2, infinity)
1121 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
1122 */
1123 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
1124 static void premultipliedAlphaToStraightAlpha8BitPerChannelSubset(uint8_t* const frame, const unsigned int width, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1125
1126 /**
1127 * Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
1128 * @param source The source image to convert, must be valid
1129 * @param target The resulting converted target image, must be valid
1130 * @param width The width of the image in pixel, with range [1, infinity)
1131 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
1132 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
1133 * @param firstRow The first row to be handled, with range [0, height - 1]
1134 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
1135 * @tparam tChannels The number of frame channels, with range [2, infinity)
1136 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
1137 */
1138 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
1139 static void premultipliedAlphaToStraightAlpha8BitPerChannelSubset(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1140
1141 /**
1142 * Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied alpha.
1143 * @param frame The image to convert, must be valid
1144 * @param width The width of the image in pixel, with range [1, infinity)
1145 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
1146 * @param firstRow The first row to be handled, with range [0, height - 1]
1147 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
1148 * @tparam tChannels The number of frame channels, with range [2, infinity)
1149 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
1150 */
1151 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
1152 static void straightAlphaToPremultipliedAlpha8BitPerChannelSubset(uint8_t* const frame, const unsigned int width, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1153
1154 /**
1155 * Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied alpha.
1156 * @param source The source image to convert, must be valid
1157 * @param target The resulting converted target image, must be valid
1158 * @param width The width of the image in pixel, with range [1, infinity)
1159 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
1160 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
1161 * @param firstRow The first row to be handled, with range [0, height - 1]
1162 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
1163 * @tparam tChannels The number of frame channels, with range [2, infinity)
1164 * @tparam tAlphaChannelIndex The index of the alpha channel, with range [0, tChannels - 1]
1165 */
1166 template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
1167 static void straightAlphaToPremultipliedAlpha8BitPerChannelSubset(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1168
1169#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
1170
1171 /**
1172 * Converts 16 pixels with 3 channels per pixel to 16 pixels with one channel per pixel by a linear combination of the three channels.
1173 * This function can be used to e.g., convert RGB24 to Y8, or RGB24 to Y8.
1174 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
1175 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
1176 * @param source The pointer to the 16 source pixels (with 3 channels = 64 bytes) to convert, must be valid
1177 * @param target The pointer to the 16 target pixels (with 1 channel = 16 bytes) receiving the converted pixel data, must be valid
1178 * @param multiplicationFactors0_128_u_16x8 The multiplication factor for the first channel (8 identical 16 bit values), with ranges [0, 128], while the sum of all three factors must be 128
1179 * @param multiplicationFactors1_128_u_16x8 The multiplication factor for the second channel (8 identical 16 bit values), with ranges [0, 128], while the sum of all three factors must be 128
1180 * @param multiplicationFactors2_128_u_16x8 The multiplication factor for the third channel (8 identical 16 bit values), with ranges [0, 128], while the sum of all three factors must be 128
1181 */
1182 static OCEAN_FORCE_INLINE void convert3ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& multiplicationFactors0_128_u_16x8, const __m128i& multiplicationFactors1_128_u_16x8, const __m128i& multiplicationFactors2_128_u_16x8);
1183
1184 /**
1185 * Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1186 * Thus, this function can be used to e.g., convert RGB24 to YUV24, or YUV24 to RGB24.
1187 * The linear combination is defined by three integer multiplication factor for each source channel with 128 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
1188 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
1189 * The transformation is based on the following pattern:
1190 * <pre>
1191 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1192 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1193 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1194 * </pre>
1195 * With t target, s source, f factor, and b bias.
1196 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1197 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1198 * @param factorChannel00_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1199 * @param factorChannel10_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1200 * @param factorChannel20_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1201 * @param factorChannel01_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1202 * @param factorChannel11_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1203 * @param factorChannel21_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1204 * @param factorChannel02_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1205 * @param factorChannel12_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127
1206 * @param factorChannel22_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1207 * @param biasChannel0_s_16x8 The bias (translation) value for the first target channel, with range [-127, 127]
1208 * @param biasChannel1_s_16x8 The bias (translation) value for the second target channel, with range [-127, 127]
1209 * @param biasChannel2_s_16x8 The bias (translation) value for the third target channel, with range [-127, 127]
1210 */
1211 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_128_s_16x8, const __m128i& factorChannel10_128_s_16x8, const __m128i& factorChannel20_128_s_16x8, const __m128i& factorChannel01_128_s_16x8, const __m128i& factorChannel11_128_s_16x8, const __m128i& factorChannel21_128_s_16x8, const __m128i& factorChannel02_128_s_16x8, const __m128i& factorChannel12_128_s_16x8, const __m128i& factorChannel22_128_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8);
1212
1213 /**
1214 * Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1215 * Thus, this function can be used to e.g., convert RGB24 to YUV24, or YUV24 to RGB24.
1216 * The linear combination is defined by three integer multiplication factor for each source channel with 1024 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
1217 * Beware: As this function applies integer multiplication factors (with 10 bits precision) the conversion result has an accuracy of +/- 1 color intensities.<br>
1218 * The transformation is based on the following pattern:
1219 * <pre>
1220 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1221 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1222 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1223 * </pre>
1224 * With t target, s source, f factor, and b bias.
1225 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1226 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1227 * @param factorChannel00_1024_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-1024 * 16, 1024 * 16]
1228 * @param factorChannel10_1024_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-1024 * 16, 1024 * 16]
1229 * @param factorChannel20_1024_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-1024 * 16, 1024 * 16]
1230 * @param factorChannel01_1024_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-1024 * 16, 1024 * 16]
1231 * @param factorChannel11_1024_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-1024 * 16, 1024 * 16]
1232 * @param factorChannel21_1024_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-1024 * 16, 1024 * 16]
1233 * @param factorChannel02_1024_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-1024 * 16, 1024 * 16]
1234 * @param factorChannel12_1024_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-1024 * 16, 1024 * 16]
1235 * @param factorChannel22_1024_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-1024 * 16, 1024 * 16]
1236 * @param biasChannel0_1024_s_32x4 The bias (translation) value for the first target channel, with range [-1024 * 16, 1024 * 16]
1237 * @param biasChannel1_1024_s_32x4 The bias (translation) value for the second target channel, with range [-1024 * 16, 1024 * 16]
1238 * @param biasChannel2_1024_s_32x4 The bias (translation) value for the third target channel, with range [-1024 * 16, 1024 * 16]
1239 */
1240 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel10BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_1024_s_16x8, const __m128i& factorChannel10_1024_s_16x8, const __m128i& factorChannel20_1024_s_16x8, const __m128i& factorChannel01_1024_s_16x8, const __m128i& factorChannel11_1024_s_16x8, const __m128i& factorChannel21_1024_s_16x8, const __m128i& factorChannel02_1024_s_16x8, const __m128i& factorChannel12_1024_s_16x8, const __m128i& factorChannel22_1024_s_16x8, const __m128i& biasChannel0_1024_s_32x4, const __m128i& biasChannel1_1024_s_32x4, const __m128i& biasChannel2_1024_s_32x4);
1241
1242 /**
1243 * Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear combination of the three channels plus an in advance bias (translation) parameter.
1244 * Thus, this function can be used to e.g., convert YUV24 to RGB24.
1245 * The linear combination is defined by three integer multiplication factor for each source channel with 64 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
1246 * Beware: As this function applies integer multiplication factors (with 6 bits precision) the conversion result has an accuracy of +/- 4 color intensities.<br>
1247 * The transformation is based on the following pattern:
1248 * <pre>
1249 * t0 = clamp(0, f00 * (s0 - b0) + f01 * (s1 - b1) + f02 * (s2 - b2), 255)
1250 * t1 = clamp(0, f10 * (s0 - b0) + f11 * (s1 - b1) + f12 * (s2 - b2), 255)
1251 * t2 = clamp(0, f20 * (s0 - b0) + f21 * (s1 - b1) + f22 * (s2 - b2), 255)
1252 * </pre>
1253 * With t target, s source, f factor, and b bias/translation.
1254 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1255 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1256 * @param factorChannel00_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1257 * @param factorChannel10_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1258 * @param factorChannel20_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1259 * @param factorChannel01_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1260 * @param factorChannel11_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1261 * @param factorChannel21_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1262 * @param factorChannel02_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1263 * @param factorChannel12_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1264 * @param factorChannel22_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1265 * @param biasChannel0_s_16x8 The bias (translation) value for the first source channel, with range [0, 128]
1266 * @param biasChannel1_s_16x8 The bias (translation) value for the second source channel, with range [0, 128]
1267 * @param biasChannel2_s_16x8 The bias (translation) value for the third source channel, with range [0, 128]
1268 */
1269 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel6BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_64_s_16x8, const __m128i& factorChannel10_64_s_16x8, const __m128i& factorChannel20_64_s_16x8, const __m128i& factorChannel01_64_s_16x8, const __m128i& factorChannel11_64_s_16x8, const __m128i& factorChannel21_64_s_16x8, const __m128i& factorChannel02_64_s_16x8, const __m128i& factorChannel12_64_s_16x8, const __m128i& factorChannel22_64_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8);
1270
1271 /**
1272 * Converts 16 pixels with 3 channels per pixel to 16 pixels with four channel per pixel by a linear combination of the three channels plus an in advance bias (translation) parameter.
1273 * Thus, this function can be used to e.g., convert YUV24 to RGBA32.
1274 * The linear combination is defined by three integer multiplication factor for each source channel with 64 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
1275 * Beware: As this function applies integer multiplication factors (with 6 bits precision) the conversion result has an accuracy of +/- 4 color intensities.<br>
1276 * The transformation is based on the following pattern:
1277 * <pre>
1278 * t0 = clamp(0, f00 * (s0 - b0) + f01 * (s1 - b1) + f02 * (s2 - b2), 255)
1279 * t1 = clamp(0, f10 * (s0 - b0) + f11 * (s1 - b1) + f12 * (s2 - b2), 255)
1280 * t2 = clamp(0, f20 * (s0 - b0) + f21 * (s1 - b1) + f22 * (s2 - b2), 255)
1281 * t3 = valueChannel3
1282 * </pre>
1283 * With t target, s source, f factor, and b bias/translation.
1284 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1285 * @param target The pointer to the 16 target pixels (with 4 channels = 64 bytes) receiving the converted pixel data, must be valid
1286 * @param factorChannel00_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1287 * @param factorChannel10_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1288 * @param factorChannel20_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1289 * @param factorChannel01_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1290 * @param factorChannel11_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1291 * @param factorChannel21_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1292 * @param factorChannel02_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1293 * @param factorChannel12_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1294 * @param factorChannel22_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1295 * @param biasChannel0_s_16x8 The bias (translation) value for the first source channel, with range [0, 128]
1296 * @param biasChannel1_s_16x8 The bias (translation) value for the second source channel, with range [0, 128]
1297 * @param biasChannel2_s_16x8 The bias (translation) value for the third source channel, with range [0, 128]
1298 * @param channelValue3_u_8x16 The constant value for the fourth target channel, with range [0, 255]
1299 */
1300 static OCEAN_FORCE_INLINE void convert3ChannelsTo4Channels16Pixels8BitPerChannel6BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_64_s_16x8, const __m128i& factorChannel10_64_s_16x8, const __m128i& factorChannel20_64_s_16x8, const __m128i& factorChannel01_64_s_16x8, const __m128i& factorChannel11_64_s_16x8, const __m128i& factorChannel21_64_s_16x8, const __m128i& factorChannel02_64_s_16x8, const __m128i& factorChannel12_64_s_16x8, const __m128i& factorChannel22_64_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8, const __m128i& channelValue3_u_8x16);
1301
1302 /**
1303 * Converts 16 pixels with 4 channels per pixel to 16 pixels with three channel per pixel by a linear combination of the four channels plus a bias (translation) parameter.
1304 * Thus, this function can be used to e.g., convert YUVA32 to RGB24.
1305 * The linear combination is defined by four integer multiplication factor for each source channel with 128 as denominator. plus one bias (translation) parameter for each target channel (with 1 as denominator).<br>
1306 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
1307 * The transformation is based on the following pattern:
1308 * <pre>
1309 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + f03 * s3 + b0
1310 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + f13 * s3 + b1
1311 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + f23 * s3 + b2
1312 * </pre>
1313 * With t target, s source, f factor, and b bias.
1314 * @param source The pointer to the 16 source pixels (with 4 channels = 64 bytes) to convert, must be valid
1315 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1316 * @param factorChannel00_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1317 * @param factorChannel10_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1318 * @param factorChannel20_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1319 * @param factorChannel01_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1320 * @param factorChannel11_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1321 * @param factorChannel21_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1322 * @param factorChannel02_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1323 * @param factorChannel12_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1324 * @param factorChannel22_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1325 * @param factorChannel03_128_s_16x8 The multiplication factor (8 identical factors) for the fourth source channel and for the first target channel, with range [-127, 127]
1326 * @param factorChannel13_128_s_16x8 The multiplication factor (8 identical factors) for the fourth source channel and for the second target channel, with range [-127, 127]
1327 * @param factorChannel23_128_s_16x8 The multiplication factor (8 identical factors) for the fourth source channel and for the third target channel, with range [-127, 127]
1328 * @param biasChannel0_s_16x8 The bias (translation) value for the first target channel, with range [-127, 127]
1329 * @param biasChannel1_s_16x8 The bias (translation) value for the second target channel, with range [-127, 127]
1330 * @param biasChannel2_s_16x8 The bias (translation) value for the third target channel, with range [-127, 127]
1331 */
1332 static OCEAN_FORCE_INLINE void convert4ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_128_s_16x8, const __m128i& factorChannel10_128_s_16x8, const __m128i& factorChannel20_128_s_16x8, const __m128i& factorChannel01_128_s_16x8, const __m128i& factorChannel11_128_s_16x8, const __m128i& factorChannel21_128_s_16x8, const __m128i& factorChannel02_128_s_16x8, const __m128i& factorChannel12_128_s_16x8, const __m128i& factorChannel22_128_s_16x8, const __m128i& factorChannel03_128_s_16x8, const __m128i& factorChannel13_128_s_16x8, const __m128i& factorChannel23_128_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8);
1333
1334 /**
1335 * Converts 16 pixels with 4 channels per pixel to 16 pixels with one channel per pixel by a linear combination of the four channels.
1336 * This function can be used to e.g., convert RGBA32 to Y8, or ARGB32 to Y8, or RGB32 to Y8.
1337 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
1338 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
1339 * @param source The pointer to the 16 source pixels (with 4 channels = 64 bytes) to convert, must be valid
1340 * @param target The pointer to the 16 target pixels (with 1 channel = 16 bytes) receiving the converted pixel data, must be valid
1341 * @param multiplicationFactors0123_128_s_32x The four individual multiplication factors, one for each channel, with ranges [0, 127], while the sum of all four factors must be 128
1342 */
1343 static OCEAN_FORCE_INLINE void convert4ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& multiplicationFactors0123_128_s_32x);
1344
1345 /**
1346 * Converts 16 pixels with 4 channels per pixel to 16 pixels with two channel per pixel by a linear combination of the four channels.
1347 * This function can be used to e.g., convert RGBA32 to YA16, or ARGB32 to AY16.
1348 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
1349 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
1350 * @param source The pointer to the 16 source pixels (with 4 channels = 64 bytes) to convert, must be valid
1351 * @param target The pointer to the 16 target pixels (with 2 channel = 32 bytes) receiving the converted pixel data, must be valid
1352 * @param multiplicationFactorsChannel0_0123_128_s_16x8 The four individual multiplication factors for the first target channel (two sets), one for each source channel, with ranges [0, 128], while the sum of all four factors must be 128
1353 * @param multiplicationFactorsChannel1_0123_128_s_16x8 The four individual multiplication factors for the second target channel (two sets), one for each source channel, with ranges [0, 128], while the sum of all four factors must be 128
1354 */
1355 static OCEAN_FORCE_INLINE void convert4ChannelsTo2Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& multiplicationFactorsChannel0_0123_128_s_16x8, const __m128i& multiplicationFactorsChannel1_0123_128_s_16x8);
1356
1357#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
1358
1359#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1360
1361 /**
1362 * Converts 8 pixels with 3 channels per pixel to 8 pixels with one channel per pixel by a linear combination of the three channels.
1363 * Thus, this function can be used to e.g., convert RGB24 to Y8, or BGR24 to Y8.
1364 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
1365 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
1366 * @param source The pointer to the 8 source pixels (with 3 channels = 24 bytes) to convert, must be valid
1367 * @param target The pointer to the 8 target pixels (with 1 channel = 8 bytes) receiving the converted pixel data, must be valid
1368 * @param factorChannel0_128_u_8x8 The multiplication factor (8 identical factors) for the first channel, with range [0, 128]
1369 * @param factorChannel1_128_u_8x8 The multiplication factor (8 identical factors) for the second channel, with range [0, 128 - factorChannel0 - factorChannel2]
1370 * @param factorChannel2_128_u_8x8 The multiplication factor (8 identical factors) for the third channel, with range [0, 128 - factorChannel0 - factorChannel1]
1371 * @tparam tUseFactorChannel0 True, if the value(s) of factorChannel0 is not zero; False, if the value(s) of factorChannel0 is zero
1372 * @tparam tUseFactorChannel1 True, if the value(s) of factorChannel1 is not zero; False, if the value(s) of factorChannel1 is zero
1373 * @tparam tUseFactorChannel2 True, if the value(s) of factorChannel2 is not zero; False, if the value(s) of factorChannel2 is zero
1374 */
1375 template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2>
1376 static OCEAN_FORCE_INLINE void convert3ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const uint8x8_t& factorChannel0_128_u_8x8, const uint8x8_t& factorChannel1_128_u_8x8, const uint8x8_t& factorChannel2_128_u_8x8);
1377
1378 /**
1379 * Converts 8 pixels with 3 channels per pixel to 8 pixels with three channel per pixel by a linear combination of the three channels plus an in advance bias (translation) parameter.
1380 * Thus, this function can be used to e.g., convert RGB24 to YUV24, or RGB24 to YUV24.
1381 * The linear combination is defined by three integer multiplication factor for each source channel with 64 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
1382 * Beware: As this function applies integer multiplication factors (with 6 bits precision) the conversion result has an accuracy of +/- 4 color intensities.<br>
1383 * The transformation is based on the following pattern:
1384 * <pre>
1385 * t0 = clamp(0, f00 * (s0 - b0) + f01 * (s1 - b1) + f02 * (s2 - b2), 255)
1386 * t1 = clamp(0, f10 * (s0 - b0) + f11 * (s1 - b1) + f12 * (s2 - b2), 255)
1387 * t2 = clamp(0, f20 * (s0 - b0) + f21 * (s1 - b1) + f22 * (s2 - b2), 255)
1388 * </pre>
1389 * With t target, s source, f factor, and b bias/translation.
1390 * @param source The pointer to the 8 source pixels (with 3 channels = 24 bytes) to convert, must be valid
1391 * @param target The pointer to the 8 target pixels (with 3 channels = 24 bytes) receiving the converted pixel data, must be valid
1392 * @param factorChannel00_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1393 * @param factorChannel10_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1394 * @param factorChannel20_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1395 * @param factorChannel01_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1396 * @param factorChannel11_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1397 * @param factorChannel21_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1398 * @param factorChannel02_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1399 * @param factorChannel12_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1400 * @param factorChannel22_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1401 * @param biasChannel0_u_8x8 The bias (translation) value for the first target channel, with range [0, 128]
1402 * @param biasChannel1_u_8x8 The bias (translation) value for the second target channel, with range [0, 128]
1403 * @param biasChannel2_u_8x8 The bias (translation) value for the third target channel, with range [0, 128]
1404 */
1405 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels8Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_64_s_16x8, const int16x8_t& factorChannel10_64_s_16x8, const int16x8_t& factorChannel20_64_s_16x8, const int16x8_t& factorChannel01_64_s_16x8, const int16x8_t& factorChannel11_64_s_16x8, const int16x8_t& factorChannel21_64_s_16x8, const int16x8_t& factorChannel02_64_s_16x8, const int16x8_t& factorChannel12_64_s_16x8, const int16x8_t& factorChannel22_64_s_16x8, const uint8x8_t& biasChannel0_u_8x8, const uint8x8_t& biasChannel1_u_8x8, const uint8x8_t& biasChannel2_u_8x8);
1406
1407 /**
1408 * Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear combination of the three channels plus an in advance bias (translation) parameter.
1409 * Thus, this function can be used to e.g., convert RGB24 to YUV24, or RGB24 to YUV24.
1410 * The linear combination is defined by three integer multiplication factor for each source channel with 64 as denominator. plus one bias (translation) parameter for each source channel (with 1 as denominator).<br>
1411 * Beware: As this function applies integer multiplication factors (with 6 bits precision) the conversion result has an accuracy of +/- 4 color intensities.<br>
1412 * The transformation is based on the following pattern:
1413 * <pre>
1414 * t0 = clamp(0, f00 * (s0 - b0) + f01 * (s1 - b1) + f02 * (s2 - b2), 255)
1415 * t1 = clamp(0, f10 * (s0 - b0) + f11 * (s1 - b1) + f12 * (s2 - b2), 255)
1416 * t2 = clamp(0, f20 * (s0 - b0) + f21 * (s1 - b1) + f22 * (s2 - b2), 255)
1417 * </pre>
1418 * With t target, s source, f factor, and b bias/translation.
1419 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1420 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1421 * @param factorChannel00_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1422 * @param factorChannel10_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1423 * @param factorChannel20_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1424 * @param factorChannel01_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1425 * @param factorChannel11_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1426 * @param factorChannel21_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1427 * @param factorChannel02_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1428 * @param factorChannel12_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1429 * @param factorChannel22_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1430 * @param biasChannel0_u_8x8 The bias (translation) value for the first target channel, with range [0, 128]
1431 * @param biasChannel1_u_8x8 The bias (translation) value for the second target channel, with range [0, 128]
1432 * @param biasChannel2_u_8x8 The bias (translation) value for the third target channel, with range [0, 128]
1433 */
1434 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_64_s_16x8, const int16x8_t& factorChannel10_64_s_16x8, const int16x8_t& factorChannel20_64_s_16x8, const int16x8_t& factorChannel01_64_s_16x8, const int16x8_t& factorChannel11_64_s_16x8, const int16x8_t& factorChannel21_64_s_16x8, const int16x8_t& factorChannel02_64_s_16x8, const int16x8_t& factorChannel12_64_s_16x8, const int16x8_t& factorChannel22_64_s_16x8, const uint8x8_t& biasChannel0_u_8x8, const uint8x8_t& biasChannel1_u_8x8, const uint8x8_t& biasChannel2_u_8x8);
1435
1436 /**
1437 * Converts 8 pixels with 3 channels per pixel to 8 pixels with three channel per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1438 * Thus, this function can be used to e.g., convert RGB24 to YUV24, or YUV24 to RGB24.
1439 * The linear combination is defined by three integer multiplication factor for each source channel with 128 as denominator. plus one bias (translation) parameter for each source channel (also with 128 as denominator).<br>
1440 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
1441 * The transformation is based on the following pattern:
1442 * <pre>
1443 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1444 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1445 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1446 * </pre>
1447 * With t target, s source, f factor, and b bias.
1448 * @param source The pointer to the 8 source pixels (with 3 channels = 24 bytes) to convert, must be valid
1449 * @param target The pointer to the 8 target pixels (with 3 channels = 24 bytes) receiving the converted pixel data, must be valid
1450 * @param factorChannel00_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1451 * @param factorChannel10_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1452 * @param factorChannel20_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1453 * @param factorChannel01_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1454 * @param factorChannel11_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1455 * @param factorChannel21_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1456 * @param factorChannel02_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1457 * @param factorChannel12_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1458 * @param factorChannel22_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1459 * @param biasChannel0_128_s_16x8 The bias (translation) value for the first target channel, with range [-128 * 128, 128 * 128]
1460 * @param biasChannel1_128_s_16x8 The bias (translation) value for the second target channel, with range [-128 * 128, 128 * 128]
1461 * @param biasChannel2_128_s_16x8 The bias (translation) value for the third target channel, with range [-128 * 128, 128 * 128]
1462 */
1463 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_128_s_16x8, const int16x8_t& factorChannel10_128_s_16x8, const int16x8_t& factorChannel20_128_s_16x8, const int16x8_t& factorChannel01_128_s_16x8, const int16x8_t& factorChannel11_128_s_16x8, const int16x8_t& factorChannel21_128_s_16x8, const int16x8_t& factorChannel02_128_s_16x8, const int16x8_t& factorChannel12_128_s_16x8, const int16x8_t& factorChannel22_128_s_16x8, const int16x8_t& biasChannel0_128_s_16x8, const int16x8_t& biasChannel1_128_s_16x8, const int16x8_t& biasChannel2_128_s_16x8);
1464
1465 /**
1466 * Converts 8 pixels with 3 channels per pixel to 8 pixels with three channel per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1467 * Thus, this function can be used to e.g., convert YUV24 to RGB24, or YVU24 to BGR24.
1468 * The linear combination is defined by three integer multiplication factor for each source channel with 1024 as denominator. plus one bias (translation) parameter for each source channel (also with 1024 as denominator).<br>
1469 * Beware: As this function applies integer multiplication factors (with 10 bits precision) the conversion result has an accuracy of +/- 1 color intensities.<br>
1470 * The transformation is based on the following pattern:
1471 * <pre>
1472 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1473 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1474 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1475 * </pre>
1476 * With t target, s source, f factor, and b bias.
1477 * @param source The pointer to the 8 source pixels (with 3 channels = 24 bytes) to convert, must be valid
1478 * @param target The pointer to the 8 target pixels (with 3 channels = 24 bytes) receiving the converted pixel data, must be valid
1479 * @param factorChannel00_1024_s_16x4 The multiplication factor (4 identical factors) for the first source channel and for the first target channel, with range [-32767, 32767]
1480 * @param factorChannel10_1024_s_16x4 The multiplication factor (4 identical factors) for the first source channel and for the second target channel, with range [-32767, 32767]
1481 * @param factorChannel20_1024_s_16x4 The multiplication factor (4 identical factors) for the first source channel and for the third target channel, with range [-32767, 32767]
1482 * @param factorChannel01_1024_s_16x4 The multiplication factor (4 identical factors) for the second source channel and for the first target channel, with range [-32767, 32767]
1483 * @param factorChannel11_1024_s_16x4 The multiplication factor (4 identical factors) for the second source channel and for the second target channel, with range [-32767, 32767]
1484 * @param factorChannel21_1024_s_16x4 The multiplication factor (4 identical factors) for the second source channel and for the third target channel, with range [-32767, 32767]
1485 * @param factorChannel02_1024_s_16x4 The multiplication factor (4 identical factors) for the third source channel and for the first target channel, with range [-32767, 32767]
1486 * @param factorChannel12_1024_s_16x4 The multiplication factor (4 identical factors) for the third source channel and for the second target channel, with range [-32767, 32767
1487 * @param factorChannel22_1024_s_16x4 The multiplication factor (4 identical factors) for the third source channel and for the third target channel, with range [-32767, 32767]
1488 * @param biasChannel0_1024_s_32x4 The bias (translation) value for the first target channel, with range [-32767, 32767]
1489 * @param biasChannel1_1024_s_32x4 The bias (translation) value for the second target channel, with range [-32767, 32767]
1490 * @param biasChannel2_1024_s_32x4 The bias (translation) value for the third target channel, with range [-32767, 32767]
1491 */
1492 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels8Pixels8BitPerChannel10BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x4_t& factorChannel00_1024_s_16x4, const int16x4_t& factorChannel10_1024_s_16x4, const int16x4_t& factorChannel20_1024_s_16x4, const int16x4_t& factorChannel01_1024_s_16x4, const int16x4_t& factorChannel11_1024_s_16x4, const int16x4_t& factorChannel21_1024_s_16x4, const int16x4_t& factorChannel02_1024_s_16x4, const int16x4_t& factorChannel12_1024_s_16x4, const int16x4_t& factorChannel22_1024_s_16x4, const int32x4_t& biasChannel0_1024_s_32x4, const int32x4_t& biasChannel1_1024_s_32x4, const int32x4_t& biasChannel2_1024_s_32x4);
1493
1494 /**
1495 * Converts 8 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1496 * Thus, this function can be used to e.g., convert YUV24 to RGB24, or YVU24 to BGR24.
1497 * The linear combination is defined by three integer multiplication factor for each source channel with 1024 as denominator. plus one bias (translation) parameter for each source channel (also with 1024 as denominator).<br>
1498 * Beware: As this function applies integer multiplication factors (with 10 bits precision) the conversion result has an accuracy of +/- 1 color intensities.<br>
1499 * The transformation is based on the following pattern:
1500 * <pre>
1501 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1502 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1503 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1504 * </pre>
1505 * With t target, s source, f factor, and b bias.
1506 * @param source The pointer to the 16 source pixels (with 3 channels = 24 bytes) to convert, must be valid
1507 * @param target The pointer to the 16 target pixels (with 3 channels = 24 bytes) receiving the converted pixel data, must be valid
1508 * @param factorChannel00_1024_s_16x4 The multiplication factor (4 identical factors) for the first source channel and for the first target channel, with range [-32767, 32767]
1509 * @param factorChannel10_1024_s_16x4 The multiplication factor (4 identical factors) for the first source channel and for the second target channel, with range [-32767, 32767]
1510 * @param factorChannel20_1024_s_16x4 The multiplication factor (4 identical factors) for the first source channel and for the third target channel, with range [-32767, 32767]
1511 * @param factorChannel01_1024_s_16x4 The multiplication factor (4 identical factors) for the second source channel and for the first target channel, with range [-32767, 32767]
1512 * @param factorChannel11_1024_s_16x4 The multiplication factor (4 identical factors) for the second source channel and for the second target channel, with range [-32767, 32767]
1513 * @param factorChannel21_1024_s_16x4 The multiplication factor (4 identical factors) for the second source channel and for the third target channel, with range [-32767, 32767]
1514 * @param factorChannel02_1024_s_16x4 The multiplication factor (4 identical factors) for the third source channel and for the first target channel, with range [-32767, 32767]
1515 * @param factorChannel12_1024_s_16x4 The multiplication factor (4 identical factors) for the third source channel and for the second target channel, with range [-32767, 32767
1516 * @param factorChannel22_1024_s_16x4 The multiplication factor (4 identical factors) for the third source channel and for the third target channel, with range [-32767, 32767]
1517 * @param biasChannel0_1024_s_32x4 The bias (translation) value for the first target channel, with range [-32767, 32767]
1518 * @param biasChannel1_1024_s_32x4 The bias (translation) value for the second target channel, with range [-32767, 32767]
1519 * @param biasChannel2_1024_s_32x4 The bias (translation) value for the third target channel, with range [-32767, 32767]
1520 */
1521 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel10BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x4_t& factorChannel00_1024_s_16x4, const int16x4_t& factorChannel10_1024_s_16x4, const int16x4_t& factorChannel20_1024_s_16x4, const int16x4_t& factorChannel01_1024_s_16x4, const int16x4_t& factorChannel11_1024_s_16x4, const int16x4_t& factorChannel21_1024_s_16x4, const int16x4_t& factorChannel02_1024_s_16x4, const int16x4_t& factorChannel12_1024_s_16x4, const int16x4_t& factorChannel22_1024_s_16x4, const int32x4_t& biasChannel0_1024_s_32x4, const int32x4_t& biasChannel1_1024_s_32x4, const int32x4_t& biasChannel2_1024_s_32x4);
1522
1523 /**
1524 * Converts 16 pixels with 3 channels per pixel to 16 pixels with 3 channels per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1525 * Thus, this function can be used to e.g., convert RGB24 to YUV24, or YUV24 to RGB24.
1526 * The linear combination is defined by three integer multiplication factor for each source channel with 128 as denominator. plus one bias (translation) parameter for each source channel (with 128 as denominator).<br>
1527 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
1528 * The transformation is based on the following pattern:
1529 * <pre>
1530 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1531 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1532 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1533 * </pre>
1534 * With t target, s source, f factor, and b bias.
1535 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1536 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1537 * @param factorChannel00_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1538 * @param factorChannel10_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1539 * @param factorChannel20_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1540 * @param factorChannel01_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1541 * @param factorChannel11_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1542 * @param factorChannel21_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1543 * @param factorChannel02_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1544 * @param factorChannel12_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127
1545 * @param factorChannel22_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1546 * @param biasChannel0_128_s_16x8 The bias (translation) value for the first target channel, with range [-128 * 128, 128 * 128]
1547 * @param biasChannel1_128_s_16x8 The bias (translation) value for the second target channel, with range [-128 * 128, 128 * 128]
1548 * @param biasChannel2_128_s_16x8 The bias (translation) value for the third target channel, with range [-128 * 128, 128 * 128]
1549 */
1550 static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_128_s_16x8, const int16x8_t& factorChannel10_128_s_16x8, const int16x8_t& factorChannel20_128_s_16x8, const int16x8_t& factorChannel01_128_s_16x8, const int16x8_t& factorChannel11_128_s_16x8, const int16x8_t& factorChannel21_128_s_16x8, const int16x8_t& factorChannel02_128_s_16x8, const int16x8_t& factorChannel12_128_s_16x8, const int16x8_t& factorChannel22_128_s_16x8, const int16x8_t& biasChannel0_128_s_16x8, const int16x8_t& biasChannel1_128_s_16x8, const int16x8_t& biasChannel2_128_s_16x8);
1551
1552 /**
1553 * Converts 16 pixels with 3 channels per pixel to 16 pixels with 4 channels per pixel by a linear combination of the three channels plus a bias (translation) parameter.
1554 * The fourth channel is set to a constant value, e.g., for an alpha channel.<br>
1555 * Thus, this function can be used to e.g., convert YUV24 to RGBA32, or YVU24 to BGRA32.<br>
1556 * The linear combination is defined by three integer multiplication factor for each source channel with 128 as denominator. plus one bias (translation) parameter for each source channel (with 128 as denominator).<br>
1557 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
1558 * The transformation is based on the following pattern:
1559 * <pre>
1560 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + b0
1561 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + b1
1562 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + b2
1563 * t3 = valueChannel3
1564 * </pre>
1565 * With t target, s source, f factor, and b bias.
1566 * @param source The pointer to the 16 source pixels (with 3 channels = 48 bytes) to convert, must be valid
1567 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1568 * @param factorChannel00_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1569 * @param factorChannel10_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1570 * @param factorChannel20_64_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1571 * @param factorChannel01_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1572 * @param factorChannel11_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1573 * @param factorChannel21_64_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1574 * @param factorChannel02_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1575 * @param factorChannel12_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127
1576 * @param factorChannel22_64_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1577 * @param biasChannel0_u_8x8 The bias (translation) value for the first target channel, with range [0, 128]
1578 * @param biasChannel1_u_8x8 The bias (translation) value for the second target channel, with range [0, 128]
1579 * @param biasChannel2_u_8x8 The bias (translation) value for the third target channel, with range [0, 138]
1580 * @param channelValue3_u_8x16 The constant value for the fourth target channel, with range [0, 255]
1581 */
1582 static OCEAN_FORCE_INLINE void convert3ChannelsTo4Channels16Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_64_s_16x8, const int16x8_t& factorChannel10_64_s_16x8, const int16x8_t& factorChannel20_64_s_16x8, const int16x8_t& factorChannel01_64_s_16x8, const int16x8_t& factorChannel11_64_s_16x8, const int16x8_t& factorChannel21_64_s_16x8, const int16x8_t& factorChannel02_64_s_16x8, const int16x8_t& factorChannel12_64_s_16x8, const int16x8_t& factorChannel22_64_s_16x8, const uint8x8_t& biasChannel0_u_8x8, const uint8x8_t& biasChannel1_u_8x8, const uint8x8_t& biasChannel2_u_8x8, const uint8x16_t& channelValue3_u_8x16);
1583
1584 /**
1585 * Converts 8 pixels with 4 channels per pixel to 8 pixels with one channel per pixel by a linear combination of the four channels.
1586 * Thus, this function can be used to e.g., convert RGBA32 to Y8, or ARGB32 to Y8, or RGB32 to Y8.
1587 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
1588 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
1589 * @param source The pointer to the 8 source pixels (with 4 channels = 32 bytes) to convert, must be valid
1590 * @param target The pointer to the 8 target pixels (with 1 channel = 8 bytes) receiving the converted pixel data, must be valid
1591 * @param factorChannel0_128_u_8x8 The multiplication factor (8 identical factors) for the first channel, with range [0, 127]
1592 * @param factorChannel1_128_u_8x8 The multiplication factor (8 identical factors) for the second channel, with range [0, 127 - factorChannel0 - factorChannel2 - factorChannel3]
1593 * @param factorChannel2_128_u_8x8 The multiplication factor (8 identical factors) for the third channel, with range [0, 127 - factorChannel0 - factorChannel1 - factorChannel3]
1594 * @param factorChannel3_128_u_8x8 The multiplication factor (8 identical factors) for the fourth channel, with range [0, 127 - factorChannel0 - factorChannel1 - factorChannel2]
1595 * @tparam tUseFactorChannel0 True, if the value(s) of factorChannel0 is not zero; False, if the value(s) of factorChannel0 is zero
1596 * @tparam tUseFactorChannel1 True, if the value(s) of factorChannel1 is not zero; False, if the value(s) of factorChannel1 is zero
1597 * @tparam tUseFactorChannel2 True, if the value(s) of factorChannel2 is not zero; False, if the value(s) of factorChannel2 is zero
1598 * @tparam tUseFactorChannel3 True, if the value(s) of factorChannel3 is not zero; False, if the value(s) of factorChannel3 is zero
1599 */
1600 template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2, bool tUseFactorChannel3>
1601 static OCEAN_FORCE_INLINE void convert4ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const uint8x8_t& factorChannel0_128_u_8x8, const uint8x8_t& factorChannel1_128_u_8x8, const uint8x8_t& factorChannel2_128_u_8x8, const uint8x8_t& factorChannel3_128_u_8x8);
1602
1603 /**
1604 * Converts 8 pixels with 4 channels per pixel to 8 pixels with two channels per pixel by a linear combination of the four channels.
1605 * Thus, this function can be used to e.g., convert RGBA32 to YA16, or ARGB32 to AY16.
1606 * The linear combination is defined by one integer multiplication factor for each channel with 128 as denominator.<br>
1607 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.
1608 * @param source The pointer to the 8 source pixels (with 4 channels = 32 bytes) to convert, must be valid
1609 * @param target The pointer to the 8 target pixels (with 1 channel = 8 bytes) receiving the converted pixel data, must be valid
1610 * @param factorChannel00_128_u_8x8 The multiplication factor (8 identical factors) for the first target and first source channel, with range [0, 127]
1611 * @param factorChannel10_128_u_8x8 The multiplication factor (8 identical factors) for the second target and first source channel, with range [0, 127]
1612 * @param factorChannel01_128_u_8x8 The multiplication factor (8 identical factors) for the first target and second source channel, with range [0, 127 - factorChannel00 - factorChannel02 - factorChannel03]
1613 * @param factorChannel11_128_u_8x8 The multiplication factor (8 identical factors) for the second target and second source channel, with range [0, 127 - factorChannel10 - factorChannel12 - factorChannel13]
1614 * @param factorChannel02_128_u_8x8 The multiplication factor (8 identical factors) for the first target and third source channel, with range [0, 127 - factorChannel00 - factorChannel01 - factorChannel03]
1615 * @param factorChannel12_128_u_8x8 The multiplication factor (8 identical factors) for the second target and third source channel, with range [0, 127 - factorChannel10 - factorChannel11 - factorChannel13]
1616 * @param factorChannel03_128_u_8x8 The multiplication factor (8 identical factors) for the first target and fourth source channel, with range [0, 127 - factorChannel00 - factorChannel01 - factorChannel02]
1617 * @param factorChannel13_128_u_8x8 The multiplication factor (8 identical factors) for the second target and fourth source channel, with range [0, 127 - factorChannel10 - factorChannel11 - factorChannel12]
1618 */
1619 static OCEAN_FORCE_INLINE void convert4ChannelsTo2Channels8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const uint8x8_t& factorChannel00_128_u_8x8, const uint8x8_t& factorChannel10_128_u_8x8, const uint8x8_t& factorChannel01_128_u_8x8, const uint8x8_t& factorChannel11_128_u_8x8, const uint8x8_t& factorChannel02_128_u_8x8, const uint8x8_t& factorChannel12_128_u_8x8, const uint8x8_t& factorChannel03_128_u_8x8, const uint8x8_t& factorChannel13_128_u_8x8);
1620
1621 /**
1622 * Converts 16 pixels with 4 channels per pixel to 16 pixels with three channels per pixel by a linear combination of the four channels plus a bias (translation) parameter.
1623 * Thus, this function can be used to e.g., convert YUVA32 to RGB24.
1624 * The linear combination is defined by four integer multiplication factor for each source channel with 128 as denominator. plus one bias (translation) parameter for each target channel (also with 128 as denominator).<br>
1625 * Beware: As this function applies integer multiplication factors (with 7 bits precision) the conversion result has an accuracy of +/- 2 color intensities.<br>
1626 * The transformation is based on the following pattern:
1627 * <pre>
1628 * t0 = f00 * s0 + f01 * s1 + f02 * s2 + f03 * s3 + b0
1629 * t1 = f10 * s0 + f11 * s1 + f12 * s2 + f13 * s3 + b1
1630 * t2 = f20 * s0 + f21 * s1 + f22 * s2 + f23 * s3 + b2
1631 * </pre>
1632 * With t target, s source, f factor, and b bias.
1633 * @param source The pointer to the 16 source pixels (with 4 channels = 64 bytes) to convert, must be valid
1634 * @param target The pointer to the 16 target pixels (with 3 channels = 48 bytes) receiving the converted pixel data, must be valid
1635 * @param factorChannel00_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the first target channel, with range [-127, 127]
1636 * @param factorChannel10_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the second target channel, with range [-127, 127]
1637 * @param factorChannel20_128_s_16x8 The multiplication factor (8 identical factors) for the first source channel and for the third target channel, with range [-127, 127]
1638 * @param factorChannel01_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the first target channel, with range [-127, 127]
1639 * @param factorChannel11_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the second target channel, with range [-127, 127]
1640 * @param factorChannel21_128_s_16x8 The multiplication factor (8 identical factors) for the second source channel and for the third target channel, with range [-127, 127]
1641 * @param factorChannel02_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the first target channel, with range [-127, 127]
1642 * @param factorChannel12_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the second target channel, with range [-127, 127]
1643 * @param factorChannel22_128_s_16x8 The multiplication factor (8 identical factors) for the third source channel and for the third target channel, with range [-127, 127]
1644 * @param factorChannel03_128_s_16x8 The multiplication factor (8 identical factors) for the fourth source channel and for the first target channel, with range [-127, 127]
1645 * @param factorChannel13_128_s_16x8 The multiplication factor (8 identical factors) for the fourth source channel and for the second target channel, with range [-127, 127]
1646 * @param factorChannel23_128_s_16x8 The multiplication factor (8 identical factors) for the fourth source channel and for the third target channel, with range [-127, 127]
1647 * @param biasChannel0_128_s_16x8 The bias (translation) value for the first target channel, multiplied by 128, with range [-128 * 128, 128 * 128]
1648 * @param biasChannel1_128_s_16x8 The bias (translation) value for the second target channel, multiplied by 128, with range [-128 * 128, 128 * 128]
1649 * @param biasChannel2_128_s_16x8 The bias (translation) value for the third target channel, multiplied by 128, with range [-128 * 128, 128 * 128]
1650 */
1651 static OCEAN_FORCE_INLINE void convert4ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_128_s_16x8, const int16x8_t& factorChannel10_128_s_16x8, const int16x8_t& factorChannel20_128_s_16x8, const int16x8_t& factorChannel01_128_s_16x8, const int16x8_t& factorChannel11_128_s_16x8, const int16x8_t& factorChannel21_128_s_16x8, const int16x8_t& factorChannel02_128_s_16x8, const int16x8_t& factorChannel12_128_s_16x8, const int16x8_t& factorChannel22_128_s_16x8, const int16x8_t& factorChannel03_128_s_16x8, const int16x8_t& factorChannel13_128_s_16x8, const int16x8_t& factorChannel23_128_s_16x8, const int16x8_t& biasChannel0_128_s_16x8, const int16x8_t& biasChannel1_128_s_16x8, const int16x8_t& biasChannel2_128_s_16x8);
1652
1653#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1654
1655};
1656
1657#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1658
1659template <>
1660inline void FrameChannels::separateTo1Channel<uint8_t, uint8_t, 2u>(const uint8_t* const sourceFrame, uint8_t* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements)
1661{
1662 ocean_assert(sourceFrame != nullptr);
1663 ocean_assert(targetFrames != nullptr);
1664
1665 ocean_assert(width != 0u && height != 0u);
1666 ocean_assert(channels == 2u);
1667
1668 constexpr unsigned int tChannels = 2u;
1669
1670 bool allTargetFramesContinuous = true;
1671
1672 if (targetFramesPaddingElements != nullptr)
1673 {
1674 for (unsigned int n = 0u; n < tChannels; ++n)
1675 {
1676 if (targetFramesPaddingElements[n] != 0u)
1677 {
1678 allTargetFramesContinuous = false;
1679 break;
1680 }
1681 }
1682 }
1683
1684 const uint8_t* source = sourceFrame;
1685 uint8_t* target0 = targetFrames[0];
1686 uint8_t* target1 = targetFrames[1];
1687
1688 constexpr unsigned int tBlockSize = 16u;
1689
1690 uint8x16x2_t source_8x16x2;
1691
1692 if (allTargetFramesContinuous && sourceFramePaddingElements == 0u)
1693 {
1694 const unsigned int pixels = width * height;
1695 const unsigned int blocks = pixels / tBlockSize;
1696 const unsigned int remaining = pixels % tBlockSize;
1697
1698 for (unsigned int n = 0u; n < blocks; ++n)
1699 {
1700 source_8x16x2 = vld2q_u8(source);
1701
1702 vst1q_u8(target0, source_8x16x2.val[0]);
1703 vst1q_u8(target1, source_8x16x2.val[1]);
1704
1705 source += tBlockSize * tChannels;
1706
1707 target0 += tBlockSize;
1708 target1 += tBlockSize;
1709 }
1710
1711 for (unsigned int n = 0u; n < remaining; ++n)
1712 {
1713 target0[n] = source[n * tChannels + 0u];
1714 target1[n] = source[n * tChannels + 1u];
1715 }
1716 }
1717 else
1718 {
1719 const unsigned int targetFrame0PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[0];
1720 const unsigned int targetFrame1PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[1];
1721
1722 const unsigned int blocks = width / tBlockSize;
1723 const unsigned int remaining = width % tBlockSize;
1724
1725 for (unsigned int y = 0u; y < height; ++y)
1726 {
1727 for (unsigned int n = 0u; n < blocks; ++n)
1728 {
1729 source_8x16x2 = vld2q_u8(source);
1730
1731 vst1q_u8(target0, source_8x16x2.val[0]);
1732 vst1q_u8(target1, source_8x16x2.val[1]);
1733
1734 source += tBlockSize * tChannels;
1735
1736 target0 += tBlockSize;
1737 target1 += tBlockSize;
1738 }
1739
1740 for (unsigned int n = 0u; n < remaining; ++n)
1741 {
1742 target0[n] = source[n * tChannels + 0u];
1743 target1[n] = source[n * tChannels + 1u];
1744 }
1745
1746 source += remaining * tChannels + sourceFramePaddingElements;
1747 target0 += remaining + targetFrame0PaddingElements;
1748 target1 += remaining + targetFrame1PaddingElements;
1749 }
1750 }
1751}
1752
1753template <>
1754inline void FrameChannels::separateTo1Channel<uint8_t, uint8_t, 3u>(const uint8_t* const sourceFrame, uint8_t* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements)
1755{
1756 ocean_assert(sourceFrame != nullptr);
1757 ocean_assert(targetFrames != nullptr);
1758
1759 ocean_assert(width != 0u && height != 0u);
1760 ocean_assert(channels == 3u);
1761
1762 constexpr unsigned int tChannels = 3u;
1763
1764 bool allTargetFramesContinuous = true;
1765
1766 if (targetFramesPaddingElements != nullptr)
1767 {
1768 for (unsigned int n = 0u; n < tChannels; ++n)
1769 {
1770 if (targetFramesPaddingElements[n] != 0u)
1771 {
1772 allTargetFramesContinuous = false;
1773 break;
1774 }
1775 }
1776 }
1777
1778 const uint8_t* source = sourceFrame;
1779 uint8_t* target0 = targetFrames[0];
1780 uint8_t* target1 = targetFrames[1];
1781 uint8_t* target2 = targetFrames[2];
1782
1783 constexpr unsigned int tBlockSize = 16u;
1784
1785 uint8x16x3_t source_8x16x3;
1786
1787 if (allTargetFramesContinuous && sourceFramePaddingElements == 0u)
1788 {
1789 const unsigned int pixels = width * height;
1790 const unsigned int blocks = pixels / tBlockSize;
1791 const unsigned int remaining = pixels % tBlockSize;
1792
1793 for (unsigned int n = 0u; n < blocks; ++n)
1794 {
1795 source_8x16x3 = vld3q_u8(source);
1796
1797 vst1q_u8(target0, source_8x16x3.val[0]);
1798 vst1q_u8(target1, source_8x16x3.val[1]);
1799 vst1q_u8(target2, source_8x16x3.val[2]);
1800
1801 source += tBlockSize * tChannels;
1802
1803 target0 += tBlockSize;
1804 target1 += tBlockSize;
1805 target2 += tBlockSize;
1806 }
1807
1808 for (unsigned int n = 0u; n < remaining; ++n)
1809 {
1810 target0[n] = source[n * tChannels + 0u];
1811 target1[n] = source[n * tChannels + 1u];
1812 target2[n] = source[n * tChannels + 2u];
1813 }
1814 }
1815 else
1816 {
1817 const unsigned int targetFrame0PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[0];
1818 const unsigned int targetFrame1PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[1];
1819 const unsigned int targetFrame2PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[2];
1820
1821 const unsigned int blocks = width / tBlockSize;
1822 const unsigned int remaining = width % tBlockSize;
1823
1824 for (unsigned int y = 0u; y < height; ++y)
1825 {
1826 for (unsigned int n = 0u; n < blocks; ++n)
1827 {
1828 source_8x16x3 = vld3q_u8(source);
1829
1830 vst1q_u8(target0, source_8x16x3.val[0]);
1831 vst1q_u8(target1, source_8x16x3.val[1]);
1832 vst1q_u8(target2, source_8x16x3.val[2]);
1833
1834 source += tBlockSize * tChannels;
1835
1836 target0 += tBlockSize;
1837 target1 += tBlockSize;
1838 target2 += tBlockSize;
1839 }
1840
1841 for (unsigned int n = 0u; n < remaining; ++n)
1842 {
1843 target0[n] = source[n * tChannels + 0u];
1844 target1[n] = source[n * tChannels + 1u];
1845 target2[n] = source[n * tChannels + 2u];
1846 }
1847
1848 source += remaining * tChannels + sourceFramePaddingElements;
1849 target0 += remaining + targetFrame0PaddingElements;
1850 target1 += remaining + targetFrame1PaddingElements;
1851 target2 += remaining + targetFrame2PaddingElements;
1852 }
1853 }
1854}
1855
1856template <>
1857inline void FrameChannels::separateTo1Channel<uint8_t, uint8_t, 4u>(const uint8_t* const sourceFrame, uint8_t* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements)
1858{
1859 ocean_assert(sourceFrame != nullptr);
1860 ocean_assert(targetFrames != nullptr);
1861
1862 ocean_assert(width != 0u && height != 0u);
1863 ocean_assert(channels == 4u);
1864
1865 constexpr unsigned int tChannels = 4u;
1866
1867 bool allTargetFramesContinuous = true;
1868
1869 if (targetFramesPaddingElements != nullptr)
1870 {
1871 for (unsigned int n = 0u; n < tChannels; ++n)
1872 {
1873 if (targetFramesPaddingElements[n] != 0u)
1874 {
1875 allTargetFramesContinuous = false;
1876 break;
1877 }
1878 }
1879 }
1880
1881 const uint8_t* source = sourceFrame;
1882 uint8_t* target0 = targetFrames[0];
1883 uint8_t* target1 = targetFrames[1];
1884 uint8_t* target2 = targetFrames[2];
1885 uint8_t* target3 = targetFrames[3];
1886
1887 constexpr unsigned int tBlockSize = 16u;
1888
1889 uint8x16x4_t source_8x16x4;
1890
1891 if (allTargetFramesContinuous && sourceFramePaddingElements == 0u)
1892 {
1893 const unsigned int pixels = width * height;
1894 const unsigned int blocks = pixels / tBlockSize;
1895 const unsigned int remaining = pixels % tBlockSize;
1896
1897 for (unsigned int n = 0u; n < blocks; ++n)
1898 {
1899 source_8x16x4 = vld4q_u8(source);
1900
1901 vst1q_u8(target0, source_8x16x4.val[0]);
1902 vst1q_u8(target1, source_8x16x4.val[1]);
1903 vst1q_u8(target2, source_8x16x4.val[2]);
1904 vst1q_u8(target3, source_8x16x4.val[3]);
1905
1906 source += tBlockSize * tChannels;
1907
1908 target0 += tBlockSize;
1909 target1 += tBlockSize;
1910 target2 += tBlockSize;
1911 target3 += tBlockSize;
1912 }
1913
1914 for (unsigned int n = 0u; n < remaining; ++n)
1915 {
1916 target0[n] = source[n * tChannels + 0u];
1917 target1[n] = source[n * tChannels + 1u];
1918 target2[n] = source[n * tChannels + 2u];
1919 target3[n] = source[n * tChannels + 3u];
1920 }
1921 }
1922 else
1923 {
1924 const unsigned int targetFrame0PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[0];
1925 const unsigned int targetFrame1PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[1];
1926 const unsigned int targetFrame2PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[2];
1927 const unsigned int targetFrame3PaddingElements = targetFramesPaddingElements == nullptr ? 0u : targetFramesPaddingElements[3];
1928
1929 const unsigned int blocks = width / tBlockSize;
1930 const unsigned int remaining = width % tBlockSize;
1931
1932 for (unsigned int y = 0u; y < height; ++y)
1933 {
1934 for (unsigned int n = 0u; n < blocks; ++n)
1935 {
1936 source_8x16x4 = vld4q_u8(source);
1937
1938 vst1q_u8(target0, source_8x16x4.val[0]);
1939 vst1q_u8(target1, source_8x16x4.val[1]);
1940 vst1q_u8(target2, source_8x16x4.val[2]);
1941 vst1q_u8(target3, source_8x16x4.val[3]);
1942
1943 source += tBlockSize * tChannels;
1944
1945 target0 += tBlockSize;
1946 target1 += tBlockSize;
1947 target2 += tBlockSize;
1948 target3 += tBlockSize;
1949 }
1950
1951 for (unsigned int n = 0u; n < remaining; ++n)
1952 {
1953 target0[n] = source[n * tChannels + 0u];
1954 target1[n] = source[n * tChannels + 1u];
1955 target2[n] = source[n * tChannels + 2u];
1956 target3[n] = source[n * tChannels + 3u];
1957 }
1958
1959 source += remaining * tChannels + sourceFramePaddingElements;
1960 target0 += remaining + targetFrame0PaddingElements;
1961 target1 += remaining + targetFrame1PaddingElements;
1962 target2 += remaining + targetFrame2PaddingElements;
1963 target3 += remaining + targetFrame3PaddingElements;
1964 }
1965 }
1966}
1967
1968#endif // OCEAN_HARDWARE_NEON_VERSION
1969
1970template <typename TSource, typename TTarget, unsigned int tChannels>
1971void FrameChannels::separateTo1Channel(const TSource* const sourceFrame, TTarget* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements)
1972{
1973 ocean_assert(sourceFrame != nullptr);
1974 ocean_assert(targetFrames != nullptr);
1975
1976 ocean_assert(width != 0u && height != 0u);
1977
1978 ocean_assert(tChannels == CHANNELS_NOT_KNOWN_AT_COMPILE_TIME || tChannels == channels);
1979
1980 if constexpr (tChannels == CHANNELS_NOT_KNOWN_AT_COMPILE_TIME)
1981 {
1982 separateTo1ChannelRuntime<TSource, TTarget>(sourceFrame, targetFrames, width, height, channels, sourceFramePaddingElements, targetFramesPaddingElements);
1983 return;
1984 }
1985
1986#ifdef OCEAN_DEBUG
1987 for (unsigned int c = 0u; c < tChannels; ++c)
1988 {
1989 ocean_assert(targetFrames[c] != nullptr);
1990 }
1991#endif
1992
1993 if (sourceFramePaddingElements == 0u && targetFramesPaddingElements == nullptr)
1994 {
1995 for (unsigned int n = 0u; n < width * height; ++n)
1996 {
1997 for (unsigned int c = 0u; c < tChannels; ++c)
1998 {
1999 targetFrames[c][n] = TTarget(sourceFrame[n * tChannels + c]);
2000 }
2001 }
2002 }
2003 else if (targetFramesPaddingElements == nullptr)
2004 {
2005 ocean_assert(sourceFramePaddingElements != 0u);
2006
2007 const unsigned int sourceFrameStrideElements = width * tChannels + sourceFramePaddingElements;
2008
2009 for (unsigned int y = 0u; y < height; ++y)
2010 {
2011 const TSource* const sourceRow = sourceFrame + y * sourceFrameStrideElements;
2012
2013 const unsigned int targetRowOffset = y * width;
2014
2015 for (unsigned int x = 0u; x < width; ++x)
2016 {
2017 for (unsigned int c = 0u; c < tChannels; ++c)
2018 {
2019 *(targetFrames[c] + targetRowOffset + x) = TTarget(*(sourceRow + x * tChannels + c));
2020 }
2021 }
2022 }
2023 }
2024 else
2025 {
2026 const unsigned int sourceFrameStrideElements = width * tChannels + sourceFramePaddingElements;
2027
2028 Indices32 targetFrameStrideElements(tChannels);
2029
2030 for (unsigned int c = 0u; c < tChannels; ++c)
2031 {
2032 targetFrameStrideElements[c] = width + targetFramesPaddingElements[c];
2033 }
2034
2035 for (unsigned int y = 0u; y < height; ++y)
2036 {
2037 const TSource* const sourceRow = sourceFrame + y * sourceFrameStrideElements;
2038
2039 for (unsigned int x = 0u; x < width; ++x)
2040 {
2041 for (unsigned int c = 0u; c < tChannels; ++c)
2042 {
2043 *(targetFrames[c] + y * targetFrameStrideElements[c] + x) = TTarget(*(sourceRow + x * tChannels + c));
2044 }
2045 }
2046 }
2047 }
2048}
2049
2050template <typename TSource, typename TTarget>
2051void FrameChannels::separateTo1Channel(const TSource* const sourceFrame, const std::initializer_list<TTarget*>& targetFrames, const unsigned int width, const unsigned int height, const unsigned int sourceFramePaddingElements, const std::initializer_list<const unsigned int>& targetFramesPaddingElements)
2052{
2053 ocean_assert(targetFrames.size() >= 1);
2054 ocean_assert(targetFramesPaddingElements.size() == 0 || targetFrames.size() == targetFramesPaddingElements.size());
2055
2056 if (targetFrames.size() == 2)
2057 {
2058 separateTo1Channel<TSource, TTarget, 2u>(sourceFrame, targetFrames.begin(), width, height, (unsigned int)(targetFrames.size()), sourceFramePaddingElements, targetFramesPaddingElements.size() == 0 ? nullptr : targetFramesPaddingElements.begin());
2059 }
2060 else if (targetFrames.size() == 3)
2061 {
2062 separateTo1Channel<TSource, TTarget, 3u>(sourceFrame, targetFrames.begin(), width, height, (unsigned int)(targetFrames.size()), sourceFramePaddingElements, targetFramesPaddingElements.size() == 0 ? nullptr : targetFramesPaddingElements.begin());
2063 }
2064 else if (targetFrames.size() == 4)
2065 {
2066 separateTo1Channel<TSource, TTarget, 4u>(sourceFrame, targetFrames.begin(), width, height, (unsigned int)(targetFrames.size()), sourceFramePaddingElements, targetFramesPaddingElements.size() == 0 ? nullptr : targetFramesPaddingElements.begin());
2067 }
2068 else
2069 {
2070 separateTo1Channel<TSource, TTarget, CHANNELS_NOT_KNOWN_AT_COMPILE_TIME>(sourceFrame, targetFrames.begin(), width, height, (unsigned int)(targetFrames.size()), sourceFramePaddingElements, targetFramesPaddingElements.size() == 0 ? nullptr : targetFramesPaddingElements.begin());
2071 }
2072}
2073
2074#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2075
2076template <>
2077inline void FrameChannels::zipChannels<uint8_t, uint8_t, 2u>(const uint8_t* const* sourceFrames, uint8_t* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2078{
2079 ocean_assert(sourceFrames != nullptr);
2080 ocean_assert(targetFrame != nullptr);
2081
2082 ocean_assert(width != 0u && height != 0u);
2083 ocean_assert(channels == 2u);
2084
2085 constexpr unsigned int tChannels = 2u;
2086
2087 bool allSourceFramesContinuous = true;
2088
2089 if (sourceFramesPaddingElements != nullptr)
2090 {
2091 for (unsigned int n = 0u; n < tChannels; ++n)
2092 {
2093 if (sourceFramesPaddingElements[n] != 0u)
2094 {
2095 allSourceFramesContinuous = false;
2096 break;
2097 }
2098 }
2099 }
2100
2101 const uint8_t* source0 = sourceFrames[0];
2102 const uint8_t* source1 = sourceFrames[1];
2103 uint8_t* target = targetFrame;
2104
2105 constexpr unsigned int tBlockSize = 16u;
2106
2107 uint8x16x2_t source_8x16x2;
2108
2109 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2110 {
2111 const unsigned int pixels = width * height;
2112 const unsigned int blocks = pixels / tBlockSize;
2113 const unsigned int remaining = pixels % tBlockSize;
2114
2115 for (unsigned int n = 0u; n < blocks; ++n)
2116 {
2117 source_8x16x2.val[0] = vld1q_u8(source0);
2118 source_8x16x2.val[1] = vld1q_u8(source1);
2119
2120 vst2q_u8(target, source_8x16x2);
2121
2122 source0 += tBlockSize;
2123 source1 += tBlockSize;
2124
2125 target += tBlockSize * tChannels;
2126 }
2127
2128 for (unsigned int n = 0u; n < remaining; ++n)
2129 {
2130 target[n * tChannels + 0u] = source0[n];
2131 target[n * tChannels + 1u] = source1[n];
2132 }
2133 }
2134 else
2135 {
2136 const unsigned int sourceFrame0PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[0];
2137 const unsigned int sourceFrame1PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[1];
2138
2139 const unsigned int blocks = width / tBlockSize;
2140 const unsigned int remaining = width % tBlockSize;
2141
2142 for (unsigned int y = 0u; y < height; ++y)
2143 {
2144 for (unsigned int n = 0u; n < blocks; ++n)
2145 {
2146 source_8x16x2.val[0] = vld1q_u8(source0);
2147 source_8x16x2.val[1] = vld1q_u8(source1);
2148
2149 vst2q_u8(target, source_8x16x2);
2150
2151 source0 += tBlockSize;
2152 source1 += tBlockSize;
2153
2154 target += tBlockSize * tChannels;
2155 }
2156
2157 for (unsigned int n = 0u; n < remaining; ++n)
2158 {
2159 target[n * tChannels + 0u] = source0[n];
2160 target[n * tChannels + 1u] = source1[n];
2161 }
2162
2163 source0 += remaining + sourceFrame0PaddingElements;
2164 source1 += remaining + sourceFrame1PaddingElements;
2165 target += remaining * tChannels + targetFramePaddingElements;
2166 }
2167 }
2168}
2169
2170template <>
2171inline void FrameChannels::zipChannels<uint8_t, uint8_t, 3u>(const uint8_t* const* sourceFrames, uint8_t* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2172{
2173 ocean_assert(sourceFrames != nullptr);
2174 ocean_assert(targetFrame != nullptr);
2175
2176 ocean_assert(width != 0u && height != 0u);
2177 ocean_assert(channels == 3u);
2178
2179 constexpr unsigned int tChannels = 3u;
2180
2181 bool allSourceFramesContinuous = true;
2182
2183 if (sourceFramesPaddingElements != nullptr)
2184 {
2185 for (unsigned int n = 0u; n < tChannels; ++n)
2186 {
2187 if (sourceFramesPaddingElements[n] != 0u)
2188 {
2189 allSourceFramesContinuous = false;
2190 break;
2191 }
2192 }
2193 }
2194
2195 const uint8_t* source0 = sourceFrames[0];
2196 const uint8_t* source1 = sourceFrames[1];
2197 const uint8_t* source2 = sourceFrames[2];
2198 uint8_t* target = targetFrame;
2199
2200 constexpr unsigned int tBlockSize = 16u;
2201
2202 uint8x16x3_t source_8x16x3;
2203
2204 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2205 {
2206 const unsigned int pixels = width * height;
2207 const unsigned int blocks = pixels / tBlockSize;
2208 const unsigned int remaining = pixels % tBlockSize;
2209
2210 for (unsigned int n = 0u; n < blocks; ++n)
2211 {
2212 source_8x16x3.val[0] = vld1q_u8(source0);
2213 source_8x16x3.val[1] = vld1q_u8(source1);
2214 source_8x16x3.val[2] = vld1q_u8(source2);
2215
2216 vst3q_u8(target, source_8x16x3);
2217
2218 source0 += tBlockSize;
2219 source1 += tBlockSize;
2220 source2 += tBlockSize;
2221
2222 target += tBlockSize * tChannels;
2223 }
2224
2225 for (unsigned int n = 0u; n < remaining; ++n)
2226 {
2227 target[n * tChannels + 0u] = source0[n];
2228 target[n * tChannels + 1u] = source1[n];
2229 target[n * tChannels + 2u] = source2[n];
2230 }
2231 }
2232 else
2233 {
2234 const unsigned int sourceFrame0PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[0];
2235 const unsigned int sourceFrame1PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[1];
2236 const unsigned int sourceFrame2PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[2];
2237
2238 const unsigned int blocks = width / tBlockSize;
2239 const unsigned int remaining = width % tBlockSize;
2240
2241 for (unsigned int y = 0u; y < height; ++y)
2242 {
2243 for (unsigned int n = 0u; n < blocks; ++n)
2244 {
2245 source_8x16x3.val[0] = vld1q_u8(source0);
2246 source_8x16x3.val[1] = vld1q_u8(source1);
2247 source_8x16x3.val[2] = vld1q_u8(source2);
2248
2249 vst3q_u8(target, source_8x16x3);
2250
2251 source0 += tBlockSize;
2252 source1 += tBlockSize;
2253 source2 += tBlockSize;
2254
2255 target += tBlockSize * tChannels;
2256 }
2257
2258 for (unsigned int n = 0u; n < remaining; ++n)
2259 {
2260 target[n * tChannels + 0u] = source0[n];
2261 target[n * tChannels + 1u] = source1[n];
2262 target[n * tChannels + 2u] = source2[n];
2263 }
2264
2265 source0 += remaining + sourceFrame0PaddingElements;
2266 source1 += remaining + sourceFrame1PaddingElements;
2267 source2 += remaining + sourceFrame2PaddingElements;
2268 target += remaining * tChannels + targetFramePaddingElements;
2269 }
2270 }
2271}
2272
2273template <>
2274inline void FrameChannels::zipChannels<uint8_t, uint8_t, 4u>(const uint8_t* const* sourceFrames, uint8_t* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2275{
2276 ocean_assert(sourceFrames != nullptr);
2277 ocean_assert(targetFrame != nullptr);
2278
2279 ocean_assert(width != 0u && height != 0u);
2280 ocean_assert(channels == 4u);
2281
2282 constexpr unsigned int tChannels = 4u;
2283
2284 bool allSourceFramesContinuous = true;
2285
2286 if (sourceFramesPaddingElements != nullptr)
2287 {
2288 for (unsigned int n = 0u; n < tChannels; ++n)
2289 {
2290 if (sourceFramesPaddingElements[n] != 0u)
2291 {
2292 allSourceFramesContinuous = false;
2293 break;
2294 }
2295 }
2296 }
2297
2298 const uint8_t* source0 = sourceFrames[0];
2299 const uint8_t* source1 = sourceFrames[1];
2300 const uint8_t* source2 = sourceFrames[2];
2301 const uint8_t* source3 = sourceFrames[3];
2302 uint8_t* target = targetFrame;
2303
2304 constexpr unsigned int tBlockSize = 16u;
2305
2306 uint8x16x4_t source_8x16x4;
2307
2308 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2309 {
2310 const unsigned int pixels = width * height;
2311 const unsigned int blocks = pixels / tBlockSize;
2312 const unsigned int remaining = pixels % tBlockSize;
2313
2314 for (unsigned int n = 0u; n < blocks; ++n)
2315 {
2316 source_8x16x4.val[0] = vld1q_u8(source0);
2317 source_8x16x4.val[1] = vld1q_u8(source1);
2318 source_8x16x4.val[2] = vld1q_u8(source2);
2319 source_8x16x4.val[3] = vld1q_u8(source3);
2320
2321 vst4q_u8(target, source_8x16x4);
2322
2323 source0 += tBlockSize;
2324 source1 += tBlockSize;
2325 source2 += tBlockSize;
2326 source3 += tBlockSize;
2327
2328 target += tBlockSize * tChannels;
2329 }
2330
2331 for (unsigned int n = 0u; n < remaining; ++n)
2332 {
2333 target[n * tChannels + 0u] = source0[n];
2334 target[n * tChannels + 1u] = source1[n];
2335 target[n * tChannels + 2u] = source2[n];
2336 target[n * tChannels + 3u] = source3[n];
2337 }
2338 }
2339 else
2340 {
2341 const unsigned int sourceFrame0PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[0];
2342 const unsigned int sourceFrame1PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[1];
2343 const unsigned int sourceFrame2PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[2];
2344 const unsigned int sourceFrame3PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[3];
2345
2346 const unsigned int blocks = width / tBlockSize;
2347 const unsigned int remaining = width % tBlockSize;
2348
2349 for (unsigned int y = 0u; y < height; ++y)
2350 {
2351 for (unsigned int n = 0u; n < blocks; ++n)
2352 {
2353 source_8x16x4.val[0] = vld1q_u8(source0);
2354 source_8x16x4.val[1] = vld1q_u8(source1);
2355 source_8x16x4.val[2] = vld1q_u8(source2);
2356 source_8x16x4.val[3] = vld1q_u8(source3);
2357
2358 vst4q_u8(target, source_8x16x4);
2359
2360 source0 += tBlockSize;
2361 source1 += tBlockSize;
2362 source2 += tBlockSize;
2363 source3 += tBlockSize;
2364
2365 target += tBlockSize * tChannels;
2366 }
2367
2368 for (unsigned int n = 0u; n < remaining; ++n)
2369 {
2370 target[n * tChannels + 0u] = source0[n];
2371 target[n * tChannels + 1u] = source1[n];
2372 target[n * tChannels + 2u] = source2[n];
2373 target[n * tChannels + 3u] = source3[n];
2374 }
2375
2376 source0 += remaining + sourceFrame0PaddingElements;
2377 source1 += remaining + sourceFrame1PaddingElements;
2378 source2 += remaining + sourceFrame2PaddingElements;
2379 source3 += remaining + sourceFrame3PaddingElements;
2380 target += remaining * tChannels + targetFramePaddingElements;
2381 }
2382 }
2383}
2384
2385template <>
2386inline void FrameChannels::zipChannels<float, uint8_t, 2u>(const float* const* sourceFrames, uint8_t* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2387{
2388 ocean_assert(sourceFrames != nullptr);
2389 ocean_assert(targetFrame != nullptr);
2390
2391 ocean_assert(width != 0u && height != 0u);
2392 ocean_assert(channels == 2u);
2393
2394 constexpr unsigned int tChannels = 2u;
2395
2396 bool allSourceFramesContinuous = true;
2397
2398 if (sourceFramesPaddingElements != nullptr)
2399 {
2400 for (unsigned int n = 0u; n < tChannels; ++n)
2401 {
2402 if (sourceFramesPaddingElements[n] != 0u)
2403 {
2404 allSourceFramesContinuous = false;
2405 break;
2406 }
2407 }
2408 }
2409
2410 const float* source0 = sourceFrames[0];
2411 const float* source1 = sourceFrames[1];
2412 uint8_t* target = targetFrame;
2413
2414 constexpr unsigned int tBlockSize = 16u;
2415
2416 uint8x16x2_t target_8x16x2;
2417
2418 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2419 {
2420 const unsigned int pixels = width * height;
2421 const unsigned int blocks = pixels / tBlockSize;
2422 const unsigned int remaining = pixels % tBlockSize;
2423
2424 for (unsigned int n = 0u; n < blocks; ++n)
2425 {
2426 target_8x16x2.val[0] = NEON::cast16ElementsNEON(source0);
2427 target_8x16x2.val[1] = NEON::cast16ElementsNEON(source1);
2428
2429 vst2q_u8(target, target_8x16x2);
2430
2431 source0 += tBlockSize;
2432 source1 += tBlockSize;
2433
2434 target += tBlockSize * tChannels;
2435 }
2436
2437 for (unsigned int n = 0u; n < remaining; ++n)
2438 {
2439 ocean_assert(source0[n] >= 0.0f && source0[n] < 256.0f);
2440 ocean_assert(source1[n] >= 0.0f && source1[n] < 256.0f);
2441
2442 target[n * tChannels + 0u] = uint8_t(source0[n]);
2443 target[n * tChannels + 1u] = uint8_t(source1[n]);
2444 }
2445 }
2446 else
2447 {
2448 const unsigned int sourceFrame0PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[0];
2449 const unsigned int sourceFrame1PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[1];
2450
2451 const unsigned int blocks = width / tBlockSize;
2452 const unsigned int remaining = width % tBlockSize;
2453
2454 for (unsigned int y = 0u; y < height; ++y)
2455 {
2456 for (unsigned int n = 0u; n < blocks; ++n)
2457 {
2458 target_8x16x2.val[0] = NEON::cast16ElementsNEON(source0);
2459 target_8x16x2.val[1] = NEON::cast16ElementsNEON(source1);
2460
2461 vst2q_u8(target, target_8x16x2);
2462
2463 source0 += tBlockSize;
2464 source1 += tBlockSize;
2465
2466 target += tBlockSize * tChannels;
2467 }
2468
2469 for (unsigned int n = 0u; n < remaining; ++n)
2470 {
2471 ocean_assert(source0[n] >= 0.0f && source0[n] < 256.0f);
2472 ocean_assert(source1[n] >= 0.0f && source1[n] < 256.0f);
2473
2474 target[n * tChannels + 0u] = uint8_t(source0[n]);
2475 target[n * tChannels + 1u] = uint8_t(source1[n]);
2476 }
2477
2478 source0 += remaining + sourceFrame0PaddingElements;
2479 source1 += remaining + sourceFrame1PaddingElements;
2480 target += remaining * tChannels + targetFramePaddingElements;
2481 }
2482 }
2483}
2484
2485template <>
2486inline void FrameChannels::zipChannels<float, uint8_t, 3u>(const float* const* sourceFrames, uint8_t* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2487{
2488 ocean_assert(sourceFrames != nullptr);
2489 ocean_assert(targetFrame != nullptr);
2490
2491 ocean_assert(width != 0u && height != 0u);
2492 ocean_assert(channels == 3u);
2493
2494 constexpr unsigned int tChannels = 3u;
2495
2496 bool allSourceFramesContinuous = true;
2497
2498 if (sourceFramesPaddingElements != nullptr)
2499 {
2500 for (unsigned int n = 0u; n < tChannels; ++n)
2501 {
2502 if (sourceFramesPaddingElements[n] != 0u)
2503 {
2504 allSourceFramesContinuous = false;
2505 break;
2506 }
2507 }
2508 }
2509
2510 const float* source0 = sourceFrames[0];
2511 const float* source1 = sourceFrames[1];
2512 const float* source2 = sourceFrames[2];
2513 uint8_t* target = targetFrame;
2514
2515 constexpr unsigned int tBlockSize = 16u;
2516
2517 uint8x16x3_t target_8x16x3;
2518
2519 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2520 {
2521 const unsigned int pixels = width * height;
2522 const unsigned int blocks = pixels / tBlockSize;
2523 const unsigned int remaining = pixels % tBlockSize;
2524
2525 for (unsigned int n = 0u; n < blocks; ++n)
2526 {
2527 target_8x16x3.val[0] = NEON::cast16ElementsNEON(source0);
2528 target_8x16x3.val[1] = NEON::cast16ElementsNEON(source1);
2529 target_8x16x3.val[2] = NEON::cast16ElementsNEON(source2);
2530
2531 vst3q_u8(target, target_8x16x3);
2532
2533 source0 += tBlockSize;
2534 source1 += tBlockSize;
2535 source2 += tBlockSize;
2536
2537 target += tBlockSize * tChannels;
2538 }
2539
2540 for (unsigned int n = 0u; n < remaining; ++n)
2541 {
2542 ocean_assert(source0[n] >= 0.0f && source0[n] < 256.0f);
2543 ocean_assert(source1[n] >= 0.0f && source1[n] < 256.0f);
2544 ocean_assert(source2[n] >= 0.0f && source2[n] < 256.0f);
2545
2546 target[n * tChannels + 0u] = uint8_t(source0[n]);
2547 target[n * tChannels + 1u] = uint8_t(source1[n]);
2548 target[n * tChannels + 2u] = uint8_t(source2[n]);
2549 }
2550 }
2551 else
2552 {
2553 const unsigned int sourceFrame0PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[0];
2554 const unsigned int sourceFrame1PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[1];
2555 const unsigned int sourceFrame2PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[2];
2556
2557 const unsigned int blocks = width / tBlockSize;
2558 const unsigned int remaining = width % tBlockSize;
2559
2560 for (unsigned int y = 0u; y < height; ++y)
2561 {
2562 for (unsigned int n = 0u; n < blocks; ++n)
2563 {
2564 target_8x16x3.val[0] = NEON::cast16ElementsNEON(source0);
2565 target_8x16x3.val[1] = NEON::cast16ElementsNEON(source1);
2566 target_8x16x3.val[2] = NEON::cast16ElementsNEON(source2);
2567
2568
2569 vst3q_u8(target, target_8x16x3);
2570
2571 source0 += tBlockSize;
2572 source1 += tBlockSize;
2573 source2 += tBlockSize;
2574
2575 target += tBlockSize * tChannels;
2576 }
2577
2578 for (unsigned int n = 0u; n < remaining; ++n)
2579 {
2580 ocean_assert(source0[n] >= 0.0f && source0[n] < 256.0f);
2581 ocean_assert(source1[n] >= 0.0f && source1[n] < 256.0f);
2582 ocean_assert(source2[n] >= 0.0f && source2[n] < 256.0f);
2583
2584 target[n * tChannels + 0u] = uint8_t(source0[n]);
2585 target[n * tChannels + 1u] = uint8_t(source1[n]);
2586 target[n * tChannels + 2u] = uint8_t(source2[n]);
2587 }
2588
2589 source0 += remaining + sourceFrame0PaddingElements;
2590 source1 += remaining + sourceFrame1PaddingElements;
2591 source2 += remaining + sourceFrame2PaddingElements;
2592 target += remaining * tChannels + targetFramePaddingElements;
2593 }
2594 }
2595}
2596
2597template <>
2598inline void FrameChannels::zipChannels<float, uint8_t, 4u>(const float* const* sourceFrames, uint8_t* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2599{
2600 ocean_assert(sourceFrames != nullptr);
2601 ocean_assert(targetFrame != nullptr);
2602
2603 ocean_assert(width != 0u && height != 0u);
2604 ocean_assert(channels == 4u);
2605
2606 constexpr unsigned int tChannels = 4u;
2607
2608 bool allSourceFramesContinuous = true;
2609
2610 if (sourceFramesPaddingElements != nullptr)
2611 {
2612 for (unsigned int n = 0u; n < tChannels; ++n)
2613 {
2614 if (sourceFramesPaddingElements[n] != 0u)
2615 {
2616 allSourceFramesContinuous = false;
2617 break;
2618 }
2619 }
2620 }
2621
2622 const float* source0 = sourceFrames[0];
2623 const float* source1 = sourceFrames[1];
2624 const float* source2 = sourceFrames[2];
2625 const float* source3 = sourceFrames[3];
2626 uint8_t* target = targetFrame;
2627
2628 constexpr unsigned int tBlockSize = 16u;
2629
2630 uint8x16x4_t target_8x16x4;
2631
2632 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2633 {
2634 const unsigned int pixels = width * height;
2635 const unsigned int blocks = pixels / tBlockSize;
2636 const unsigned int remaining = pixels % tBlockSize;
2637
2638 for (unsigned int n = 0u; n < blocks; ++n)
2639 {
2640 target_8x16x4.val[0] = NEON::cast16ElementsNEON(source0);
2641 target_8x16x4.val[1] = NEON::cast16ElementsNEON(source1);
2642 target_8x16x4.val[2] = NEON::cast16ElementsNEON(source2);
2643 target_8x16x4.val[3] = NEON::cast16ElementsNEON(source3);
2644
2645 vst4q_u8(target, target_8x16x4);
2646
2647 source0 += tBlockSize;
2648 source1 += tBlockSize;
2649 source2 += tBlockSize;
2650 source3 += tBlockSize;
2651
2652 target += tBlockSize * tChannels;
2653 }
2654
2655 for (unsigned int n = 0u; n < remaining; ++n)
2656 {
2657 ocean_assert(source0[n] >= 0.0f && source0[n] < 256.0f);
2658 ocean_assert(source1[n] >= 0.0f && source1[n] < 256.0f);
2659 ocean_assert(source2[n] >= 0.0f && source2[n] < 256.0f);
2660 ocean_assert(source3[n] >= 0.0f && source3[n] < 256.0f);
2661
2662 target[n * tChannels + 0u] = uint8_t(source0[n]);
2663 target[n * tChannels + 1u] = uint8_t(source1[n]);
2664 target[n * tChannels + 2u] = uint8_t(source2[n]);
2665 target[n * tChannels + 3u] = uint8_t(source3[n]);
2666 }
2667 }
2668 else
2669 {
2670 const unsigned int sourceFrame0PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[0];
2671 const unsigned int sourceFrame1PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[1];
2672 const unsigned int sourceFrame2PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[2];
2673 const unsigned int sourceFrame3PaddingElements = sourceFramesPaddingElements == nullptr ? 0u : sourceFramesPaddingElements[3];
2674
2675 const unsigned int blocks = width / tBlockSize;
2676 const unsigned int remaining = width % tBlockSize;
2677
2678 for (unsigned int y = 0u; y < height; ++y)
2679 {
2680 for (unsigned int n = 0u; n < blocks; ++n)
2681 {
2682 target_8x16x4.val[0] = NEON::cast16ElementsNEON(source0);
2683 target_8x16x4.val[1] = NEON::cast16ElementsNEON(source1);
2684 target_8x16x4.val[2] = NEON::cast16ElementsNEON(source2);
2685 target_8x16x4.val[3] = NEON::cast16ElementsNEON(source3);
2686
2687 vst4q_u8(target, target_8x16x4);
2688
2689 source0 += tBlockSize;
2690 source1 += tBlockSize;
2691 source2 += tBlockSize;
2692 source3 += tBlockSize;
2693
2694 target += tBlockSize * tChannels;
2695 }
2696
2697 for (unsigned int n = 0u; n < remaining; ++n)
2698 {
2699 ocean_assert(source0[n] >= 0.0f && source0[n] < 256.0f);
2700 ocean_assert(source1[n] >= 0.0f && source1[n] < 256.0f);
2701 ocean_assert(source2[n] >= 0.0f && source2[n] < 256.0f);
2702 ocean_assert(source3[n] >= 0.0f && source3[n] < 256.0f);
2703
2704 target[n * tChannels + 0u] = uint8_t(source0[n]);
2705 target[n * tChannels + 1u] = uint8_t(source1[n]);
2706 target[n * tChannels + 2u] = uint8_t(source2[n]);
2707 target[n * tChannels + 3u] = uint8_t(source3[n]);
2708 }
2709
2710 source0 += remaining + sourceFrame0PaddingElements;
2711 source1 += remaining + sourceFrame1PaddingElements;
2712 source2 += remaining + sourceFrame2PaddingElements;
2713 source3 += remaining + sourceFrame3PaddingElements;
2714 target += remaining * tChannels + targetFramePaddingElements;
2715 }
2716 }
2717}
2718
2719#endif // defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2720
2721template <typename TSource, typename TTarget, unsigned int tChannels>
2722void FrameChannels::zipChannels(const TSource* const* sourceFrames, TTarget* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
2723{
2724 ocean_assert(sourceFrames != nullptr);
2725 ocean_assert(targetFrame != nullptr);
2726
2727 ocean_assert(width != 0u && height != 0u);
2728
2729 ocean_assert(tChannels == CHANNELS_NOT_KNOWN_AT_COMPILE_TIME || tChannels == channels);
2730
2731 if constexpr (tChannels == CHANNELS_NOT_KNOWN_AT_COMPILE_TIME)
2732 {
2733 zipChannelsRuntime<TSource, TTarget>(sourceFrames, targetFrame, width, height, channels, sourceFramesPaddingElements, targetFramePaddingElements);
2734 return;
2735 }
2736
2737 bool allSourceFramesContinuous = true;
2738
2739 if (sourceFramesPaddingElements != nullptr)
2740 {
2741 for (unsigned int n = 0u; n < tChannels; ++n)
2742 {
2743 if (sourceFramesPaddingElements[n] != 0u)
2744 {
2745 allSourceFramesContinuous = false;
2746 break;
2747 }
2748 }
2749 }
2750
2751 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
2752 {
2753 for (unsigned int n = 0u; n < width * height; ++n)
2754 {
2755 for (unsigned int c = 0u; c < tChannels; ++c)
2756 {
2757 targetFrame[n * tChannels + c] = TTarget(sourceFrames[c][n]);
2758 }
2759 }
2760 }
2761 else
2762 {
2763 const unsigned int targetFrameStrideElements = width * tChannels + targetFramePaddingElements;
2764
2765 Indices32 sourceFrameStrideElements(tChannels);
2766
2767 for (unsigned int c = 0u; c < tChannels; ++c)
2768 {
2769 if (sourceFramesPaddingElements == nullptr)
2770 {
2771 sourceFrameStrideElements[c] = width;
2772 }
2773 else
2774 {
2775 sourceFrameStrideElements[c] = width + sourceFramesPaddingElements[c];
2776 }
2777 }
2778
2779 for (unsigned int y = 0u; y < height; ++y)
2780 {
2781 TTarget* const targetRow = targetFrame + y * targetFrameStrideElements;
2782
2783 for (unsigned int x = 0u; x < width; ++x)
2784 {
2785 for (unsigned int c = 0u; c < tChannels; ++c)
2786 {
2787 *(targetRow + x * tChannels + c) = TTarget(*(sourceFrames[c] + y * sourceFrameStrideElements[c] + x));
2788 }
2789 }
2790 }
2791 }
2792}
2793
2794template <typename TSource, typename TTarget>
2795void FrameChannels::zipChannels(const std::initializer_list<const TSource*>& sourceFrames, TTarget* const targetFrame, const unsigned int width, const unsigned int height, const std::initializer_list<unsigned int>& sourceFramePaddingElements, const unsigned int targetFramePaddingElements)
2796{
2797 ocean_assert(sourceFrames.size() >= 1);
2798 ocean_assert(sourceFramePaddingElements.size() == 0 || sourceFrames.size() == sourceFramePaddingElements.size());
2799
2800 if (sourceFrames.size() == 2)
2801 {
2802 zipChannels<TSource, TTarget, 2u>(sourceFrames.begin(), targetFrame, width, height, (unsigned int)(sourceFrames.size()), sourceFramePaddingElements.size() == 0 ? nullptr : sourceFramePaddingElements.begin(), targetFramePaddingElements);
2803 }
2804 else if (sourceFrames.size() == 3)
2805 {
2806 zipChannels<TSource, TTarget, 3u>(sourceFrames.begin(), targetFrame, width, height, (unsigned int)(sourceFrames.size()), sourceFramePaddingElements.size() == 0 ? nullptr : sourceFramePaddingElements.begin(), targetFramePaddingElements);
2807 }
2808 else if (sourceFrames.size() == 4)
2809 {
2810 zipChannels<TSource, TTarget, 4u>(sourceFrames.begin(), targetFrame, width, height, (unsigned int)(sourceFrames.size()), sourceFramePaddingElements.size() == 0 ? nullptr : sourceFramePaddingElements.begin(), targetFramePaddingElements);
2811 }
2812 else
2813 {
2814 zipChannels<TSource, TTarget, CHANNELS_NOT_KNOWN_AT_COMPILE_TIME>(sourceFrames.begin(), targetFrame, width, height, (unsigned int)(sourceFrames.size()), sourceFramePaddingElements.size() == 0 ? nullptr : sourceFramePaddingElements.begin(), targetFramePaddingElements);
2815 }
2816}
2817
2818template <typename T, unsigned int tSourceChannels>
2819inline void FrameChannels::addFirstChannel(const T* source, const T* sourceNewChannel, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int sourceNewChannelPaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2820{
2821 static_assert(tSourceChannels != 0u, "Invalid channel number!");
2822
2823 ocean_assert(source != nullptr && sourceNewChannel != nullptr && target != nullptr);
2824 ocean_assert(source != target);
2825 ocean_assert(width >= 1u && height >= 1u);
2826
2827 const unsigned int options[3] = {sourcePaddingElements, sourceNewChannelPaddingElements, targetPaddingElements};
2828
2829 const void* sources[2] = {source, sourceNewChannel};
2830
2831 FrameConverter::convertArbitraryPixelFormat(sources, (void**)&target, width, height, conversionFlag, 1u, FrameChannels::addChannelRow<T, tSourceChannels, true>, options, worker);
2832}
2833
2834template <typename T, unsigned int tSourceChannels>
2835inline void FrameChannels::addFirstChannelValue(const T* source, const T newChannelValue, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2836{
2837 static_assert(tSourceChannels >= 1u, "Invalid channel number!");
2838
2839 ocean_assert(source != nullptr && target != nullptr);
2840 ocean_assert(width >= 1u && height >= 1u);
2841
2842 const unsigned int targetChannels = tSourceChannels + 1u;
2843
2844 const unsigned int sourceStrideElements = width * tSourceChannels + sourcePaddingElements;
2845 const unsigned int targetStrideElements = width * targetChannels + targetPaddingElements;
2846
2847 const void* channelValueParameter = (const void*)(&newChannelValue);
2848
2849 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
2850
2851 FrameConverter::convertGenericPixelFormat<T>(source, target, width, height, sourceStrideElements, targetStrideElements, conversionFlag, FrameChannels::addChannelValueRow<T, tSourceChannels, true>, FrameChannels::reverseRowPixelOrderInPlace<T, targetChannels>, areContinuous, channelValueParameter, worker);
2852}
2853
2854template <typename T, unsigned int tSourceChannels>
2855inline void FrameChannels::addLastChannel(const T* source, const T* sourceNewChannel, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int sourceNewChannelPaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2856{
2857 static_assert(tSourceChannels != 0u, "Invalid channel number!");
2858
2859 ocean_assert(source != nullptr && sourceNewChannel != nullptr && target != nullptr);
2860 ocean_assert(source != target);
2861 ocean_assert(width >= 1u && height >= 1u);
2862
2863 const unsigned int options[3] = {sourcePaddingElements, sourceNewChannelPaddingElements, targetPaddingElements};
2864
2865 const void* sources[2] = {source, sourceNewChannel};
2866
2867 FrameConverter::convertArbitraryPixelFormat(sources, (void**)&target, width, height, conversionFlag, 1u, FrameChannels::addChannelRow<T, tSourceChannels, false>, options, worker);
2868}
2869
2870template <typename T, unsigned int tSourceChannels>
2871inline void FrameChannels::addLastChannelValue(const T* source, const T newChannelValue, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2872{
2873 static_assert(tSourceChannels >= 1u, "Invalid channel number!");
2874
2875 ocean_assert(source != nullptr && target != nullptr);
2876 ocean_assert(width >= 1u && height >= 1u);
2877
2878 const unsigned int targetChannels = tSourceChannels + 1u;
2879
2880 const unsigned int sourceStrideElements = width * tSourceChannels + sourcePaddingElements;
2881 const unsigned int targetStrideElements = width * targetChannels + targetPaddingElements;
2882
2883 const void* channelValueParameter = (const void*)(&newChannelValue);
2884
2885 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
2886
2887 FrameConverter::convertGenericPixelFormat<T>(source, target, width, height, sourceStrideElements, targetStrideElements, conversionFlag, FrameChannels::addChannelValueRow<T, tSourceChannels, false>, FrameChannels::reverseRowPixelOrderInPlace<T, targetChannels>, areContinuous, channelValueParameter, worker);
2888}
2889
2890template <typename T, unsigned int tSourceChannels>
2891inline void FrameChannels::removeFirstChannel(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2892{
2893 static_assert(tSourceChannels >= 2u && tSourceChannels <= 8u, "Invalid channel number!");
2894
2895 ocean_assert(source != nullptr && target != nullptr);
2896 ocean_assert(width >= 1u && height >= 1u);
2897
2898 const unsigned int shufflePatternMax = 0x07654321u;
2899 const unsigned int mask = 0xFFFFFFFFu >> ((8u - tSourceChannels + 1u) * 4u); // e.g., 0xFF for tChannels == 3u, 0xFFF for tChannels == 4u
2900
2901 const unsigned int shufflePattern = shufflePatternMax & mask;
2902
2903 FrameChannels::shuffleChannels<T, tSourceChannels, tSourceChannels - 1u, shufflePattern>(source, target, width, height, conversionFlag, sourcePaddingElements, targetPaddingElements, worker);
2904}
2905
2906template <typename T, unsigned int tSourceChannels>
2907inline void FrameChannels::removeLastChannel(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2908{
2909 static_assert(tSourceChannels >= 2u && tSourceChannels <= 8u, "Invalid channel number!");
2910
2911 ocean_assert(source != nullptr && target != nullptr);
2912 ocean_assert(width >= 1u && height >= 1u);
2913
2914 const unsigned int shufflePatternMax = 0x76543210u;
2915 const unsigned int mask = 0xFFFFFFFFu >> ((8u - tSourceChannels + 1u) * 4u); // e.g., 0xFF for tChannels == 3u, 0xFFF for tChannels == 4u
2916
2917 const unsigned int shufflePattern = shufflePatternMax & mask;
2918
2919 FrameChannels::shuffleChannels<T, tSourceChannels, tSourceChannels - 1u, shufflePattern>(source, target, width, height, conversionFlag, sourcePaddingElements, targetPaddingElements, worker);
2920}
2921
2922template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tSourceChannelIndex, unsigned int tTargetChannelIndex>
2923inline void FrameChannels::copyChannel(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2924{
2925 static_assert(tSourceChannels >= 1u, "Invalid number of channels!");
2926 static_assert(tTargetChannels >= 1u, "Invalid number of channels!");
2927
2928 static_assert(tSourceChannelIndex < tSourceChannels, "Invalid channel index!");
2929 static_assert(tTargetChannelIndex < tTargetChannels, "Invalid channel index!");
2930
2931 ocean_assert(source != nullptr && target != nullptr);
2932 ocean_assert(width >= 1u && height >= 1u);
2933
2934 const unsigned int sourceStrideElements = width * tSourceChannels + sourcePaddingElements;
2935 const unsigned int targetStrideElements = width * tTargetChannels + targetPaddingElements;
2936
2937 constexpr RowReversePixelOrderInPlaceFunction<T> reversePixelOrderRowInPlaceFunction = nullptr;
2938
2939 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
2940
2941 FrameConverter::convertGenericPixelFormat<T>(source, target, width, height, sourceStrideElements, targetStrideElements, CONVERT_NORMAL, FrameChannels::copyChannelRow<T, tSourceChannels, tTargetChannels, tSourceChannelIndex, tTargetChannelIndex>, reversePixelOrderRowInPlaceFunction, areContinuous, nullptr, worker);
2942}
2943
2944template <typename T, unsigned int tChannel, unsigned int tChannels>
2945inline void FrameChannels::setChannel(T* frame, const unsigned int width, const unsigned int height, const T value, const unsigned int framePaddingElements, Worker* worker)
2946{
2947 static_assert(tChannels >= 1u, "Invalid channel number!");
2948 static_assert(tChannel < tChannels, "Invalid channel index!");
2949
2950 ocean_assert(frame != nullptr);
2951 ocean_assert(width >= 1u && height >= 1u);
2952
2953 if (worker)
2954 {
2955 worker->executeFunction(Worker::Function::createStatic(&setChannelSubset<T, tChannel, tChannels>, frame, width, value, framePaddingElements, 0u, 0u), 0u, height);
2956 }
2957 else
2958 {
2959 setChannelSubset<T, tChannel, tChannels>(frame, width, value, framePaddingElements, 0u, height);
2960 }
2961}
2962
2963template <typename T, unsigned int tChannels>
2964inline void FrameChannels::reverseChannelOrder(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
2965{
2966 static_assert(tChannels >= 1u, "Invalid channel number!");
2967
2968 ocean_assert(source != nullptr && target != nullptr);
2969 ocean_assert(width >= 1u && height >= 1u);
2970
2971 const unsigned int sourceStrideElements = width * tChannels + sourcePaddingElements;
2972 const unsigned int targetStrideElements = width * tChannels + targetPaddingElements;
2973
2974 constexpr bool areContinuous = false; // even if both images are continuous, we must reverse each line by another
2975
2976 FrameConverter::convertGenericPixelFormat<T>(source, target, width, height, sourceStrideElements, targetStrideElements, conversionFlag, FrameChannels::reverseRowChannelOrder<T, tChannels>, FrameChannels::reverseRowPixelOrderInPlace<T, tChannels>, areContinuous, nullptr, worker);
2977}
2978
2979template <typename T, unsigned int tChannels>
2980void FrameChannels::reverseRowPixelOrder(const T* source, T* target, const size_t size)
2981{
2982 static_assert(tChannels >= 1u, "Invalid channel number!");
2983
2984 ocean_assert(source != nullptr && target != nullptr);
2985 ocean_assert(size >= 1);
2986
2987#ifdef OCEAN_DEBUG
2988 const T* const debugSourceStart = source;
2989 const T* const debugSourceEnd = debugSourceStart + size * tChannels;
2990
2991 const T* const debugTargetStart = target;
2992 const T* const debugTargetEnd = debugTargetStart + size * tChannels;
2993#endif
2994
2995 // moving target to the end of the memory block
2996 target += size * tChannels;
2997
2998 const T* const sourceEnd = source + size * tChannels;
2999
3000#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3001
3002 if constexpr (std::is_same<typename TypeMapper<T>::Type, uint8_t>::value)
3003 {
3004 const size_t blocks16 = size / size_t(16);
3005
3006 switch (tChannels)
3007 {
3008 case 1u:
3009 {
3010 for (size_t n = 0; n < blocks16; ++n)
3011 {
3012 target -= 16u * tChannels;
3013
3014 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3015 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3016
3017 const uint8x16_t source_u_8x16 = vld1q_u8((const uint8_t*)(source));
3018 uint8x16_t revSource_u_8x16 = vrev64q_u8(source_u_8x16);
3019 revSource_u_8x16 = vcombine_u8(vget_high_u8(revSource_u_8x16), vget_low_u8(revSource_u_8x16));
3020
3021 vst1q_u8((uint8_t*)(target), revSource_u_8x16);
3022
3023 source += 16u * tChannels;
3024 }
3025
3026 break;
3027 }
3028
3029 case 2u:
3030 {
3031 for (size_t n = 0; n < blocks16; ++n)
3032 {
3033 target -= 16u * tChannels;
3034
3035 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3036 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3037
3038 const uint8x16_t sourceA_u_8x16 = vld1q_u8((const uint8_t*)(source) + 0);
3039 const uint8x16_t sourceB_u_8x16 = vld1q_u8((const uint8_t*)(source) + 16);
3040
3041 const uint8x16_t revSourceA_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u8(sourceA_u_8x16)));
3042 const uint8x16_t revSourceB_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u8(sourceB_u_8x16)));
3043
3044 const uint8x16_t targetA_u_8x16 = vcombine_u8(vget_high_u8(revSourceA_u_8x16), vget_low_u8(revSourceA_u_8x16));
3045 const uint8x16_t targetB_u_8x16 = vcombine_u8(vget_high_u8(revSourceB_u_8x16), vget_low_u8(revSourceB_u_8x16));
3046
3047 vst1q_u8((uint8_t*)(target) + 0, targetB_u_8x16);
3048 vst1q_u8((uint8_t*)(target) + 16, targetA_u_8x16);
3049
3050 source += 16u * tChannels;
3051 }
3052
3053 break;
3054 }
3055
3056 case 3u:
3057 {
3058 for (size_t n = 0; n < blocks16; ++n)
3059 {
3060 target -= 16u * tChannels;
3061
3062 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3063 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3064
3065 const uint8x16x3_t source_u_8x16x3 = vld3q_u8((const uint8_t*)(source));
3066
3067 uint8x16x3_t revSource_u_8x16x3;
3068 revSource_u_8x16x3.val[0] = vcombine_u8(vrev64_u8(vget_high_u8(source_u_8x16x3.val[0])), vrev64_u8(vget_low_u8(source_u_8x16x3.val[0])));
3069 revSource_u_8x16x3.val[1] = vcombine_u8(vrev64_u8(vget_high_u8(source_u_8x16x3.val[1])), vrev64_u8(vget_low_u8(source_u_8x16x3.val[1])));
3070 revSource_u_8x16x3.val[2] = vcombine_u8(vrev64_u8(vget_high_u8(source_u_8x16x3.val[2])), vrev64_u8(vget_low_u8(source_u_8x16x3.val[2])));
3071
3072 vst3q_u8((uint8_t*)(target), revSource_u_8x16x3);
3073
3074 source += 16u * tChannels;
3075 }
3076
3077 break;
3078 }
3079
3080 case 4u:
3081 {
3082 for (size_t n = 0; n < blocks16; ++n)
3083 {
3084 target -= 16u * tChannels;
3085
3086 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3087 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3088
3089 const uint8x16_t sourceA_u_8x16 = vld1q_u8((const uint8_t*)(source) + 0);
3090 const uint8x16_t sourceB_u_8x16 = vld1q_u8((const uint8_t*)(source) + 16);
3091 const uint8x16_t sourceC_u_8x16 = vld1q_u8((const uint8_t*)(source) + 32);
3092 const uint8x16_t sourceD_u_8x16 = vld1q_u8((const uint8_t*)(source) + 48);
3093
3094 const uint8x16_t revSourceA_u_8x16 = vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(sourceA_u_8x16)));
3095 const uint8x16_t revSourceB_u_8x16 = vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(sourceB_u_8x16)));
3096 const uint8x16_t revSourceC_u_8x16 = vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(sourceC_u_8x16)));
3097 const uint8x16_t revSourceD_u_8x16 = vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(sourceD_u_8x16)));
3098
3099 const uint8x16_t targetA_u_8x16 = vcombine_u8(vget_high_u8(revSourceA_u_8x16), vget_low_u8(revSourceA_u_8x16));
3100 const uint8x16_t targetB_u_8x16 = vcombine_u8(vget_high_u8(revSourceB_u_8x16), vget_low_u8(revSourceB_u_8x16));
3101 const uint8x16_t targetC_u_8x16 = vcombine_u8(vget_high_u8(revSourceC_u_8x16), vget_low_u8(revSourceC_u_8x16));
3102 const uint8x16_t targetD_u_8x16 = vcombine_u8(vget_high_u8(revSourceD_u_8x16), vget_low_u8(revSourceD_u_8x16));
3103
3104 vst1q_u8((uint8_t*)(target) + 0, targetD_u_8x16);
3105 vst1q_u8((uint8_t*)(target) + 16, targetC_u_8x16);
3106 vst1q_u8((uint8_t*)(target) + 32, targetB_u_8x16);
3107 vst1q_u8((uint8_t*)(target) + 48, targetA_u_8x16);
3108
3109 source += 16u * tChannels;
3110 }
3111
3112 break;
3113 }
3114
3115 default:
3116 break;
3117 }
3118 }
3119
3120#endif // OCEAN_HARDWARE_NEON_VERSION
3121
3122 while (source != sourceEnd)
3123 {
3124 ocean_assert(source < sourceEnd);
3125
3126 for (unsigned int n = 0u; n < tChannels; ++n)
3127 {
3128 ocean_assert(source + tChannels - n - 1u >= debugSourceStart);
3129 ocean_assert(source + tChannels - n - 1u < debugSourceEnd);
3130
3131 ocean_assert(target > debugTargetStart && target <= debugTargetEnd);
3132
3133 *--target = source[tChannels - n - 1u];
3134 }
3135
3136 source += tChannels;
3137 }
3138}
3139
3140template <typename T, unsigned int tChannels>
3141void FrameChannels::reverseRowPixelOrderInPlace(T* data, const size_t size)
3142{
3143 static_assert(tChannels >= 1u, "Invalid channel number!");
3144
3145 ocean_assert(data != nullptr);
3146 ocean_assert(size >= 1);
3147
3148 using PixelType = typename DataType<T, tChannels>::Type;
3149
3150 size_t n = 0;
3151
3152#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3153
3154 if constexpr (std::is_same<typename TypeMapper<T>::Type, uint8_t>::value)
3155 {
3156 if (size >= 32)
3157 {
3158 const size_t blocks32 = size / size_t(32);
3159
3160 uint8_t* left = (uint8_t*)(data);
3161 uint8_t* right = (uint8_t*)(data) + (size - 16u) * tChannels;
3162
3163 switch (tChannels)
3164 {
3165 case 1u:
3166 {
3167 for (size_t nBlock = 0; nBlock < blocks32; ++nBlock)
3168 {
3169 const uint8x16_t left_u_8x16 = vld1q_u8(left);
3170 const uint8x16_t right_u_8x16 = vld1q_u8(right);
3171
3172 uint8x16_t revLeft_u_8x16 = vrev64q_u8(left_u_8x16);
3173 revLeft_u_8x16 = vcombine_u8(vget_high_u8(revLeft_u_8x16), vget_low_u8(revLeft_u_8x16));
3174
3175 uint8x16_t revRight_u_8x16 = vrev64q_u8(right_u_8x16);
3176 revRight_u_8x16 = vcombine_u8(vget_high_u8(revRight_u_8x16), vget_low_u8(revRight_u_8x16));
3177
3178 vst1q_u8(left, revRight_u_8x16);
3179 vst1q_u8(right, revLeft_u_8x16);
3180
3181 left += 16u * tChannels;
3182 right -= 16u * tChannels;
3183 }
3184
3185 n += blocks32 * 16u;
3186
3187 break;
3188 }
3189
3190 case 2u:
3191 {
3192 for (size_t nBlock = 0; nBlock < blocks32; ++nBlock)
3193 {
3194 const uint8x16x2_t left_u_8x16x2 = vld2q_u8(left);
3195 const uint8x16x2_t right_u_8x16x2 = vld2q_u8(right);
3196
3197 uint8x16x2_t revLeft_u_8x16x2;
3198 revLeft_u_8x16x2.val[0] = vrev64q_u8(left_u_8x16x2.val[0]);
3199 revLeft_u_8x16x2.val[1] = vrev64q_u8(left_u_8x16x2.val[1]);
3200 revLeft_u_8x16x2.val[0] = vcombine_u8(vget_high_u8(revLeft_u_8x16x2.val[0]), vget_low_u8(revLeft_u_8x16x2.val[0]));
3201 revLeft_u_8x16x2.val[1] = vcombine_u8(vget_high_u8(revLeft_u_8x16x2.val[1]), vget_low_u8(revLeft_u_8x16x2.val[1]));
3202
3203 uint8x16x2_t revRight_u_8x16x2;
3204 revRight_u_8x16x2.val[0] = vrev64q_u8(right_u_8x16x2.val[0]);
3205 revRight_u_8x16x2.val[1] = vrev64q_u8(right_u_8x16x2.val[1]);
3206 revRight_u_8x16x2.val[0] = vcombine_u8(vget_high_u8(revRight_u_8x16x2.val[0]), vget_low_u8(revRight_u_8x16x2.val[0]));
3207 revRight_u_8x16x2.val[1] = vcombine_u8(vget_high_u8(revRight_u_8x16x2.val[1]), vget_low_u8(revRight_u_8x16x2.val[1]));
3208
3209 vst2q_u8(left, revRight_u_8x16x2);
3210 vst2q_u8(right, revLeft_u_8x16x2);
3211
3212 left += 16u * tChannels;
3213 right -= 16u * tChannels;
3214 }
3215
3216 n += blocks32 * 16u;
3217
3218 break;
3219 }
3220
3221 case 3u:
3222 {
3223 for (size_t nBlock = 0; nBlock < blocks32; ++nBlock)
3224 {
3225 const uint8x16x3_t left_u_8x16x3 = vld3q_u8(left);
3226 const uint8x16x3_t right_u_8x16x3 = vld3q_u8(right);
3227
3228 uint8x16x3_t revLeft_u_8x16x3;
3229 revLeft_u_8x16x3.val[0] = vrev64q_u8(left_u_8x16x3.val[0]);
3230 revLeft_u_8x16x3.val[1] = vrev64q_u8(left_u_8x16x3.val[1]);
3231 revLeft_u_8x16x3.val[2] = vrev64q_u8(left_u_8x16x3.val[2]);
3232 revLeft_u_8x16x3.val[0] = vcombine_u8(vget_high_u8(revLeft_u_8x16x3.val[0]), vget_low_u8(revLeft_u_8x16x3.val[0]));
3233 revLeft_u_8x16x3.val[1] = vcombine_u8(vget_high_u8(revLeft_u_8x16x3.val[1]), vget_low_u8(revLeft_u_8x16x3.val[1]));
3234 revLeft_u_8x16x3.val[2] = vcombine_u8(vget_high_u8(revLeft_u_8x16x3.val[2]), vget_low_u8(revLeft_u_8x16x3.val[2]));
3235
3236 uint8x16x3_t revRight_u_8x16x3;
3237 revRight_u_8x16x3.val[0] = vrev64q_u8(right_u_8x16x3.val[0]);
3238 revRight_u_8x16x3.val[1] = vrev64q_u8(right_u_8x16x3.val[1]);
3239 revRight_u_8x16x3.val[2] = vrev64q_u8(right_u_8x16x3.val[2]);
3240 revRight_u_8x16x3.val[0] = vcombine_u8(vget_high_u8(revRight_u_8x16x3.val[0]), vget_low_u8(revRight_u_8x16x3.val[0]));
3241 revRight_u_8x16x3.val[1] = vcombine_u8(vget_high_u8(revRight_u_8x16x3.val[1]), vget_low_u8(revRight_u_8x16x3.val[1]));
3242 revRight_u_8x16x3.val[2] = vcombine_u8(vget_high_u8(revRight_u_8x16x3.val[2]), vget_low_u8(revRight_u_8x16x3.val[2]));
3243
3244 vst3q_u8(left, revRight_u_8x16x3);
3245 vst3q_u8(right, revLeft_u_8x16x3);
3246
3247 left += 16u * tChannels;
3248 right -= 16u * tChannels;
3249 }
3250
3251 n += blocks32 * 16u;
3252
3253 break;
3254 }
3255
3256 case 4u:
3257 {
3258 for (size_t nBlock = 0; nBlock < blocks32; ++nBlock)
3259 {
3260 const uint8x16x4_t left_u_8x16x4 = vld4q_u8(left);
3261 const uint8x16x4_t right_u_8x16x4 = vld4q_u8(right);
3262
3263 uint8x16x4_t revLeft_u_8x16x4;
3264 revLeft_u_8x16x4.val[0] = vrev64q_u8(left_u_8x16x4.val[0]);
3265 revLeft_u_8x16x4.val[1] = vrev64q_u8(left_u_8x16x4.val[1]);
3266 revLeft_u_8x16x4.val[2] = vrev64q_u8(left_u_8x16x4.val[2]);
3267 revLeft_u_8x16x4.val[3] = vrev64q_u8(left_u_8x16x4.val[3]);
3268 revLeft_u_8x16x4.val[0] = vcombine_u8(vget_high_u8(revLeft_u_8x16x4.val[0]), vget_low_u8(revLeft_u_8x16x4.val[0]));
3269 revLeft_u_8x16x4.val[1] = vcombine_u8(vget_high_u8(revLeft_u_8x16x4.val[1]), vget_low_u8(revLeft_u_8x16x4.val[1]));
3270 revLeft_u_8x16x4.val[2] = vcombine_u8(vget_high_u8(revLeft_u_8x16x4.val[2]), vget_low_u8(revLeft_u_8x16x4.val[2]));
3271 revLeft_u_8x16x4.val[3] = vcombine_u8(vget_high_u8(revLeft_u_8x16x4.val[3]), vget_low_u8(revLeft_u_8x16x4.val[3]));
3272
3273 uint8x16x4_t revRight_u_8x16x4;
3274 revRight_u_8x16x4.val[0] = vrev64q_u8(right_u_8x16x4.val[0]);
3275 revRight_u_8x16x4.val[1] = vrev64q_u8(right_u_8x16x4.val[1]);
3276 revRight_u_8x16x4.val[2] = vrev64q_u8(right_u_8x16x4.val[2]);
3277 revRight_u_8x16x4.val[3] = vrev64q_u8(right_u_8x16x4.val[3]);
3278 revRight_u_8x16x4.val[0] = vcombine_u8(vget_high_u8(revRight_u_8x16x4.val[0]), vget_low_u8(revRight_u_8x16x4.val[0]));
3279 revRight_u_8x16x4.val[1] = vcombine_u8(vget_high_u8(revRight_u_8x16x4.val[1]), vget_low_u8(revRight_u_8x16x4.val[1]));
3280 revRight_u_8x16x4.val[2] = vcombine_u8(vget_high_u8(revRight_u_8x16x4.val[2]), vget_low_u8(revRight_u_8x16x4.val[2]));
3281 revRight_u_8x16x4.val[3] = vcombine_u8(vget_high_u8(revRight_u_8x16x4.val[3]), vget_low_u8(revRight_u_8x16x4.val[3]));
3282
3283 vst4q_u8(left, revRight_u_8x16x4);
3284 vst4q_u8(right, revLeft_u_8x16x4);
3285
3286 left += 16u * tChannels;
3287 right -= 16u * tChannels;
3288 }
3289
3290 n += blocks32 * 16u;
3291
3292 break;
3293 }
3294
3295 default:
3296 break;
3297 }
3298 }
3299 }
3300
3301#endif
3302
3303 PixelType intermediate;
3304
3305 PixelType* const pixels = (PixelType*)(data);
3306
3307 while (n < size / 2)
3308 {
3309 intermediate = pixels[n];
3310
3311 pixels[n] = pixels[size - n - 1];
3312 pixels[size - n - 1] = intermediate;
3313
3314 ++n;
3315 }
3316}
3317
3318template <typename T, unsigned int tChannels>
3319void FrameChannels::reverseRowChannelOrder(const T* source, T* target, const size_t size, const void* /*options*/)
3320{
3321 ocean_assert(source != nullptr && target != nullptr);
3322 ocean_assert(source != target);
3323 ocean_assert(size >= 1);
3324
3325#ifdef OCEAN_DEBUG
3326 const T* const debugSourceStart = source;
3327 const T* const debugSourceEnd = debugSourceStart + size * tChannels;
3328
3329 const T* const debugTargetStart = target;
3330 const T* const debugTargetEnd = debugTargetStart + size * tChannels;
3331#endif
3332
3333 if constexpr (tChannels == 1)
3334 {
3335 // we actually copy the one channel
3336
3337 memcpy(target, source, sizeof(T) * size);
3338 return;
3339 }
3340
3341 const T* const sourceEnd = source + size * tChannels;
3342
3343#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
3344
3345 if ((std::is_same<typename TypeMapper<T>::Type, uint8_t>::value))
3346 {
3347 const size_t blocks16 = size / size_t(16);
3348
3349 switch (tChannels)
3350 {
3351 case 1u:
3352 ocean_assert(false && "This should have been handled above!");
3353 break;
3354
3355 case 2u:
3356 {
3357 for (size_t n = 0; n < blocks16; ++n)
3358 {
3359 SSE::reverseChannelOrder2Channel8Bit32Elements((const uint8_t*)source, (uint8_t*)target);
3360
3361 source += 16u * tChannels;
3362 target += 16u * tChannels;
3363 }
3364
3365 break;
3366 }
3367
3368 case 3u:
3369 {
3370 for (size_t n = 0; n < blocks16; ++n)
3371 {
3372 SSE::reverseChannelOrder3Channel8Bit48Elements((const uint8_t*)source, (uint8_t*)target);
3373
3374 source += 16u * tChannels;
3375 target += 16u * tChannels;
3376 }
3377
3378 break;
3379 }
3380
3381 case 4u:
3382 {
3383 for (size_t n = 0; n < blocks16; ++n)
3384 {
3385 SSE::reverseChannelOrder4Channel8Bit64Elements((const uint8_t*)source, (uint8_t*)target);
3386
3387 source += 16u * tChannels;
3388 target += 16u * tChannels;
3389 }
3390
3391 break;
3392 }
3393
3394 default:
3395 break;
3396 }
3397 }
3398
3399#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3400
3401 if ((std::is_same<typename TypeMapper<T>::Type, uint8_t>::value))
3402 {
3403 const size_t blocks16 = size / size_t(16);
3404
3405 switch (tChannels)
3406 {
3407 case 1u:
3408 ocean_assert(false && "This should have been handled above!");
3409 break;
3410
3411 case 2u:
3412 {
3413 for (size_t n = 0; n < blocks16; ++n)
3414 {
3415 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3416 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3417
3418 const uint8x16_t sourceA_u_8x16 = vld1q_u8((const uint8_t*)source + 0);
3419 const uint8x16_t sourceB_u_8x16 = vld1q_u8((const uint8_t*)source + 16);
3420
3421 const uint8x16_t revSourceA_u_8x16 = vrev16q_u8(sourceA_u_8x16);
3422 const uint8x16_t revSourceB_u_8x16 = vrev16q_u8(sourceB_u_8x16);
3423
3424 vst1q_u8((uint8_t*)target + 0, revSourceA_u_8x16);
3425 vst1q_u8((uint8_t*)target + 16, revSourceB_u_8x16);
3426
3427 source += 16u * tChannels;
3428 target += 16u * tChannels;
3429 }
3430
3431 break;
3432 }
3433
3434 case 3u:
3435 {
3436 for (size_t n = 0; n < blocks16; ++n)
3437 {
3438 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3439 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3440
3441 const uint8x16x3_t source_u_8x16x3 = vld3q_u8((const uint8_t*)source);
3442
3443 uint8x16x3_t revSource_u_8x16x3;
3444 revSource_u_8x16x3.val[0] = source_u_8x16x3.val[2];
3445 revSource_u_8x16x3.val[1] = source_u_8x16x3.val[1];
3446 revSource_u_8x16x3.val[2] = source_u_8x16x3.val[0];
3447
3448 vst3q_u8((uint8_t*)target, revSource_u_8x16x3);
3449
3450 source += 16u * tChannels;
3451 target += 16u * tChannels;
3452 }
3453
3454 break;
3455 }
3456
3457 case 4u:
3458 {
3459 for (size_t n = 0; n < blocks16; ++n)
3460 {
3461 ocean_assert(source >= debugSourceStart && source + 16u * tChannels <= debugSourceEnd);
3462 ocean_assert(target >= debugTargetStart && target + 16u * tChannels <= debugTargetEnd);
3463
3464 const uint8x16_t sourceA_u_8x16 = vld1q_u8((const uint8_t*)source + 0);
3465 const uint8x16_t sourceB_u_8x16 = vld1q_u8((const uint8_t*)source + 16);
3466 const uint8x16_t sourceC_u_8x16 = vld1q_u8((const uint8_t*)source + 32);
3467 const uint8x16_t sourceD_u_8x16 = vld1q_u8((const uint8_t*)source + 48);
3468
3469 const uint8x16_t revSourceA_u_8x16 = vrev32q_u8(sourceA_u_8x16);
3470 const uint8x16_t revSourceB_u_8x16 = vrev32q_u8(sourceB_u_8x16);
3471 const uint8x16_t revSourceC_u_8x16 = vrev32q_u8(sourceC_u_8x16);
3472 const uint8x16_t revSourceD_u_8x16 = vrev32q_u8(sourceD_u_8x16);
3473
3474 vst1q_u8((uint8_t*)target + 0, revSourceA_u_8x16);
3475 vst1q_u8((uint8_t*)target + 16, revSourceB_u_8x16);
3476 vst1q_u8((uint8_t*)target + 32, revSourceC_u_8x16);
3477 vst1q_u8((uint8_t*)target + 48, revSourceD_u_8x16);
3478
3479 source += 16u * tChannels;
3480 target += 16u * tChannels;
3481 }
3482
3483 break;
3484 }
3485
3486 default:
3487 break;
3488 }
3489 }
3490
3491#endif // OCEAN_HARDWARE_NEON_VERSION
3492
3493 while (source != sourceEnd)
3494 {
3495 ocean_assert(source < sourceEnd);
3496
3497 ocean_assert(source >= debugSourceStart && source + tChannels <= debugSourceEnd);
3498 ocean_assert(target >= debugTargetStart && target + tChannels <= debugTargetEnd);
3499
3500 for (unsigned int n = 0u; n < tChannels; ++n)
3501 {
3502 target[n] = source[tChannels - n - 1u];
3503 }
3504
3505 source += tChannels;
3506 target += tChannels;
3507 }
3508}
3509
3510template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
3511inline void FrameChannels::shuffleRowChannels(const T* source, T* target, const size_t size, const void* /*unusedOptions*/)
3512{
3513 static_assert(tSourceChannels >= 1u && tSourceChannels <= 8u, "Invalid channel number!");
3514 static_assert(tTargetChannels >= 1u && tTargetChannels <= 8u, "Invalid channel number!");
3515
3516 static_assert(tSourceChannels != 1u || tTargetChannels != 1u, "Invalid channel number!");
3517
3518 static_assert(((tShufflePattern & 0x0000000Fu) >> 0u) < tSourceChannels, "Invalid shuffle pattern!");
3519 static_assert(((tShufflePattern & 0x000000F0u) >> 4u) < tSourceChannels, "Invalid shuffle pattern!");
3520 static_assert(((tShufflePattern & 0x00000F00u) >> 8u) < tSourceChannels, "Invalid shuffle pattern!");
3521 static_assert(((tShufflePattern & 0x0000F000u) >> 12u) < tSourceChannels, "Invalid shuffle pattern!");
3522 static_assert(((tShufflePattern & 0x000F0000u) >> 16u) < tSourceChannels, "Invalid shuffle pattern!");
3523 static_assert(((tShufflePattern & 0x00F00000u) >> 20u) < tSourceChannels, "Invalid shuffle pattern!");
3524 static_assert(((tShufflePattern & 0x0F000000u) >> 24u) < tSourceChannels, "Invalid shuffle pattern!");
3525 static_assert(((tShufflePattern & 0xF0000000u) >> 28u) < tSourceChannels, "Invalid shuffle pattern!");
3526
3527 ocean_assert(source != nullptr && target != nullptr);
3528 ocean_assert(size != 0);
3529
3530 const T* const sourceEnd = source + size * tSourceChannels;
3531
3532#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
3533
3534 if ((std::is_same<typename TypeMapper<T>::Type, uint8_t>::value))
3535 {
3536 const size_t blocks16 = size / size_t(16);
3537
3538 switch (tSourceChannels | ((tTargetChannels) << 4u))
3539 {
3540 // 4 -> 4
3541 case (4u | (4u << 4u)):
3542 {
3543 // the following shuffle patterns are known during compile time
3544
3545 constexpr unsigned int offset1 = 0x04040404u;
3546 constexpr unsigned int offset2 = 0x08080808u;
3547 constexpr unsigned int offset3 = 0x0C0C0C0Cu;
3548
3549 // converting shufflePattern16 to shufflePattern16
3550 const unsigned int shufflePattern0 = ((tShufflePattern & 0xF000u) << 12u) | ((tShufflePattern & 0x0F00u) << 8u) | ((tShufflePattern & 0x00F0u) << 4u) | ((tShufflePattern & 0x000Fu) << 0u);
3551
3552 const unsigned int shufflePattern1 = shufflePattern0 + offset1;
3553 const unsigned int shufflePattern2 = shufflePattern0 + offset2;
3554 const unsigned int shufflePattern3 = shufflePattern0 + offset3;
3555
3556 const __m128i shufflePattern128 = SSE::set128i((((unsigned long long)shufflePattern3) << 32ull) | (unsigned long long)shufflePattern2, (((unsigned long long)shufflePattern1) << 32ull) | (unsigned long long)shufflePattern0);
3557
3558 for (size_t n = 0; n < blocks16; ++n)
3559 {
3560 SSE::store128i(_mm_shuffle_epi8(SSE::load128i((const uint8_t*)source + 0), shufflePattern128), (uint8_t*)target + 0);
3561 SSE::store128i(_mm_shuffle_epi8(SSE::load128i((const uint8_t*)source + 16), shufflePattern128), (uint8_t*)target + 16);
3562 SSE::store128i(_mm_shuffle_epi8(SSE::load128i((const uint8_t*)source + 32), shufflePattern128), (uint8_t*)target + 32);
3563 SSE::store128i(_mm_shuffle_epi8(SSE::load128i((const uint8_t*)source + 48), shufflePattern128), (uint8_t*)target + 48);
3564
3565 source += 16u * tSourceChannels;
3566 target += 16u * tTargetChannels;
3567 }
3568
3569 break;
3570 }
3571
3572 default:
3573 // we do not have a NEON-based optimization
3574 break;
3575 }
3576 }
3577
3578#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3579
3580 if ((std::is_same<typename TypeMapper<T>::Type, uint8_t>::value))
3581 {
3582 const size_t blocks16 = size / size_t(16);
3583
3584 switch (tSourceChannels | ((tTargetChannels) << 4u))
3585 {
3586 // 1 -> 3
3587 case (1u | (3u << 4u)):
3588 {
3589 static_assert(tSourceChannels != 1u || tShufflePattern == 0u, "Invalid shuffle patter!");
3590
3591 for (size_t n = 0; n < blocks16; ++n)
3592 {
3593 const uint8x16_t source_u_8x16 = vld1q_u8((const uint8_t*)source);
3594
3595 uint8x16x3_t target_u_8x16x3;
3596
3597 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3598 {
3599 target_u_8x16x3.val[nT] = source_u_8x16;
3600 }
3601
3602 vst3q_u8((uint8_t*)target, target_u_8x16x3);
3603
3604 source += 16u * tSourceChannels;
3605 target += 16u * tTargetChannels;
3606 }
3607
3608 break;
3609 }
3610
3611 // 2 -> 1
3612 case (2u | (1u << 4u)):
3613 {
3614 for (size_t n = 0; n < blocks16; ++n)
3615 {
3616 const uint8x16x2_t source_u_8x16x2 = vld2q_u8((const uint8_t*)source);
3617
3618 constexpr unsigned int sourceChannel = tShufflePattern & 0x00000001u; // possible index values {0, 1}
3619 static_assert(sourceChannel <= 1u, "Invalid shuffle pattern!");
3620 ocean_assert(sourceChannel == (tShufflePattern & 0x0000000Fu));
3621
3622 const uint8x16_t target_u_8x16 = source_u_8x16x2.val[sourceChannel];
3623
3624 vst1q_u8((uint8_t*)target, target_u_8x16);
3625
3626 source += 16u * tSourceChannels;
3627 target += 16u * tTargetChannels;
3628 }
3629
3630 break;
3631 }
3632
3633 // 2 -> 3
3634 case (2u | (3u << 4u)):
3635 {
3636 for (size_t n = 0; n < blocks16; ++n)
3637 {
3638 const uint8x16x2_t source_u_8x16x2 = vld2q_u8((const uint8_t*)source);
3639
3640 uint8x16x3_t target_u_8x16x3;
3641
3642 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3643 {
3644 ocean_assert(((tShufflePattern >> (nT * 4u)) & 0x00000001u) == ((tShufflePattern >> (nT * 4u)) & 0x0000000Fu));
3645
3646 target_u_8x16x3.val[nT] = source_u_8x16x2.val[(tShufflePattern >> (nT * 4u)) & 0x00000001u]; // possible index values {0, 1}
3647 }
3648
3649 vst3q_u8((uint8_t*)target, target_u_8x16x3);
3650
3651 source += 16u * tSourceChannels;
3652 target += 16u * tTargetChannels;
3653 }
3654
3655 break;
3656 }
3657
3658 // 2 -> 4
3659 case (2u | (4u << 4u)):
3660 {
3661 for (size_t n = 0; n < blocks16; ++n)
3662 {
3663 const uint8x16x2_t source_u_8x16x2 = vld2q_u8((const uint8_t*)source);
3664
3665 uint8x16x4_t target_u_8x16x4;
3666
3667 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3668 {
3669 ocean_assert(((tShufflePattern >> (nT * 4u)) & 0x00000001u) == ((tShufflePattern >> (nT * 4u)) & 0x0000000Fu));
3670
3671 target_u_8x16x4.val[nT] = source_u_8x16x2.val[(tShufflePattern >> (nT * 4u)) & 0x00000001u]; // possible index values {0, 1}
3672 }
3673
3674 vst4q_u8((uint8_t*)target, target_u_8x16x4);
3675
3676 source += 16u * tSourceChannels;
3677 target += 16u * tTargetChannels;
3678 }
3679
3680 break;
3681 }
3682
3683 // 3 -> 1
3684 case (3u | (1u << 4u)):
3685 {
3686 constexpr unsigned int sourceChannel = (tShufflePattern & 0x0000000Fu) <= 2u ? (tShufflePattern & 0x0000000Fu) : 2u; // possible index values {0, 1, 2}
3687 ocean_assert(sourceChannel == (tShufflePattern & 0x0000000Fu));
3688
3689 for (size_t n = 0; n < blocks16; ++n)
3690 {
3691 const uint8x16x3_t source_u_8x16x3 = vld3q_u8((const uint8_t*)source);
3692
3693 const uint8x16_t target_u_8x16 = source_u_8x16x3.val[sourceChannel];
3694
3695 vst1q_u8((uint8_t*)target, target_u_8x16);
3696
3697 source += 16u * tSourceChannels;
3698 target += 16u * tTargetChannels;
3699 }
3700
3701 break;
3702 }
3703
3704 // 3 -> 2
3705 case (3u | (2u << 4u)):
3706 {
3707 for (size_t n = 0; n < blocks16; ++n)
3708 {
3709 const uint8x16x3_t source_u_8x16x3 = vld3q_u8((const uint8_t*)source);
3710
3711 uint8x16x2_t target_u_8x16x2;
3712
3713 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3714 {
3715 target_u_8x16x2.val[nT] = source_u_8x16x3.val[std::min((tShufflePattern >> (nT * 4u)) & 0x0000000Fu, 2u)]; // possible index values {0, 1, 2}
3716 }
3717
3718 vst2q_u8((uint8_t*)target, target_u_8x16x2);
3719
3720 source += 16u * tSourceChannels;
3721 target += 16u * tTargetChannels;
3722 }
3723
3724 break;
3725 }
3726
3727 // 3 -> 3
3728 case (3u | (3u << 4u)):
3729 {
3730 for (size_t n = 0; n < blocks16; ++n)
3731 {
3732 const uint8x16x3_t source_u_8x16x3 = vld3q_u8((const uint8_t*)source);
3733
3734 uint8x16x3_t target_u_8x16x3;
3735
3736 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3737 {
3738 target_u_8x16x3.val[nT] = source_u_8x16x3.val[std::min((tShufflePattern >> (nT * 4u)) & 0x0000000Fu, 2u)]; // possible index values {0, 1, 2}
3739 }
3740
3741 vst3q_u8((uint8_t*)target, target_u_8x16x3);
3742
3743 source += 16u * tSourceChannels;
3744 target += 16u * tTargetChannels;
3745 }
3746
3747 break;
3748 }
3749
3750 // 4 -> 1
3751 case (4u | (1u << 4u)):
3752 {
3753 for (size_t n = 0; n < blocks16; ++n)
3754 {
3755 const uint8x16x4_t source_u_8x16x4 = vld4q_u8((const uint8_t*)source);
3756
3757 constexpr unsigned int sourceChannel = tShufflePattern & 0x00000003u; // possible index values {0, 1, 2, 3}
3758 static_assert(sourceChannel <= 3u, "Invalid shuffle pattern!");
3759
3760 ocean_assert(sourceChannel == (tShufflePattern & 0x0000000Fu));
3761
3762 const uint8x16_t target_u_8x16 = source_u_8x16x4.val[sourceChannel];
3763
3764 vst1q_u8((uint8_t*)target, target_u_8x16);
3765
3766 source += 16u * tSourceChannels;
3767 target += 16u * tTargetChannels;
3768 }
3769
3770 break;
3771 }
3772
3773 // 4 -> 2
3774 case (4u | (2u << 4u)):
3775 {
3776 for (size_t n = 0; n < blocks16; ++n)
3777 {
3778 const uint8x16x4_t source_u_8x16x4 = vld4q_u8((const uint8_t*)source);
3779
3780 uint8x16x2_t target_u_8x16x2;
3781
3782 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3783 {
3784 ocean_assert(((tShufflePattern >> (nT * 4u)) & 0x00000003u) == ((tShufflePattern >> (nT * 4u)) & 0x0000000Fu));
3785
3786 target_u_8x16x2.val[nT] = source_u_8x16x4.val[(tShufflePattern >> (nT * 4u)) & 0x00000003u]; // possible index values {0, 1, 2, 3}
3787 }
3788
3789 vst2q_u8((uint8_t*)target, target_u_8x16x2);
3790
3791 source += 16u * tSourceChannels;
3792 target += 16u * tTargetChannels;
3793 }
3794
3795 break;
3796 }
3797
3798 // 4 -> 3
3799 case (4u | (3u << 4u)):
3800 {
3801 for (size_t n = 0; n < blocks16; ++n)
3802 {
3803 const uint8x16x4_t source_u_8x16x4 = vld4q_u8((const uint8_t*)source);
3804
3805 uint8x16x3_t target_u_8x16x3;
3806
3807 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3808 {
3809 ocean_assert(((tShufflePattern >> (nT * 4u)) & 0x00000003u) == ((tShufflePattern >> (nT * 4u)) & 0x0000000Fu));
3810
3811 target_u_8x16x3.val[nT] = source_u_8x16x4.val[(tShufflePattern >> (nT * 4u)) & 0x00000003u]; // possible index values {0, 1, 2, 3}
3812 }
3813
3814 vst3q_u8((uint8_t*)target, target_u_8x16x3);
3815
3816 source += 16u * tSourceChannels;
3817 target += 16u * tTargetChannels;
3818 }
3819
3820 break;
3821 }
3822
3823 // 4 -> 4
3824 case (4u | (4u << 4u)):
3825 {
3826 for (size_t n = 0; n < blocks16; ++n)
3827 {
3828 const uint8x16x4_t source_u_8x16x4 = vld4q_u8((const uint8_t*)source);
3829
3830 uint8x16x4_t target_u_8x16x4;
3831
3832 for (unsigned int nT = 0u; nT < tTargetChannels; ++nT)
3833 {
3834 ocean_assert(((tShufflePattern >> (nT * 4u)) & 0x00000003u) == ((tShufflePattern >> (nT * 4u)) & 0x0000000Fu));
3835
3836 target_u_8x16x4.val[nT] = source_u_8x16x4.val[(tShufflePattern >> (nT * 4u)) & 0x00000003u]; // possible index values {0, 1, 2, 3}
3837 }
3838
3839 vst4q_u8((uint8_t*)target, target_u_8x16x4);
3840
3841 source += 16u * tSourceChannels;
3842 target += 16u * tTargetChannels;
3843 }
3844
3845 break;
3846 }
3847
3848 default:
3849 // we do not have a NEON-based optimization
3850 break;
3851 }
3852 }
3853
3854#endif
3855
3856 while (source != sourceEnd)
3857 {
3858 ocean_assert(source < sourceEnd);
3859
3860 for (unsigned int n = 0u; n < tTargetChannels; ++n)
3861 {
3862 target[n] = source[(tShufflePattern >> (n * 4u)) & 0x0000000Fu];
3863 }
3864
3865 source += tSourceChannels;
3866 target += tTargetChannels;
3867 }
3868}
3869
3870template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
3871inline void FrameChannels::shuffleRowChannelsAndSetLastChannelValue(const T* source, T* target, const size_t size, const void* options)
3872{
3873 static_assert(tSourceChannels >= 1u && tSourceChannels <= 8u, "Invalid channel number!");
3874 static_assert(tTargetChannels >= 2u && tTargetChannels <= 8u, "Invalid channel number!");
3875
3876 static_assert(((tShufflePattern & 0x0000000Fu) >> 0u) < tSourceChannels, "Invalid shuffle pattern!");
3877 static_assert(((tShufflePattern & 0x000000F0u) >> 4u) < tSourceChannels, "Invalid shuffle pattern!");
3878 static_assert(((tShufflePattern & 0x00000F00u) >> 8u) < tSourceChannels, "Invalid shuffle pattern!");
3879 static_assert(((tShufflePattern & 0x0000F000u) >> 12u) < tSourceChannels, "Invalid shuffle pattern!");
3880 static_assert(((tShufflePattern & 0x000F0000u) >> 16u) < tSourceChannels, "Invalid shuffle pattern!");
3881 static_assert(((tShufflePattern & 0x00F00000u) >> 20u) < tSourceChannels, "Invalid shuffle pattern!");
3882 static_assert(((tShufflePattern & 0x0F000000u) >> 24u) < tSourceChannels, "Invalid shuffle pattern!");
3883 static_assert(((tShufflePattern & 0xF0000000u) >> 28u) < tSourceChannels, "Invalid shuffle pattern!");
3884
3885 ocean_assert(source != nullptr && target != nullptr);
3886 ocean_assert(size != 0);
3887
3888 ocean_assert(options != nullptr);
3889
3890 const T lastChannelValue = *(const T*)(options);
3891
3892 const T* const sourceEnd = source + size * tSourceChannels;
3893
3894#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3895
3896 if ((std::is_same<typename TypeMapper<T>::Type, uint8_t>::value))
3897 {
3898 const size_t blocks16 = size / size_t(16);
3899
3900 switch (tSourceChannels | ((tTargetChannels) << 4u))
3901 {
3902 // 1 -> 4
3903 case (1u | (4u << 4u)):
3904 {
3905 ocean_assert(tShufflePattern == 0u);
3906
3907 const uint8x16_t lastChannelValue_u_8x16 = vmovq_n_u8(lastChannelValue);
3908
3909 uint8x16x4_t target_u_8x16x4;
3910 target_u_8x16x4.val[3] = lastChannelValue_u_8x16;
3911
3912 for (size_t n = 0; n < blocks16; ++n)
3913 {
3914 const uint8x16_t source_u_8x16 = vld1q_u8((const uint8_t*)source);
3915
3916 for (unsigned int nT = 0u; nT < tTargetChannels - 1u; ++nT)
3917 {
3918 target_u_8x16x4.val[nT] = source_u_8x16;
3919 }
3920
3921 vst4q_u8((uint8_t*)target, target_u_8x16x4);
3922
3923 source += 16u * tSourceChannels;
3924 target += 16u * tTargetChannels;
3925 }
3926
3927 break;
3928 }
3929
3930 // 3 -> 4
3931 case (3u | (4u << 4u)):
3932 {
3933 const uint8x16_t lastChannelValue_u_8x16 = vmovq_n_u8(lastChannelValue);
3934
3935 uint8x16x4_t target_u_8x16x4;
3936 target_u_8x16x4.val[3] = lastChannelValue_u_8x16;
3937
3938 for (size_t n = 0; n < blocks16; ++n)
3939 {
3940 const uint8x16x3_t source_u_8x16x3 = vld3q_u8((const uint8_t*)source);
3941
3942 for (unsigned int nT = 0u; nT < tTargetChannels - 1u; ++nT)
3943 {
3944 target_u_8x16x4.val[nT] = source_u_8x16x3.val[std::min((tShufflePattern >> (nT * 4u)) & 0x0000000Fu, 2u)]; // possible index values {0, 1, 2}
3945 }
3946
3947 vst4q_u8((uint8_t*)target, target_u_8x16x4);
3948
3949 source += 16u * tSourceChannels;
3950 target += 16u * tTargetChannels;
3951 }
3952
3953 break;
3954 }
3955
3956 // 4 -> 4
3957 case (4u | (4u << 4u)):
3958 {
3959 const uint8x16_t lastChannelValue_u_8x16 = vmovq_n_u8(lastChannelValue);
3960
3961 uint8x16x4_t target_u_8x16x4;
3962 target_u_8x16x4.val[3] = lastChannelValue_u_8x16;
3963
3964 for (size_t n = 0; n < blocks16; ++n)
3965 {
3966 const uint8x16x4_t source_u_8x16x4 = vld4q_u8((const uint8_t*)source);
3967
3968 for (unsigned int nT = 0u; nT < tTargetChannels - 1u; ++nT)
3969 {
3970 target_u_8x16x4.val[nT] = source_u_8x16x4.val[std::min((tShufflePattern >> (nT * 4u)) & 0x0000000Fu, 3u)]; // possible index values {0, 1, 2, 3}
3971 }
3972
3973 vst4q_u8((uint8_t*)target, target_u_8x16x4);
3974
3975 source += 16u * tSourceChannels;
3976 target += 16u * tTargetChannels;
3977 }
3978
3979 break;
3980 }
3981
3982 default:
3983 // we do not have a NEON-based optimization
3984 break;
3985 }
3986 }
3987
3988#endif
3989
3990 while (source != sourceEnd)
3991 {
3992 ocean_assert(source < sourceEnd);
3993
3994 for (unsigned int n = 0u; n < tTargetChannels - 1u; ++n)
3995 {
3996 target[n] = source[(tShufflePattern >> (n * 4u)) & 0x0000000Fu];
3997 target[tTargetChannels - 1u] = lastChannelValue;
3998 }
3999
4000 source += tSourceChannels;
4001 target += tTargetChannels;
4002 }
4003}
4004
4005template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
4006inline void FrameChannels::shuffleChannels(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
4007{
4008 static_assert(tSourceChannels >= 1u && tSourceChannels <= 8u, "Invalid channel number!");
4009 static_assert(tTargetChannels >= 1u && tTargetChannels <= 8u, "Invalid channel number!");
4010
4011 static_assert(tSourceChannels != 1u || tTargetChannels != 1u, "Invalid channel number!");
4012
4013 static_assert(((tShufflePattern & 0x0000000Fu) >> 0u) < tSourceChannels, "Invalid shuffle pattern!");
4014 static_assert(((tShufflePattern & 0x000000F0u) >> 4u) < tSourceChannels, "Invalid shuffle pattern!");
4015 static_assert(((tShufflePattern & 0x00000F00u) >> 8u) < tSourceChannels, "Invalid shuffle pattern!");
4016 static_assert(((tShufflePattern & 0x0000F000u) >> 12u) < tSourceChannels, "Invalid shuffle pattern!");
4017 static_assert(((tShufflePattern & 0x000F0000u) >> 16u) < tSourceChannels, "Invalid shuffle pattern!");
4018 static_assert(((tShufflePattern & 0x00F00000u) >> 20u) < tSourceChannels, "Invalid shuffle pattern!");
4019 static_assert(((tShufflePattern & 0x0F000000u) >> 24u) < tSourceChannels, "Invalid shuffle pattern!");
4020 static_assert(((tShufflePattern & 0xF0000000u) >> 28u) < tSourceChannels, "Invalid shuffle pattern!");
4021
4022 ocean_assert(source != nullptr && target != nullptr);
4023 ocean_assert(width >= 1u && height >= 1u);
4024
4025 const unsigned int sourceStrideElements = width * tSourceChannels + sourcePaddingElements;
4026 const unsigned int targetStrideElements = width * tTargetChannels + targetPaddingElements;
4027
4028 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
4029
4030 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, conversionFlag, FrameChannels::shuffleRowChannels<T, tSourceChannels, tTargetChannels, tShufflePattern>, FrameChannels::reverseRowPixelOrderInPlace<T, tTargetChannels>, areContinuous, nullptr, worker);
4031}
4032
4033template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tShufflePattern>
4034inline void FrameChannels::shuffleChannelsAndSetLastChannelValue(const T* source, const T newChannelValue, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
4035{
4036 static_assert(tSourceChannels >= 1u && tSourceChannels <= 8u, "Invalid channel number!");
4037 static_assert(tTargetChannels >= 2u && tTargetChannels <= 8u, "Invalid channel number!");
4038
4039 static_assert(((tShufflePattern & 0x0000000Fu) >> 0u) < tSourceChannels, "Invalid shuffle pattern!");
4040 static_assert(((tShufflePattern & 0x000000F0u) >> 4u) < tSourceChannels, "Invalid shuffle pattern!");
4041 static_assert(((tShufflePattern & 0x00000F00u) >> 8u) < tSourceChannels, "Invalid shuffle pattern!");
4042 static_assert(((tShufflePattern & 0x0000F000u) >> 12u) < tSourceChannels, "Invalid shuffle pattern!");
4043 static_assert(((tShufflePattern & 0x000F0000u) >> 16u) < tSourceChannels, "Invalid shuffle pattern!");
4044 static_assert(((tShufflePattern & 0x00F00000u) >> 20u) < tSourceChannels, "Invalid shuffle pattern!");
4045 static_assert(((tShufflePattern & 0x0F000000u) >> 24u) < tSourceChannels, "Invalid shuffle pattern!");
4046 static_assert(((tShufflePattern & 0xF0000000u) >> 28u) < tSourceChannels, "Invalid shuffle pattern!");
4047
4048 ocean_assert(source != nullptr && target != nullptr);
4049 ocean_assert(width >= 1u && height >= 1u);
4050
4051 const unsigned int sourceStrideElements = width * tSourceChannels + sourcePaddingElements;
4052 const unsigned int targetStrideElements = width * tTargetChannels + targetPaddingElements;
4053
4054 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
4055
4056 const T options = newChannelValue;
4057
4058 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, conversionFlag, FrameChannels::shuffleRowChannelsAndSetLastChannelValue<T, tSourceChannels, tTargetChannels, tShufflePattern>, FrameChannels::reverseRowPixelOrderInPlace<T, tTargetChannels>, areContinuous, &options, worker);
4059}
4060
4061template <unsigned int tChannels>
4062inline void FrameChannels::narrow16BitPerChannelTo8BitPerChannel(const uint16_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
4063{
4064 static_assert(tChannels >= 1u, "Invalid channel number!");
4065
4066 ocean_assert(source != nullptr && target != nullptr);
4067 ocean_assert(width >= 1u && height >= 1u);
4068
4069 const unsigned int sourceStrideElements = width * tChannels + sourcePaddingElements;
4070 const unsigned int targetStrideElements = width * tChannels + targetPaddingElements;
4071
4072 const bool areContinuous = sourcePaddingElements == 0u && targetPaddingElements == 0u;
4073
4074 FrameConverter::convertGenericPixelFormat(source, target, width, height, sourceStrideElements, targetStrideElements, conversionFlag, FrameChannels::narrowRow16BitPerChannelTo8BitPerChannel<tChannels>, FrameChannels::reverseRowPixelOrderInPlace<uint8_t, tChannels>, areContinuous, nullptr, worker);
4075}
4076
4077template <typename T, unsigned int tChannels, void (*tPixelFunction)(const T*, T*)>
4078void FrameChannels::applyPixelModifier(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, Worker* worker)
4079{
4080 static_assert(tChannels > 0u, "Invalid channel number!");
4081
4082 ocean_assert(source && target);
4083 ocean_assert(width != 0u && height != 0u);
4084
4085 if (worker)
4086 {
4087 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::applyPixelModifierSubset<T, tChannels, tPixelFunction>, source, target, width, height, conversionFlag, 0u, 0u), 0u, height);
4088 }
4089 else
4090 {
4091 applyPixelModifierSubset<T, tChannels, tPixelFunction>(source, target, width, height, conversionFlag, 0u, height);
4092 }
4093}
4094
4095template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tPixelFunction)(const TSource*, TTarget*)>
4096void FrameChannels::applyAdvancedPixelModifier(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, Worker* worker)
4097{
4098 static_assert(tSourceChannels > 0u, "Invalid source channel number!");
4099 static_assert(tTargetChannels > 0u, "Invalid target channel number!");
4100
4101 ocean_assert(source && target);
4102 ocean_assert(width != 0u && height != 0u);
4103
4104 if (worker)
4105 {
4106 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::applyAdvancedPixelModifierSubset<TSource, TTarget, tSourceChannels, tTargetChannels, tPixelFunction>, source, target, width, height, sourcePaddingElements, targetPaddingElements, conversionFlag, 0u, 0u), 0u, height);
4107 }
4108 else
4109 {
4110 applyAdvancedPixelModifierSubset<TSource, TTarget, tSourceChannels, tTargetChannels, tPixelFunction>(source, target, width, height, sourcePaddingElements, targetPaddingElements, conversionFlag, 0u, height);
4111 }
4112}
4113
4114template <typename TSource0, typename TSource1, typename TTarget, typename TIntermediate, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tOperator)(const TSource0*, const TSource1*, TTarget*)>
4115void FrameChannels::applyBivariateOperator(const TSource0* source0, const TSource1* source1, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int source0PaddingElements, const unsigned int source1PaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, Worker* worker)
4116{
4117 static_assert(tSourceChannels > 0u, "Invalid source channel number!");
4118 static_assert(tTargetChannels > 0u, "Invalid target channel number!");
4119
4120 ocean_assert(source0 && source1 && target);
4121 ocean_assert(width != 0u && height != 0u);
4122
4123 if (worker)
4124 {
4125 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::applyBivariateOperatorSubset<TSource0, TSource1, TTarget, TIntermediate, tSourceChannels, tTargetChannels, tOperator>, source0, source1, target, width, height, source0PaddingElements, source1PaddingElements, targetPaddingElements, conversionFlag, 0u, 0u), 0u, height);
4126 }
4127 else
4128 {
4129 FrameChannels::applyBivariateOperatorSubset<TSource0, TSource1, TTarget, TIntermediate, tSourceChannels, tTargetChannels, tOperator>(source0, source1, target, width, height, source0PaddingElements, source1PaddingElements, targetPaddingElements, conversionFlag, 0u, height);
4130 }
4131}
4132
4133template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels>
4134void FrameChannels::applyRowOperator(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const RowOperatorFunction<TSource, TTarget, tSourceChannels, tTargetChannels>& rowOperatorFunction, Worker* worker)
4135{
4136 static_assert(tSourceChannels > 0u, "Invalid source channel number!");
4137 static_assert(tTargetChannels > 0u, "Invalid target channel number!");
4138
4139 ocean_assert(source != nullptr && target != nullptr);
4140 ocean_assert(width != 0u && height != 0u);
4141
4142 const unsigned int sourceStrideElements = width * tSourceChannels + sourcePaddingElements;
4143 const unsigned int targetStrideElements = width * tTargetChannels + targetPaddingElements;
4144
4145 if (worker)
4146 {
4147 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::applyRowOperatorSubset<TSource, TTarget, tSourceChannels, tTargetChannels>, source, target, width, height, sourceStrideElements, targetStrideElements, rowOperatorFunction, 0u, 0u), 0u, height);
4148 }
4149 else
4150 {
4151 applyRowOperatorSubset<TSource, TTarget, tSourceChannels, tTargetChannels>(source, target, width, height, sourceStrideElements, targetStrideElements, rowOperatorFunction, 0u, height);
4152 }
4153}
4154
4155template <typename T, unsigned int tChannels>
4156inline void FrameChannels::transformGeneric(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
4157{
4158 ocean_assert(source != nullptr && target != nullptr);
4159 ocean_assert(width >= 1u && height >= 1u);
4160
4161 const unsigned int bytesPerRow = width * sizeof(T) * tChannels;
4162
4163 const unsigned int sourceStrideBytes = width * sizeof(T) * tChannels + sizeof(T) * sourcePaddingElements;
4164 const unsigned int targetStrideBytes = width * sizeof(T) * tChannels + sizeof(T) * targetPaddingElements;
4165
4166 using MappedType = typename TypeMapper<T>::Type;
4167
4168 const RowReversePixelOrderFunction<void> rowReversePixelOrderFunction = (const RowReversePixelOrderFunction<void>)(FrameChannels::reverseRowPixelOrder<MappedType, tChannels>);
4169
4170 if (worker && height > 200u)
4171 {
4172 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::transformGenericSubset, (const uint8_t*)(source), (uint8_t*)(target), width, height, conversionFlag, rowReversePixelOrderFunction, bytesPerRow, sourceStrideBytes, targetStrideBytes, 0u, 0u), 0u, height, 9u, 10u, 20u);
4173 }
4174 else
4175 {
4176 transformGenericSubset((const uint8_t*)(source), (uint8_t*)(target), width, height, conversionFlag, rowReversePixelOrderFunction, bytesPerRow, sourceStrideBytes, targetStrideBytes, 0u, height);
4177 }
4178}
4179
4180template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
4181void FrameChannels::premultipliedAlphaToStraightAlpha8BitPerChannel(uint8_t* const frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, Worker* worker)
4182{
4183 static_assert(tChannels >= 2u, "Invalid channel number!");
4184 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
4185
4186 ocean_assert(frame != nullptr);
4187 ocean_assert(width >= 1u && height >= 1u);
4188
4189 if (worker && height > 200u)
4190 {
4191 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::premultipliedAlphaToStraightAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>, frame, width, framePaddingElements, 0u, 0u), 0u, height, 3u, 4u, 20u);
4192 }
4193 else
4194 {
4195 premultipliedAlphaToStraightAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>(frame, width, framePaddingElements, 0u, height);
4196 }
4197}
4198
4199template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
4200void FrameChannels::premultipliedAlphaToStraightAlpha8BitPerChannel(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
4201{
4202 static_assert(tChannels >= 2u, "Invalid channel number!");
4203 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
4204
4205 ocean_assert(source != nullptr && target != nullptr);
4206 ocean_assert(width >= 1u && height >= 1u);
4207
4208 if (worker && height > 200u)
4209 {
4210 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::premultipliedAlphaToStraightAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>, source, target, width, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, height, 5u, 6u, 20u);
4211 }
4212 else
4213 {
4214 premultipliedAlphaToStraightAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>(source, target, width, sourcePaddingElements, targetPaddingElements, 0u, height);
4215 }
4216}
4217
4218template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
4219void FrameChannels::straightAlphaToPremultipliedAlpha8BitPerChannel(uint8_t* const frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, Worker* worker)
4220{
4221 static_assert(tChannels >= 2u, "Invalid channel number!");
4222 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
4223
4224 ocean_assert(frame != nullptr);
4225 ocean_assert(width >= 1u && height >= 1u);
4226
4227 if (worker && height > 200u)
4228 {
4229 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::straightAlphaToPremultipliedAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>, frame, width, framePaddingElements, 0u, 0u), 0u, height, 3u, 4u, 20u);
4230 }
4231 else
4232 {
4233 straightAlphaToPremultipliedAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>(frame, width, framePaddingElements, 0u, height);
4234 }
4235}
4236
4237template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
4238void FrameChannels::straightAlphaToPremultipliedAlpha8BitPerChannel(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
4239{
4240 static_assert(tChannels >= 2u, "Invalid channel number!");
4241 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
4242
4243 ocean_assert(source != nullptr && target != nullptr);
4244 ocean_assert(width >= 1u && height >= 1u);
4245
4246 if (worker && height > 200u)
4247 {
4248 worker->executeFunction(Worker::Function::createStatic(&FrameChannels::straightAlphaToPremultipliedAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>, source, target, width, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, height, 5u, 6u, 20u);
4249 }
4250 else
4251 {
4252 straightAlphaToPremultipliedAlpha8BitPerChannelSubset<tChannels, tAlphaChannelIndex>(source, target, width, sourcePaddingElements, targetPaddingElements, 0u, height);
4253 }
4254}
4255
4256template <unsigned int tChannels>
4257void FrameChannels::narrowRow16BitPerChannelTo8BitPerChannel(const uint16_t* source, uint8_t* target, const size_t size, const void* /* unusedParameters */)
4258{
4259 static_assert(tChannels >= 1u, "Invalid channel number!");
4260
4261 ocean_assert(source != nullptr && target != nullptr);
4262 ocean_assert(size > 0);
4263
4264 const uint16_t* const sourceEnd = source + size * tChannels;
4265
4266#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
4267
4268 const size_t blocks8 = size / size_t(8);
4269
4270 switch (tChannels)
4271 {
4272 case 4u:
4273 {
4274 for (size_t n = 0; n < blocks8; ++n)
4275 {
4276 const uint16x8_t sourceA_u_16x8 = vld1q_u16(source + 0);
4277 const uint16x8_t sourceB_u_16x8 = vld1q_u16(source + 8);
4278 const uint16x8_t sourceC_u_16x8 = vld1q_u16(source + 16);
4279 const uint16x8_t sourceD_u_16x8 = vld1q_u16(source + 24);
4280
4281 const uint8x16_t targetAB_u_8x16 = vcombine_u8(vqrshrn_n_u16(sourceA_u_16x8, 8), vqrshrn_n_u16(sourceB_u_16x8, 8)); // narrowing rounded right shift: target = (source + 128) / 256
4282 const uint8x16_t targetCD_u_8x16 = vcombine_u8(vqrshrn_n_u16(sourceC_u_16x8, 8), vqrshrn_n_u16(sourceD_u_16x8, 8));
4283
4284 vst1q_u8(target + 0, targetAB_u_8x16);
4285 vst1q_u8(target + 16, targetCD_u_8x16);
4286
4287 source += 8u * tChannels;
4288 target += 8u * tChannels;
4289 }
4290
4291 break;
4292 }
4293
4294 default:
4295 break;
4296 }
4297
4298#endif
4299
4300 while (source != sourceEnd)
4301 {
4302 ocean_assert(source < sourceEnd);
4303
4304 for (unsigned int n = 0u; n < tChannels; ++n)
4305 {
4306 ocean_assert((uint16_t)(source[n] >> 8u) <= 255u);
4307 target[n] = (uint8_t)(source[n] >> 8u);
4308 }
4309
4310 source += tChannels;
4311 target += tChannels;
4312 }
4313}
4314
4315template <typename T, unsigned int tSourceChannels, bool tAddToFront>
4316void FrameChannels::addChannelRow(const void** sources, void** targets, const unsigned int multipleRowIndex, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const void* options)
4317{
4318 static_assert(tSourceChannels != 0u, "Invalid channel number!");
4319 static_assert(sizeof(size_t) == sizeof(const T*), "Invalid pointer size!");
4320
4321 ocean_assert(sources != nullptr && targets != nullptr);
4322 ocean_assert(width != 0u && height != 0u);
4323 ocean_assert(multipleRowIndex < height);
4324 ocean_assert(options != nullptr);
4325
4326 const T* source = (const T*)(sources[0]);
4327 const T* sourceOneChannel = (const T*)(sources[1]);
4328 ocean_assert(source != nullptr && sourceOneChannel != nullptr);
4329
4330 T* target = (T*)(targets[0]);
4331 ocean_assert(target != nullptr);
4332
4333 const unsigned int* uintOptions = (const unsigned int*)options;
4334 ocean_assert(uintOptions != nullptr);
4335
4336 const unsigned int sourcePaddingElements = uintOptions[0];
4337 const unsigned int sourceOneChannelPaddingElements = uintOptions[1];
4338 const unsigned int targetPaddingElements = uintOptions[2];
4339
4340 const unsigned int targetChannels = tSourceChannels + 1u;
4341
4342 const unsigned int sourceStrideElements = tSourceChannels * width + sourcePaddingElements;
4343 const unsigned int sourceOneChannelStrideElements = width + sourceOneChannelPaddingElements;
4344 const unsigned int targetStrideElements = targetChannels * width + targetPaddingElements;
4345
4346 const bool flipTarget = conversionFlag == CONVERT_FLIPPED || conversionFlag == CONVERT_FLIPPED_AND_MIRRORED;
4347 const bool mirrorTarget = conversionFlag == CONVERT_MIRRORED || conversionFlag == CONVERT_FLIPPED_AND_MIRRORED;
4348
4349 const T* sourceRow = source + sourceStrideElements * multipleRowIndex;
4350 const T* sourceOneChannelRow = sourceOneChannel + sourceOneChannelStrideElements * multipleRowIndex;
4351 T* targetRow = flipTarget ? target + targetStrideElements * (height - multipleRowIndex - 1u) : target + targetStrideElements * multipleRowIndex;
4352
4353 if (mirrorTarget == false)
4354 {
4355 for (unsigned int n = 0u; n < width; ++n)
4356 {
4357 if constexpr (tAddToFront)
4358 {
4359 targetRow[0] = sourceOneChannelRow[0];
4360
4361 for (unsigned int c = 0u; c < tSourceChannels; ++c)
4362 {
4363 targetRow[c + 1u] = sourceRow[c];
4364 }
4365 }
4366 else
4367 {
4368 for (unsigned int c = 0u; c < tSourceChannels; ++c)
4369 {
4370 targetRow[c] = sourceRow[c];
4371 }
4372
4373 targetRow[tSourceChannels] = sourceOneChannelRow[0];
4374 }
4375
4376 sourceRow += tSourceChannels;
4377 sourceOneChannelRow++;
4378
4379 targetRow += targetChannels;
4380 }
4381 }
4382 else
4383 {
4384 targetRow += targetChannels * (width - 1u);
4385
4386 for (unsigned int n = 0u; n < width; ++n)
4387 {
4388 if constexpr (tAddToFront)
4389 {
4390 targetRow[0] = sourceOneChannelRow[0];
4391
4392 for (unsigned int c = 0u; c < tSourceChannels; ++c)
4393 {
4394 targetRow[c + 1u] = sourceRow[c];
4395 }
4396 }
4397 else
4398 {
4399 for (unsigned int c = 0u; c < tSourceChannels; ++c)
4400 {
4401 targetRow[c] = sourceRow[c];
4402 }
4403
4404 targetRow[tSourceChannels] = sourceOneChannelRow[0];
4405 }
4406
4407 sourceRow += tSourceChannels;
4408 sourceOneChannelRow++;
4409
4410 targetRow -= targetChannels;
4411 }
4412 }
4413}
4414
4415template <typename T, unsigned int tSourceChannels, bool tAddToFront>
4416void FrameChannels::addChannelValueRow(const T* source, T* target, const size_t size, const void* channelValueParameter)
4417{
4418 static_assert(tSourceChannels != 0u, "Invalid channel number!");
4419
4420 ocean_assert(source != nullptr && target != nullptr);
4421 ocean_assert(size > 0);
4422 ocean_assert(channelValueParameter != nullptr);
4423
4424 const T& channelValue = *((const T*)channelValueParameter);
4425
4426 const unsigned int targetChannels = tSourceChannels + 1u;
4427
4428 for (size_t n = 0; n < size; ++n)
4429 {
4430 if constexpr (tAddToFront)
4431 {
4432 target[0] = channelValue;
4433
4434 for (unsigned int c = 0u; c < tSourceChannels; ++c)
4435 {
4436 target[c + 1u] = source[c];
4437 }
4438 }
4439 else
4440 {
4441 for (unsigned int c = 0u; c < tSourceChannels; ++c)
4442 {
4443 target[c] = source[c];
4444 }
4445
4446 target[tSourceChannels] = channelValue;
4447 }
4448
4449 source += tSourceChannels;
4450 target += targetChannels;
4451 }
4452}
4453
4454template <typename T, unsigned int tSourceChannels, unsigned int tTargetChannels, unsigned int tSourceChannelIndex, unsigned int tTargetChannelIndex>
4455void FrameChannels::copyChannelRow(const T* source, T* target, const size_t size, const void* /*unusedParameters*/)
4456{
4457 static_assert(tSourceChannels != 0u, "Invalid channel number!");
4458 static_assert(tTargetChannels != 0u, "Invalid channel number!");
4459
4460 static_assert(tSourceChannelIndex < tSourceChannels, "Invalid channel number!");
4461 static_assert(tTargetChannelIndex < tTargetChannels, "Invalid channel number!");
4462
4463 ocean_assert(source != nullptr && target != nullptr);
4464 ocean_assert(size > 0);
4465
4466 for (size_t n = 0; n < size; ++n)
4467 {
4468 target[tTargetChannelIndex] = source[tSourceChannelIndex];
4469
4470 source += tSourceChannels;
4471 target += tTargetChannels;
4472 }
4473}
4474
4475template <typename TSource, typename TTarget>
4476void FrameChannels::separateTo1ChannelRuntime(const TSource* const sourceFrame, TTarget* const* const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int* targetFramesPaddingElements)
4477{
4478 ocean_assert(sourceFrame != nullptr);
4479 ocean_assert(targetFrames != nullptr);
4480
4481 ocean_assert(width != 0u && height != 0u);
4482 ocean_assert(channels != 0u);
4483
4484#ifdef OCEAN_DEBUG
4485 for (unsigned int c = 0u; c < channels; ++c)
4486 {
4487 ocean_assert(targetFrames[c] != nullptr);
4488 }
4489#endif
4490
4491 if (sourceFramePaddingElements == 0u && targetFramesPaddingElements == nullptr)
4492 {
4493 for (unsigned int n = 0u; n < width * height; ++n)
4494 {
4495 for (unsigned int c = 0u; c < channels; ++c)
4496 {
4497 targetFrames[c][n] = TTarget(sourceFrame[n * channels + c]);
4498 }
4499 }
4500 }
4501 else if (targetFramesPaddingElements == nullptr)
4502 {
4503 ocean_assert(sourceFramePaddingElements != 0u);
4504
4505 const unsigned int sourceFrameStrideElements = width * channels + sourceFramePaddingElements;
4506
4507 for (unsigned int y = 0u; y < height; ++y)
4508 {
4509 const TSource* const sourceRow = sourceFrame + y * sourceFrameStrideElements;
4510
4511 const unsigned int targetRowOffset = y * width;
4512
4513 for (unsigned int x = 0u; x < width; ++x)
4514 {
4515 for (unsigned int c = 0u; c < channels; ++c)
4516 {
4517 *(targetFrames[c] + targetRowOffset + x) = TTarget(*(sourceRow + x * channels + c));
4518 }
4519 }
4520 }
4521 }
4522 else
4523 {
4524 const unsigned int sourceFrameStrideElements = width * channels + sourceFramePaddingElements;
4525
4526 Indices32 targetFrameStrideElements(channels);
4527
4528 for (unsigned int c = 0u; c < channels; ++c)
4529 {
4530 targetFrameStrideElements[c] = width + targetFramesPaddingElements[c];
4531 }
4532
4533 for (unsigned int y = 0u; y < height; ++y)
4534 {
4535 const TSource* const sourceRow = sourceFrame + y * sourceFrameStrideElements;
4536
4537 for (unsigned int x = 0u; x < width; ++x)
4538 {
4539 for (unsigned int c = 0u; c < channels; ++c)
4540 {
4541 *(targetFrames[c] + y * targetFrameStrideElements[c] + x) = TTarget(*(sourceRow + x * channels + c));
4542 }
4543 }
4544 }
4545 }
4546}
4547
4548template <typename TSource, typename TTarget>
4549void FrameChannels::zipChannelsRuntime(const TSource* const* sourceFrames, TTarget* const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int* sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
4550{
4551 ocean_assert(sourceFrames != nullptr);
4552 ocean_assert(targetFrame != nullptr);
4553
4554 ocean_assert(width != 0u && height != 0u);
4555 ocean_assert(channels != 0u);
4556
4557 bool allSourceFramesContinuous = true;
4558
4559 if (sourceFramesPaddingElements != nullptr)
4560 {
4561 for (unsigned int n = 0u; n < channels; ++n)
4562 {
4563 if (sourceFramesPaddingElements[n] != 0u)
4564 {
4565 allSourceFramesContinuous = false;
4566 break;
4567 }
4568 }
4569 }
4570
4571 if (allSourceFramesContinuous && targetFramePaddingElements == 0u)
4572 {
4573 for (unsigned int n = 0u; n < width * height; ++n)
4574 {
4575 for (unsigned int c = 0u; c < channels; ++c)
4576 {
4577 targetFrame[n * channels + c] = TTarget(sourceFrames[c][n]);
4578 }
4579 }
4580 }
4581 else
4582 {
4583 const unsigned int targetFrameStrideElements = width * channels + targetFramePaddingElements;
4584
4585 Indices32 sourceFrameStrideElements(channels);
4586
4587 for (unsigned int c = 0u; c < channels; ++c)
4588 {
4589 if (sourceFramesPaddingElements == nullptr)
4590 {
4591 sourceFrameStrideElements[c] = width;
4592 }
4593 else
4594 {
4595 sourceFrameStrideElements[c] = width + sourceFramesPaddingElements[c];
4596 }
4597 }
4598
4599 for (unsigned int y = 0u; y < height; ++y)
4600 {
4601 TTarget* const targetRow = targetFrame + y * targetFrameStrideElements;
4602
4603 for (unsigned int x = 0u; x < width; ++x)
4604 {
4605 for (unsigned int c = 0u; c < channels; ++c)
4606 {
4607 *(targetRow + x * channels + c) = TTarget(*(sourceFrames[c] + y * sourceFrameStrideElements[c] + x));
4608 }
4609 }
4610 }
4611 }
4612}
4613
4614template <typename T, unsigned int tChannel, unsigned int tChannels>
4615void FrameChannels::setChannelSubset(T* frame, const unsigned int width, const T value, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows)
4616{
4617 static_assert(tChannels >= 1u, "Invalid channel number!");
4618 static_assert(tChannel < tChannels, "Invalid channel index!");
4619
4620 ocean_assert(frame != nullptr);
4621
4622 const unsigned int frameStrideElements = width * tChannels + framePaddingElements;
4623
4624 frame += firstRow * frameStrideElements + tChannel;
4625
4626 for (unsigned int n = 0u; n < numberRows; ++n)
4627 {
4628 for (unsigned int x = 0u; x < width; ++x)
4629 {
4630 frame[x * tChannels] = value;
4631 }
4632
4633 frame += frameStrideElements;
4634 }
4635}
4636
4637template <typename T, unsigned int tChannels, void (*tPixelFunction)(const T*, T*)>
4638void FrameChannels::applyPixelModifierSubset(const T* source, T* target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows)
4639{
4640 static_assert(tChannels >= 1u, "Invalid channel number");
4641
4642 ocean_assert(source && target);
4643 ocean_assert(source != target);
4644
4645 ocean_assert(numberRows > 0u);
4646 ocean_assert(firstRow + numberRows <= height);
4647
4648 const unsigned int widthElements = width * tChannels;
4649 const unsigned int targetBlockSize = widthElements * numberRows;
4650
4651 switch (conversionFlag)
4652 {
4653 case CONVERT_NORMAL:
4654 {
4655 source += firstRow * widthElements;
4656 target += firstRow * widthElements;
4657
4658 const T* const targetEnd = target + targetBlockSize;
4659
4660 while (target != targetEnd)
4661 {
4662 tPixelFunction(source, target);
4663
4664 source += tChannels;
4665 target += tChannels;
4666 }
4667
4668 break;
4669 }
4670
4671 case CONVERT_FLIPPED:
4672 {
4673 source += firstRow * widthElements;
4674 target += width * height * tChannels - (firstRow + 1u) * widthElements;
4675
4676 const T* const targetEnd = target - targetBlockSize;
4677
4678 while (target != targetEnd)
4679 {
4680 const T* const targetRowEnd = target + widthElements;
4681
4682 while (target != targetRowEnd)
4683 {
4684 tPixelFunction(source, target);
4685
4686 source += tChannels;
4687 target += tChannels;
4688 }
4689
4690 target -= (widthElements << 1); // width * tChannels * 2
4691 }
4692
4693 break;
4694 }
4695
4696 case CONVERT_MIRRORED:
4697 {
4698 source += firstRow * widthElements;
4699 target += (firstRow + 1u) * widthElements;
4700
4701 const T* const targetEnd = target + targetBlockSize;
4702
4703 while (target != targetEnd)
4704 {
4705 const T* const targetRowEnd = target - widthElements;
4706
4707 while (target != targetRowEnd)
4708 {
4709 tPixelFunction(source, target -= tChannels);
4710
4711 source += tChannels;
4712 }
4713
4714 target += widthElements << 1; // width * tChannels * 2;
4715 }
4716
4717 break;
4718 }
4719
4721 {
4722 source += firstRow * widthElements;
4723 target += width * height * tChannels - firstRow * widthElements;
4724
4725 const T* const targetEnd = target - targetBlockSize;
4726
4727 while (target != targetEnd)
4728 {
4729 tPixelFunction(source, target -= tChannels);
4730
4731 source += tChannels;
4732 }
4733
4734 break;
4735 }
4736
4737 default: // this case is not handled
4738 break;
4739 }
4740}
4741
4742template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tPixelFunction)(const TSource*, TTarget*)>
4743void FrameChannels::applyAdvancedPixelModifierSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows)
4744{
4745 static_assert(tSourceChannels >= 1u, "Invalid source channel number");
4746 static_assert(tTargetChannels >= 1u, "Invalid target channel number");
4747
4748 ocean_assert(source && target);
4749 ocean_assert((void*)source != (void*)target);
4750
4751 ocean_assert(numberRows != 0u);
4752 ocean_assert(firstRow + numberRows <= height);
4753
4754 const unsigned int sourceWidthElements = width * tSourceChannels;
4755 const unsigned int targetWidthElements = width * tTargetChannels;
4756
4757 const unsigned int sourceStrideElements = sourceWidthElements + sourcePaddingElements;
4758 const unsigned int targetStrideElements = targetWidthElements + targetPaddingElements;
4759
4760 switch (conversionFlag)
4761 {
4762 case CONVERT_NORMAL:
4763 {
4764 for (unsigned int rowIndex = firstRow; rowIndex < firstRow + numberRows; ++rowIndex)
4765 {
4766 const TSource* sourcePixel = source + rowIndex * sourceStrideElements;
4767 TTarget* targetPixel = target + rowIndex * targetStrideElements;
4768
4769 for (unsigned int x = 0u; x < width; ++x)
4770 {
4771 tPixelFunction(sourcePixel, targetPixel);
4772
4773 sourcePixel += tSourceChannels;
4774 targetPixel += tTargetChannels;
4775 }
4776 }
4777
4778 break;
4779 }
4780
4781 case CONVERT_FLIPPED:
4782 {
4783 for (unsigned int rowIndex = firstRow; rowIndex < firstRow + numberRows; ++rowIndex)
4784 {
4785 const TSource* sourcePixel = source + rowIndex * sourceStrideElements;
4786 TTarget* targetPixel = target + (height - rowIndex - 1u) * targetStrideElements;
4787
4788 for (unsigned int x = 0u; x < width; ++x)
4789 {
4790 tPixelFunction(sourcePixel, targetPixel);
4791
4792 sourcePixel += tSourceChannels;
4793 targetPixel += tTargetChannels;
4794 }
4795 }
4796
4797 break;
4798 }
4799
4800 case CONVERT_MIRRORED:
4801 {
4802 for (unsigned int rowIndex = firstRow; rowIndex < firstRow + numberRows; ++rowIndex)
4803 {
4804 const TSource* sourcePixel = source + rowIndex * sourceStrideElements;
4805
4806 TTarget* const targetRowBegin = target + rowIndex * targetStrideElements;
4807 TTarget* targetPixel = targetRowBegin + targetWidthElements - tTargetChannels;
4808
4809 for (unsigned int x = 0u; x < width; ++x)
4810 {
4811 ocean_assert(targetPixel >= targetRowBegin);
4812 tPixelFunction(sourcePixel, targetPixel);
4813
4814 sourcePixel += tSourceChannels;
4815 targetPixel -= tTargetChannels;
4816 }
4817 }
4818
4819 break;
4820 }
4821
4823 {
4824 for (unsigned int rowIndex = firstRow; rowIndex < firstRow + numberRows; ++rowIndex)
4825 {
4826 const TSource* sourcePixel = source + rowIndex * sourceStrideElements;
4827
4828 TTarget* const targetRowBegin = target + (height - rowIndex - 1u) * targetStrideElements;
4829 TTarget* targetPixel = targetRowBegin + targetWidthElements - tTargetChannels;
4830
4831 for (unsigned int x = 0u; x < width; ++x)
4832 {
4833 ocean_assert(targetPixel >= targetRowBegin);
4834 tPixelFunction(sourcePixel, targetPixel);
4835
4836 sourcePixel += tSourceChannels;
4837 targetPixel -= tTargetChannels;
4838 }
4839 }
4840
4841 break;
4842 }
4843
4844 default: // this case is not handled
4845 break;
4846 }
4847}
4848
4849template <typename TSource0, typename TSource1, typename TTarget, typename TIntermediate, unsigned int tSourceChannels, unsigned int tTargetChannels, void (*tOperator)(const TSource0*, const TSource1*, TTarget*)>
4850void FrameChannels::applyBivariateOperatorSubset(const TSource0* source0, const TSource1* source1, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int source0PaddingElements, const unsigned int source1PaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows)
4851{
4852 static_assert(tSourceChannels >= 1u, "Invalid source channel number");
4853 static_assert(tTargetChannels >= 1u, "Invalid target channel number");
4854 static_assert(tOperator, "Invalid operator function");
4855
4856 ocean_assert(source0 != nullptr && source1 != nullptr && target != nullptr);
4857 ocean_assert((const void*)(source0) != (const void*)(target));
4858 ocean_assert((const void*)(source1) != (const void*)(target));
4859
4860 ocean_assert(numberRows != 0u);
4861 ocean_assert(firstRow + numberRows <= height);
4862
4863 const unsigned int source0StrideElements = width * tSourceChannels + source0PaddingElements;
4864 const unsigned int source1StrideElements = width * tSourceChannels + source1PaddingElements;
4865
4866 const unsigned int targetWidthElements = width * tTargetChannels;
4867
4868 const unsigned int targetStrideElements = targetWidthElements + targetPaddingElements;
4869
4870 switch (conversionFlag)
4871 {
4872 case CONVERT_NORMAL:
4873 {
4874 for (unsigned int rowIndex = firstRow; rowIndex < (firstRow + numberRows); ++rowIndex)
4875 {
4876 const TSource0* rowSource0 = source0 + rowIndex * source0StrideElements;
4877 const TSource1* rowSource1 = source1 + rowIndex * source1StrideElements;
4878
4879 TTarget* rowTarget = target + rowIndex * targetStrideElements;
4880 const TTarget* const rowTargetEnd = rowTarget + targetWidthElements;
4881
4882 while (rowTarget != rowTargetEnd)
4883 {
4884 ocean_assert(rowTarget < rowTargetEnd);
4885
4886 tOperator(rowSource0, rowSource1, rowTarget);
4887
4888 rowSource0 += tSourceChannels;
4889 rowSource1 += tSourceChannels;
4890
4891 rowTarget += tTargetChannels;
4892 }
4893 }
4894
4895 return;
4896 }
4897
4898 case CONVERT_FLIPPED:
4899 {
4900 for (unsigned int rowIndex = firstRow; rowIndex < (firstRow + numberRows); ++rowIndex)
4901 {
4902 const TSource0* rowSource0 = source0 + rowIndex * source0StrideElements;
4903 const TSource1* rowSource1 = source1 + rowIndex * source1StrideElements;
4904
4905 TTarget* rowTarget = target + (height - rowIndex - 1u) * targetStrideElements;
4906 const TTarget* const rowTargetEnd = rowTarget + targetWidthElements;
4907
4908 while (rowTarget != rowTargetEnd)
4909 {
4910 ocean_assert(rowTarget < rowTargetEnd);
4911
4912 tOperator(rowSource0, rowSource1, rowTarget);
4913
4914 rowSource0 += tSourceChannels;
4915 rowSource1 += tSourceChannels;
4916
4917 rowTarget += tTargetChannels;
4918 }
4919 }
4920
4921 return;
4922 }
4923
4924 case CONVERT_MIRRORED:
4925 {
4926 for (unsigned int rowIndex = firstRow; rowIndex < (firstRow + numberRows); ++rowIndex)
4927 {
4928 const TSource0* rowSource0 = source0 + rowIndex * source0StrideElements;
4929 const TSource1* rowSource1 = source1 + rowIndex * source1StrideElements;
4930
4931 TTarget* rowTarget = target + rowIndex * targetStrideElements + targetWidthElements - tTargetChannels;
4932 const TTarget* const rowTargetEnd = rowTarget - targetWidthElements;
4933
4934 while (rowTarget != rowTargetEnd)
4935 {
4936 ocean_assert(rowTarget > rowTargetEnd);
4937
4938 tOperator(rowSource0, rowSource1, rowTarget);
4939
4940 rowSource0 += tSourceChannels;
4941 rowSource1 += tSourceChannels;
4942
4943 rowTarget -= tTargetChannels;
4944 }
4945 }
4946
4947 return;
4948 }
4949
4951 {
4952 for (unsigned int rowIndex = firstRow; rowIndex < (firstRow + numberRows); ++rowIndex)
4953 {
4954 const TSource0* rowSource0 = source0 + rowIndex * source0StrideElements;
4955 const TSource1* rowSource1 = source1 + rowIndex * source1StrideElements;
4956
4957 TTarget* rowTarget = target + (height - rowIndex - 1u) * targetStrideElements + targetWidthElements - tTargetChannels;
4958 const TTarget* const rowTargetEnd = rowTarget - targetWidthElements;
4959
4960 while (rowTarget != rowTargetEnd)
4961 {
4962 ocean_assert(rowTarget > rowTargetEnd);
4963
4964 tOperator(rowSource0, rowSource1, rowTarget);
4965
4966 rowSource0 += tSourceChannels;
4967 rowSource1 += tSourceChannels;
4968
4969 rowTarget -= tTargetChannels;
4970 }
4971 }
4972
4973 return;
4974 }
4975
4976 default:
4977 ocean_assert(false && "This should never happen!");
4978 break;
4979 }
4980}
4981
4982template <typename TSource, typename TTarget, unsigned int tSourceChannels, unsigned int tTargetChannels>
4983void FrameChannels::applyRowOperatorSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int sourceStrideElements, const unsigned int targetStrideElements, const RowOperatorFunction<TSource, TTarget, tSourceChannels, tTargetChannels> rowOperatorFunction, const unsigned int firstRow, const unsigned int numberRows)
4984{
4985 static_assert(tSourceChannels >= 1u, "Invalid source channel number");
4986 static_assert(tTargetChannels >= 1u, "Invalid target channel number");
4987
4988 ocean_assert(source != nullptr && target != nullptr);
4989 ocean_assert((const void*)source != (const void*)target);
4990
4991 ocean_assert(width * tSourceChannels <= sourceStrideElements);
4992 ocean_assert(width * tTargetChannels <= targetStrideElements);
4993
4994 ocean_assert(rowOperatorFunction != nullptr);
4995
4996 ocean_assert(numberRows != 0u);
4997 ocean_assert(firstRow + numberRows <= height);
4998
4999 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
5000 {
5001 rowOperatorFunction(source + y * sourceStrideElements, target + y * targetStrideElements, width, height, y, sourceStrideElements, targetStrideElements);
5002 }
5003}
5004
5005template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2>
5006void FrameChannels::convertRow3ChannelsTo1Channel8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* channelMultiplicationFactors_128)
5007{
5008 static_assert(tUseFactorChannel0 || tUseFactorChannel1 || tUseFactorChannel2, "Invalid channel factors!");
5009
5010 ocean_assert(channelMultiplicationFactors_128 != nullptr);
5011 const unsigned int* channelFactors_128 = reinterpret_cast<const unsigned int*>(channelMultiplicationFactors_128);
5012 ocean_assert(channelFactors_128 != nullptr);
5013
5014 const unsigned int factorChannel0_128 = channelFactors_128[0];
5015 const unsigned int factorChannel1_128 = channelFactors_128[1];
5016 const unsigned int factorChannel2_128 = channelFactors_128[2];
5017
5018 ocean_assert(factorChannel0_128 <= 128u && factorChannel1_128 <= 128u && factorChannel2_128 <= 128u);
5019 ocean_assert(factorChannel0_128 + factorChannel1_128 + factorChannel2_128 == 128u);
5020
5021 ocean_assert(tUseFactorChannel0 == (factorChannel0_128 != 0u));
5022 ocean_assert(tUseFactorChannel1 == (factorChannel1_128 != 0u));
5023 ocean_assert(tUseFactorChannel2 == (factorChannel2_128 != 0u));
5024
5025 ocean_assert(source != nullptr && target != nullptr && size >= 1);
5026
5027 const uint8_t* const targetEnd = target + size;
5028
5029#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
5030
5031 constexpr size_t blockSize = 16;
5032 const size_t blocks = size / blockSize;
5033
5034 const __m128i multiplicationFactors0_128_u_16x8 = _mm_set1_epi16(int16_t(factorChannel0_128));
5035 const __m128i multiplicationFactors1_128_u_16x8 = _mm_set1_epi16(int16_t(factorChannel1_128));
5036 const __m128i multiplicationFactors2_128_u_16x8 = _mm_set1_epi16(int16_t(factorChannel2_128));
5037
5038 for (size_t n = 0; n < blocks; ++n)
5039 {
5040 convert3ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(source, target, multiplicationFactors0_128_u_16x8, multiplicationFactors1_128_u_16x8, multiplicationFactors2_128_u_16x8);
5041
5042 source += blockSize * size_t(3);
5043 target += blockSize;
5044 }
5045
5046#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
5047
5048 constexpr size_t blockSize = 8;
5049 const size_t blocks = size / blockSize;
5050
5051 const uint8x8_t factorChannel0_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel0_128);
5052 const uint8x8_t factorChannel1_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel1_128);
5053 const uint8x8_t factorChannel2_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel2_128);
5054
5055 for (size_t n = 0; n < blocks; ++n)
5056 {
5057 convert3ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON<tUseFactorChannel0, tUseFactorChannel1, tUseFactorChannel2>(source, target, factorChannel0_128_u_8x8, factorChannel1_128_u_8x8, factorChannel2_128_u_8x8);
5058
5059 source += blockSize * size_t(3);
5060 target += blockSize;
5061 }
5062
5063#endif
5064
5065 while (target != targetEnd)
5066 {
5067 ocean_assert(target < targetEnd);
5068
5069 const unsigned int channel0 = tUseFactorChannel0 ? (source[0] * factorChannel0_128) : 0u;
5070 const unsigned int channel1 = tUseFactorChannel1 ? (source[1] * factorChannel1_128) : 0u;
5071 const unsigned int channel2 = tUseFactorChannel2 ? (source[2] * factorChannel2_128) : 0u;
5072
5073 *target++ = (uint8_t)((channel0 + channel1 + channel2 + 64u) >> 7u);
5074 source += 3;
5075 }
5076}
5077
5078template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2, bool tUseFactorChannel3>
5079void FrameChannels::convertRow4ChannelsTo1Channel8BitPerChannel7BitPrecision(const uint8_t* source, uint8_t* target, const size_t size, const void* channelMultiplicationFactors_128)
5080{
5081 static_assert(tUseFactorChannel0 || tUseFactorChannel1 || tUseFactorChannel2 || tUseFactorChannel3, "Invalid channel factors!");
5082
5083 ocean_assert(channelMultiplicationFactors_128 != nullptr);
5084 const unsigned int* channelFactors_128 = reinterpret_cast<const unsigned int*>(channelMultiplicationFactors_128);
5085 ocean_assert(channelFactors_128 != nullptr);
5086
5087 const unsigned int factorChannel0_128 = channelFactors_128[0];
5088 const unsigned int factorChannel1_128 = channelFactors_128[1];
5089 const unsigned int factorChannel2_128 = channelFactors_128[2];
5090 const unsigned int factorChannel3_128 = channelFactors_128[3];
5091
5092 ocean_assert(factorChannel0_128 <= 127u && factorChannel1_128 <= 127u && factorChannel2_128 <= 127u && factorChannel3_128 <= 127u);
5093 ocean_assert(factorChannel0_128 + factorChannel1_128 + factorChannel2_128 + factorChannel3_128 == 128u);
5094
5095 ocean_assert(tUseFactorChannel0 == (factorChannel0_128 != 0u));
5096 ocean_assert(tUseFactorChannel1 == (factorChannel1_128 != 0u));
5097 ocean_assert(tUseFactorChannel2 == (factorChannel2_128 != 0u));
5098 ocean_assert(tUseFactorChannel3 == (factorChannel3_128 != 0u));
5099
5100 ocean_assert(source != nullptr && target != nullptr && size >= 1);
5101
5102 const uint8_t* const targetEnd = target + size;
5103
5104#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
5105
5106 constexpr size_t blockSize = 16;
5107 const size_t blocks = size / blockSize;
5108
5109 const __m128i m128_multiplicationFactors = _mm_set1_epi32(int(factorChannel0_128 | (factorChannel1_128 << 8u) | (factorChannel2_128 << 16u) | (factorChannel3_128 << 24u)));
5110
5111 for (size_t n = 0; n < blocks; ++n)
5112 {
5113 convert4ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(source, target, m128_multiplicationFactors);
5114
5115 source += blockSize * size_t(4);
5116 target += blockSize;
5117 }
5118
5119#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
5120
5121 constexpr size_t blockSize = 8;
5122 const size_t blocks = size / blockSize;
5123
5124 const uint8x8_t factorChannel0_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel0_128);
5125 const uint8x8_t factorChannel1_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel1_128);
5126 const uint8x8_t factorChannel2_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel2_128);
5127 const uint8x8_t factorChannel3_128_u_8x8 = vdup_n_u8((uint8_t)factorChannel3_128);
5128
5129 for (size_t n = 0; n < blocks; ++n)
5130 {
5131 convert4ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON<tUseFactorChannel0, tUseFactorChannel1, tUseFactorChannel2, tUseFactorChannel3>(source, target, factorChannel0_128_u_8x8, factorChannel1_128_u_8x8, factorChannel2_128_u_8x8, factorChannel3_128_u_8x8);
5132
5133 source += blockSize * size_t(4);
5134 target += blockSize;
5135 }
5136
5137#endif
5138
5139 while (target != targetEnd)
5140 {
5141 ocean_assert(target < targetEnd);
5142
5143 const unsigned int channel0 = tUseFactorChannel0 ? (source[0] * factorChannel0_128) : 0u;
5144 const unsigned int channel1 = tUseFactorChannel1 ? (source[1] * factorChannel1_128) : 0u;
5145 const unsigned int channel2 = tUseFactorChannel2 ? (source[2] * factorChannel2_128) : 0u;
5146 const unsigned int channel3 = tUseFactorChannel3 ? (source[3] * factorChannel3_128) : 0u;
5147
5148 *target++ = (uint8_t)((channel0 + channel1 + channel2 + channel3 + 64u) >> 7u);
5149 source += 4;
5150 }
5151}
5152
5153template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
5154void FrameChannels::premultipliedAlphaToStraightAlpha8BitPerChannelSubset(uint8_t* const frame, const unsigned int width, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows)
5155{
5156 static_assert(tChannels >= 2u, "Invalid channel number!");
5157 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
5158
5159 ocean_assert(frame != nullptr);
5160 ocean_assert(width >= 1u);
5161
5162 const unsigned int frameStrideElements = width * tChannels + framePaddingElements;
5163
5164 uint8_t* frameRow = frame + frameStrideElements * firstRow;
5165
5166 for (unsigned int y = 0u; y < numberRows; ++y)
5167 {
5168 for (unsigned int x = 0u; x < width; ++x)
5169 {
5170 if (frameRow[tAlphaChannelIndex])
5171 {
5172 const uint8_t alpha_2 = frameRow[tAlphaChannelIndex] / 2u;
5173
5174 for (unsigned int channelIndex = 0u; channelIndex < tChannels; ++channelIndex)
5175 {
5176 if (channelIndex != tAlphaChannelIndex)
5177 {
5178 frameRow[channelIndex] = uint8_t(std::min((frameRow[channelIndex] * 255u + alpha_2) / frameRow[tAlphaChannelIndex], 255u));
5179 }
5180 }
5181 }
5182
5183 frameRow += tChannels;
5184 }
5185
5186 frameRow += framePaddingElements;
5187 }
5188}
5189
5190template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
5191void FrameChannels::premultipliedAlphaToStraightAlpha8BitPerChannelSubset(const uint8_t* const source, uint8_t* target, const unsigned int width, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
5192{
5193 static_assert(tChannels >= 2u, "Invalid channel number!");
5194 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
5195
5196 ocean_assert(source != nullptr && target != nullptr);
5197 ocean_assert(width >= 1u);
5198
5199 const unsigned int sourceStrideElements = width * tChannels + sourcePaddingElements;
5200 const unsigned int targetStrideElements = width * tChannels + targetPaddingElements;
5201
5202 const uint8_t* sourceRow = source + sourceStrideElements * firstRow;
5203 uint8_t* targetRow = target + targetStrideElements * firstRow;
5204
5205 for (unsigned int y = 0u; y < numberRows; ++y)
5206 {
5207 for (unsigned int x = 0u; x < width; ++x)
5208 {
5209 if (sourceRow[tAlphaChannelIndex])
5210 {
5211 const uint8_t alpha_2 = sourceRow[tAlphaChannelIndex] / 2u;
5212
5213 for (unsigned int channelIndex = 0u; channelIndex < tChannels; ++channelIndex)
5214 {
5215 if (channelIndex != tAlphaChannelIndex)
5216 {
5217 targetRow[channelIndex] = uint8_t(std::max((sourceRow[channelIndex] * 255u + alpha_2) / sourceRow[tAlphaChannelIndex], 255u));
5218 }
5219 else
5220 {
5221 targetRow[channelIndex] = sourceRow[channelIndex];
5222 }
5223 }
5224 }
5225 else
5226 {
5227 for (unsigned int channelIndex = 0u; channelIndex < tChannels; ++channelIndex)
5228 {
5229 targetRow[channelIndex] = sourceRow[channelIndex];
5230 }
5231 }
5232
5233 sourceRow += tChannels;
5234 targetRow += tChannels;
5235 }
5236
5237 sourceRow += sourcePaddingElements;
5238 targetRow += targetPaddingElements;
5239 }
5240}
5241
5242template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
5243void FrameChannels::straightAlphaToPremultipliedAlpha8BitPerChannelSubset(uint8_t* const frame, const unsigned int width, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows)
5244{
5245 static_assert(tChannels >= 2u, "Invalid channel number!");
5246 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
5247
5248 ocean_assert(frame != nullptr);
5249 ocean_assert(width >= 1u);
5250
5251 const unsigned int frameStrideElements = width * tChannels + framePaddingElements;
5252
5253 uint8_t* frameRow = frame + frameStrideElements * firstRow;
5254
5255 for (unsigned int y = 0u; y < numberRows; ++y)
5256 {
5257 for (unsigned int x = 0u; x < width; ++x)
5258 {
5259 for (unsigned int channelIndex = 0u; channelIndex < tChannels; ++channelIndex)
5260 {
5261 if (channelIndex != tAlphaChannelIndex)
5262 {
5263 frameRow[channelIndex] = (frameRow[channelIndex] * frameRow[tAlphaChannelIndex] + 127u) / 255u;
5264 }
5265 }
5266
5267 frameRow += tChannels;
5268 }
5269
5270 frameRow += framePaddingElements;
5271 }
5272}
5273
5274template <unsigned int tChannels, unsigned int tAlphaChannelIndex>
5275void FrameChannels::straightAlphaToPremultipliedAlpha8BitPerChannelSubset(const uint8_t* const source, uint8_t* const target, const unsigned int width, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
5276{
5277 static_assert(tChannels >= 2u, "Invalid channel number!");
5278 static_assert(tAlphaChannelIndex < tChannels, "Invalid alpha channel index!");
5279
5280 ocean_assert(source != nullptr && target != nullptr);
5281 ocean_assert(width >= 1u);
5282
5283 const unsigned int sourceStrideElements = width * tChannels + sourcePaddingElements;
5284 const unsigned int targetStrideElements = width * tChannels + targetPaddingElements;
5285
5286 const uint8_t* sourceRow = source + sourceStrideElements * firstRow;
5287 uint8_t* targetRow = target + targetStrideElements * firstRow;
5288
5289 for (unsigned int y = 0u; y < numberRows; ++y)
5290 {
5291 for (unsigned int x = 0u; x < width; ++x)
5292 {
5293 for (unsigned int channelIndex = 0u; channelIndex < tChannels; ++channelIndex)
5294 {
5295 if (channelIndex != tAlphaChannelIndex)
5296 {
5297 targetRow[channelIndex] = (sourceRow[channelIndex] * sourceRow[tAlphaChannelIndex] + 127u) / 255u;
5298 }
5299 else
5300 {
5301 targetRow[channelIndex] = sourceRow[channelIndex];
5302 }
5303 }
5304
5305 sourceRow += tChannels;
5306 targetRow += tChannels;
5307 }
5308
5309 sourceRow += sourcePaddingElements;
5310 targetRow += targetPaddingElements;
5311 }
5312}
5313
5314#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
5315
5316OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& multiplicationFactors0_128_u_16x8, const __m128i& multiplicationFactors1_128_u_16x8, const __m128i& multiplicationFactors2_128_u_16x8)
5317{
5318 ocean_assert(source != nullptr && target != nullptr);
5319
5320 // the documentation of this function is designed for RGB24 to Y8 conversion
5321 // however, in general this function can be used to apply a linear combination on the four source channels
5322 // to create one output channel
5323
5324 // precise color space conversion:
5325 // Y = 0.299 * R + 0.587 * G + 0.114 * B
5326
5327 // approximation:
5328 // Y = (38 * R + 75 * G + 15 * B) / 128
5329
5330 // we expect the following input pattern (for here RGB24):
5331 // FEDC BA98 7654 3210 FEDC BA98 7654 3210 FEDC BA98 7654 3210
5332 // BGRB GRBG RBGR BGRB GRBG RBGR BGRB GRBG RBGR BGRB GRBG RBGR
5333
5334 // we store eight 16 bit values holding 64 for rounding purpose:
5335 const __m128i constant64_u_16x8 = _mm_set1_epi32(0x00400040);
5336
5337 const __m128i sourceA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5338 const __m128i sourceB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5339 const __m128i sourceC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5340
5341 __m128i channel0_u_8x16;
5342 __m128i channel1_u_8x16;
5343 __m128i channel2_u_8x16;
5344 SSE::deInterleave3Channel8Bit48Elements(sourceA_u_8x16, sourceB_u_8x16, sourceC_u_8x16, channel0_u_8x16, channel1_u_8x16, channel2_u_8x16);
5345
5346 // now we need 16 bit values instead of 8 bit values
5347
5348 const __m128i channel0_low_u_8x16 = SSE::removeHighBits16_8(channel0_u_8x16);
5349 const __m128i channel1_low_u_8x16 = SSE::removeHighBits16_8(channel1_u_8x16);
5350 const __m128i channel2_low_u_8x16 = SSE::removeHighBits16_8(channel2_u_8x16);
5351
5352 const __m128i channel0_high_u_8x16 = _mm_srli_epi16(channel0_u_8x16, 8);
5353 const __m128i channel1_high_u_8x16 = _mm_srli_epi16(channel1_u_8x16, 8);
5354 const __m128i channel2_high_u_8x16 = _mm_srli_epi16(channel2_u_8x16, 8);
5355
5356 // we multiply each channel with the corresponding multiplication factors
5357
5358 const __m128i result0_low_u_8x16 = _mm_mullo_epi16(channel0_low_u_8x16, multiplicationFactors0_128_u_16x8);
5359 const __m128i result0_high_u_8x16 = _mm_mullo_epi16(channel0_high_u_8x16, multiplicationFactors0_128_u_16x8);
5360
5361 const __m128i result1_low_u_8x16 = _mm_mullo_epi16(channel1_low_u_8x16, multiplicationFactors1_128_u_16x8);
5362 const __m128i result1_high_u_8x16 = _mm_mullo_epi16(channel1_high_u_8x16, multiplicationFactors1_128_u_16x8);
5363
5364 const __m128i result2_low_u_8x16 = _mm_mullo_epi16(channel2_low_u_8x16, multiplicationFactors2_128_u_16x8);
5365 const __m128i result2_high_u_8x16 = _mm_mullo_epi16(channel2_high_u_8x16, multiplicationFactors2_128_u_16x8);
5366
5367 // we sum up all results and add 64 for rounding purpose
5368 const __m128i result128_low_u_8x16 = _mm_adds_epu16(_mm_adds_epu16(result0_low_u_8x16, result1_low_u_8x16), _mm_adds_epu16(result2_low_u_8x16, constant64_u_16x8));
5369 const __m128i result128_high_u_8x16 = _mm_adds_epu16(_mm_adds_epu16(result0_high_u_8x16, result1_high_u_8x16), _mm_adds_epu16(result2_high_u_8x16, constant64_u_16x8));
5370
5371 // we shift the multiplication results by 7 bits (= 128)
5372 const __m128i result_low_u_8x16 = _mm_srli_epi16(result128_low_u_8x16, 7);
5373 const __m128i result_high_u_8x16 = _mm_srli_epi16(result128_high_u_8x16, 7);
5374
5375 // finally, we have to get rid of the upper zero bits by combining two 128 bit registers to one:
5376 const __m128i result_u_8x16 = _mm_or_si128(result_low_u_8x16, _mm_slli_epi16(result_high_u_8x16, 8));
5377
5378 // and we can store the result
5379 _mm_storeu_si128((__m128i*)target, result_u_8x16);
5380}
5381
5382OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_128_s_16x8, const __m128i& factorChannel10_128_s_16x8, const __m128i& factorChannel20_128_s_16x8, const __m128i& factorChannel01_128_s_16x8, const __m128i& factorChannel11_128_s_16x8, const __m128i& factorChannel21_128_s_16x8, const __m128i& factorChannel02_128_s_16x8, const __m128i& factorChannel12_128_s_16x8, const __m128i& factorChannel22_128_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8)
5383{
5384 ocean_assert(source != nullptr && target != nullptr);
5385
5386 // the documentation of this function designed for RGB24 to YUV24 conversion
5387
5388 // precise color space conversion:
5389 // | Y | | 0.2578125 0.5039063 0.09765625 16.0 | | R |
5390 // | U | = | -0.1484375 -0.2890625 0.4375 128.0 | * | G |
5391 // | V | | 0.4375 -0.3671875 -0.0703125 128.0 | | B |
5392 // | 1 |
5393
5394 // approximation:
5395 // Y = ( 33 * R + 64 * G + 13 * B) / 128 + 16
5396 // U = (-19 * R - 37 * G + 56 * B) / 128 + 128
5397 // V = ( 56 * R - 47 * G - 9 * B) / 128 + 128
5398
5399 // we expect the following input pattern (for here RGB24):
5400 // FEDC BA98 7654 3210 FEDC BA98 7654 3210 FEDC BA98 7654 3210
5401 // BGRB GRBG RBGR BGRB GRBG RBGR BGRB GRBG RBGR BGRB GRBG RBGR
5402
5403 const __m128i sourceA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5404 const __m128i sourceB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5405 const __m128i sourceC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5406
5407 __m128i channel0_u_8x16;
5408 __m128i channel1_u_8x16;
5409 __m128i channel2_u_8x16;
5410 SSE::deInterleave3Channel8Bit48Elements(sourceA_u_8x16, sourceB_u_8x16, sourceC_u_8x16, channel0_u_8x16, channel1_u_8x16, channel2_u_8x16);
5411
5412 // now we need 16 bit values instead of 8 bit values
5413
5414 const __m128i channel0_low_u_8x16 = SSE::removeHighBits16_8(channel0_u_8x16);
5415 const __m128i channel1_low_u_8x16 = SSE::removeHighBits16_8(channel1_u_8x16);
5416 const __m128i channel2_low_u_8x16 = SSE::removeHighBits16_8(channel2_u_8x16);
5417
5418 const __m128i channel0_high_u_8x16 = _mm_srli_epi16(channel0_u_8x16, 8);
5419 const __m128i channel1_high_u_8x16 = _mm_srli_epi16(channel1_u_8x16, 8);
5420 const __m128i channel2_high_u_8x16 = _mm_srli_epi16(channel2_u_8x16, 8);
5421
5422 // we multiply each channel with the corresponding multiplication factors
5423
5424 __m128i result0_low_u_8x16 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_u_8x16, factorChannel00_128_s_16x8), _mm_mullo_epi16(channel1_low_u_8x16, factorChannel01_128_s_16x8)), _mm_mullo_epi16(channel2_low_u_8x16, factorChannel02_128_s_16x8));
5425 __m128i result1_low_u_8x16 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_u_8x16, factorChannel10_128_s_16x8), _mm_mullo_epi16(channel1_low_u_8x16, factorChannel11_128_s_16x8)), _mm_mullo_epi16(channel2_low_u_8x16, factorChannel12_128_s_16x8));
5426 __m128i result2_low_u_8x16 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_u_8x16, factorChannel20_128_s_16x8), _mm_mullo_epi16(channel1_low_u_8x16, factorChannel21_128_s_16x8)), _mm_mullo_epi16(channel2_low_u_8x16, factorChannel22_128_s_16x8));
5427
5428 __m128i result0_high_u_8x16 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_u_8x16, factorChannel00_128_s_16x8), _mm_mullo_epi16(channel1_high_u_8x16, factorChannel01_128_s_16x8)), _mm_mullo_epi16(channel2_high_u_8x16, factorChannel02_128_s_16x8));
5429 __m128i result1_high_u_8x16 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_u_8x16, factorChannel10_128_s_16x8), _mm_mullo_epi16(channel1_high_u_8x16, factorChannel11_128_s_16x8)), _mm_mullo_epi16(channel2_high_u_8x16, factorChannel12_128_s_16x8));
5430 __m128i result2_high_u_8x16 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_u_8x16, factorChannel20_128_s_16x8), _mm_mullo_epi16(channel1_high_u_8x16, factorChannel21_128_s_16x8)), _mm_mullo_epi16(channel2_high_u_8x16, factorChannel22_128_s_16x8));
5431
5432 // we normalize the result by 128 and add the bias
5433
5434 result0_low_u_8x16 = _mm_add_epi16(SSE::divideByRightShiftSigned16Bit(result0_low_u_8x16, 7), biasChannel0_s_16x8);
5435 result1_low_u_8x16 = _mm_add_epi16(SSE::divideByRightShiftSigned16Bit(result1_low_u_8x16, 7), biasChannel1_s_16x8);
5436 result2_low_u_8x16 = _mm_add_epi16(SSE::divideByRightShiftSigned16Bit(result2_low_u_8x16, 7), biasChannel2_s_16x8);
5437
5438 result0_high_u_8x16 = _mm_add_epi16(SSE::divideByRightShiftSigned16Bit(result0_high_u_8x16, 7), biasChannel0_s_16x8);
5439 result1_high_u_8x16 = _mm_add_epi16(SSE::divideByRightShiftSigned16Bit(result1_high_u_8x16, 7), biasChannel1_s_16x8);
5440 result2_high_u_8x16 = _mm_add_epi16(SSE::divideByRightShiftSigned16Bit(result2_high_u_8x16, 7), biasChannel2_s_16x8);
5441
5442 // from here, we need values within the range [0, 255], so that we clamp the results
5443
5444 const __m128i constant255_s_16x8 = _mm_set1_epi16(255);
5445
5446 result0_low_u_8x16 = _mm_min_epi16(_mm_max_epi16(result0_low_u_8x16, _mm_setzero_si128()), constant255_s_16x8);
5447 result1_low_u_8x16 = _mm_min_epi16(_mm_max_epi16(result1_low_u_8x16, _mm_setzero_si128()), constant255_s_16x8);
5448 result2_low_u_8x16 = _mm_min_epi16(_mm_max_epi16(result2_low_u_8x16, _mm_setzero_si128()), constant255_s_16x8);
5449
5450 result0_high_u_8x16 = _mm_min_epi16(_mm_max_epi16(result0_high_u_8x16, _mm_setzero_si128()), constant255_s_16x8);
5451 result1_high_u_8x16 = _mm_min_epi16(_mm_max_epi16(result1_high_u_8x16, _mm_setzero_si128()), constant255_s_16x8);
5452 result2_high_u_8x16 = _mm_min_epi16(_mm_max_epi16(result2_high_u_8x16, _mm_setzero_si128()), constant255_s_16x8);
5453
5454 // finally, we have to get rid of the upper zero bits by combining two 128 bit registers to one:
5455 const __m128i result0_u_8x16 = _mm_or_si128(result0_low_u_8x16, _mm_slli_epi16(result0_high_u_8x16, 8));
5456 const __m128i result1_u_8x16 = _mm_or_si128(result1_low_u_8x16, _mm_slli_epi16(result1_high_u_8x16, 8));
5457 const __m128i result2_u_8x16 = _mm_or_si128(result2_low_u_8x16, _mm_slli_epi16(result2_high_u_8x16, 8));
5458
5459 __m128i resultA_u_8x16;
5460 __m128i resultB_u_8x16;
5461 __m128i resultC_u_8x16;
5462 SSE::interleave3Channel8Bit48Elements(result0_u_8x16, result1_u_8x16, result2_u_8x16, resultA_u_8x16, resultB_u_8x16, resultC_u_8x16);
5463
5464 // and we can store the result
5465 _mm_storeu_si128((__m128i*)target + 0, resultA_u_8x16);
5466 _mm_storeu_si128((__m128i*)target + 1, resultB_u_8x16);
5467 _mm_storeu_si128((__m128i*)target + 2, resultC_u_8x16);
5468}
5469
5470OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels16Pixels8BitPerChannel10BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_1024_s_16x8, const __m128i& factorChannel10_1024_s_16x8, const __m128i& factorChannel20_1024_s_16x8, const __m128i& factorChannel01_1024_s_16x8, const __m128i& factorChannel11_1024_s_16x8, const __m128i& factorChannel21_1024_s_16x8, const __m128i& factorChannel02_1024_s_16x8, const __m128i& factorChannel12_1024_s_16x8, const __m128i& factorChannel22_1024_s_16x8, const __m128i& biasChannel0_1024_s_32x4, const __m128i& biasChannel1_1024_s_32x4, const __m128i& biasChannel2_1024_s_32x4)
5471{
5472 ocean_assert(source != nullptr && target != nullptr);
5473
5474 // the documentation of this function designed for RGB24 to YUV24 conversion
5475
5476 /// precise color space conversion:
5477 // | R | | 1.1639404296875 0.0 1.595947265625 -222.904296875 | | Y |
5478 // | G | = | 1.1639404296875 -0.3909912109375 -0.81298828125 135.486328125 | * | U |
5479 // | B | | 1.1639404296875 2.0179443359375 0.0 -276.919921875 | | V |
5480 // | 1 |
5481
5482 // approximation:
5483 // | R | | 1192 0 1634 -223 | | Y |
5484 // | G | = | 1192 -400 -833 135 | * | U |
5485 // | B | | 1192 2066 0 -277 | | V |
5486 // | 1 |
5487
5488 // we expect the following input pattern (for here RGB24):
5489 // FEDC BA98 7654 3210 FEDC BA98 7654 3210 FEDC BA98 7654 3210
5490 // BGRB GRBG RBGR BGRB GRBG RBGR BGRB GRBG RBGR BGRB GRBG RBGR
5491
5492 const __m128i sourceA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5493 const __m128i sourceB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5494 const __m128i sourceC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5495
5496 __m128i channel0_u_8x16;
5497 __m128i channel1_u_8x16;
5498 __m128i channel2_u_8x16;
5499 SSE::deInterleave3Channel8Bit48Elements(sourceA_u_8x16, sourceB_u_8x16, sourceC_u_8x16, channel0_u_8x16, channel1_u_8x16, channel2_u_8x16);
5500
5501
5502 // now we need 16 bit values instead of 8 bit values
5503
5504 const __m128i channel0_low_u_16x8 = SSE::removeHighBits16_8(channel0_u_8x16);
5505 const __m128i channel1_low_u_16x8 = SSE::removeHighBits16_8(channel1_u_8x16);
5506 const __m128i channel2_low_u_16x8 = SSE::removeHighBits16_8(channel2_u_8x16);
5507
5508 const __m128i channel0_high_u_16x8 = _mm_srli_epi16(channel0_u_8x16, 8);
5509 const __m128i channel1_high_u_16x8 = _mm_srli_epi16(channel1_u_8x16, 8);
5510 const __m128i channel2_high_u_16x8 = _mm_srli_epi16(channel2_u_8x16, 8);
5511
5512
5513 // we multiply each channel with the corresponding multiplication factors (int16_t * int16_t = int32_t), and we normalize the result by 1024
5514
5515 __m128i result0_low_A_s_32x4;
5516 __m128i result0_low_B_s_32x4;
5517 __m128i result0_high_A_s_32x4;
5518 __m128i result0_high_B_s_32x4;
5519
5520 SSE::multiplyInt8x16ToInt32x8(channel0_low_u_16x8, factorChannel00_1024_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5521 SSE::multiplyInt8x16ToInt32x8(channel0_high_u_16x8, factorChannel00_1024_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5522
5523 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_low_u_16x8, factorChannel01_1024_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5524 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_high_u_16x8, factorChannel01_1024_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5525
5526 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_low_u_16x8, factorChannel02_1024_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5527 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_high_u_16x8, factorChannel02_1024_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5528
5529 result0_low_A_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result0_low_A_s_32x4, biasChannel0_1024_s_32x4), 10);
5530 result0_low_B_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result0_low_B_s_32x4, biasChannel0_1024_s_32x4), 10);
5531 result0_high_A_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result0_high_A_s_32x4, biasChannel0_1024_s_32x4), 10);
5532 result0_high_B_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result0_high_B_s_32x4, biasChannel0_1024_s_32x4), 10);
5533
5534
5535 __m128i result1_low_A_s_32x4;
5536 __m128i result1_low_B_s_32x4;
5537 __m128i result1_high_A_s_32x4;
5538 __m128i result1_high_B_s_32x4;
5539
5540 SSE::multiplyInt8x16ToInt32x8(channel0_low_u_16x8, factorChannel10_1024_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5541 SSE::multiplyInt8x16ToInt32x8(channel0_high_u_16x8, factorChannel10_1024_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5542
5543 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_low_u_16x8, factorChannel11_1024_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5544 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_high_u_16x8, factorChannel11_1024_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5545
5546 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_low_u_16x8, factorChannel12_1024_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5547 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_high_u_16x8, factorChannel12_1024_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5548
5549 result1_low_A_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result1_low_A_s_32x4, biasChannel1_1024_s_32x4), 10);
5550 result1_low_B_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result1_low_B_s_32x4, biasChannel1_1024_s_32x4), 10);
5551 result1_high_A_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result1_high_A_s_32x4, biasChannel1_1024_s_32x4), 10);
5552 result1_high_B_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result1_high_B_s_32x4, biasChannel1_1024_s_32x4), 10);
5553
5554
5555 __m128i result2_low_A_s_32x4;
5556 __m128i result2_low_B_s_32x4;
5557 __m128i result2_high_A_s_32x4;
5558 __m128i result2_high_B_s_32x4;
5559
5560 SSE::multiplyInt8x16ToInt32x8(channel0_low_u_16x8, factorChannel20_1024_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5561 SSE::multiplyInt8x16ToInt32x8(channel0_high_u_16x8, factorChannel20_1024_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5562
5563 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_low_u_16x8, factorChannel21_1024_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5564 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_high_u_16x8, factorChannel21_1024_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5565
5566 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_low_u_16x8, factorChannel22_1024_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5567 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_high_u_16x8, factorChannel22_1024_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5568
5569 result2_low_A_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result2_low_A_s_32x4, biasChannel2_1024_s_32x4), 10);
5570 result2_low_B_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result2_low_B_s_32x4, biasChannel2_1024_s_32x4), 10);
5571 result2_high_A_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result2_high_A_s_32x4, biasChannel2_1024_s_32x4), 10);
5572 result2_high_B_s_32x4 = SSE::divideByRightShiftSigned32Bit(_mm_add_epi32(result2_high_B_s_32x4, biasChannel2_1024_s_32x4), 10);
5573
5574
5575 // now we have int32_t values with 0x0000 or 0xFFFF in the high 16 bits
5576 // thus we can merge 8 int32_t values to 8 int16_t values
5577
5578 const __m128i mask_0000FFFF_32x4 = _mm_set1_epi32(0x0000FFFF);
5579
5580 __m128i result0_A_s_16x8 = _mm_or_si128(_mm_and_si128(result0_low_A_s_32x4, mask_0000FFFF_32x4), _mm_slli_epi32(result0_high_A_s_32x4, 16));
5581 __m128i result0_B_s_16x8 = _mm_or_si128(_mm_and_si128(result0_low_B_s_32x4, mask_0000FFFF_32x4), _mm_slli_epi32(result0_high_B_s_32x4, 16));
5582
5583 __m128i result1_A_s_16x8 = _mm_or_si128(_mm_and_si128(result1_low_A_s_32x4, mask_0000FFFF_32x4), _mm_slli_epi32(result1_high_A_s_32x4, 16));
5584 __m128i result1_B_s_16x8 = _mm_or_si128(_mm_and_si128(result1_low_B_s_32x4, mask_0000FFFF_32x4), _mm_slli_epi32(result1_high_B_s_32x4, 16));
5585
5586 __m128i result2_A_s_16x8 = _mm_or_si128(_mm_and_si128(result2_low_A_s_32x4, mask_0000FFFF_32x4), _mm_slli_epi32(result2_high_A_s_32x4, 16));
5587 __m128i result2_B_s_16x8 = _mm_or_si128(_mm_and_si128(result2_low_B_s_32x4, mask_0000FFFF_32x4), _mm_slli_epi32(result2_high_B_s_32x4, 16));
5588
5589
5590 // we combine 16 int16_t values to 16 uint8_t values (saturated)
5591
5592 const __m128i result0_u_8x16 = _mm_packus_epi16(result0_A_s_16x8, result0_B_s_16x8);
5593 const __m128i result1_u_8x16 = _mm_packus_epi16(result1_A_s_16x8, result1_B_s_16x8);
5594 const __m128i result2_u_8x16 = _mm_packus_epi16(result2_A_s_16x8, result2_B_s_16x8);
5595
5596 __m128i resultA_u_8x16;
5597 __m128i resultB_u_8x16;
5598 __m128i resultC_u_8x16;
5599 SSE::interleave3Channel8Bit48Elements(result0_u_8x16, result1_u_8x16, result2_u_8x16, resultA_u_8x16, resultB_u_8x16, resultC_u_8x16);
5600
5601 // and we can store the result
5602 _mm_storeu_si128((__m128i*)target + 0, resultA_u_8x16);
5603 _mm_storeu_si128((__m128i*)target + 1, resultB_u_8x16);
5604 _mm_storeu_si128((__m128i*)target + 2, resultC_u_8x16);
5605}
5606
5607OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels16Pixels8BitPerChannel6BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_64_s_16x8, const __m128i& factorChannel10_64_s_16x8, const __m128i& factorChannel20_64_s_16x8, const __m128i& factorChannel01_64_s_16x8, const __m128i& factorChannel11_64_s_16x8, const __m128i& factorChannel21_64_s_16x8, const __m128i& factorChannel02_64_s_16x8, const __m128i& factorChannel12_64_s_16x8, const __m128i& factorChannel22_64_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8)
5608{
5609 ocean_assert(source != nullptr && target != nullptr);
5610
5611 // the documentation of this function designed for YUV24 to RGB24 conversion
5612
5613 // precise color space conversion:
5614 // | R | | 1 0.0 1.370705 -175.45024 | | Y |
5615 // | G | = | 1 -0.3376335 -0.698001 132.561152 | * | U |
5616 // | B | | 1 1.732446 0.0 -221.753088 | | V |
5617 // | 1 |
5618
5619 // approximation:
5620 // R = 64 * Y + 0 * (U - 128) + 88 * (V - 128)
5621 // G = 64 * Y - 22 * (U - 128) - 45 * (V - 128)
5622 // B = 64 * Y + 111 * (U - 128) + 0 * (V - 128)
5623
5624 // we expect the following input pattern (for here YUV24):
5625 // FEDC BA98 7654 3210 FEDC BA98 7654 3210 FEDC BA98 7654 3210
5626 // VUYX VUYX VUYX VUYX VUYX VUYX VUYX VUYX VUYX VUYX VUYX VUYX
5627
5628 const __m128i sourceA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5629 const __m128i sourceB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5630 const __m128i sourceC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5631
5632 __m128i channel0_u_8x16;
5633 __m128i channel1_u_8x16;
5634 __m128i channel2_u_8x16;
5635 SSE::deInterleave3Channel8Bit48Elements(sourceA_u_8x16, sourceB_u_8x16, sourceC_u_8x16, channel0_u_8x16, channel1_u_8x16, channel2_u_8x16);
5636
5637 // subtract the bias values and convert to signed 16 bit
5638
5639 const __m128i channel0_low_s_16x8 = _mm_sub_epi16(_mm_unpacklo_epi8(channel0_u_8x16, _mm_setzero_si128()), biasChannel0_s_16x8);
5640 const __m128i channel1_low_s_16x8 = _mm_sub_epi16(_mm_unpacklo_epi8(channel1_u_8x16, _mm_setzero_si128()), biasChannel1_s_16x8);
5641 const __m128i channel2_low_s_16x8 = _mm_sub_epi16(_mm_unpacklo_epi8(channel2_u_8x16, _mm_setzero_si128()), biasChannel2_s_16x8);
5642
5643 const __m128i channel0_high_s_16x8 = _mm_sub_epi16(_mm_unpackhi_epi8(channel0_u_8x16, _mm_setzero_si128()), biasChannel0_s_16x8);
5644 const __m128i channel1_high_s_16x8 = _mm_sub_epi16(_mm_unpackhi_epi8(channel1_u_8x16, _mm_setzero_si128()), biasChannel1_s_16x8);
5645 const __m128i channel2_high_s_16x8 = _mm_sub_epi16(_mm_unpackhi_epi8(channel2_u_8x16, _mm_setzero_si128()), biasChannel2_s_16x8);
5646
5647 // we multiply each channel with the corresponding multiplication factors
5648
5649 __m128i result0_low_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_s_16x8, factorChannel00_64_s_16x8), _mm_mullo_epi16(channel1_low_s_16x8, factorChannel01_64_s_16x8)), _mm_mullo_epi16(channel2_low_s_16x8, factorChannel02_64_s_16x8));
5650 __m128i result1_low_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_s_16x8, factorChannel10_64_s_16x8), _mm_mullo_epi16(channel1_low_s_16x8, factorChannel11_64_s_16x8)), _mm_mullo_epi16(channel2_low_s_16x8, factorChannel12_64_s_16x8));
5651 __m128i result2_low_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_s_16x8, factorChannel20_64_s_16x8), _mm_mullo_epi16(channel1_low_s_16x8, factorChannel21_64_s_16x8)), _mm_mullo_epi16(channel2_low_s_16x8, factorChannel22_64_s_16x8));
5652
5653 __m128i result0_high_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_s_16x8, factorChannel00_64_s_16x8), _mm_mullo_epi16(channel1_high_s_16x8, factorChannel01_64_s_16x8)), _mm_mullo_epi16(channel2_high_s_16x8, factorChannel02_64_s_16x8));
5654 __m128i result1_high_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_s_16x8, factorChannel10_64_s_16x8), _mm_mullo_epi16(channel1_high_s_16x8, factorChannel11_64_s_16x8)), _mm_mullo_epi16(channel2_high_s_16x8, factorChannel12_64_s_16x8));
5655 __m128i result2_high_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_s_16x8, factorChannel20_64_s_16x8), _mm_mullo_epi16(channel1_high_s_16x8, factorChannel21_64_s_16x8)), _mm_mullo_epi16(channel2_high_s_16x8, factorChannel22_64_s_16x8));
5656
5657 // we normalize the result by 64
5658
5659 result0_low_s_16x8 = SSE::divideByRightShiftSigned16Bit(result0_low_s_16x8, 6);
5660 result1_low_s_16x8 = SSE::divideByRightShiftSigned16Bit(result1_low_s_16x8, 6);
5661 result2_low_s_16x8 = SSE::divideByRightShiftSigned16Bit(result2_low_s_16x8, 6);
5662
5663 result0_high_s_16x8 = SSE::divideByRightShiftSigned16Bit(result0_high_s_16x8, 6);
5664 result1_high_s_16x8 = SSE::divideByRightShiftSigned16Bit(result1_high_s_16x8, 6);
5665 result2_high_s_16x8 = SSE::divideByRightShiftSigned16Bit(result2_high_s_16x8, 6);
5666
5667 // we combine 16 int16_t values to 16 uint8_t values (saturated to [0, 255])
5668 const __m128i result0_u_8x16 = _mm_packus_epi16(result0_low_s_16x8, result0_high_s_16x8);
5669 const __m128i result1_u_8x16 = _mm_packus_epi16(result1_low_s_16x8, result1_high_s_16x8);
5670 const __m128i result2_u_8x16 = _mm_packus_epi16(result2_low_s_16x8, result2_high_s_16x8);
5671
5672 __m128i resultA_u_8x16;
5673 __m128i resultB_u_8x16;
5674 __m128i resultC_u_8x16;
5675 SSE::interleave3Channel8Bit48Elements(result0_u_8x16, result1_u_8x16, result2_u_8x16, resultA_u_8x16, resultB_u_8x16, resultC_u_8x16);
5676
5677 // and we can store the result
5678 _mm_storeu_si128((__m128i*)target + 0, resultA_u_8x16);
5679 _mm_storeu_si128((__m128i*)target + 1, resultB_u_8x16);
5680 _mm_storeu_si128((__m128i*)target + 2, resultC_u_8x16);
5681}
5682
5683OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo4Channels16Pixels8BitPerChannel6BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_64_s_16x8, const __m128i& factorChannel10_64_s_16x8, const __m128i& factorChannel20_64_s_16x8, const __m128i& factorChannel01_64_s_16x8, const __m128i& factorChannel11_64_s_16x8, const __m128i& factorChannel21_64_s_16x8, const __m128i& factorChannel02_64_s_16x8, const __m128i& factorChannel12_64_s_16x8, const __m128i& factorChannel22_64_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8, const __m128i& channelValue3_u_8x16)
5684{
5685 ocean_assert(source != nullptr && target != nullptr);
5686
5687 // the documentation of this function designed for YUV24 to RGBA32 conversion
5688
5689 // precise color space conversion:
5690 // | R | | 1 0.0 1.370705 -175.45024 | | Y |
5691 // | G | = | 1 -0.3376335 -0.698001 132.561152 | * | U |
5692 // | B | | 1 1.732446 0.0 -221.753088 | | V |
5693 // | 1 |
5694
5695 // approximation:
5696 // R = 64 * Y + 0 * (U - 128) + 88 * (V - 128)
5697 // G = 64 * Y - 22 * (U - 128) - 45 * (V - 128)
5698 // B = 64 * Y + 111 * (U - 128) + 0 * (V - 128)
5699
5700 const __m128i sourceA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5701 const __m128i sourceB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5702 const __m128i sourceC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5703
5704 __m128i channel0_u_8x16;
5705 __m128i channel1_u_8x16;
5706 __m128i channel2_u_8x16;
5707 SSE::deInterleave3Channel8Bit48Elements(sourceA_u_8x16, sourceB_u_8x16, sourceC_u_8x16, channel0_u_8x16, channel1_u_8x16, channel2_u_8x16);
5708
5709 // subtract the bias values and convert to signed 16 bit
5710
5711 const __m128i channel0_low_s_16x8 = _mm_sub_epi16(_mm_unpacklo_epi8(channel0_u_8x16, _mm_setzero_si128()), biasChannel0_s_16x8);
5712 const __m128i channel1_low_s_16x8 = _mm_sub_epi16(_mm_unpacklo_epi8(channel1_u_8x16, _mm_setzero_si128()), biasChannel1_s_16x8);
5713 const __m128i channel2_low_s_16x8 = _mm_sub_epi16(_mm_unpacklo_epi8(channel2_u_8x16, _mm_setzero_si128()), biasChannel2_s_16x8);
5714
5715 const __m128i channel0_high_s_16x8 = _mm_sub_epi16(_mm_unpackhi_epi8(channel0_u_8x16, _mm_setzero_si128()), biasChannel0_s_16x8);
5716 const __m128i channel1_high_s_16x8 = _mm_sub_epi16(_mm_unpackhi_epi8(channel1_u_8x16, _mm_setzero_si128()), biasChannel1_s_16x8);
5717 const __m128i channel2_high_s_16x8 = _mm_sub_epi16(_mm_unpackhi_epi8(channel2_u_8x16, _mm_setzero_si128()), biasChannel2_s_16x8);
5718
5719 // we multiply each channel with the corresponding multiplication factors
5720
5721 __m128i result0_low_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_s_16x8, factorChannel00_64_s_16x8), _mm_mullo_epi16(channel1_low_s_16x8, factorChannel01_64_s_16x8)), _mm_mullo_epi16(channel2_low_s_16x8, factorChannel02_64_s_16x8));
5722 __m128i result1_low_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_s_16x8, factorChannel10_64_s_16x8), _mm_mullo_epi16(channel1_low_s_16x8, factorChannel11_64_s_16x8)), _mm_mullo_epi16(channel2_low_s_16x8, factorChannel12_64_s_16x8));
5723 __m128i result2_low_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_low_s_16x8, factorChannel20_64_s_16x8), _mm_mullo_epi16(channel1_low_s_16x8, factorChannel21_64_s_16x8)), _mm_mullo_epi16(channel2_low_s_16x8, factorChannel22_64_s_16x8));
5724
5725 __m128i result0_high_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_s_16x8, factorChannel00_64_s_16x8), _mm_mullo_epi16(channel1_high_s_16x8, factorChannel01_64_s_16x8)), _mm_mullo_epi16(channel2_high_s_16x8, factorChannel02_64_s_16x8));
5726 __m128i result1_high_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_s_16x8, factorChannel10_64_s_16x8), _mm_mullo_epi16(channel1_high_s_16x8, factorChannel11_64_s_16x8)), _mm_mullo_epi16(channel2_high_s_16x8, factorChannel12_64_s_16x8));
5727 __m128i result2_high_s_16x8 = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(channel0_high_s_16x8, factorChannel20_64_s_16x8), _mm_mullo_epi16(channel1_high_s_16x8, factorChannel21_64_s_16x8)), _mm_mullo_epi16(channel2_high_s_16x8, factorChannel22_64_s_16x8));
5728
5729 // we normalize the result by 64
5730
5731 result0_low_s_16x8 = SSE::divideByRightShiftSigned16Bit(result0_low_s_16x8, 6);
5732 result1_low_s_16x8 = SSE::divideByRightShiftSigned16Bit(result1_low_s_16x8, 6);
5733 result2_low_s_16x8 = SSE::divideByRightShiftSigned16Bit(result2_low_s_16x8, 6);
5734
5735 result0_high_s_16x8 = SSE::divideByRightShiftSigned16Bit(result0_high_s_16x8, 6);
5736 result1_high_s_16x8 = SSE::divideByRightShiftSigned16Bit(result1_high_s_16x8, 6);
5737 result2_high_s_16x8 = SSE::divideByRightShiftSigned16Bit(result2_high_s_16x8, 6);
5738
5739 // we combine 16 int16_t values to 16 uint8_t values (saturated to [0, 255])
5740 const __m128i result0_u_8x16 = _mm_packus_epi16(result0_low_s_16x8, result0_high_s_16x8);
5741 const __m128i result1_u_8x16 = _mm_packus_epi16(result1_low_s_16x8, result1_high_s_16x8);
5742 const __m128i result2_u_8x16 = _mm_packus_epi16(result2_low_s_16x8, result2_high_s_16x8);
5743
5744 // interleave 4 channels with constant alpha
5745
5746 const __m128i result01_low_u_8x16 = _mm_unpacklo_epi8(result0_u_8x16, result1_u_8x16);
5747 const __m128i result01_high_u_8x16 = _mm_unpackhi_epi8(result0_u_8x16, result1_u_8x16);
5748 const __m128i result23_low_u_8x16 = _mm_unpacklo_epi8(result2_u_8x16, channelValue3_u_8x16);
5749 const __m128i result23_high_u_8x16 = _mm_unpackhi_epi8(result2_u_8x16, channelValue3_u_8x16);
5750
5751 const __m128i resultA_u_8x16 = _mm_unpacklo_epi16(result01_low_u_8x16, result23_low_u_8x16);
5752 const __m128i resultB_u_8x16 = _mm_unpackhi_epi16(result01_low_u_8x16, result23_low_u_8x16);
5753 const __m128i resultC_u_8x16 = _mm_unpacklo_epi16(result01_high_u_8x16, result23_high_u_8x16);
5754 const __m128i resultD_u_8x16 = _mm_unpackhi_epi16(result01_high_u_8x16, result23_high_u_8x16);
5755
5756 // and we can store the result
5757 _mm_storeu_si128((__m128i*)target + 0, resultA_u_8x16);
5758 _mm_storeu_si128((__m128i*)target + 1, resultB_u_8x16);
5759 _mm_storeu_si128((__m128i*)target + 2, resultC_u_8x16);
5760 _mm_storeu_si128((__m128i*)target + 3, resultD_u_8x16);
5761}
5762
5763OCEAN_FORCE_INLINE void FrameChannels::convert4ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& factorChannel00_128_s_16x8, const __m128i& factorChannel10_128_s_16x8, const __m128i& factorChannel20_128_s_16x8, const __m128i& factorChannel01_128_s_16x8, const __m128i& factorChannel11_128_s_16x8, const __m128i& factorChannel21_128_s_16x8, const __m128i& factorChannel02_128_s_16x8, const __m128i& factorChannel12_128_s_16x8, const __m128i& factorChannel22_128_s_16x8, const __m128i& factorChannel03_128_s_16x8, const __m128i& factorChannel13_128_s_16x8, const __m128i& factorChannel23_128_s_16x8, const __m128i& biasChannel0_s_16x8, const __m128i& biasChannel1_s_16x8, const __m128i& biasChannel2_s_16x8)
5764{
5765 ocean_assert(source != nullptr && target != nullptr);
5766
5767 // the documentation of this function designed for YUVA32 to RGB24 conversion
5768
5769 // we expect the following input pattern (for here YUVA32):
5770 // FEDC BA98 7654 3210
5771 // AYUV AYUV AYUV AYUV
5772
5773 const __m128i pixelsA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5774 const __m128i pixelsB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5775 const __m128i pixelsC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5776 const __m128i pixelsD_u_8x16 = _mm_loadu_si128((const __m128i*)source + 3);
5777
5778 // deinterleave the 4 channels to get separated channels
5779 // deinterleaving 4 channels from 64 bytes to 4x16 bytes
5780 // The high 64 bits of the shuffle mask use 0xFF to produce zeros,
5781 // so only the low 4 16-bit positions contain valid data
5782 const __m128i shuffle0 = SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0xFF0cFF08FF04FF00ull);
5783 const __m128i shuffle1 = SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0xFF0dFF09FF05FF01ull);
5784 const __m128i shuffle2 = SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0xFF0eFF0aFF06FF02ull);
5785 const __m128i shuffle3 = SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0xFF0fFF0bFF07FF03ull);
5786
5787 // extract channel 0 values: 0x 0x 0x 0x 0x 0x 0x 0x
5788 const __m128i channel0A_u_16x8 = _mm_shuffle_epi8(pixelsA_u_8x16, shuffle0);
5789 const __m128i channel0B_u_16x8 = _mm_shuffle_epi8(pixelsB_u_8x16, shuffle0);
5790 const __m128i channel0C_u_16x8 = _mm_shuffle_epi8(pixelsC_u_8x16, shuffle0);
5791 const __m128i channel0D_u_16x8 = _mm_shuffle_epi8(pixelsD_u_8x16, shuffle0);
5792
5793 // extract channel 1 values
5794 const __m128i channel1A_u_16x8 = _mm_shuffle_epi8(pixelsA_u_8x16, shuffle1);
5795 const __m128i channel1B_u_16x8 = _mm_shuffle_epi8(pixelsB_u_8x16, shuffle1);
5796 const __m128i channel1C_u_16x8 = _mm_shuffle_epi8(pixelsC_u_8x16, shuffle1);
5797 const __m128i channel1D_u_16x8 = _mm_shuffle_epi8(pixelsD_u_8x16, shuffle1);
5798
5799 // extract channel 2 values
5800 const __m128i channel2A_u_16x8 = _mm_shuffle_epi8(pixelsA_u_8x16, shuffle2);
5801 const __m128i channel2B_u_16x8 = _mm_shuffle_epi8(pixelsB_u_8x16, shuffle2);
5802 const __m128i channel2C_u_16x8 = _mm_shuffle_epi8(pixelsC_u_8x16, shuffle2);
5803 const __m128i channel2D_u_16x8 = _mm_shuffle_epi8(pixelsD_u_8x16, shuffle2);
5804
5805 // extract channel 3 values
5806 const __m128i channel3A_u_16x8 = _mm_shuffle_epi8(pixelsA_u_8x16, shuffle3);
5807 const __m128i channel3B_u_16x8 = _mm_shuffle_epi8(pixelsB_u_8x16, shuffle3);
5808 const __m128i channel3C_u_16x8 = _mm_shuffle_epi8(pixelsC_u_8x16, shuffle3);
5809 const __m128i channel3D_u_16x8 = _mm_shuffle_epi8(pixelsD_u_8x16, shuffle3);
5810
5811 // combine to 8 values each: 0c 0c 0c 0c 0a 0a 0a 0a
5812 const __m128i channel0_low_u_16x8 = _mm_or_si128(channel0A_u_16x8, _mm_slli_si128(channel0B_u_16x8, 8));
5813 const __m128i channel0_high_u_16x8 = _mm_or_si128(channel0C_u_16x8, _mm_slli_si128(channel0D_u_16x8, 8));
5814
5815 const __m128i channel1_low_u_16x8 = _mm_or_si128(channel1A_u_16x8, _mm_slli_si128(channel1B_u_16x8, 8));
5816 const __m128i channel1_high_u_16x8 = _mm_or_si128(channel1C_u_16x8, _mm_slli_si128(channel1D_u_16x8, 8));
5817
5818 const __m128i channel2_low_u_16x8 = _mm_or_si128(channel2A_u_16x8, _mm_slli_si128(channel2B_u_16x8, 8));
5819 const __m128i channel2_high_u_16x8 = _mm_or_si128(channel2C_u_16x8, _mm_slli_si128(channel2D_u_16x8, 8));
5820
5821 const __m128i channel3_low_u_16x8 = _mm_or_si128(channel3A_u_16x8, _mm_slli_si128(channel3B_u_16x8, 8));
5822 const __m128i channel3_high_u_16x8 = _mm_or_si128(channel3C_u_16x8, _mm_slli_si128(channel3D_u_16x8, 8));
5823
5824 // We need to use 32-bit intermediate results to avoid overflow in the 4-channel case
5825 // when factors have opposing signs, the sum of 4 products can exceed INT16_MAX
5826
5827 // Process result channel 0 with 32-bit intermediates
5828 __m128i result0_low_A_s_32x4;
5829 __m128i result0_low_B_s_32x4;
5830 SSE::multiplyInt8x16ToInt32x8(channel0_low_u_16x8, factorChannel00_128_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5831 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_low_u_16x8, factorChannel01_128_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5832 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_low_u_16x8, factorChannel02_128_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5833 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel3_low_u_16x8, factorChannel03_128_s_16x8, result0_low_A_s_32x4, result0_low_B_s_32x4);
5834
5835 __m128i result0_high_A_s_32x4;
5836 __m128i result0_high_B_s_32x4;
5837 SSE::multiplyInt8x16ToInt32x8(channel0_high_u_16x8, factorChannel00_128_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5838 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_high_u_16x8, factorChannel01_128_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5839 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_high_u_16x8, factorChannel02_128_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5840 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel3_high_u_16x8, factorChannel03_128_s_16x8, result0_high_A_s_32x4, result0_high_B_s_32x4);
5841
5842 // Process result channel 1 with 32-bit intermediates
5843 __m128i result1_low_A_s_32x4;
5844 __m128i result1_low_B_s_32x4;
5845 SSE::multiplyInt8x16ToInt32x8(channel0_low_u_16x8, factorChannel10_128_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5846 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_low_u_16x8, factorChannel11_128_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5847 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_low_u_16x8, factorChannel12_128_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5848 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel3_low_u_16x8, factorChannel13_128_s_16x8, result1_low_A_s_32x4, result1_low_B_s_32x4);
5849
5850 __m128i result1_high_A_s_32x4;
5851 __m128i result1_high_B_s_32x4;
5852 SSE::multiplyInt8x16ToInt32x8(channel0_high_u_16x8, factorChannel10_128_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5853 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_high_u_16x8, factorChannel11_128_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5854 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_high_u_16x8, factorChannel12_128_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5855 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel3_high_u_16x8, factorChannel13_128_s_16x8, result1_high_A_s_32x4, result1_high_B_s_32x4);
5856
5857 // Process result channel 2 with 32-bit intermediates
5858 __m128i result2_low_A_s_32x4;
5859 __m128i result2_low_B_s_32x4;
5860 SSE::multiplyInt8x16ToInt32x8(channel0_low_u_16x8, factorChannel20_128_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5861 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_low_u_16x8, factorChannel21_128_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5862 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_low_u_16x8, factorChannel22_128_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5863 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel3_low_u_16x8, factorChannel23_128_s_16x8, result2_low_A_s_32x4, result2_low_B_s_32x4);
5864
5865 __m128i result2_high_A_s_32x4;
5866 __m128i result2_high_B_s_32x4;
5867 SSE::multiplyInt8x16ToInt32x8(channel0_high_u_16x8, factorChannel20_128_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5868 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel1_high_u_16x8, factorChannel21_128_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5869 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel2_high_u_16x8, factorChannel22_128_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5870 SSE::multiplyInt8x16ToInt32x8AndAccumulate(channel3_high_u_16x8, factorChannel23_128_s_16x8, result2_high_A_s_32x4, result2_high_B_s_32x4);
5871
5872 // Convert bias from 16-bit to 32-bit
5873 const __m128i biasChannel0_s_32x4 = _mm_cvtepi16_epi32(biasChannel0_s_16x8);
5874 const __m128i biasChannel1_s_32x4 = _mm_cvtepi16_epi32(biasChannel1_s_16x8);
5875 const __m128i biasChannel2_s_32x4 = _mm_cvtepi16_epi32(biasChannel2_s_16x8);
5876
5877 // Normalize by 128 (divide) and then add bias
5878 result0_low_A_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result0_low_A_s_32x4, 7), biasChannel0_s_32x4);
5879 result0_low_B_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result0_low_B_s_32x4, 7), biasChannel0_s_32x4);
5880 result0_high_A_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result0_high_A_s_32x4, 7), biasChannel0_s_32x4);
5881 result0_high_B_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result0_high_B_s_32x4, 7), biasChannel0_s_32x4);
5882
5883 result1_low_A_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result1_low_A_s_32x4, 7), biasChannel1_s_32x4);
5884 result1_low_B_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result1_low_B_s_32x4, 7), biasChannel1_s_32x4);
5885 result1_high_A_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result1_high_A_s_32x4, 7), biasChannel1_s_32x4);
5886 result1_high_B_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result1_high_B_s_32x4, 7), biasChannel1_s_32x4);
5887
5888 result2_low_A_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result2_low_A_s_32x4, 7), biasChannel2_s_32x4);
5889 result2_low_B_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result2_low_B_s_32x4, 7), biasChannel2_s_32x4);
5890 result2_high_A_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result2_high_A_s_32x4, 7), biasChannel2_s_32x4);
5891 result2_high_B_s_32x4 = _mm_add_epi32(SSE::divideByRightShiftSigned32Bit(result2_high_B_s_32x4, 7), biasChannel2_s_32x4);
5892
5893 // Pack 32-bit results to 16-bit using signed saturation
5894 // _mm_packs_epi32 takes two __m128i with 4 int32 each and produces one __m128i with 8 int16
5895 // Order: A[0], A[1], A[2], A[3], B[0], B[1], B[2], B[3]
5896 const __m128i result0_low_s_16x8 = _mm_packs_epi32(result0_low_A_s_32x4, result0_low_B_s_32x4);
5897 const __m128i result0_high_s_16x8 = _mm_packs_epi32(result0_high_A_s_32x4, result0_high_B_s_32x4);
5898
5899 const __m128i result1_low_s_16x8 = _mm_packs_epi32(result1_low_A_s_32x4, result1_low_B_s_32x4);
5900 const __m128i result1_high_s_16x8 = _mm_packs_epi32(result1_high_A_s_32x4, result1_high_B_s_32x4);
5901
5902 const __m128i result2_low_s_16x8 = _mm_packs_epi32(result2_low_A_s_32x4, result2_low_B_s_32x4);
5903 const __m128i result2_high_s_16x8 = _mm_packs_epi32(result2_high_A_s_32x4, result2_high_B_s_32x4);
5904
5905 // we combine 16 int16_t values to 16 uint8_t values (saturated to [0, 255])
5906 const __m128i result0_u_8x16 = _mm_packus_epi16(result0_low_s_16x8, result0_high_s_16x8);
5907 const __m128i result1_u_8x16 = _mm_packus_epi16(result1_low_s_16x8, result1_high_s_16x8);
5908 const __m128i result2_u_8x16 = _mm_packus_epi16(result2_low_s_16x8, result2_high_s_16x8);
5909
5910 __m128i resultA_u_8x16;
5911 __m128i resultB_u_8x16;
5912 __m128i resultC_u_8x16;
5913 SSE::interleave3Channel8Bit48Elements(result0_u_8x16, result1_u_8x16, result2_u_8x16, resultA_u_8x16, resultB_u_8x16, resultC_u_8x16);
5914
5915 // and we can store the result
5916 _mm_storeu_si128((__m128i*)target + 0, resultA_u_8x16);
5917 _mm_storeu_si128((__m128i*)target + 1, resultB_u_8x16);
5918 _mm_storeu_si128((__m128i*)target + 2, resultC_u_8x16);
5919}
5920
5921OCEAN_FORCE_INLINE void FrameChannels::convert4ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& multiplicationFactors0123_128_s_32x4)
5922{
5923 ocean_assert(source != nullptr && target != nullptr);
5924
5925 // the documentation of this function is designed for RGBA32 to Y8 conversion
5926 // however, in general this function can be used to apply a linear combination on the four source channels
5927 // to create one output channel
5928
5929 // precise color space conversion:
5930 // Y = 0.299 * R + 0.587 * G + 0.114 * B
5931
5932 // approximation:
5933 // Y = (38 * R + 75 * G + 15 * B) / 128
5934
5935 // we expect the following input pattern (for here RGBA32):
5936 // FEDC BA98 7654 3210
5937 // ABGR ABGR ABGR ABGR
5938
5939 // we calculate:
5940 // (int16_t)((uint8_t)R * (signed char)38) + (int16_t)((uint8_t)G * (signed char)75) for the first 16 bits
5941 // (int16_t)((uint8_t)B * (signed char)15) + (int16_t)((uint8_t)A * (signed char)0) for the second 16 bits
5942
5943 // we store eight 16 bit values holding 64 for rounding purpose:
5944 // FE DC BA 98 76 54 32 10
5945 // 64 64 64 64 64 64 64 64
5946 const __m128i constant64_u_8x16 = _mm_set1_epi32(0x00400040);
5947
5948 const __m128i pixelsA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
5949 const __m128i pixelsB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
5950 const __m128i pixelsC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
5951 const __m128i pixelsD_u_8x16 = _mm_loadu_si128((const __m128i*)source + 3);
5952
5953 // we get the following pattern
5954 // FE DC BA 98 76 54 32 10
5955 // 0b gr 0b gr 0b gr 0b gr
5956 const __m128i intermediateResults0_u_16x8 = _mm_maddubs_epi16(pixelsA_u_8x16, multiplicationFactors0123_128_s_32x4);
5957 const __m128i intermediateResults1_u_16x8 = _mm_maddubs_epi16(pixelsB_u_8x16, multiplicationFactors0123_128_s_32x4);
5958 const __m128i intermediateResults2_u_16x8 = _mm_maddubs_epi16(pixelsC_u_8x16, multiplicationFactors0123_128_s_32x4);
5959 const __m128i intermediateResults3_u_16x8 = _mm_maddubs_epi16(pixelsD_u_8x16, multiplicationFactors0123_128_s_32x4);
5960
5961 // now we sum the pairs of neighboring 16 bit intermediate results
5962 __m128i grayA_u_16x8 = _mm_hadd_epi16(intermediateResults0_u_16x8, intermediateResults1_u_16x8);
5963 __m128i grayB_u_16x8 = _mm_hadd_epi16(intermediateResults2_u_16x8, intermediateResults3_u_16x8);
5964
5965 // we add 64 for rounding purpose
5966 grayA_u_16x8 = _mm_add_epi16(grayA_u_16x8, constant64_u_8x16);
5967 grayB_u_16x8 = _mm_add_epi16(grayB_u_16x8, constant64_u_8x16);
5968
5969 // we shift the multiplication results by 7 bits (= 128)
5970 grayA_u_16x8 = _mm_srli_epi16(grayA_u_16x8, 7);
5971 grayB_u_16x8 = _mm_srli_epi16(grayB_u_16x8, 7);
5972
5973 // now we have the following pattern (in two 128 bit registers):
5974 // FEDCBA9876543210
5975 // 0Y0Y0Y0Y0Y0Y0Y0Y
5976
5977 // finally, we have to get rid of the upper zero bits by combining two 128 bit registers to one:
5978 const __m128i gray_u_8x16 = _mm_packus_epi16(grayA_u_16x8, grayB_u_16x8);
5979
5980 // and we can store the result
5981 _mm_storeu_si128((__m128i*)target, gray_u_8x16);
5982}
5983
5984void FrameChannels::convert4ChannelsTo2Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t* const source, uint8_t* const target, const __m128i& multiplicationFactorsChannel0_0123_128_s_16x8, const __m128i& multiplicationFactorsChannel1_0123_128_s_16x8)
5985{
5986 ocean_assert(source != nullptr && target != nullptr);
5987
5988 // the documentation of this function is designed for RGBA32 to YA16 conversion
5989 // however, in general this function can be used to apply a linear combination on the four source channels
5990 // to create one output channel
5991
5992 // precise color space conversion:
5993 // Y = 0.299 * R + 0.587 * G + 0.114 * B + 0.0 * A
5994 // A = 0.0 * R + 0.0 * G + 0.0 * B + 1.0 * A
5995
5996 // approximation:
5997 // Y = (38 * R + 75 * G + 15 * B + 0 * A) / 128
5998 // A = (128 * A) / 128
5999
6000 // we expect the following input pattern (for here RGBA32):
6001 // FEDC BA98 7654 3210
6002 // ABGR ABGR ABGR ABGR
6003
6004 // we store eight 16 bit values holding 64 for rounding purpose:
6005 // FE DC BA 98 76 54 32 10
6006 // 64 64 64 64 64 64 64 64
6007 const __m128i constant64_u_8x16 = _mm_set1_epi32(0x00400040);
6008
6009 const __m128i pixelsA_u_8x16 = _mm_loadu_si128((const __m128i*)source + 0);
6010 const __m128i pixelsB_u_8x16 = _mm_loadu_si128((const __m128i*)source + 1);
6011 const __m128i pixelsC_u_8x16 = _mm_loadu_si128((const __m128i*)source + 2);
6012 const __m128i pixelsD_u_8x16 = _mm_loadu_si128((const __m128i*)source + 3);
6013
6014 // we convert the 8 bit values to 16 bit values
6015
6016 const __m128i pixelsA_u_16x8 = _mm_unpacklo_epi8(pixelsA_u_8x16, _mm_setzero_si128());
6017 const __m128i pixelsB_u_16x8 = _mm_unpackhi_epi8(pixelsA_u_8x16, _mm_setzero_si128());
6018
6019 const __m128i pixelsC_u_16x8 = _mm_unpacklo_epi8(pixelsB_u_8x16, _mm_setzero_si128());
6020 const __m128i pixelsD_u_16x8 = _mm_unpackhi_epi8(pixelsB_u_8x16, _mm_setzero_si128());
6021
6022 const __m128i pixelsE_u_16x8 = _mm_unpacklo_epi8(pixelsC_u_8x16, _mm_setzero_si128());
6023 const __m128i pixelsF_u_16x8 = _mm_unpackhi_epi8(pixelsC_u_8x16, _mm_setzero_si128());
6024
6025 const __m128i pixelsG_u_16x8 = _mm_unpacklo_epi8(pixelsD_u_8x16, _mm_setzero_si128());
6026 const __m128i pixelsH_u_16x8 = _mm_unpackhi_epi8(pixelsD_u_8x16, _mm_setzero_si128());
6027
6028 // now we have the following pattern
6029 // FE DC BA 98 76 54 32 10
6030 // 0a 0b 0g 0r 0a 0b 0g 0r
6031
6032 const __m128i intermediateResultsChannel0_0_u_32x4 = _mm_madd_epi16(pixelsA_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8); // r * f00 + g * f01 | b * f02 + a * f03 | ...
6033 const __m128i intermediateResultsChannel0_1_u_32x4 = _mm_madd_epi16(pixelsB_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6034 const __m128i intermediateResultsChannel0_2_u_32x4 = _mm_madd_epi16(pixelsC_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6035 const __m128i intermediateResultsChannel0_3_u_32x4 = _mm_madd_epi16(pixelsD_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6036 const __m128i intermediateResultsChannel0_4_u_32x4 = _mm_madd_epi16(pixelsE_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6037 const __m128i intermediateResultsChannel0_5_u_32x4 = _mm_madd_epi16(pixelsF_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6038 const __m128i intermediateResultsChannel0_6_u_32x4 = _mm_madd_epi16(pixelsG_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6039 const __m128i intermediateResultsChannel0_7_u_32x4 = _mm_madd_epi16(pixelsH_u_16x8, multiplicationFactorsChannel0_0123_128_s_16x8);
6040
6041 const __m128i resultsChannel0_A_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel0_0_u_32x4, intermediateResultsChannel0_1_u_32x4); // r * f00 + g * f01 + b * f02 + a * f03 | ...
6042 const __m128i resultsChannel0_B_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel0_2_u_32x4, intermediateResultsChannel0_3_u_32x4);
6043 const __m128i resultsChannel0_C_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel0_4_u_32x4, intermediateResultsChannel0_5_u_32x4);
6044 const __m128i resultsChannel0_D_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel0_6_u_32x4, intermediateResultsChannel0_7_u_32x4);
6045
6046
6047 const __m128i intermediateResultsChannel1_0_u_32x4 = _mm_madd_epi16(pixelsA_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8); // r * f10 + g * f11 | b * f12 + a * f13 | ...
6048 const __m128i intermediateResultsChannel1_1_u_32x4 = _mm_madd_epi16(pixelsB_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6049 const __m128i intermediateResultsChannel1_2_u_32x4 = _mm_madd_epi16(pixelsC_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6050 const __m128i intermediateResultsChannel1_3_u_32x4 = _mm_madd_epi16(pixelsD_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6051 const __m128i intermediateResultsChannel1_4_u_32x4 = _mm_madd_epi16(pixelsE_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6052 const __m128i intermediateResultsChannel1_5_u_32x4 = _mm_madd_epi16(pixelsF_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6053 const __m128i intermediateResultsChannel1_6_u_32x4 = _mm_madd_epi16(pixelsG_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6054 const __m128i intermediateResultsChannel1_7_u_32x4 = _mm_madd_epi16(pixelsH_u_16x8, multiplicationFactorsChannel1_0123_128_s_16x8);
6055
6056 const __m128i resultsChannel1_A_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel1_0_u_32x4, intermediateResultsChannel1_1_u_32x4); // r * f10 + g * f11 + b * f12 + a * f13 | ...
6057 const __m128i resultsChannel1_B_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel1_2_u_32x4, intermediateResultsChannel1_3_u_32x4);
6058 const __m128i resultsChannel1_C_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel1_4_u_32x4, intermediateResultsChannel1_5_u_32x4);
6059 const __m128i resultsChannel1_D_u_32x4 = _mm_hadd_epi32(intermediateResultsChannel1_6_u_32x4, intermediateResultsChannel1_7_u_32x4);
6060
6061 // now we interleave the results of first and second channel (as both results fit into 16 bit)
6062
6063 __m128i resultA_u_16x8 = _mm_or_si128(resultsChannel0_A_u_32x4, _mm_slli_epi32(resultsChannel1_A_u_32x4, 16));
6064 __m128i resultB_u_16x8 = _mm_or_si128(resultsChannel0_B_u_32x4, _mm_slli_epi32(resultsChannel1_B_u_32x4, 16));
6065 __m128i resultC_u_16x8 = _mm_or_si128(resultsChannel0_C_u_32x4, _mm_slli_epi32(resultsChannel1_C_u_32x4, 16));
6066 __m128i resultD_u_16x8 = _mm_or_si128(resultsChannel0_D_u_32x4, _mm_slli_epi32(resultsChannel1_D_u_32x4, 16));
6067
6068 // we add 64 for rounding purpose
6069 resultA_u_16x8 = _mm_add_epi16(resultA_u_16x8, constant64_u_8x16);
6070 resultB_u_16x8 = _mm_add_epi16(resultB_u_16x8, constant64_u_8x16);
6071 resultC_u_16x8 = _mm_add_epi16(resultC_u_16x8, constant64_u_8x16);
6072 resultD_u_16x8 = _mm_add_epi16(resultD_u_16x8, constant64_u_8x16);
6073
6074 // we shift the multiplication results by 7 bits (= 128)
6075 resultA_u_16x8 = _mm_srli_epi16(resultA_u_16x8, 7);
6076 resultB_u_16x8 = _mm_srli_epi16(resultB_u_16x8, 7);
6077 resultC_u_16x8 = _mm_srli_epi16(resultC_u_16x8, 7);
6078 resultD_u_16x8 = _mm_srli_epi16(resultD_u_16x8, 7);
6079
6080 // now we have the following pattern (in two 128 bit registers):
6081 // FEDCBA9876543210
6082 // 0A0Y0A0Y0A0Y0A0Y
6083
6084 // finally, we have to get rid of the upper zero bits by combining two 128 bit registers to one:
6085 const __m128i resultAB_u_8x16 = _mm_packus_epi16(resultA_u_16x8, resultB_u_16x8);
6086 const __m128i resultCD_u_8x16 = _mm_packus_epi16(resultC_u_16x8, resultD_u_16x8);
6087
6088 // and we can store the result
6089 _mm_storeu_si128((__m128i*)target + 0, resultAB_u_8x16);
6090 _mm_storeu_si128((__m128i*)target + 1, resultCD_u_8x16);
6091}
6092
6093#endif // OCEAN_HARDWARE_SSE_VERSION
6094
6095#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
6096
6097template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2>
6098void FrameChannels::convert3ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const uint8x8_t& factorChannel0_128_u_8x8, const uint8x8_t& factorChannel1_128_u_8x8, const uint8x8_t& factorChannel2_128_u_8x8)
6099{
6100 static_assert(tUseFactorChannel0 || tUseFactorChannel1 || tUseFactorChannel2, "Invalid multiplication factors!");
6101
6102 ocean_assert(source != nullptr && target != nullptr);
6103
6104 // the documentation of this function designed for RGB24 to Y8 conversion
6105
6106 // precise color space conversion:
6107 // Y = 0.299 * R + 0.587 * G + 0.114 * B
6108
6109 // approximation:
6110 // Y = (38 * R + 75 * G + 15 * B) / 128
6111
6112 // we expect the following input pattern (for here RGB24):
6113 // FEDC BA98 7654 3210
6114 // RBGR BGRB GRBG RBGR
6115
6116 // we load 8 pixels (= 3 * 8 values) and directly deinterleave the 3 channels so that we receive the following patterns:
6117 // source_u_8x8x3.val[0]: R R R R R R R R
6118 // source_u_8x8x3.val[1]: G G G G G G G G
6119 // source_u_8x8x3.val[2]: B B B B B B B B
6120
6121 uint8x8x3_t source_u_8x8x3 = vld3_u8(source);
6122
6123 uint16x8_t intermediateResults_u_16x8;
6124
6125 // we multiply the first channel with the specified factor (unless zero)
6126
6127 if constexpr (tUseFactorChannel0)
6128 {
6129 intermediateResults_u_16x8 = vmull_u8(source_u_8x8x3.val[0], factorChannel0_128_u_8x8);
6130 }
6131 else
6132 {
6133 intermediateResults_u_16x8 = vdupq_n_u16(0u);
6134 }
6135
6136 // we multiply the second channel with the specified factor (unless zero) and accumulate the results
6137
6138 if constexpr (tUseFactorChannel1)
6139 {
6140 intermediateResults_u_16x8 = vmlal_u8(intermediateResults_u_16x8, source_u_8x8x3.val[1], factorChannel1_128_u_8x8);
6141 }
6142
6143 // we multiply the third channel with the specified factor (unless zero) and accumulate the results
6144
6145 if constexpr (tUseFactorChannel2)
6146 {
6147 intermediateResults_u_16x8 = vmlal_u8(intermediateResults_u_16x8, source_u_8x8x3.val[2], factorChannel2_128_u_8x8);
6148 }
6149
6150 // we shift the 16 bit values by 7 bits (= 128), apply rounding, and narrow the 16 bit integers to 8 bit integers within one operation
6151 uint8x8_t results_u_8x8 = vqrshrn_n_u16(intermediateResults_u_16x8, 7); // results_u_8x8 = (intermediateResults_u_16x8 + 2^6) >> 2^7
6152
6153 // and we can store the result
6154 vst1_u8(target, results_u_8x8);
6155}
6156
6157OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels8Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_64_s_16x8, const int16x8_t& factorChannel10_64_s_16x8, const int16x8_t& factorChannel20_64_s_16x8, const int16x8_t& factorChannel01_64_s_16x8, const int16x8_t& factorChannel11_64_s_16x8, const int16x8_t& factorChannel21_64_s_16x8, const int16x8_t& factorChannel02_64_s_16x8, const int16x8_t& factorChannel12_64_s_16x8, const int16x8_t& factorChannel22_64_s_16x8, const uint8x8_t& biasChannel0_u_8x8, const uint8x8_t& biasChannel1_u_8x8, const uint8x8_t& biasChannel2_u_8x8)
6158{
6159 ocean_assert(source != nullptr && target != nullptr);
6160
6161 // the documentation of this function designed for YUV24 to RGB24 conversion
6162
6163 // precise color space conversion:
6164 // | R | | 1 0.0 1.370705 -175.45024 | | Y |
6165 // | G | = | 1 -0.3376335 -0.698001 132.561152 | * | U |
6166 // | B | | 1 1.732446 0.0 -221.753088 | | V |
6167 // | 1 |
6168
6169 // approximation:
6170 // R = 64 * Y + 0 * (U - 128) + 88 * (V - 128)
6171 // G = 64 * Y - 22 * (U - 128) - 45 * (V - 128)
6172 // B = 64 * Y + 111 * (U - 128) + 0 * (V - 128)
6173
6174 // we load 8 pixels (= 3 * 8 values) and directly deinterleave the 3 channels so that we receive the following patterns:
6175 // source_u_8x8x3.val[0]: R R R R R R R R
6176 // source_u_8x8x3.val[1]: G G G G G G G G
6177 // source_u_8x8x3.val[2]: B B B B B B B B
6178
6179 const uint8x8x3_t source_u_8x8x3 = vld3_u8(source);
6180
6181 // Y' = Y - bias0, U' = U - bias1, V' = V - bias2
6182 const int16x8_t source0_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(source_u_8x8x3.val[0], biasChannel0_u_8x8));
6183 const int16x8_t source1_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(source_u_8x8x3.val[1], biasChannel1_u_8x8));
6184 const int16x8_t source2_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(source_u_8x8x3.val[2], biasChannel2_u_8x8));
6185
6186 // now we apply the 3x3 matrix multiplication
6187
6188 int16x8_t intermediateResults0_s_16x8 = vmulq_s16(source0_s_16x8, factorChannel00_64_s_16x8);
6189 int16x8_t intermediateResults1_s_16x8 = vmulq_s16(source0_s_16x8, factorChannel10_64_s_16x8);
6190 int16x8_t intermediateResults2_s_16x8 = vmulq_s16(source0_s_16x8, factorChannel20_64_s_16x8);
6191
6192 intermediateResults0_s_16x8 = vqaddq_s16(intermediateResults0_s_16x8, vmulq_s16(source1_s_16x8, factorChannel01_64_s_16x8)); // intermediateResults0 = saturated(intermediateResults0 + source10_low * factorChannel01)
6193 intermediateResults1_s_16x8 = vqaddq_s16(intermediateResults1_s_16x8, vmulq_s16(source1_s_16x8, factorChannel11_64_s_16x8));
6194 intermediateResults2_s_16x8 = vqaddq_s16(intermediateResults2_s_16x8, vmulq_s16(source1_s_16x8, factorChannel21_64_s_16x8));
6195
6196 intermediateResults0_s_16x8 = vqaddq_s16(intermediateResults0_s_16x8, vmulq_s16(source2_s_16x8, factorChannel02_64_s_16x8));
6197 intermediateResults1_s_16x8 = vqaddq_s16(intermediateResults1_s_16x8, vmulq_s16(source2_s_16x8, factorChannel12_64_s_16x8));
6198 intermediateResults2_s_16x8 = vqaddq_s16(intermediateResults2_s_16x8, vmulq_s16(source2_s_16x8, factorChannel22_64_s_16x8));
6199
6200 uint8x8x3_t results_u_8x8x3;
6201
6202 // saturated narrow signed to unsigned, normalized by 2^6
6203 results_u_8x8x3.val[0] = vqrshrun_n_s16(intermediateResults0_s_16x8, 6);
6204 results_u_8x8x3.val[1] = vqrshrun_n_s16(intermediateResults1_s_16x8, 6);
6205 results_u_8x8x3.val[2] = vqrshrun_n_s16(intermediateResults2_s_16x8, 6);
6206
6207 // and we can store the result
6208 vst3_u8(target, results_u_8x8x3);
6209}
6210
6211OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels16Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_64_s_16x8, const int16x8_t& factorChannel10_64_s_16x8, const int16x8_t& factorChannel20_64_s_16x8, const int16x8_t& factorChannel01_64_s_16x8, const int16x8_t& factorChannel11_64_s_16x8, const int16x8_t& factorChannel21_64_s_16x8, const int16x8_t& factorChannel02_64_s_16x8, const int16x8_t& factorChannel12_64_s_16x8, const int16x8_t& factorChannel22_64_s_16x8, const uint8x8_t& biasChannel0_u_8x8, const uint8x8_t& biasChannel1_u_8x8, const uint8x8_t& biasChannel2_u_8x8)
6212{
6213 ocean_assert(source != nullptr && target != nullptr);
6214
6215 // the documentation of this function designed for YUV24 to RGB24 conversion
6216
6217 // precise color space conversion:
6218 // | R | | 1 0.0 1.370705 -175.45024 | | Y |
6219 // | G | = | 1 -0.3376335 -0.698001 132.561152 | * | U |
6220 // | B | | 1 1.732446 0.0 -221.753088 | | V |
6221 // | 1 |
6222
6223 // approximation:
6224 // R = 64 * Y + 0 * (U - 128) + 88 * (V - 128)
6225 // G = 64 * Y - 22 * (U - 128) - 45 * (V - 128)
6226 // B = 64 * Y + 111 * (U - 128) + 0 * (V - 128)
6227
6228 const uint8x16x3_t source_u_8x16x3 = vld3q_u8(source);
6229
6230 // Y' = Y - bias0, U' = U - bias1, V' = V - bias2
6231 const int16x8_t source0_low_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(source_u_8x16x3.val[0]), biasChannel0_u_8x8));
6232 const int16x8_t source1_low_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(source_u_8x16x3.val[1]), biasChannel1_u_8x8));
6233 const int16x8_t source2_low_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(source_u_8x16x3.val[2]), biasChannel2_u_8x8));
6234
6235 const int16x8_t source0_high_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(source_u_8x16x3.val[0]), biasChannel0_u_8x8));
6236 const int16x8_t source1_high_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(source_u_8x16x3.val[1]), biasChannel1_u_8x8));
6237 const int16x8_t source2_high_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(source_u_8x16x3.val[2]), biasChannel2_u_8x8));
6238
6239 // now we mulitply apply the 3x3 matrix multiplication
6240
6241 int16x8_t intermediateResults0_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel00_64_s_16x8);
6242 int16x8_t intermediateResults1_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel10_64_s_16x8);
6243 int16x8_t intermediateResults2_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel20_64_s_16x8);
6244
6245 int16x8_t intermediateResults0_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel00_64_s_16x8);
6246 int16x8_t intermediateResults1_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel10_64_s_16x8);
6247 int16x8_t intermediateResults2_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel20_64_s_16x8);
6248
6249 intermediateResults0_low_s_16x8 = vqaddq_s16(intermediateResults0_low_s_16x8, vmulq_s16(source1_low_s_16x8, factorChannel01_64_s_16x8)); // intermediateResults0 = saturated(intermediateResults0 + source10_low * factorChannel01)
6250 intermediateResults1_low_s_16x8 = vqaddq_s16(intermediateResults1_low_s_16x8, vmulq_s16(source1_low_s_16x8, factorChannel11_64_s_16x8));
6251 intermediateResults2_low_s_16x8 = vqaddq_s16(intermediateResults2_low_s_16x8, vmulq_s16(source1_low_s_16x8, factorChannel21_64_s_16x8));
6252
6253 intermediateResults0_high_s_16x8 = vqaddq_s16(intermediateResults0_high_s_16x8, vmulq_s16(source1_high_s_16x8, factorChannel01_64_s_16x8));
6254 intermediateResults1_high_s_16x8 = vqaddq_s16(intermediateResults1_high_s_16x8, vmulq_s16(source1_high_s_16x8, factorChannel11_64_s_16x8));
6255 intermediateResults2_high_s_16x8 = vqaddq_s16(intermediateResults2_high_s_16x8, vmulq_s16(source1_high_s_16x8, factorChannel21_64_s_16x8));
6256
6257 intermediateResults0_low_s_16x8 = vqaddq_s16(intermediateResults0_low_s_16x8, vmulq_s16(source2_low_s_16x8, factorChannel02_64_s_16x8));
6258 intermediateResults1_low_s_16x8 = vqaddq_s16(intermediateResults1_low_s_16x8, vmulq_s16(source2_low_s_16x8, factorChannel12_64_s_16x8));
6259 intermediateResults2_low_s_16x8 = vqaddq_s16(intermediateResults2_low_s_16x8, vmulq_s16(source2_low_s_16x8, factorChannel22_64_s_16x8));
6260
6261 intermediateResults0_high_s_16x8 = vqaddq_s16(intermediateResults0_high_s_16x8, vmulq_s16(source2_high_s_16x8, factorChannel02_64_s_16x8));
6262 intermediateResults1_high_s_16x8 = vqaddq_s16(intermediateResults1_high_s_16x8, vmulq_s16(source2_high_s_16x8, factorChannel12_64_s_16x8));
6263 intermediateResults2_high_s_16x8 = vqaddq_s16(intermediateResults2_high_s_16x8, vmulq_s16(source2_high_s_16x8, factorChannel22_64_s_16x8));
6264
6265 uint8x16x3_t results_u_8x16x3;
6266
6267 // saturated narrow signed to unsigned, normalized by 2^6
6268 results_u_8x16x3.val[0] = vcombine_u8(vqrshrun_n_s16(intermediateResults0_low_s_16x8, 6), vqrshrun_n_s16(intermediateResults0_high_s_16x8, 6));
6269 results_u_8x16x3.val[1] = vcombine_u8(vqrshrun_n_s16(intermediateResults1_low_s_16x8, 6), vqrshrun_n_s16(intermediateResults1_high_s_16x8, 6));
6270 results_u_8x16x3.val[2] = vcombine_u8(vqrshrun_n_s16(intermediateResults2_low_s_16x8, 6), vqrshrun_n_s16(intermediateResults2_high_s_16x8, 6));
6271
6272 // and we can store the result
6273 vst3q_u8(target, results_u_8x16x3);
6274}
6275
6276OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_128_s_16x8, const int16x8_t& factorChannel10_128_s_16x8, const int16x8_t& factorChannel20_128_s_16x8, const int16x8_t& factorChannel01_128_s_16x8, const int16x8_t& factorChannel11_128_s_16x8, const int16x8_t& factorChannel21_128_s_16x8, const int16x8_t& factorChannel02_128_s_16x8, const int16x8_t& factorChannel12_128_s_16x8, const int16x8_t& factorChannel22_128_s_16x8, const int16x8_t& biasChannel0_128_s_16x8, const int16x8_t& biasChannel1_128_s_16x8, const int16x8_t& biasChannel2_128_s_16x8)
6277{
6278 ocean_assert(source != nullptr && target != nullptr);
6279
6280 // the documentation of this function designed for RGB24 to YUV24 conversion
6281
6282 // precise color space conversion:
6283 // | Y | | 0.2578125 0.5039063 0.09765625 16.0 | | R |
6284 // | U | = | -0.1484375 -0.2890625 0.4375 128.0 | * | G |
6285 // | V | | 0.4375 -0.3671875 -0.0703125 128.0 | | B |
6286 // | 1 |
6287
6288 // approximation:
6289 // Y = ( 33 * R + 64 * G + 13 * B) / 128 + 16
6290 // U = (-19 * R - 37 * G + 56 * B) / 128 + 128
6291 // V = ( 56 * R - 47 * G - 9 * B) / 128 + 128
6292
6293 // we load 8 pixels (= 3 * 8 values) and directly deinterleave the 3 channels so that we receive the following patterns:
6294 // source_u_8x8x3.val[0]: R R R R R R R R
6295 // source_u_8x8x3.val[1]: G G G G G G G G
6296 // source_u_8x8x3.val[2]: B B B B B B B B
6297
6298 const uint8x8x3_t source_u_8x8x3 = vld3_u8(source);
6299
6300 const int16x8_t source0_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(source_u_8x8x3.val[0]));
6301 const int16x8_t source1_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(source_u_8x8x3.val[1]));
6302 const int16x8_t source2_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(source_u_8x8x3.val[2]));
6303
6304 int16x8_t intermediateResults0_s_16x8 = vmulq_s16(source0_s_16x8, factorChannel00_128_s_16x8);
6305 int16x8_t intermediateResults1_s_16x8 = vmulq_s16(source0_s_16x8, factorChannel10_128_s_16x8);
6306 int16x8_t intermediateResults2_s_16x8 = vmulq_s16(source0_s_16x8, factorChannel20_128_s_16x8);
6307
6308 intermediateResults0_s_16x8 = vmlaq_s16(intermediateResults0_s_16x8, source1_s_16x8, factorChannel01_128_s_16x8);
6309 intermediateResults1_s_16x8 = vmlaq_s16(intermediateResults1_s_16x8, source1_s_16x8, factorChannel11_128_s_16x8);
6310 intermediateResults2_s_16x8 = vmlaq_s16(intermediateResults2_s_16x8, source1_s_16x8, factorChannel21_128_s_16x8);
6311
6312 intermediateResults0_s_16x8 = vmlaq_s16(intermediateResults0_s_16x8, source2_s_16x8, factorChannel02_128_s_16x8);
6313 intermediateResults1_s_16x8 = vmlaq_s16(intermediateResults1_s_16x8, source2_s_16x8, factorChannel12_128_s_16x8);
6314 intermediateResults2_s_16x8 = vmlaq_s16(intermediateResults2_s_16x8, source2_s_16x8, factorChannel22_128_s_16x8);
6315
6316 // now we add the bias values (saturated)
6317
6318 intermediateResults0_s_16x8 = vqaddq_s16(intermediateResults0_s_16x8, biasChannel0_128_s_16x8);
6319 intermediateResults1_s_16x8 = vqaddq_s16(intermediateResults1_s_16x8, biasChannel1_128_s_16x8);
6320 intermediateResults2_s_16x8 = vqaddq_s16(intermediateResults2_s_16x8, biasChannel2_128_s_16x8);
6321
6322 uint8x8x3_t results_u_8x8x3;
6323
6324 // saturated narrow signed to unsigned
6325 results_u_8x8x3.val[0] = vqrshrun_n_s16(intermediateResults0_s_16x8, 7);
6326 results_u_8x8x3.val[1] = vqrshrun_n_s16(intermediateResults1_s_16x8, 7);
6327 results_u_8x8x3.val[2] = vqrshrun_n_s16(intermediateResults2_s_16x8, 7);
6328
6329 // and we can store the result
6330 vst3_u8(target, results_u_8x8x3);
6331}
6332
6333OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels8Pixels8BitPerChannel10BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x4_t& factorChannel00_1024_s_16x4, const int16x4_t& factorChannel10_1024_s_16x4, const int16x4_t& factorChannel20_1024_s_16x4, const int16x4_t& factorChannel01_1024_s_16x4, const int16x4_t& factorChannel11_1024_s_16x4, const int16x4_t& factorChannel21_1024_s_16x4, const int16x4_t& factorChannel02_1024_s_16x4, const int16x4_t& factorChannel12_1024_s_16x4, const int16x4_t& factorChannel22_1024_s_16x4, const int32x4_t& biasChannel0_1024_s_32x4, const int32x4_t& biasChannel1_1024_s_32x4, const int32x4_t& biasChannel2_1024_s_32x4)
6334{
6335 ocean_assert(source != nullptr && target != nullptr);
6336
6337 // the documentation of this function designed for YUV24 to RGB24 conversion
6338
6339 // precise color space conversion:
6340 // | R | | 1.1639404296875 0.0 1.595947265625 -222.904296875 | | Y |
6341 // | G | = | 1.1639404296875 -0.3909912109375 -0.81298828125 135.486328125 | * | U |
6342 // | B | | 1.1639404296875 2.0179443359375 0.0 -276.919921875 | | V |
6343 // | 1 |
6344
6345 // approximation:
6346 // | R | | 1192 0 1634 -223 | | Y |
6347 // | G | = | 1192 -400 -833 135 | * | U |
6348 // | B | | 1192 2066 0 -277 | | V |
6349 // | 1 |
6350
6351 // we load 8 pixels (= 3 * 8 values) and directly deinterleave the 3 channels so that we receive the following patterns:
6352 // source_u_8x8x3.val[0]: R R R R R R R R
6353 // source_u_8x8x3.val[1]: G G G G G G G G
6354 // source_u_8x8x3.val[2]: B B B B B B B B
6355
6356 const uint8x8x3_t source_u_8x8x3 = vld3_u8(source);
6357
6358 const int16x8_t source0_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(source_u_8x8x3.val[0]));
6359 const int16x8_t source1_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(source_u_8x8x3.val[1]));
6360 const int16x8_t source2_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(source_u_8x8x3.val[2]));
6361
6362 const int16x4_t source0_low_s_16x4 = vget_low_s16(source0_s_16x8);
6363 const int16x4_t source0_high_s_16x4 = vget_high_s16(source0_s_16x8);
6364
6365 int32x4_t intermediateResults0_low_s_32x4 = vmull_s16(source0_low_s_16x4, factorChannel00_1024_s_16x4);
6366 int32x4_t intermediateResults0_high_s_32x4 = vmull_s16(source0_high_s_16x4, factorChannel00_1024_s_16x4);
6367
6368 int32x4_t intermediateResults1_low_s_32x4 = vmull_s16(source0_low_s_16x4, factorChannel10_1024_s_16x4);
6369 int32x4_t intermediateResults1_high_s_32x4 = vmull_s16(source0_high_s_16x4, factorChannel10_1024_s_16x4);
6370
6371 int32x4_t intermediateResults2_low_s_32x4 = vmull_s16(source0_low_s_16x4, factorChannel20_1024_s_16x4);
6372 int32x4_t intermediateResults2_high_s_32x4 = vmull_s16(source0_high_s_16x4, factorChannel20_1024_s_16x4);
6373
6374
6375 const int16x4_t source1_low_s_16x4 = vget_low_s16(source1_s_16x8);
6376 const int16x4_t source1_high_s_16x4 = vget_high_s16(source1_s_16x8);
6377
6378 intermediateResults0_low_s_32x4 = vmlal_s16(intermediateResults0_low_s_32x4, source1_low_s_16x4, factorChannel01_1024_s_16x4);
6379 intermediateResults0_high_s_32x4 = vmlal_s16(intermediateResults0_high_s_32x4, source1_high_s_16x4, factorChannel01_1024_s_16x4);
6380
6381 intermediateResults1_low_s_32x4 = vmlal_s16(intermediateResults1_low_s_32x4, source1_low_s_16x4, factorChannel11_1024_s_16x4);
6382 intermediateResults1_high_s_32x4 = vmlal_s16(intermediateResults1_high_s_32x4, source1_high_s_16x4, factorChannel11_1024_s_16x4);
6383
6384 intermediateResults2_low_s_32x4 = vmlal_s16(intermediateResults2_low_s_32x4, source1_low_s_16x4, factorChannel21_1024_s_16x4);
6385 intermediateResults2_high_s_32x4 = vmlal_s16(intermediateResults2_high_s_32x4, source1_high_s_16x4, factorChannel21_1024_s_16x4);
6386
6387
6388 const int16x4_t source2_low_s_16x4 = vget_low_s16(source2_s_16x8);
6389 const int16x4_t source2_high_s_16x4 = vget_high_s16(source2_s_16x8);
6390
6391 intermediateResults0_low_s_32x4 = vmlal_s16(intermediateResults0_low_s_32x4, source2_low_s_16x4, factorChannel02_1024_s_16x4);
6392 intermediateResults0_high_s_32x4 = vmlal_s16(intermediateResults0_high_s_32x4, source2_high_s_16x4, factorChannel02_1024_s_16x4);
6393
6394 intermediateResults1_low_s_32x4 = vmlal_s16(intermediateResults1_low_s_32x4, source2_low_s_16x4, factorChannel12_1024_s_16x4);
6395 intermediateResults1_high_s_32x4 = vmlal_s16(intermediateResults1_high_s_32x4, source2_high_s_16x4, factorChannel12_1024_s_16x4);
6396
6397 intermediateResults2_low_s_32x4 = vmlal_s16(intermediateResults2_low_s_32x4, source2_low_s_16x4, factorChannel22_1024_s_16x4);
6398 intermediateResults2_high_s_32x4 = vmlal_s16(intermediateResults2_high_s_32x4, source2_high_s_16x4, factorChannel22_1024_s_16x4);
6399
6400
6401 // now we add the bias values (saturated)
6402
6403 intermediateResults0_low_s_32x4 = vaddq_s32(intermediateResults0_low_s_32x4, biasChannel0_1024_s_32x4);
6404 intermediateResults0_high_s_32x4 = vaddq_s32(intermediateResults0_high_s_32x4, biasChannel0_1024_s_32x4);
6405
6406 intermediateResults1_low_s_32x4 = vaddq_s32(intermediateResults1_low_s_32x4, biasChannel1_1024_s_32x4);
6407 intermediateResults1_high_s_32x4 = vaddq_s32(intermediateResults1_high_s_32x4, biasChannel1_1024_s_32x4);
6408
6409 intermediateResults2_low_s_32x4 = vaddq_s32(intermediateResults2_low_s_32x4, biasChannel2_1024_s_32x4);
6410 intermediateResults2_high_s_32x4 = vaddq_s32(intermediateResults2_high_s_32x4, biasChannel2_1024_s_32x4);
6411
6412
6413 uint8x8x3_t results_u_8x8x3;
6414
6415 // saturated narrow signed to unsigned
6416 results_u_8x8x3.val[0] = vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults0_low_s_32x4, 10), vqrshrun_n_s32(intermediateResults0_high_s_32x4, 10)));
6417 results_u_8x8x3.val[1] = vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults1_low_s_32x4, 10), vqrshrun_n_s32(intermediateResults1_high_s_32x4, 10)));
6418 results_u_8x8x3.val[2] = vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults2_low_s_32x4, 10), vqrshrun_n_s32(intermediateResults2_high_s_32x4, 10)));
6419
6420 // and we can store the result
6421 vst3_u8(target, results_u_8x8x3);
6422}
6423
6424OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels16Pixels8BitPerChannel10BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x4_t& factorChannel00_1024_s_16x4, const int16x4_t& factorChannel10_1024_s_16x4, const int16x4_t& factorChannel20_1024_s_16x4, const int16x4_t& factorChannel01_1024_s_16x4, const int16x4_t& factorChannel11_1024_s_16x4, const int16x4_t& factorChannel21_1024_s_16x4, const int16x4_t& factorChannel02_1024_s_16x4, const int16x4_t& factorChannel12_1024_s_16x4, const int16x4_t& factorChannel22_1024_s_16x4, const int32x4_t& biasChannel0_1024_s_32x4, const int32x4_t& biasChannel1_1024_s_32x4, const int32x4_t& biasChannel2_1024_s_32x4)
6425{
6426 ocean_assert(source != nullptr && target != nullptr);
6427
6428 // the documentation of this function designed for YUV24 to RGB24 conversion
6429
6430 // precise color space conversion:
6431 // | R | | 1.1639404296875 0.0 1.595947265625 -222.904296875 | | Y |
6432 // | G | = | 1.1639404296875 -0.3909912109375 -0.81298828125 135.486328125 | * | U |
6433 // | B | | 1.1639404296875 2.0179443359375 0.0 -276.919921875 | | V |
6434 // | 1 |
6435
6436 // approximation:
6437 // | R | | 1192 0 1634 -223 | | Y |
6438 // | G | = | 1192 -400 -833 135 | * | U |
6439 // | B | | 1192 2066 0 -277 | | V |
6440 // | 1 |
6441
6442 // we load 8 pixels (= 3 * 8 values) and directly deinterleave the 3 channels so that we receive the following patterns:
6443 // source_u_8x8x3.val[0]: R R R R R R R R
6444 // source_u_8x8x3.val[1]: G G G G G G G G
6445 // source_u_8x8x3.val[2]: B B B B B B B B
6446
6447 const uint8x16x3_t source_u_8x16x3 = vld3q_u8(source);
6448
6449 const int16x8_t source0_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x3.val[0])));
6450 const int16x8_t source1_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x3.val[1])));
6451 const int16x8_t source2_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x3.val[2])));
6452
6453 const int16x8_t source0_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x3.val[0])));
6454 const int16x8_t source1_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x3.val[1])));
6455 const int16x8_t source2_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x3.val[2])));
6456
6457 const int16x4_t source0_A_s_16x4 = vget_low_s16(source0_low_s_16x8);
6458 const int16x4_t source0_B_s_16x4 = vget_high_s16(source0_low_s_16x8);
6459 const int16x4_t source0_C_s_16x4 = vget_low_s16(source0_high_s_16x8);
6460 const int16x4_t source0_D_s_16x4 = vget_high_s16(source0_high_s_16x8);
6461
6462 int32x4_t intermediateResults0_A_s_32x4 = vmull_s16(source0_A_s_16x4, factorChannel00_1024_s_16x4);
6463 int32x4_t intermediateResults0_B_s_32x4 = vmull_s16(source0_B_s_16x4, factorChannel00_1024_s_16x4);
6464 int32x4_t intermediateResults0_C_s_32x4 = vmull_s16(source0_C_s_16x4, factorChannel00_1024_s_16x4);
6465 int32x4_t intermediateResults0_D_s_32x4 = vmull_s16(source0_D_s_16x4, factorChannel00_1024_s_16x4);
6466
6467 int32x4_t intermediateResults1_A_s_32x4 = vmull_s16(source0_A_s_16x4, factorChannel10_1024_s_16x4);
6468 int32x4_t intermediateResults1_B_s_32x4 = vmull_s16(source0_B_s_16x4, factorChannel10_1024_s_16x4);
6469 int32x4_t intermediateResults1_C_s_32x4 = vmull_s16(source0_C_s_16x4, factorChannel10_1024_s_16x4);
6470 int32x4_t intermediateResults1_D_s_32x4 = vmull_s16(source0_D_s_16x4, factorChannel10_1024_s_16x4);
6471
6472 int32x4_t intermediateResults2_A_s_32x4 = vmull_s16(source0_A_s_16x4, factorChannel20_1024_s_16x4);
6473 int32x4_t intermediateResults2_B_s_32x4 = vmull_s16(source0_B_s_16x4, factorChannel20_1024_s_16x4);
6474 int32x4_t intermediateResults2_C_s_32x4 = vmull_s16(source0_C_s_16x4, factorChannel20_1024_s_16x4);
6475 int32x4_t intermediateResults2_D_s_32x4 = vmull_s16(source0_D_s_16x4, factorChannel20_1024_s_16x4);
6476
6477
6478 const int16x4_t source1_A_s_16x4 = vget_low_s16(source1_low_s_16x8);
6479 const int16x4_t source1_B_s_16x4 = vget_high_s16(source1_low_s_16x8);
6480 const int16x4_t source1_C_s_16x4 = vget_low_s16(source1_high_s_16x8);
6481 const int16x4_t source1_D_s_16x4 = vget_high_s16(source1_high_s_16x8);
6482
6483 intermediateResults0_A_s_32x4 = vmlal_s16(intermediateResults0_A_s_32x4, source1_A_s_16x4, factorChannel01_1024_s_16x4);
6484 intermediateResults0_B_s_32x4 = vmlal_s16(intermediateResults0_B_s_32x4, source1_B_s_16x4, factorChannel01_1024_s_16x4);
6485 intermediateResults0_C_s_32x4 = vmlal_s16(intermediateResults0_C_s_32x4, source1_C_s_16x4, factorChannel01_1024_s_16x4);
6486 intermediateResults0_D_s_32x4 = vmlal_s16(intermediateResults0_D_s_32x4, source1_D_s_16x4, factorChannel01_1024_s_16x4);
6487
6488 intermediateResults1_A_s_32x4 = vmlal_s16(intermediateResults1_A_s_32x4, source1_A_s_16x4, factorChannel11_1024_s_16x4);
6489 intermediateResults1_B_s_32x4 = vmlal_s16(intermediateResults1_B_s_32x4, source1_B_s_16x4, factorChannel11_1024_s_16x4);
6490 intermediateResults1_C_s_32x4 = vmlal_s16(intermediateResults1_C_s_32x4, source1_C_s_16x4, factorChannel11_1024_s_16x4);
6491 intermediateResults1_D_s_32x4 = vmlal_s16(intermediateResults1_D_s_32x4, source1_D_s_16x4, factorChannel11_1024_s_16x4);
6492
6493 intermediateResults2_A_s_32x4 = vmlal_s16(intermediateResults2_A_s_32x4, source1_A_s_16x4, factorChannel21_1024_s_16x4);
6494 intermediateResults2_B_s_32x4 = vmlal_s16(intermediateResults2_B_s_32x4, source1_B_s_16x4, factorChannel21_1024_s_16x4);
6495 intermediateResults2_C_s_32x4 = vmlal_s16(intermediateResults2_C_s_32x4, source1_C_s_16x4, factorChannel21_1024_s_16x4);
6496 intermediateResults2_D_s_32x4 = vmlal_s16(intermediateResults2_D_s_32x4, source1_D_s_16x4, factorChannel21_1024_s_16x4);
6497
6498
6499 const int16x4_t source2_A_s_16x4 = vget_low_s16(source2_low_s_16x8);
6500 const int16x4_t source2_B_s_16x4 = vget_high_s16(source2_low_s_16x8);
6501 const int16x4_t source2_C_s_16x4 = vget_low_s16(source2_high_s_16x8);
6502 const int16x4_t source2_D_s_16x4 = vget_high_s16(source2_high_s_16x8);
6503
6504 intermediateResults0_A_s_32x4 = vmlal_s16(intermediateResults0_A_s_32x4, source2_A_s_16x4, factorChannel02_1024_s_16x4);
6505 intermediateResults0_B_s_32x4 = vmlal_s16(intermediateResults0_B_s_32x4, source2_B_s_16x4, factorChannel02_1024_s_16x4);
6506 intermediateResults0_C_s_32x4 = vmlal_s16(intermediateResults0_C_s_32x4, source2_C_s_16x4, factorChannel02_1024_s_16x4);
6507 intermediateResults0_D_s_32x4 = vmlal_s16(intermediateResults0_D_s_32x4, source2_D_s_16x4, factorChannel02_1024_s_16x4);
6508
6509 intermediateResults1_A_s_32x4 = vmlal_s16(intermediateResults1_A_s_32x4, source2_A_s_16x4, factorChannel12_1024_s_16x4);
6510 intermediateResults1_B_s_32x4 = vmlal_s16(intermediateResults1_B_s_32x4, source2_B_s_16x4, factorChannel12_1024_s_16x4);
6511 intermediateResults1_C_s_32x4 = vmlal_s16(intermediateResults1_C_s_32x4, source2_C_s_16x4, factorChannel12_1024_s_16x4);
6512 intermediateResults1_D_s_32x4 = vmlal_s16(intermediateResults1_D_s_32x4, source2_D_s_16x4, factorChannel12_1024_s_16x4);
6513
6514 intermediateResults2_A_s_32x4 = vmlal_s16(intermediateResults2_A_s_32x4, source2_A_s_16x4, factorChannel22_1024_s_16x4);
6515 intermediateResults2_B_s_32x4 = vmlal_s16(intermediateResults2_B_s_32x4, source2_B_s_16x4, factorChannel22_1024_s_16x4);
6516 intermediateResults2_C_s_32x4 = vmlal_s16(intermediateResults2_C_s_32x4, source2_C_s_16x4, factorChannel22_1024_s_16x4);
6517 intermediateResults2_D_s_32x4 = vmlal_s16(intermediateResults2_D_s_32x4, source2_D_s_16x4, factorChannel22_1024_s_16x4);
6518
6519
6520 // now we add the bias values (saturated)
6521
6522 intermediateResults0_A_s_32x4 = vaddq_s32(intermediateResults0_A_s_32x4, biasChannel0_1024_s_32x4);
6523 intermediateResults0_B_s_32x4 = vaddq_s32(intermediateResults0_B_s_32x4, biasChannel0_1024_s_32x4);
6524 intermediateResults0_C_s_32x4 = vaddq_s32(intermediateResults0_C_s_32x4, biasChannel0_1024_s_32x4);
6525 intermediateResults0_D_s_32x4 = vaddq_s32(intermediateResults0_D_s_32x4, biasChannel0_1024_s_32x4);
6526
6527 intermediateResults1_A_s_32x4 = vaddq_s32(intermediateResults1_A_s_32x4, biasChannel1_1024_s_32x4);
6528 intermediateResults1_B_s_32x4 = vaddq_s32(intermediateResults1_B_s_32x4, biasChannel1_1024_s_32x4);
6529 intermediateResults1_C_s_32x4 = vaddq_s32(intermediateResults1_C_s_32x4, biasChannel1_1024_s_32x4);
6530 intermediateResults1_D_s_32x4 = vaddq_s32(intermediateResults1_D_s_32x4, biasChannel1_1024_s_32x4);
6531
6532 intermediateResults2_A_s_32x4 = vaddq_s32(intermediateResults2_A_s_32x4, biasChannel2_1024_s_32x4);
6533 intermediateResults2_B_s_32x4 = vaddq_s32(intermediateResults2_B_s_32x4, biasChannel2_1024_s_32x4);
6534 intermediateResults2_C_s_32x4 = vaddq_s32(intermediateResults2_C_s_32x4, biasChannel2_1024_s_32x4);
6535 intermediateResults2_D_s_32x4 = vaddq_s32(intermediateResults2_D_s_32x4, biasChannel2_1024_s_32x4);
6536
6537
6538 uint8x16x3_t results_u_8x16x3;
6539
6540 // saturated narrow signed to unsigned
6541 results_u_8x16x3.val[0] = vcombine_u8(vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults0_A_s_32x4, 10), vqrshrun_n_s32(intermediateResults0_B_s_32x4, 10))), vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults0_C_s_32x4, 10), vqrshrun_n_s32(intermediateResults0_D_s_32x4, 10))));
6542
6543 results_u_8x16x3.val[1] = vcombine_u8(vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults1_A_s_32x4, 10), vqrshrun_n_s32(intermediateResults1_B_s_32x4, 10))), vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults1_C_s_32x4, 10), vqrshrun_n_s32(intermediateResults1_D_s_32x4, 10))));
6544 results_u_8x16x3.val[2] = vcombine_u8(vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults2_A_s_32x4, 10), vqrshrun_n_s32(intermediateResults2_B_s_32x4, 10))), vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults2_C_s_32x4, 10), vqrshrun_n_s32(intermediateResults2_D_s_32x4, 10))));
6545
6546 // and we can store the result
6547 vst3q_u8(target, results_u_8x16x3);
6548}
6549
6550OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_128_s_16x8, const int16x8_t& factorChannel10_128_s_16x8, const int16x8_t& factorChannel20_128_s_16x8, const int16x8_t& factorChannel01_128_s_16x8, const int16x8_t& factorChannel11_128_s_16x8, const int16x8_t& factorChannel21_128_s_16x8, const int16x8_t& factorChannel02_128_s_16x8, const int16x8_t& factorChannel12_128_s_16x8, const int16x8_t& factorChannel22_128_s_16x8, const int16x8_t& biasChannel0_128_s_16x8, const int16x8_t& biasChannel1_128_s_16x8, const int16x8_t& biasChannel2_128_s_16x8)
6551{
6552 ocean_assert(source != nullptr && target != nullptr);
6553
6554 // the documentation of this function designed for RGB24 to YUV24 conversion
6555
6556 // precise color space conversion:
6557 // | Y | | 0.2578125 0.5039063 0.09765625 16.0 | | R |
6558 // | U | = | -0.1484375 -0.2890625 0.4375 128.0 | * | G |
6559 // | V | | 0.4375 -0.3671875 -0.0703125 128.0 | | B |
6560 // | 1 |
6561
6562 // approximation:
6563 // Y = ( 33 * R + 64 * G + 13 * B) / 128 + 16
6564 // U = (-19 * R - 37 * G + 56 * B) / 128 + 128
6565 // V = ( 56 * R - 47 * G - 9 * B) / 128 + 128
6566
6567 // we load 8 pixels (= 3 * 8 values) and directly deinterleave the 3 channels so that we receive the following patterns:
6568 // source_u_8x8x3.val[0]: R R R R R R R R
6569 // source_u_8x8x3.val[1]: G G G G G G G G
6570 // source_u_8x8x3.val[2]: B B B B B B B B
6571
6572 const uint8x16x3_t source_u_8x16x3 = vld3q_u8(source);
6573
6574 const int16x8_t source0_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x3.val[0])));
6575 const int16x8_t source1_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x3.val[1])));
6576 const int16x8_t source2_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x3.val[2])));
6577
6578 const int16x8_t source0_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x3.val[0])));
6579 const int16x8_t source1_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x3.val[1])));
6580 const int16x8_t source2_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x3.val[2])));
6581
6582
6583 int16x8_t intermediateResults0_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel00_128_s_16x8);
6584 int16x8_t intermediateResults1_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel10_128_s_16x8);
6585 int16x8_t intermediateResults2_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel20_128_s_16x8);
6586
6587 int16x8_t intermediateResults0_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel00_128_s_16x8);
6588 int16x8_t intermediateResults1_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel10_128_s_16x8);
6589 int16x8_t intermediateResults2_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel20_128_s_16x8);
6590
6591
6592 intermediateResults0_low_s_16x8 = vmlaq_s16(intermediateResults0_low_s_16x8, source1_low_s_16x8, factorChannel01_128_s_16x8);
6593 intermediateResults1_low_s_16x8 = vmlaq_s16(intermediateResults1_low_s_16x8, source1_low_s_16x8, factorChannel11_128_s_16x8);
6594 intermediateResults2_low_s_16x8 = vmlaq_s16(intermediateResults2_low_s_16x8, source1_low_s_16x8, factorChannel21_128_s_16x8);
6595
6596 intermediateResults0_high_s_16x8 = vmlaq_s16(intermediateResults0_high_s_16x8, source1_high_s_16x8, factorChannel01_128_s_16x8);
6597 intermediateResults1_high_s_16x8 = vmlaq_s16(intermediateResults1_high_s_16x8, source1_high_s_16x8, factorChannel11_128_s_16x8);
6598 intermediateResults2_high_s_16x8 = vmlaq_s16(intermediateResults2_high_s_16x8, source1_high_s_16x8, factorChannel21_128_s_16x8);
6599
6600
6601 intermediateResults0_low_s_16x8 = vmlaq_s16(intermediateResults0_low_s_16x8, source2_low_s_16x8, factorChannel02_128_s_16x8);
6602 intermediateResults1_low_s_16x8 = vmlaq_s16(intermediateResults1_low_s_16x8, source2_low_s_16x8, factorChannel12_128_s_16x8);
6603 intermediateResults2_low_s_16x8 = vmlaq_s16(intermediateResults2_low_s_16x8, source2_low_s_16x8, factorChannel22_128_s_16x8);
6604
6605 intermediateResults0_high_s_16x8 = vmlaq_s16(intermediateResults0_high_s_16x8, source2_high_s_16x8, factorChannel02_128_s_16x8);
6606 intermediateResults1_high_s_16x8 = vmlaq_s16(intermediateResults1_high_s_16x8, source2_high_s_16x8, factorChannel12_128_s_16x8);
6607 intermediateResults2_high_s_16x8 = vmlaq_s16(intermediateResults2_high_s_16x8, source2_high_s_16x8, factorChannel22_128_s_16x8);
6608
6609 // now we add the bias values (saturated)
6610
6611 intermediateResults0_low_s_16x8 = vqaddq_s16(intermediateResults0_low_s_16x8, biasChannel0_128_s_16x8);
6612 intermediateResults0_high_s_16x8 = vqaddq_s16(intermediateResults0_high_s_16x8, biasChannel0_128_s_16x8);
6613
6614 intermediateResults1_low_s_16x8 = vqaddq_s16(intermediateResults1_low_s_16x8, biasChannel1_128_s_16x8);
6615 intermediateResults1_high_s_16x8 = vqaddq_s16(intermediateResults1_high_s_16x8, biasChannel1_128_s_16x8);
6616
6617 intermediateResults2_low_s_16x8 = vqaddq_s16(intermediateResults2_low_s_16x8, biasChannel2_128_s_16x8);
6618 intermediateResults2_high_s_16x8 = vqaddq_s16(intermediateResults2_high_s_16x8, biasChannel2_128_s_16x8);
6619
6620
6621 uint8x16x3_t results_u_8x16x3;
6622
6623 // saturated narrow signed to unsigned shift with rounding
6624 results_u_8x16x3.val[0] = vcombine_u8(vqrshrun_n_s16(intermediateResults0_low_s_16x8, 7), vqrshrun_n_s16(intermediateResults0_high_s_16x8, 7));
6625 results_u_8x16x3.val[1] = vcombine_u8(vqrshrun_n_s16(intermediateResults1_low_s_16x8, 7), vqrshrun_n_s16(intermediateResults1_high_s_16x8, 7));
6626 results_u_8x16x3.val[2] = vcombine_u8(vqrshrun_n_s16(intermediateResults2_low_s_16x8, 7), vqrshrun_n_s16(intermediateResults2_high_s_16x8, 7));
6627
6628 // and we can store the result
6629 vst3q_u8(target, results_u_8x16x3);
6630}
6631
6632OCEAN_FORCE_INLINE void FrameChannels::convert3ChannelsTo4Channels16Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_64_s_16x8, const int16x8_t& factorChannel10_64_s_16x8, const int16x8_t& factorChannel20_64_s_16x8, const int16x8_t& factorChannel01_64_s_16x8, const int16x8_t& factorChannel11_64_s_16x8, const int16x8_t& factorChannel21_64_s_16x8, const int16x8_t& factorChannel02_64_s_16x8, const int16x8_t& factorChannel12_64_s_16x8, const int16x8_t& factorChannel22_64_s_16x8, const uint8x8_t& biasChannel0_u_8x8, const uint8x8_t& biasChannel1_u_8x8, const uint8x8_t& biasChannel2_u_8x8, const uint8x16_t& channelValue3_u_8x16)
6633{
6634 ocean_assert(source != nullptr && target != nullptr);
6635
6636 // the documentation of this function designed for YUV24 to RGB24 conversion
6637
6638 // precise color space conversion:
6639 // | R | | 1 0.0 1.370705 -175.45024 | | Y |
6640 // | G | = | 1 -0.3376335 -0.698001 132.561152 | * | U |
6641 // | B | | 1 1.732446 0.0 -221.753088 | | V |
6642 // | 1 |
6643
6644 // approximation:
6645 // R = 64 * Y + 0 * (U - 128) + 88 * (V - 128)
6646 // G = 64 * Y - 22 * (U - 128) - 45 * (V - 128)
6647 // B = 64 * Y + 111 * (U - 128) + 0 * (V - 128)
6648
6649 const uint8x16x3_t source_u_8x16x3 = vld3q_u8(source);
6650
6651 // Y' = Y - bias0, U' = U - bias1, V' = V - bias2
6652 const int16x8_t source0_low_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(source_u_8x16x3.val[0]), biasChannel0_u_8x8));
6653 const int16x8_t source1_low_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(source_u_8x16x3.val[1]), biasChannel1_u_8x8));
6654 const int16x8_t source2_low_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(source_u_8x16x3.val[2]), biasChannel2_u_8x8));
6655
6656 const int16x8_t source0_high_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(source_u_8x16x3.val[0]), biasChannel0_u_8x8));
6657 const int16x8_t source1_high_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(source_u_8x16x3.val[1]), biasChannel1_u_8x8));
6658 const int16x8_t source2_high_s_16x8 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(source_u_8x16x3.val[2]), biasChannel2_u_8x8));
6659
6660 // now we mulitply apply the 3x3 matrix multiplication
6661
6662 int16x8_t intermediateResults0_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel00_64_s_16x8);
6663 int16x8_t intermediateResults1_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel10_64_s_16x8);
6664 int16x8_t intermediateResults2_low_s_16x8 = vmulq_s16(source0_low_s_16x8, factorChannel20_64_s_16x8);
6665
6666 int16x8_t intermediateResults0_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel00_64_s_16x8);
6667 int16x8_t intermediateResults1_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel10_64_s_16x8);
6668 int16x8_t intermediateResults2_high_s_16x8 = vmulq_s16(source0_high_s_16x8, factorChannel20_64_s_16x8);
6669
6670 intermediateResults0_low_s_16x8 = vqaddq_s16(intermediateResults0_low_s_16x8, vmulq_s16(source1_low_s_16x8, factorChannel01_64_s_16x8)); // intermediateResults0 = saturated(intermediateResults0 + source10_low * factorChannel01)
6671 intermediateResults1_low_s_16x8 = vqaddq_s16(intermediateResults1_low_s_16x8, vmulq_s16(source1_low_s_16x8, factorChannel11_64_s_16x8));
6672 intermediateResults2_low_s_16x8 = vqaddq_s16(intermediateResults2_low_s_16x8, vmulq_s16(source1_low_s_16x8, factorChannel21_64_s_16x8));
6673
6674 intermediateResults0_high_s_16x8 = vqaddq_s16(intermediateResults0_high_s_16x8, vmulq_s16(source1_high_s_16x8, factorChannel01_64_s_16x8));
6675 intermediateResults1_high_s_16x8 = vqaddq_s16(intermediateResults1_high_s_16x8, vmulq_s16(source1_high_s_16x8, factorChannel11_64_s_16x8));
6676 intermediateResults2_high_s_16x8 = vqaddq_s16(intermediateResults2_high_s_16x8, vmulq_s16(source1_high_s_16x8, factorChannel21_64_s_16x8));
6677
6678 intermediateResults0_low_s_16x8 = vqaddq_s16(intermediateResults0_low_s_16x8, vmulq_s16(source2_low_s_16x8, factorChannel02_64_s_16x8));
6679 intermediateResults1_low_s_16x8 = vqaddq_s16(intermediateResults1_low_s_16x8, vmulq_s16(source2_low_s_16x8, factorChannel12_64_s_16x8));
6680 intermediateResults2_low_s_16x8 = vqaddq_s16(intermediateResults2_low_s_16x8, vmulq_s16(source2_low_s_16x8, factorChannel22_64_s_16x8));
6681
6682 intermediateResults0_high_s_16x8 = vqaddq_s16(intermediateResults0_high_s_16x8, vmulq_s16(source2_high_s_16x8, factorChannel02_64_s_16x8));
6683 intermediateResults1_high_s_16x8 = vqaddq_s16(intermediateResults1_high_s_16x8, vmulq_s16(source2_high_s_16x8, factorChannel12_64_s_16x8));
6684 intermediateResults2_high_s_16x8 = vqaddq_s16(intermediateResults2_high_s_16x8, vmulq_s16(source2_high_s_16x8, factorChannel22_64_s_16x8));
6685
6686 uint8x16x4_t results_u_8x16x4;
6687
6688 // saturated narrow signed to unsigned, normalized by 2^6
6689 results_u_8x16x4.val[0] = vcombine_u8(vqrshrun_n_s16(intermediateResults0_low_s_16x8, 6), vqrshrun_n_s16(intermediateResults0_high_s_16x8, 6));
6690 results_u_8x16x4.val[1] = vcombine_u8(vqrshrun_n_s16(intermediateResults1_low_s_16x8, 6), vqrshrun_n_s16(intermediateResults1_high_s_16x8, 6));
6691 results_u_8x16x4.val[2] = vcombine_u8(vqrshrun_n_s16(intermediateResults2_low_s_16x8, 6), vqrshrun_n_s16(intermediateResults2_high_s_16x8, 6));
6692 results_u_8x16x4.val[3] = channelValue3_u_8x16;
6693
6694 // and we can store the result
6695 vst4q_u8(target, results_u_8x16x4);
6696}
6697
6698template <bool tUseFactorChannel0, bool tUseFactorChannel1, bool tUseFactorChannel2, bool tUseFactorChannel3>
6699void FrameChannels::convert4ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const uint8x8_t& factorChannel0_128_u_8x8, const uint8x8_t& factorChannel1_128_u_8x8, const uint8x8_t& factorChannel2_128_u_8x8, const uint8x8_t& factorChannel3_128_u_8x8)
6700{
6701 static_assert(tUseFactorChannel0 || tUseFactorChannel1 || tUseFactorChannel2 || tUseFactorChannel3, "Invalid multiplication factors!");
6702
6703 ocean_assert(source != nullptr && target != nullptr);
6704
6705 // the documentation of this function designed for RGBA32 to Y8 conversion
6706
6707 // precise color space conversion:
6708 // Y = 0.299 * R + 0.587 * G + 0.114 * B
6709
6710 // approximation:
6711 // Y = (38 * R + 75 * G + 15 * B) / 128
6712
6713 // we expect the following input pattern (for here RGBA32):
6714 // FEDC BA98 7654 3210
6715 // ABGR ABGR ABGR ABGR
6716
6717 // we load 8 pixels (= 4 * 8 values) and directly deinterleave the 4 channels so that we receive the following patterns:
6718 // m4_64_pixels.val[0]: R R R R R R R R
6719 // m4_64_pixels.val[1]: G G G G G G G G
6720 // m4_64_pixels.val[2]: B B B B B B B B
6721 // m4_64_pixels.val[3]: A A A A A A A A
6722
6723 uint8x8x4_t pixels_u_8x8x4 = vld4_u8(source);
6724
6725 uint16x8_t intermediateResults_16x8;
6726
6727 // we multiply the first channel with the specified factor (unless zero)
6728
6729 if constexpr (tUseFactorChannel0)
6730 {
6731 intermediateResults_16x8 = vmull_u8(pixels_u_8x8x4.val[0], factorChannel0_128_u_8x8);
6732 }
6733 else
6734 {
6735 intermediateResults_16x8 = vdupq_n_u16(0u);
6736 }
6737
6738 // we multiply the second channel with the specified factor (unless zero) and accumulate the results
6739
6740 if constexpr (tUseFactorChannel1)
6741 {
6742 intermediateResults_16x8 = vmlal_u8(intermediateResults_16x8, pixels_u_8x8x4.val[1], factorChannel1_128_u_8x8);
6743 }
6744
6745 // we multiply the third channel with the specified factor (unless zero) and accumulate the results
6746
6747 if constexpr (tUseFactorChannel2)
6748 {
6749 intermediateResults_16x8 = vmlal_u8(intermediateResults_16x8, pixels_u_8x8x4.val[2], factorChannel2_128_u_8x8);
6750 }
6751
6752 // we multiply the fourth channel with the specified factor (unless zero) and accumulate the results
6753
6754 if constexpr (tUseFactorChannel3)
6755 {
6756 intermediateResults_16x8 = vmlal_u8(intermediateResults_16x8, pixels_u_8x8x4.val[3], factorChannel3_128_u_8x8);
6757 }
6758
6759 // we shift the 16 bit values by 7 bits (= 128), apply rounding, and narrow the 16 bit integers to 8 bit integers within one operation
6760 uint8x8_t results_u_8x8 = vqrshrn_n_u16(intermediateResults_16x8, 7); // pixels_u_8x8x4 = (intermediateResults_16x8 + 2^6) >> 2^7
6761
6762 // and we can store the result
6763 vst1_u8(target, results_u_8x8);
6764}
6765
6766OCEAN_FORCE_INLINE void FrameChannels::convert4ChannelsTo2Channels8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const uint8x8_t& factorChannel00_128_u_8x8, const uint8x8_t& factorChannel10_128_u_8x8, const uint8x8_t& factorChannel01_128_u_8x8, const uint8x8_t& factorChannel11_128_u_8x8, const uint8x8_t& factorChannel02_128_u_8x8, const uint8x8_t& factorChannel12_128_u_8x8, const uint8x8_t& factorChannel03_128_u_8x8, const uint8x8_t& factorChannel13_128_u_8x8)
6767{
6768 ocean_assert(source != nullptr && target != nullptr);
6769
6770 // the documentation of this function designed for RGBA32 to YA16 conversion
6771
6772 // precise color space conversion:
6773 // Y = 0.299 * R + 0.587 * G + 0.114 * B + 0.0 * A
6774 // A = 0.0 * R + 0.0 * G + 0.0 * B + 1.0 * A
6775
6776 // approximation:
6777 // Y = (38 * R + 75 * G + 15 * B + 0 * A) / 128
6778 // A = (128 * A) / 128
6779
6780 // we expect the following input pattern (for here RGBA32):
6781 // FEDC BA98 7654 3210
6782 // ABGR ABGR ABGR ABGR
6783
6784 // we load 8 pixels (= 4 * 8 values) and directly deinterleave the 4 channels so that we receive the following patterns:
6785 // m4_64_pixels.val[0]: R R R R R R R R
6786 // m4_64_pixels.val[1]: G G G G G G G G
6787 // m4_64_pixels.val[2]: B B B B B B B B
6788 // m4_64_pixels.val[3]: A A A A A A A A
6789
6790 uint8x8x4_t pixels_u_8x8x4 = vld4_u8(source);
6791
6792 uint16x8_t intermediateResultsChannel0_16x8 = vmull_u8(pixels_u_8x8x4.val[0], factorChannel00_128_u_8x8);
6793 uint16x8_t intermediateResultsChannel1_16x8 = vmull_u8(pixels_u_8x8x4.val[0], factorChannel10_128_u_8x8);
6794
6795 intermediateResultsChannel0_16x8 = vmlal_u8(intermediateResultsChannel0_16x8, pixels_u_8x8x4.val[1], factorChannel01_128_u_8x8);
6796 intermediateResultsChannel1_16x8 = vmlal_u8(intermediateResultsChannel1_16x8, pixels_u_8x8x4.val[1], factorChannel11_128_u_8x8);
6797
6798 intermediateResultsChannel0_16x8 = vmlal_u8(intermediateResultsChannel0_16x8, pixels_u_8x8x4.val[2], factorChannel02_128_u_8x8);
6799 intermediateResultsChannel1_16x8 = vmlal_u8(intermediateResultsChannel1_16x8, pixels_u_8x8x4.val[2], factorChannel12_128_u_8x8);
6800
6801 intermediateResultsChannel0_16x8 = vmlal_u8(intermediateResultsChannel0_16x8, pixels_u_8x8x4.val[3], factorChannel03_128_u_8x8);
6802 intermediateResultsChannel1_16x8 = vmlal_u8(intermediateResultsChannel1_16x8, pixels_u_8x8x4.val[3], factorChannel13_128_u_8x8);
6803
6804 uint8x8x2_t results_u_8x8x2;
6805
6806 // we shift the 16 bit values by 7 bits (= 128), apply rounding, and narrow the 16 bit integers to 8 bit integers within one operation
6807
6808 results_u_8x8x2.val[0] = vqrshrn_n_u16(intermediateResultsChannel0_16x8, 7); // results_u_8x8x2.val[0] = (intermediateResultsChannel0_16x8 + 2^6) >> 2^7
6809 results_u_8x8x2.val[1] = vqrshrn_n_u16(intermediateResultsChannel1_16x8, 7);
6810
6811 // and we can store the result
6812 vst2_u8(target, results_u_8x8x2);
6813}
6814
6815OCEAN_FORCE_INLINE void FrameChannels::convert4ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t* const source, uint8_t* const target, const int16x8_t& factorChannel00_128_s_16x8, const int16x8_t& factorChannel10_128_s_16x8, const int16x8_t& factorChannel20_128_s_16x8, const int16x8_t& factorChannel01_128_s_16x8, const int16x8_t& factorChannel11_128_s_16x8, const int16x8_t& factorChannel21_128_s_16x8, const int16x8_t& factorChannel02_128_s_16x8, const int16x8_t& factorChannel12_128_s_16x8, const int16x8_t& factorChannel22_128_s_16x8, const int16x8_t& factorChannel03_128_s_16x8, const int16x8_t& factorChannel13_128_s_16x8, const int16x8_t& factorChannel23_128_s_16x8, const int16x8_t& biasChannel0_128_s_16x8, const int16x8_t& biasChannel1_128_s_16x8, const int16x8_t& biasChannel2_128_s_16x8)
6816{
6817 ocean_assert(source != nullptr && target != nullptr);
6818
6819 // the documentation of this function designed for YUVA32 to RGB24 conversion
6820
6821 // approximation:
6822 // R = f00 * s0 + f01 * s1 + f02 * s2 + f03 * s3 + b0
6823 // G = f10 * s0 + f11 * s1 + f12 * s2 + f13 * s3 + b1
6824 // B = f20 * s0 + f21 * s1 + f22 * s2 + f23 * s3 + b2
6825
6826 // we load 16 pixels (= 4 * 16 values) and directly deinterleave the 4 channels so that we receive the following patterns:
6827 // source_u_8x16x4.val[0]: s0 s0 s0 s0 s0 s0 s0 s0 s0 s0 s0 s0 s0 s0 s0 s0
6828 // source_u_8x16x4.val[1]: s1 s1 s1 s1 s1 s1 s1 s1 s1 s1 s1 s1 s1 s1 s1 s1
6829 // source_u_8x16x4.val[2]: s2 s2 s2 s2 s2 s2 s2 s2 s2 s2 s2 s2 s2 s2 s2 s2
6830 // source_u_8x16x4.val[3]: s3 s3 s3 s3 s3 s3 s3 s3 s3 s3 s3 s3 s3 s3 s3 s3
6831
6832 const uint8x16x4_t source_u_8x16x4 = vld4q_u8(source);
6833
6834 // widen 8 bit unsigned to 16 bit signed
6835
6836 const int16x8_t source0_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x4.val[0])));
6837 const int16x8_t source1_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x4.val[1])));
6838 const int16x8_t source2_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x4.val[2])));
6839 const int16x8_t source3_low_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(source_u_8x16x4.val[3])));
6840
6841 const int16x8_t source0_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x4.val[0])));
6842 const int16x8_t source1_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x4.val[1])));
6843 const int16x8_t source2_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x4.val[2])));
6844 const int16x8_t source3_high_s_16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(source_u_8x16x4.val[3])));
6845
6846 // We need to use 32-bit intermediate results to avoid overflow in the 4-channel case
6847 // Split the 16x8 source vectors into 4 sets of 4 elements for 32-bit multiplication
6848
6849 const int16x4_t factorChannel00_128_s_16x4 = vget_low_s16(factorChannel00_128_s_16x8);
6850 const int16x4_t factorChannel10_128_s_16x4 = vget_low_s16(factorChannel10_128_s_16x8);
6851 const int16x4_t factorChannel20_128_s_16x4 = vget_low_s16(factorChannel20_128_s_16x8);
6852
6853 const int16x4_t factorChannel01_128_s_16x4 = vget_low_s16(factorChannel01_128_s_16x8);
6854 const int16x4_t factorChannel11_128_s_16x4 = vget_low_s16(factorChannel11_128_s_16x8);
6855 const int16x4_t factorChannel21_128_s_16x4 = vget_low_s16(factorChannel21_128_s_16x8);
6856
6857 const int16x4_t factorChannel02_128_s_16x4 = vget_low_s16(factorChannel02_128_s_16x8);
6858 const int16x4_t factorChannel12_128_s_16x4 = vget_low_s16(factorChannel12_128_s_16x8);
6859 const int16x4_t factorChannel22_128_s_16x4 = vget_low_s16(factorChannel22_128_s_16x8);
6860
6861 const int16x4_t factorChannel03_128_s_16x4 = vget_low_s16(factorChannel03_128_s_16x8);
6862 const int16x4_t factorChannel13_128_s_16x4 = vget_low_s16(factorChannel13_128_s_16x8);
6863 const int16x4_t factorChannel23_128_s_16x4 = vget_low_s16(factorChannel23_128_s_16x8);
6864
6865 // Process 8 pixels in low part with widening 32-bit multiply-accumulate
6866
6867 const int16x4_t source0_low_low_s_16x4 = vget_low_s16(source0_low_s_16x8);
6868 const int16x4_t source0_low_high_s_16x4 = vget_high_s16(source0_low_s_16x8);
6869 const int16x4_t source1_low_low_s_16x4 = vget_low_s16(source1_low_s_16x8);
6870 const int16x4_t source1_low_high_s_16x4 = vget_high_s16(source1_low_s_16x8);
6871 const int16x4_t source2_low_low_s_16x4 = vget_low_s16(source2_low_s_16x8);
6872 const int16x4_t source2_low_high_s_16x4 = vget_high_s16(source2_low_s_16x8);
6873 const int16x4_t source3_low_low_s_16x4 = vget_low_s16(source3_low_s_16x8);
6874 const int16x4_t source3_low_high_s_16x4 = vget_high_s16(source3_low_s_16x8);
6875
6876 int32x4_t intermediateResults0_low_low_s_32x4 = vmull_s16(source0_low_low_s_16x4, factorChannel00_128_s_16x4);
6877 int32x4_t intermediateResults0_low_high_s_32x4 = vmull_s16(source0_low_high_s_16x4, factorChannel00_128_s_16x4);
6878 int32x4_t intermediateResults1_low_low_s_32x4 = vmull_s16(source0_low_low_s_16x4, factorChannel10_128_s_16x4);
6879 int32x4_t intermediateResults1_low_high_s_32x4 = vmull_s16(source0_low_high_s_16x4, factorChannel10_128_s_16x4);
6880 int32x4_t intermediateResults2_low_low_s_32x4 = vmull_s16(source0_low_low_s_16x4, factorChannel20_128_s_16x4);
6881 int32x4_t intermediateResults2_low_high_s_32x4 = vmull_s16(source0_low_high_s_16x4, factorChannel20_128_s_16x4);
6882
6883 intermediateResults0_low_low_s_32x4 = vmlal_s16(intermediateResults0_low_low_s_32x4, source1_low_low_s_16x4, factorChannel01_128_s_16x4);
6884 intermediateResults0_low_high_s_32x4 = vmlal_s16(intermediateResults0_low_high_s_32x4, source1_low_high_s_16x4, factorChannel01_128_s_16x4);
6885 intermediateResults1_low_low_s_32x4 = vmlal_s16(intermediateResults1_low_low_s_32x4, source1_low_low_s_16x4, factorChannel11_128_s_16x4);
6886 intermediateResults1_low_high_s_32x4 = vmlal_s16(intermediateResults1_low_high_s_32x4, source1_low_high_s_16x4, factorChannel11_128_s_16x4);
6887 intermediateResults2_low_low_s_32x4 = vmlal_s16(intermediateResults2_low_low_s_32x4, source1_low_low_s_16x4, factorChannel21_128_s_16x4);
6888 intermediateResults2_low_high_s_32x4 = vmlal_s16(intermediateResults2_low_high_s_32x4, source1_low_high_s_16x4, factorChannel21_128_s_16x4);
6889
6890 intermediateResults0_low_low_s_32x4 = vmlal_s16(intermediateResults0_low_low_s_32x4, source2_low_low_s_16x4, factorChannel02_128_s_16x4);
6891 intermediateResults0_low_high_s_32x4 = vmlal_s16(intermediateResults0_low_high_s_32x4, source2_low_high_s_16x4, factorChannel02_128_s_16x4);
6892 intermediateResults1_low_low_s_32x4 = vmlal_s16(intermediateResults1_low_low_s_32x4, source2_low_low_s_16x4, factorChannel12_128_s_16x4);
6893 intermediateResults1_low_high_s_32x4 = vmlal_s16(intermediateResults1_low_high_s_32x4, source2_low_high_s_16x4, factorChannel12_128_s_16x4);
6894 intermediateResults2_low_low_s_32x4 = vmlal_s16(intermediateResults2_low_low_s_32x4, source2_low_low_s_16x4, factorChannel22_128_s_16x4);
6895 intermediateResults2_low_high_s_32x4 = vmlal_s16(intermediateResults2_low_high_s_32x4, source2_low_high_s_16x4, factorChannel22_128_s_16x4);
6896
6897 intermediateResults0_low_low_s_32x4 = vmlal_s16(intermediateResults0_low_low_s_32x4, source3_low_low_s_16x4, factorChannel03_128_s_16x4);
6898 intermediateResults0_low_high_s_32x4 = vmlal_s16(intermediateResults0_low_high_s_32x4, source3_low_high_s_16x4, factorChannel03_128_s_16x4);
6899 intermediateResults1_low_low_s_32x4 = vmlal_s16(intermediateResults1_low_low_s_32x4, source3_low_low_s_16x4, factorChannel13_128_s_16x4);
6900 intermediateResults1_low_high_s_32x4 = vmlal_s16(intermediateResults1_low_high_s_32x4, source3_low_high_s_16x4, factorChannel13_128_s_16x4);
6901 intermediateResults2_low_low_s_32x4 = vmlal_s16(intermediateResults2_low_low_s_32x4, source3_low_low_s_16x4, factorChannel23_128_s_16x4);
6902 intermediateResults2_low_high_s_32x4 = vmlal_s16(intermediateResults2_low_high_s_32x4, source3_low_high_s_16x4, factorChannel23_128_s_16x4);
6903
6904 // Process 8 pixels in high part
6905
6906 const int16x4_t source0_high_low_s_16x4 = vget_low_s16(source0_high_s_16x8);
6907 const int16x4_t source0_high_high_s_16x4 = vget_high_s16(source0_high_s_16x8);
6908 const int16x4_t source1_high_low_s_16x4 = vget_low_s16(source1_high_s_16x8);
6909 const int16x4_t source1_high_high_s_16x4 = vget_high_s16(source1_high_s_16x8);
6910 const int16x4_t source2_high_low_s_16x4 = vget_low_s16(source2_high_s_16x8);
6911 const int16x4_t source2_high_high_s_16x4 = vget_high_s16(source2_high_s_16x8);
6912 const int16x4_t source3_high_low_s_16x4 = vget_low_s16(source3_high_s_16x8);
6913 const int16x4_t source3_high_high_s_16x4 = vget_high_s16(source3_high_s_16x8);
6914
6915 int32x4_t intermediateResults0_high_low_s_32x4 = vmull_s16(source0_high_low_s_16x4, factorChannel00_128_s_16x4);
6916 int32x4_t intermediateResults0_high_high_s_32x4 = vmull_s16(source0_high_high_s_16x4, factorChannel00_128_s_16x4);
6917 int32x4_t intermediateResults1_high_low_s_32x4 = vmull_s16(source0_high_low_s_16x4, factorChannel10_128_s_16x4);
6918 int32x4_t intermediateResults1_high_high_s_32x4 = vmull_s16(source0_high_high_s_16x4, factorChannel10_128_s_16x4);
6919 int32x4_t intermediateResults2_high_low_s_32x4 = vmull_s16(source0_high_low_s_16x4, factorChannel20_128_s_16x4);
6920 int32x4_t intermediateResults2_high_high_s_32x4 = vmull_s16(source0_high_high_s_16x4, factorChannel20_128_s_16x4);
6921
6922 intermediateResults0_high_low_s_32x4 = vmlal_s16(intermediateResults0_high_low_s_32x4, source1_high_low_s_16x4, factorChannel01_128_s_16x4);
6923 intermediateResults0_high_high_s_32x4 = vmlal_s16(intermediateResults0_high_high_s_32x4, source1_high_high_s_16x4, factorChannel01_128_s_16x4);
6924 intermediateResults1_high_low_s_32x4 = vmlal_s16(intermediateResults1_high_low_s_32x4, source1_high_low_s_16x4, factorChannel11_128_s_16x4);
6925 intermediateResults1_high_high_s_32x4 = vmlal_s16(intermediateResults1_high_high_s_32x4, source1_high_high_s_16x4, factorChannel11_128_s_16x4);
6926 intermediateResults2_high_low_s_32x4 = vmlal_s16(intermediateResults2_high_low_s_32x4, source1_high_low_s_16x4, factorChannel21_128_s_16x4);
6927 intermediateResults2_high_high_s_32x4 = vmlal_s16(intermediateResults2_high_high_s_32x4, source1_high_high_s_16x4, factorChannel21_128_s_16x4);
6928
6929 intermediateResults0_high_low_s_32x4 = vmlal_s16(intermediateResults0_high_low_s_32x4, source2_high_low_s_16x4, factorChannel02_128_s_16x4);
6930 intermediateResults0_high_high_s_32x4 = vmlal_s16(intermediateResults0_high_high_s_32x4, source2_high_high_s_16x4, factorChannel02_128_s_16x4);
6931 intermediateResults1_high_low_s_32x4 = vmlal_s16(intermediateResults1_high_low_s_32x4, source2_high_low_s_16x4, factorChannel12_128_s_16x4);
6932 intermediateResults1_high_high_s_32x4 = vmlal_s16(intermediateResults1_high_high_s_32x4, source2_high_high_s_16x4, factorChannel12_128_s_16x4);
6933 intermediateResults2_high_low_s_32x4 = vmlal_s16(intermediateResults2_high_low_s_32x4, source2_high_low_s_16x4, factorChannel22_128_s_16x4);
6934 intermediateResults2_high_high_s_32x4 = vmlal_s16(intermediateResults2_high_high_s_32x4, source2_high_high_s_16x4, factorChannel22_128_s_16x4);
6935
6936 intermediateResults0_high_low_s_32x4 = vmlal_s16(intermediateResults0_high_low_s_32x4, source3_high_low_s_16x4, factorChannel03_128_s_16x4);
6937 intermediateResults0_high_high_s_32x4 = vmlal_s16(intermediateResults0_high_high_s_32x4, source3_high_high_s_16x4, factorChannel03_128_s_16x4);
6938 intermediateResults1_high_low_s_32x4 = vmlal_s16(intermediateResults1_high_low_s_32x4, source3_high_low_s_16x4, factorChannel13_128_s_16x4);
6939 intermediateResults1_high_high_s_32x4 = vmlal_s16(intermediateResults1_high_high_s_32x4, source3_high_high_s_16x4, factorChannel13_128_s_16x4);
6940 intermediateResults2_high_low_s_32x4 = vmlal_s16(intermediateResults2_high_low_s_32x4, source3_high_low_s_16x4, factorChannel23_128_s_16x4);
6941 intermediateResults2_high_high_s_32x4 = vmlal_s16(intermediateResults2_high_high_s_32x4, source3_high_high_s_16x4, factorChannel23_128_s_16x4);
6942
6943 // Convert bias from 16-bit to 32-bit for addition
6944 const int32x4_t biasChannel0_128_s_32x4 = vmovl_s16(vget_low_s16(biasChannel0_128_s_16x8));
6945 const int32x4_t biasChannel1_128_s_32x4 = vmovl_s16(vget_low_s16(biasChannel1_128_s_16x8));
6946 const int32x4_t biasChannel2_128_s_32x4 = vmovl_s16(vget_low_s16(biasChannel2_128_s_16x8));
6947
6948 // Add bias (bias is pre-scaled by 128 in the calling code)
6949 intermediateResults0_low_low_s_32x4 = vaddq_s32(intermediateResults0_low_low_s_32x4, biasChannel0_128_s_32x4);
6950 intermediateResults0_low_high_s_32x4 = vaddq_s32(intermediateResults0_low_high_s_32x4, biasChannel0_128_s_32x4);
6951 intermediateResults1_low_low_s_32x4 = vaddq_s32(intermediateResults1_low_low_s_32x4, biasChannel1_128_s_32x4);
6952 intermediateResults1_low_high_s_32x4 = vaddq_s32(intermediateResults1_low_high_s_32x4, biasChannel1_128_s_32x4);
6953 intermediateResults2_low_low_s_32x4 = vaddq_s32(intermediateResults2_low_low_s_32x4, biasChannel2_128_s_32x4);
6954 intermediateResults2_low_high_s_32x4 = vaddq_s32(intermediateResults2_low_high_s_32x4, biasChannel2_128_s_32x4);
6955
6956 intermediateResults0_high_low_s_32x4 = vaddq_s32(intermediateResults0_high_low_s_32x4, biasChannel0_128_s_32x4);
6957 intermediateResults0_high_high_s_32x4 = vaddq_s32(intermediateResults0_high_high_s_32x4, biasChannel0_128_s_32x4);
6958 intermediateResults1_high_low_s_32x4 = vaddq_s32(intermediateResults1_high_low_s_32x4, biasChannel1_128_s_32x4);
6959 intermediateResults1_high_high_s_32x4 = vaddq_s32(intermediateResults1_high_high_s_32x4, biasChannel1_128_s_32x4);
6960 intermediateResults2_high_low_s_32x4 = vaddq_s32(intermediateResults2_high_low_s_32x4, biasChannel2_128_s_32x4);
6961 intermediateResults2_high_high_s_32x4 = vaddq_s32(intermediateResults2_high_high_s_32x4, biasChannel2_128_s_32x4);
6962
6963 // Shift and narrow from 32-bit to 16-bit, then from 16-bit to 8-bit
6964 uint8x16x3_t results_u_8x16x3;
6965
6966 // vqrshrun_n_s32: rounding shift right by 7 and narrow signed 32-bit to unsigned 16-bit with saturation
6967 // vqmovn_u16: narrow unsigned 16-bit to unsigned 8-bit with saturation
6968 results_u_8x16x3.val[0] = vcombine_u8(
6969 vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults0_low_low_s_32x4, 7), vqrshrun_n_s32(intermediateResults0_low_high_s_32x4, 7))),
6970 vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults0_high_low_s_32x4, 7), vqrshrun_n_s32(intermediateResults0_high_high_s_32x4, 7))));
6971 results_u_8x16x3.val[1] = vcombine_u8(
6972 vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults1_low_low_s_32x4, 7), vqrshrun_n_s32(intermediateResults1_low_high_s_32x4, 7))),
6973 vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults1_high_low_s_32x4, 7), vqrshrun_n_s32(intermediateResults1_high_high_s_32x4, 7))));
6974 results_u_8x16x3.val[2] = vcombine_u8(
6975 vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults2_low_low_s_32x4, 7), vqrshrun_n_s32(intermediateResults2_low_high_s_32x4, 7))),
6976 vqmovn_u16(vcombine_u16(vqrshrun_n_s32(intermediateResults2_high_low_s_32x4, 7), vqrshrun_n_s32(intermediateResults2_high_high_s_32x4, 7))));
6977
6978 // and we can store the result
6979 vst3q_u8(target, results_u_8x16x3);
6980}
6981
6982#endif // OCEAN_HARDWARE_NEON_VERSION
6983
6984}
6985
6986}
6987
6988#endif // META_OCEAN_CV_FRAME_CHANNELS_H
The following comfort class provides comfortable functions simplifying prototyping applications but a...
Definition FrameChannels.h:51
static bool premultipliedAlphaToStraightAlpha(const Frame &source, Frame &target, Worker *worker=nullptr)
Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
static bool zipChannels(const Frames &sourceFrames, Frame &targetFrame, const FrameType::PixelFormat targetPixelFormat=FrameType::FORMAT_UNDEFINED)
Zips/interleaves 1-channel images into one image with n-channels.
static bool separateTo1Channel(const Frame &sourceFrame, Frames &targetFrames, const FrameType::PixelFormat targetPixelFormat=FrameType::FORMAT_UNDEFINED)
Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24,...
static bool premultipliedAlphaToStraightAlpha(Frame &frame, Worker *worker=nullptr)
Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
static bool separateTo1Channel(const Frame &sourceFrame, const std::initializer_list< Frame * > &targetFrames, const FrameType::PixelFormat targetPixelFormat=FrameType::FORMAT_UNDEFINED)
Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24,...
static bool zipChannels(const std::initializer_list< Frame > &sourceFrames, Frame &targetFrame, const FrameType::PixelFormat targetPixelFormat=FrameType::FORMAT_UNDEFINED)
Zips/interleaves 1-channel images into one image with n-channels.
static bool straightAlphaToPremultipliedAlpha(Frame &frame, Worker *worker=nullptr)
Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied al...
static bool straightAlphaToPremultipliedAlpha(const Frame &source, Frame &target, Worker *worker=nullptr)
Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied al...
This class implements frame channel conversion, transformation and extraction functions.
Definition FrameChannels.h:31
static void reverseChannelOrder(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Reverses the order of the channels of a frame with zipped pixel format.
Definition FrameChannels.h:2964
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel10BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &factorChannel00_1024_s_16x8, const __m128i &factorChannel10_1024_s_16x8, const __m128i &factorChannel20_1024_s_16x8, const __m128i &factorChannel01_1024_s_16x8, const __m128i &factorChannel11_1024_s_16x8, const __m128i &factorChannel21_1024_s_16x8, const __m128i &factorChannel02_1024_s_16x8, const __m128i &factorChannel12_1024_s_16x8, const __m128i &factorChannel22_1024_s_16x8, const __m128i &biasChannel0_1024_s_32x4, const __m128i &biasChannel1_1024_s_32x4, const __m128i &biasChannel2_1024_s_32x4)
Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear co...
Definition FrameChannels.h:5470
static void addChannelValueRow(const T *source, T *target, const size_t size, const void *channelValueParameter)
Adds a channel to a given row with generic (zipped) pixel format and sets all values to a specified v...
Definition FrameChannels.h:4416
static void shuffleRowChannelsAndSetLastChannelValue(const T *source, T *target, const size_t size, const void *options=nullptr)
Shuffles the channels of row pixels by application of a specified shuffle pattern and sets the last c...
Definition FrameChannels.h:3871
static OCEAN_FORCE_INLINE void convert3ChannelsTo4Channels16Pixels8BitPerChannel6BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &factorChannel00_64_s_16x8, const __m128i &factorChannel10_64_s_16x8, const __m128i &factorChannel20_64_s_16x8, const __m128i &factorChannel01_64_s_16x8, const __m128i &factorChannel11_64_s_16x8, const __m128i &factorChannel21_64_s_16x8, const __m128i &factorChannel02_64_s_16x8, const __m128i &factorChannel12_64_s_16x8, const __m128i &factorChannel22_64_s_16x8, const __m128i &biasChannel0_s_16x8, const __m128i &biasChannel1_s_16x8, const __m128i &biasChannel2_s_16x8, const __m128i &channelValue3_u_8x16)
Converts 16 pixels with 3 channels per pixel to 16 pixels with four channel per pixel by a linear com...
Definition FrameChannels.h:5683
static void separateTo1Channel(const TSource *const sourceFrame, TTarget *const *const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int *targetFramesPaddingElements)
Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24,...
Definition FrameChannels.h:1971
static OCEAN_FORCE_INLINE void convert3ChannelsTo4Channels16Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x8_t &factorChannel00_64_s_16x8, const int16x8_t &factorChannel10_64_s_16x8, const int16x8_t &factorChannel20_64_s_16x8, const int16x8_t &factorChannel01_64_s_16x8, const int16x8_t &factorChannel11_64_s_16x8, const int16x8_t &factorChannel21_64_s_16x8, const int16x8_t &factorChannel02_64_s_16x8, const int16x8_t &factorChannel12_64_s_16x8, const int16x8_t &factorChannel22_64_s_16x8, const uint8x8_t &biasChannel0_u_8x8, const uint8x8_t &biasChannel1_u_8x8, const uint8x8_t &biasChannel2_u_8x8, const uint8x16_t &channelValue3_u_8x16)
Converts 16 pixels with 3 channels per pixel to 16 pixels with 4 channels per pixel by a linear combi...
Definition FrameChannels.h:6632
static void addChannelRow(const void **sources, void **targets, const unsigned int multipleRowIndex, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const void *options)
Adds a channel to a given row with generic (zipped) pixel format and copies the information of the ne...
Definition FrameChannels.h:4316
static void shuffleChannelsAndSetLastChannelValue(const T *source, const T newChannelValue, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Shuffles the channels of source frame and sets the last channel with constant value in the target fra...
Definition FrameChannels.h:4034
static OCEAN_FORCE_INLINE void convert3ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &multiplicationFactors0_128_u_16x8, const __m128i &multiplicationFactors1_128_u_16x8, const __m128i &multiplicationFactors2_128_u_16x8)
Converts 16 pixels with 3 channels per pixel to 16 pixels with one channel per pixel by a linear comb...
Definition FrameChannels.h:5316
static void shuffleChannels(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Shuffles the channels of a frame by an arbitrary pattern.
Definition FrameChannels.h:4006
static void convertRow3ChannelsTo1Channel8BitPerChannel7BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *channelMultiplicationFactors_128)
Converts a row of pixels with 3 channels to pixels with one channel by a linear combination of the fo...
Definition FrameChannels.h:5006
static void convertRow4ChannelsTo1Channel8BitPerChannel7BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *channelMultiplicationFactors_128)
Converts a row of pixels with 4 channels to pixels with one channel by a linear combination of the fo...
Definition FrameChannels.h:5079
static void copyChannel(const T *source, T *target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Copies one channel from a given frame with zipped pixel format to another frame with zipped pixel for...
Definition FrameChannels.h:2923
static void zipChannels(const TSource *const *const sourceFrames, TTarget *const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int *sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
Zips/interleaves 1-channel images into one image with n-channels.
Definition FrameChannels.h:2722
static void straightAlphaToPremultipliedAlpha8BitPerChannel(uint8_t *const frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, Worker *worker=nullptr)
Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied al...
Definition FrameChannels.h:4219
static void applyRowOperator(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const RowOperatorFunction< TSource, TTarget, tSourceChannels, tTargetChannels > &rowOperatorFunction, Worker *worker=nullptr)
Applies a row operator to all rows of a source image.
Definition FrameChannels.h:4134
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x8_t &factorChannel00_64_s_16x8, const int16x8_t &factorChannel10_64_s_16x8, const int16x8_t &factorChannel20_64_s_16x8, const int16x8_t &factorChannel01_64_s_16x8, const int16x8_t &factorChannel11_64_s_16x8, const int16x8_t &factorChannel21_64_s_16x8, const int16x8_t &factorChannel02_64_s_16x8, const int16x8_t &factorChannel12_64_s_16x8, const int16x8_t &factorChannel22_64_s_16x8, const uint8x8_t &biasChannel0_u_8x8, const uint8x8_t &biasChannel1_u_8x8, const uint8x8_t &biasChannel2_u_8x8)
Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear co...
Definition FrameChannels.h:6211
static void convertRow4ChannelsTo2Channels8BitPerChannel7BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *multiplicationFactors_128)
Converts a row of pixels with 4 channels to pixels with two channel by a linear combination of the fo...
static void setChannelSubset(T *frame, const unsigned int width, const T value, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Sets one channel of a frame with one unique value.
Definition FrameChannels.h:4615
static void applyBivariateOperatorSubset(const TSource0 *source0, const TSource1 *source1, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int source0PaddingElements, const unsigned int source1PaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows)
Generic bivariate pixel operations.
Definition FrameChannels.h:4850
static void applyAdvancedPixelModifier(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, Worker *worker=nullptr)
Applies a specific modifier function on each pixel.
Definition FrameChannels.h:4096
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels8Pixels8BitPerChannel6BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x8_t &factorChannel00_64_s_16x8, const int16x8_t &factorChannel10_64_s_16x8, const int16x8_t &factorChannel20_64_s_16x8, const int16x8_t &factorChannel01_64_s_16x8, const int16x8_t &factorChannel11_64_s_16x8, const int16x8_t &factorChannel21_64_s_16x8, const int16x8_t &factorChannel02_64_s_16x8, const int16x8_t &factorChannel12_64_s_16x8, const int16x8_t &factorChannel22_64_s_16x8, const uint8x8_t &biasChannel0_u_8x8, const uint8x8_t &biasChannel1_u_8x8, const uint8x8_t &biasChannel2_u_8x8)
Converts 8 pixels with 3 channels per pixel to 8 pixels with three channel per pixel by a linear comb...
Definition FrameChannels.h:6157
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel10BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x4_t &factorChannel00_1024_s_16x4, const int16x4_t &factorChannel10_1024_s_16x4, const int16x4_t &factorChannel20_1024_s_16x4, const int16x4_t &factorChannel01_1024_s_16x4, const int16x4_t &factorChannel11_1024_s_16x4, const int16x4_t &factorChannel21_1024_s_16x4, const int16x4_t &factorChannel02_1024_s_16x4, const int16x4_t &factorChannel12_1024_s_16x4, const int16x4_t &factorChannel22_1024_s_16x4, const int32x4_t &biasChannel0_1024_s_32x4, const int32x4_t &biasChannel1_1024_s_32x4, const int32x4_t &biasChannel2_1024_s_32x4)
Converts 8 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear com...
Definition FrameChannels.h:6424
static OCEAN_FORCE_INLINE void convert4ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const uint8x8_t &factorChannel0_128_u_8x8, const uint8x8_t &factorChannel1_128_u_8x8, const uint8x8_t &factorChannel2_128_u_8x8, const uint8x8_t &factorChannel3_128_u_8x8)
Converts 8 pixels with 4 channels per pixel to 8 pixels with one channel per pixel by a linear combin...
static void addFirstChannelValue(const T *source, const T newChannelValue, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Adds a new channel to a given frame with zipped pixel format, the value of the new channel will be th...
Definition FrameChannels.h:2835
static void addLastChannel(const T *source, const T *sourceNewChannel, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int sourceNewChannelPaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Adds a new channel to a given frame with zipped pixel format, the new channel will be added to the ba...
Definition FrameChannels.h:2855
static void removeFirstChannel(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Removes the first channel from a given frame with zipped (generic) pixel format.
Definition FrameChannels.h:2891
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels8Pixels8BitPerChannel10BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x4_t &factorChannel00_1024_s_16x4, const int16x4_t &factorChannel10_1024_s_16x4, const int16x4_t &factorChannel20_1024_s_16x4, const int16x4_t &factorChannel01_1024_s_16x4, const int16x4_t &factorChannel11_1024_s_16x4, const int16x4_t &factorChannel21_1024_s_16x4, const int16x4_t &factorChannel02_1024_s_16x4, const int16x4_t &factorChannel12_1024_s_16x4, const int16x4_t &factorChannel22_1024_s_16x4, const int32x4_t &biasChannel0_1024_s_32x4, const int32x4_t &biasChannel1_1024_s_32x4, const int32x4_t &biasChannel2_1024_s_32x4)
Converts 8 pixels with 3 channels per pixel to 8 pixels with three channel per pixel by a linear comb...
Definition FrameChannels.h:6333
static void addLastChannelValue(const T *source, const T newChannelValue, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Adds a new channel to a given frame with zipped pixel format, the value of the new channel will be th...
Definition FrameChannels.h:2871
static void convertRow3ChannelsTo3Channels8BitPerChannel6BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the thr...
static void reverseRowPixelOrderInPlace(T *data, const size_t size)
Reverses/mirrors the order of pixels in a given row (or a memory block in general) in place.
Definition FrameChannels.h:3141
static void applyRowOperatorSubset(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourceStrideElements, const unsigned int targetStrideElements, const RowOperatorFunction< TSource, TTarget, tSourceChannels, tTargetChannels > rowOperatorFunction, const unsigned int firstRow, const unsigned int numberRows)
Applies a row operator to a subset of all rows of a source image.
Definition FrameChannels.h:4983
static void applyPixelModifier(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, Worker *worker=nullptr)
Applies a specific modifier function on each pixel.
Definition FrameChannels.h:4078
static void narrowRow16BitPerChannelTo8BitPerChannel(const uint16_t *source, uint8_t *target, const size_t size, const void *unusedParameters=nullptr)
Narrows a row of pixels with 16 bit channels to pixels with 8 bit channels.
Definition FrameChannels.h:4257
static void applyAdvancedPixelModifierSubset(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows)
Applies a specific modifier function on each pixel.
Definition FrameChannels.h:4743
static void convertRow4ChannelsTo3Channels8BitPerChannel7BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the thr...
static OCEAN_FORCE_INLINE void convert4ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &factorChannel00_128_s_16x8, const __m128i &factorChannel10_128_s_16x8, const __m128i &factorChannel20_128_s_16x8, const __m128i &factorChannel01_128_s_16x8, const __m128i &factorChannel11_128_s_16x8, const __m128i &factorChannel21_128_s_16x8, const __m128i &factorChannel02_128_s_16x8, const __m128i &factorChannel12_128_s_16x8, const __m128i &factorChannel22_128_s_16x8, const __m128i &factorChannel03_128_s_16x8, const __m128i &factorChannel13_128_s_16x8, const __m128i &factorChannel23_128_s_16x8, const __m128i &biasChannel0_s_16x8, const __m128i &biasChannel1_s_16x8, const __m128i &biasChannel2_s_16x8)
Converts 16 pixels with 4 channels per pixel to 16 pixels with three channel per pixel by a linear co...
Definition FrameChannels.h:5763
static void shuffleRowChannels(const T *source, T *target, const size_t size, const void *unusedOptions=nullptr)
Shuffles the channels of row pixels by application of a specified shuffle pattern.
Definition FrameChannels.h:3511
static void premultipliedAlphaToStraightAlpha8BitPerChannel(uint8_t *const frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, Worker *worker=nullptr)
Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
Definition FrameChannels.h:4181
static void convertRow3ChannelsTo3Channels8BitPerChannel7BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the thr...
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x8_t &factorChannel00_128_s_16x8, const int16x8_t &factorChannel10_128_s_16x8, const int16x8_t &factorChannel20_128_s_16x8, const int16x8_t &factorChannel01_128_s_16x8, const int16x8_t &factorChannel11_128_s_16x8, const int16x8_t &factorChannel21_128_s_16x8, const int16x8_t &factorChannel02_128_s_16x8, const int16x8_t &factorChannel12_128_s_16x8, const int16x8_t &factorChannel22_128_s_16x8, const int16x8_t &biasChannel0_128_s_16x8, const int16x8_t &biasChannel1_128_s_16x8, const int16x8_t &biasChannel2_128_s_16x8)
Converts 16 pixels with 3 channels per pixel to 16 pixels with 3 channels per pixel by a linear combi...
Definition FrameChannels.h:6550
static void convertRow3ChannelsTo4Channels8BitPerChannel6BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a row of pixels with 3 channels to pixels with 4 channels by a linear combination of the thr...
static constexpr unsigned int CHANNELS_NOT_KNOWN_AT_COMPILE_TIME
Definition of a constant to specify that the number of channels are not known at compile time but at ...
Definition FrameChannels.h:37
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x8_t &factorChannel00_128_s_16x8, const int16x8_t &factorChannel10_128_s_16x8, const int16x8_t &factorChannel20_128_s_16x8, const int16x8_t &factorChannel01_128_s_16x8, const int16x8_t &factorChannel11_128_s_16x8, const int16x8_t &factorChannel21_128_s_16x8, const int16x8_t &factorChannel02_128_s_16x8, const int16x8_t &factorChannel12_128_s_16x8, const int16x8_t &factorChannel22_128_s_16x8, const int16x8_t &biasChannel0_128_s_16x8, const int16x8_t &biasChannel1_128_s_16x8, const int16x8_t &biasChannel2_128_s_16x8)
Converts 8 pixels with 3 channels per pixel to 8 pixels with three channel per pixel by a linear comb...
Definition FrameChannels.h:6276
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel6BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &factorChannel00_64_s_16x8, const __m128i &factorChannel10_64_s_16x8, const __m128i &factorChannel20_64_s_16x8, const __m128i &factorChannel01_64_s_16x8, const __m128i &factorChannel11_64_s_16x8, const __m128i &factorChannel21_64_s_16x8, const __m128i &factorChannel02_64_s_16x8, const __m128i &factorChannel12_64_s_16x8, const __m128i &factorChannel22_64_s_16x8, const __m128i &biasChannel0_s_16x8, const __m128i &biasChannel1_s_16x8, const __m128i &biasChannel2_s_16x8)
Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear co...
Definition FrameChannels.h:5607
static void copyChannelRow(const T *source, T *target, const size_t size, const void *unusedParameters=nullptr)
Copies one channel from a source row to a target row with generic (zipped) pixel format.
Definition FrameChannels.h:4455
static void reverseRowPixelOrder(const T *source, T *target, const size_t size)
Reverses/mirrors the order of pixels in a given row (or a memory block in general).
Definition FrameChannels.h:2980
static OCEAN_FORCE_INLINE void convert3ChannelsTo1Channel8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const uint8x8_t &factorChannel0_128_u_8x8, const uint8x8_t &factorChannel1_128_u_8x8, const uint8x8_t &factorChannel2_128_u_8x8)
Converts 8 pixels with 3 channels per pixel to 8 pixels with one channel per pixel by a linear combin...
static OCEAN_FORCE_INLINE void convert4ChannelsTo1Channel16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &multiplicationFactors0123_128_s_32x)
Converts 16 pixels with 4 channels per pixel to 16 pixels with one channel per pixel by a linear comb...
Definition FrameChannels.h:5921
static void removeLastChannel(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Removes the last channel from a given frame with zipped (generic) pixel format.
Definition FrameChannels.h:2907
static void transformGeneric(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker)
Transforms a frame with generic pixel format (with zipped pixel information) like RGB24 or YUV24,...
Definition FrameChannels.h:4156
static void setChannel(T *frame, const unsigned int width, const unsigned int height, const T value, const unsigned int framePaddingElements, Worker *worker=nullptr)
Sets one channel of a frame with a specific unique value.
Definition FrameChannels.h:2945
static void straightAlphaToPremultipliedAlpha8BitPerChannelSubset(uint8_t *const frame, const unsigned int width, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Converts an image with straight alpha (without premultiplied alpha) to an image with premultiplied al...
Definition FrameChannels.h:5243
static void narrow16BitPerChannelTo8BitPerChannel(const uint16_t *source, uint8_t *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Narrows 16 bit channels of a frame to 8 bit channels.
Definition FrameChannels.h:4062
static OCEAN_FORCE_INLINE void convert4ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const int16x8_t &factorChannel00_128_s_16x8, const int16x8_t &factorChannel10_128_s_16x8, const int16x8_t &factorChannel20_128_s_16x8, const int16x8_t &factorChannel01_128_s_16x8, const int16x8_t &factorChannel11_128_s_16x8, const int16x8_t &factorChannel21_128_s_16x8, const int16x8_t &factorChannel02_128_s_16x8, const int16x8_t &factorChannel12_128_s_16x8, const int16x8_t &factorChannel22_128_s_16x8, const int16x8_t &factorChannel03_128_s_16x8, const int16x8_t &factorChannel13_128_s_16x8, const int16x8_t &factorChannel23_128_s_16x8, const int16x8_t &biasChannel0_128_s_16x8, const int16x8_t &biasChannel1_128_s_16x8, const int16x8_t &biasChannel2_128_s_16x8)
Converts 16 pixels with 4 channels per pixel to 16 pixels with three channels per pixel by a linear c...
Definition FrameChannels.h:6815
static void transformGenericSubset(const uint8_t *source, uint8_t *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const RowReversePixelOrderFunction< void > rowReversePixelOrderFunction, const unsigned int bytesPerRow, const unsigned int sourceStrideBytes, const unsigned int targetStrideBytes, const unsigned int firstRow, const unsigned int numberRows)
Transforms a subset of a frame with generic pixel format (with zipped pixel information) like RGB24 o...
static OCEAN_FORCE_INLINE void convert3ChannelsTo3Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &factorChannel00_128_s_16x8, const __m128i &factorChannel10_128_s_16x8, const __m128i &factorChannel20_128_s_16x8, const __m128i &factorChannel01_128_s_16x8, const __m128i &factorChannel11_128_s_16x8, const __m128i &factorChannel21_128_s_16x8, const __m128i &factorChannel02_128_s_16x8, const __m128i &factorChannel12_128_s_16x8, const __m128i &factorChannel22_128_s_16x8, const __m128i &biasChannel0_s_16x8, const __m128i &biasChannel1_s_16x8, const __m128i &biasChannel2_s_16x8)
Converts 16 pixels with 3 channels per pixel to 16 pixels with three channel per pixel by a linear co...
Definition FrameChannels.h:5382
static void reverseRowChannelOrder(const T *source, T *target, const size_t size, const void *unusedOptions=nullptr)
Reverses/mirrors the order of channels in a given row (or a memory block in general).
Definition FrameChannels.h:3319
static void convertRow3ChannelsTo3Channels8BitPerChannel10BitPrecision(const uint8_t *source, uint8_t *target, const size_t size, const void *parameters)
Converts a row of pixels with 3 channels to pixels with 3 channels by a linear combination of the thr...
static void applyBivariateOperator(const TSource0 *source0, const TSource1 *source1, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int source0PaddingElements, const unsigned int source1PaddingElements, const unsigned int targetPaddingElements, const ConversionFlag conversionFlag, Worker *worker=nullptr)
Generic bivariate pixel operations Applies bivariate per-pixel operators: C(y, x) = op(A(y,...
Definition FrameChannels.h:4115
static void addFirstChannel(const T *source, const T *sourceNewChannel, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int sourcePaddingElements, const unsigned int sourceNewChannelPaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Adds a new channel to a given frame with zipped pixel format, the new channel will be added to the fr...
Definition FrameChannels.h:2819
static OCEAN_FORCE_INLINE void convert4ChannelsTo2Channels8Pixels8BitPerChannel7BitPrecisionNEON(const uint8_t *const source, uint8_t *const target, const uint8x8_t &factorChannel00_128_u_8x8, const uint8x8_t &factorChannel10_128_u_8x8, const uint8x8_t &factorChannel01_128_u_8x8, const uint8x8_t &factorChannel11_128_u_8x8, const uint8x8_t &factorChannel02_128_u_8x8, const uint8x8_t &factorChannel12_128_u_8x8, const uint8x8_t &factorChannel03_128_u_8x8, const uint8x8_t &factorChannel13_128_u_8x8)
Converts 8 pixels with 4 channels per pixel to 8 pixels with two channels per pixel by a linear combi...
Definition FrameChannels.h:6766
static void separateTo1ChannelRuntime(const TSource *const sourceFrame, TTarget *const *const targetFrames, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourceFramePaddingElements, const unsigned int *targetFramesPaddingElements)
Separates a given frame with zipped pixel format e.g., FORMAT_RGB24, FORMAT_YUV24,...
Definition FrameChannels.h:4476
static void zipChannelsRuntime(const TSource *const *const sourceFrames, TTarget *const targetFrame, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int *sourceFramesPaddingElements, const unsigned int targetFramePaddingElements)
Zips/interleaves 1-channel images into one image with n-channels.
Definition FrameChannels.h:4549
void(*)(const TSource *sourceRow, TTarget *targetRow, const unsigned int width, const unsigned int height, unsigned int rowIndex, const unsigned int sourceStrideElements, const unsigned int targetStrideElements) RowOperatorFunction
Definition of a function pointer to a function able to operate on an entire image row.
Definition FrameChannels.h:43
static void applyPixelModifierSubset(const T *source, T *target, const unsigned int width, const unsigned int height, const ConversionFlag conversionFlag, const unsigned int firstRow, const unsigned int numberRows)
Applies a specific modifier function on each pixel.
Definition FrameChannels.h:4638
static void premultipliedAlphaToStraightAlpha8BitPerChannelSubset(uint8_t *const frame, const unsigned int width, const unsigned int framePaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Converts an image with premultiplied alpha to a straight image (without premultiplied alpha).
Definition FrameChannels.h:5154
static OCEAN_FORCE_INLINE void convert4ChannelsTo2Channels16Pixels8BitPerChannel7BitPrecisionSSE(const uint8_t *const source, uint8_t *const target, const __m128i &multiplicationFactorsChannel0_0123_128_s_16x8, const __m128i &multiplicationFactorsChannel1_0123_128_s_16x8)
Converts 16 pixels with 4 channels per pixel to 16 pixels with two channel per pixel by a linear comb...
Definition FrameChannels.h:5984
This is the base class for all frame converter classes.
Definition FrameConverter.h:32
ConversionFlag
Definition of individual conversion flags.
Definition FrameConverter.h:39
@ CONVERT_NORMAL
Normal conversion, neither flips nor mirrors the image.
Definition FrameConverter.h:49
@ CONVERT_FLIPPED_AND_MIRRORED
Rotated conversion, rotates the image by 180.0 degrees with anchor in the center of the image.
Definition FrameConverter.h:82
@ CONVERT_MIRRORED
Mirrored conversion, exchanges left and right of the image (like in a mirror, mirroring around the y-...
Definition FrameConverter.h:71
@ CONVERT_FLIPPED
Flipped conversion, exchanges top and bottom of the image (flipping around the x-axis).
Definition FrameConverter.h:60
static void convertGenericPixelFormat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int sourceStrideElements, const unsigned int targetStrideElements, const ConversionFlag flag, const RowConversionFunction< TSource, TTarget > rowConversionFunction, const RowReversePixelOrderInPlaceFunction< TTarget > targetReversePixelOrderInPlaceFunction, const bool areContinuous, const void *options, Worker *worker)
Converts a frame with generic pixel format (e.g., RGBA32, BGR24, YUV24, ...) to a frame with generic ...
Definition FrameConverter.h:3483
void(*)(T *row, const size_t width) RowReversePixelOrderInPlaceFunction
Definition of a function pointer to a function able to reverse the order of pixels in an image row wi...
Definition FrameConverter.h:603
void(*)(const T *inputRow, T *targetRow, const size_t width) RowReversePixelOrderFunction
Definition of a function pointer to a function able to reverse the order of pixels in an image row wi...
Definition FrameConverter.h:594
static void convertArbitraryPixelFormat(const void **sources, void **targets, const unsigned int width, const unsigned int height, const ConversionFlag flag, const unsigned int multipleRowsPerIteration, const MultipleRowsConversionFunction multipleRowsConversionFunction, const void *options, Worker *worker)
Converts a frame with arbitrary pixel format (e.g., Y_UV12, Y_VU12, YUYV16, ...) to a frame with arbi...
Definition FrameConverter.h:3506
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition NEON.h:1216
static __m128i divideByRightShiftSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 32 bit values by applying a right shift.
Definition SSE.h:3173
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition SSE.h:3724
static void store128i(const __m128i &value, uint8_t *const buffer)
Stores a 128i value to the memory.
Definition SSE.h:3869
static __m128i divideByRightShiftSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight int16_t values by applying a right shift.
Definition SSE.h:3104
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i &values0, const __m128i &values1, __m128i &results0, __m128i &results1)
Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
Definition SSE.h:4014
static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const __m128i &channel0, const __m128i &channel1, const __m128i &channel2, __m128i &interleavedA, __m128i &interleavedB, __m128i &interleavedC)
Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3410
static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2, __m128i &reversedInterleaved0, __m128i &reversedInterleaved1, __m128i &reversedInterleaved2)
Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channe...
Definition SSE.h:3492
static __m128i removeHighBits16_8(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements.
Definition SSE.h:3904
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3369
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3875
static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels...
Definition SSE.h:3517
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i &values0, const __m128i &values1, __m128i &products0, __m128i &products1)
Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
Definition SSE.h:4005
static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels...
Definition SSE.h:3477
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition Caller.h:2877
This class implements Ocean's image class.
Definition Frame.h:1879
PixelFormat
Definition of all pixel formats available in the Ocean framework.
Definition Frame.h:183
typename TypeMapperBySize< sizeof(T)>::Type Type
Definition of an invalid mapped data type.
Definition DataType.h:508
This class implements a worker able to distribute function calls over different threads.
Definition Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
std::vector< Index32 > Indices32
Definition of a vector holding 32 bit index values.
Definition Base.h:96
std::vector< Frame > Frames
Definition of a vector holding padding frames.
Definition Frame.h:1842
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
Default definition of a type with tBytes bytes.
Definition DataType.h:32