Ocean
FrameTransposer.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_FRAME_TRANSPOSER_H
9 #define META_OCEAN_CV_FRAME_TRANSPOSER_H
10 
11 #include "ocean/cv/CV.h"
12 #include "ocean/cv/NEON.h"
13 #include "ocean/cv/FrameChannels.h"
14 
15 #include "ocean/base/DataType.h"
16 #include "ocean/base/Frame.h"
17 #include "ocean/base/Worker.h"
18 
19 namespace Ocean
20 {
21 
22 namespace CV
23 {
24 
25 /**
26  * This class implements a frame transposer.
27  * @ingroup cv
28  */
29 class OCEAN_CV_EXPORT FrameTransposer
30 {
31  public:
32 
33  /**
34  * The following comfort class provides comfortable functions simplifying prototyping applications but also increasing binary size of the resulting applications.
35  * Best practice is to avoid using these functions if binary size matters,<br>
36  * as for every comfort function a corresponding function exists with specialized functionality not increasing binary size significantly.<br>
37  */
38  class OCEAN_CV_EXPORT Comfort
39  {
40  public:
41 
42  /**
43  * Rotates a given frame either clockwise or counter-clockwise by 90 degrees.
44  * @param input The input frame which will be rotated, must be valid
45  * @param output The resulting rotated output frame, the frame type will be set automatically
46  * @param clockwise True, to rotate the frame clockwise; False, to rotate the frame counter-clockwise
47  * @param worker Optional worker object to distribute the computation
48  * @return True, if succeeded
49  */
50  static bool rotate90(const Frame& input, Frame& output, const bool clockwise, Worker* worker = nullptr);
51 
52  /**
53  * Rotates a given frame either clockwise or counter-clockwise by 90 degrees.
54  * @param frame The frame to rotate, must be valid
55  * @param clockwise True, to rotate the frame clockwise; False, to rotate the frame counter-clockwise
56  * @param worker Optional worker object to distribute the computation
57  * @return True, if succeeded
58  */
59  static inline bool rotate90(Frame& frame, const bool clockwise, Worker* worker = nullptr);
60 
61  /**
62  * Rotates a given frame by 180 degrees.
63  * @param input The input frame which will be rotated, must be valid
64  * @param output The resulting rotated output frame, the frame type will be set automatically
65  * @param worker Optional worker object to distribute the computation
66  * @return True, if succeeded
67  */
68  static bool rotate180(const Frame& input, Frame& output, Worker* worker = nullptr);
69 
70  /**
71  * Rotates a given frame by 180 degrees.
72  * @param frame The frame to rotate, must be valid
73  * @param worker Optional worker object to distribute the computation
74  * @return True, if succeeded
75  */
76  static inline bool rotate180(Frame& frame, Worker* worker = nullptr);
77 
78  /**
79  * Rotates a given frame with 90 degree steps.
80  * @param input The input frame which will be rotated, must be valid
81  * @param output The resulting rotated output frame, the frame type will be set automatically
82  * @param angle The clockwise rotation angle to be used, must be a multiple of +/- 90, with range (-infinity, infinity)
83  * @param worker Optional worker object to distribute the computation
84  * @return True, if succeeded
85  */
86  static bool rotate(const Frame& input, Frame& output, const int angle, Worker* worker = nullptr);
87 
88  /**
89  * Rotates a given frame with 90 degree steps.
90  * @param frame The frame to rotate, must be valid
91  * @param angle The clockwise rotation angle to be used, must be a multiple of +/- 90, with range (-infinity, infinity)
92  * @param worker Optional worker object to distribute the computation
93  * @return True, if succeeded
94  */
95  static inline bool rotate(Frame& frame, const int angle, Worker* worker = nullptr);
96  };
97 
98  protected:
99 
100  /**
101  * Definition of individual flip directions which can be applied to a transposed frame.
102  * Flipping the transposed result allows to rotate the image by 90 degree (clockwise and counter clockwise).
103  */
105  {
106  /// Applying no flip.
108  /// Applying a left-right flip like a mirror, combined with a transpose operation an image can be rotated clockwise.
110  /// Applying a top-bottom flip, combined with a transpose operation an image can be rotated counter clockwise.
111  FD_TOP_BOTTOM
112  };
113 
114  /**
115  * Helper class for functions transposing blocks.
116  * The class is necessary to allow a partially specialization of template parameters.
117  * @tparam T The data type of each elements, e.g., 'uint8_t', 'int8_t', 'float'
118  * @tparam tChannels The number of channels the given data has, with range [1, infinity)
119  */
120  template <typename T, unsigned int tChannels>
122  {
123  public:
124 
125  /**
126  * Transposes a block of 8x8 pixels.
127  * @param sourceBlock The pointer to the start location of the source block, must be valid
128  * @param targetBlock The pointer to the start location of the target block, must be valid
129  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [8 * tChannels, infinity)
130  * @param targetStrideElements The number of elements between two successive rows, in elements, with range [8 * tChannels, infinity)
131  * @tparam tFlipDirection The flip direction to be applied after transposing the block
132  * @see transposeBlock().
133  */
134  template <FlipDirection tFlipDirection>
135  static OCEAN_FORCE_INLINE void transposeBlock8x8(const T* sourceBlock, T* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements);
136 
137  /**
138  * Transposes a block of n x m pixels.
139  * This function should be used for blocks smaller than 8x8.
140  * @param sourceBlock The pointer to the start location of the source block, must be valid
141  * @param targetBlock The pointer to the start location of the target block, must be valid
142  * @param blockWidth The width of the block to transpose, with range [1, 7]
143  * @param blockHeight The height of the block to transpose, with range [1, 7]
144  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [8 * tChannels, infinity)
145  * @param targetStrideElements The number of elements between two successive rows, in elements, with range [8 * tChannels, infinity)
146  * @tparam tFlipDirection The flip direction to be applied after transposing the block
147  * @see transposeBlock8x8().
148  */
149  template <FlipDirection tFlipDirection>
150  static OCEAN_FORCE_INLINE void transposeBlock(const T* sourceBlock, T* targetBlock, const unsigned int blockWidth, const unsigned int blockHeight, const unsigned int sourceStrideElements, const unsigned int targetStrideElements);
151 
152 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
153 
154  /**
155  * Transposes a block of 4x4 pixels.
156  * @param sourceBlock The pointer to the start location of the source block, must be valid
157  * @param targetBlock The pointer to the start location of the target block, must be valid
158  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [4 * tChannels, infinity)
159  * @param targetStrideElements The number of elements between two successive rows, in elements, with range [4 * tChannels, infinity)
160  * @tparam tFlipDirection The flip direction to be applied after transposing the block
161  * @see transposeBlock().
162  */
163  template <FlipDirection tFlipDirection>
164  static OCEAN_FORCE_INLINE void transposeBlock4x4NEON(const T* sourceBlock, T* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements);
165 
166 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
167  };
168 
169  public:
170 
171  /**
172  * Transposes a given frame.
173  * Beware: This function has a significantly bigger binary size impact than the corresponding template-based function.
174  * @param source The source frame to transpose, must be valid
175  * @param target The target frame receiving the transposed image, if the frame type of the target frame does not match the transposed source frame, the target frame will be adjusted accordingly, must not be 'source'
176  * @param worker Optional worker to distribute the computation
177  * @return True, if succeeded
178  */
179  static bool transpose(const Frame& source, Frame& target, Worker* worker = nullptr);
180 
181  /**
182  * Transposes a given frame.
183  * Beware: This function has a significantly bigger binary size impact than the corresponding template-based function.
184  * @param frame The frame to transpose, must be valid
185  * @param worker Optional worker to distribute the computation
186  * @return True, if succeeded
187  */
188  static inline bool transpose(Frame& frame, Worker* worker = nullptr);
189 
190  /**
191  * Transposes a given image buffer.
192  * @param source The source buffer to transpose, must be valid
193  * @param target The target buffer receiving the transposed image, must not be 'source', must be valid
194  * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
195  * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
196  * @param sourcePaddingElements The number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
197  * @param targetPaddingElements The number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
198  * @param worker Optional worker to distribute the computation
199  * @tparam T The data type of each channel
200  * @tparam tChannels The number of frame channels, with range [1, infinity)
201  */
202  template <typename T, unsigned int tChannels>
203  static void transpose(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
204 
205  /**
206  * Rotates a given image buffer 90 degrees clockwise or counter clockwise.
207  * @param source The source buffer to transpose, must be valid
208  * @param target The target buffer receiving the rotated image, must not be 'source', must be valid
209  * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
210  * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
211  * @param clockwise True, to rotate the source image clockwise; False, to rotate the image counter clockwise
212  * @param worker Optional worker to distribute the computation
213  * @param sourcePaddingElements The optional number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
214  * @param targetPaddingElements The optional number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
215  * @tparam T The data type of each channel
216  * @tparam tChannels The number of frame channels, with range [1, infinity)
217  */
218  template <typename T, unsigned int tChannels>
219  static void rotate90(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const bool clockwise, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
220 
221  /**
222  * Rotates a given image buffer 180 degrees.
223  * @param source The source buffer to transpose, must be valid
224  * @param target The target buffer receiving the rotated image, must not be 'source', must be valid
225  * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
226  * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
227  * @param worker Optional worker to distribute the computation
228  * @param sourcePaddingElements The optional number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
229  * @param targetPaddingElements The optional number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
230  * @tparam T The data type of each channel
231  * @tparam tChannels The number of frame channels, with range [1, infinity)
232  */
233  template <typename T, unsigned int tChannels>
234  static void rotate180(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
235 
236  /**
237  * Rotates a given image with 90 degree steps.
238  * @param source The source buffer to rotated, must be valid
239  * @param target The target buffer receiving the rotated image, must not be 'source', must be valid
240  * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
241  * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
242  * @param angle The clockwise rotation angle to be used, must be a multiple of +/- 90, with range (-infinity, infinity)
243  * @param worker Optional worker to distribute the computation
244  * @param sourcePaddingElements The optional number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
245  * @param targetPaddingElements The optional number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
246  * @return True, if succeeded
247  * @tparam T The data type of each channel
248  * @tparam tChannels The number of frame channels, with range [1, infinity)
249  */
250  template <typename T, unsigned int tChannels>
251  static bool rotate(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const int angle, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
252 
253  protected:
254 
255  /**
256  * Transposes the subset of a given image buffer.
257  * @param source The source buffer to transpose
258  * @param target The target buffer receiving the transposed image
259  * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
260  * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
261  * @param sourcePaddingElements The optional number of padding elements at the end of each row of the source frame, in elements, with range [0, infinity)
262  * @param targetPaddingElements The optional number of padding elements at the end of each row of the target frame, in elements, with range [0, infinity)
263  * @param firstSourceRow First source row to be handled
264  * @param numberSourceRows The number of source rows to be handled
265  * @tparam T The data type of each channel
266  * @tparam tChannels The number of frame channels, with range [1, infinity)
267  * @tparam tFlipDirection The flip direction to be applied after transposing
268  * @see transposeBlock8x8(), transposeBlock().
269  */
270  template <typename T, unsigned int tChannels, FlipDirection tFlipDirection>
271  static void transposeSubset(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstSourceRow, const unsigned int numberSourceRows);
272 
273  /**
274  * Rotates a subset of a given frame either clockwise or counter-clockwise by 90 degree.
275  * @param source The source frame which will be rotated, must be valid
276  * @param target The resulting rotated target frame, must be valid and must have the same buffer size as the source frame
277  * @param sourceWidth The width of the source frame in pixel, with range [1, infinity)
278  * @param sourceHeight The height of the source frame in pixel, with range [1, infinity)
279  * @param clockwise True, to rotate the frame clockwise; False, to rotate the frame counter-clockwise
280  * @param sourcePaddingElements Number of padding elements in the source frame, range: [0, infinity)
281  * @param targetPaddingElements Number of padding elements in the target frame, range: [0, infinity)
282  * @param firstTargetRow The first target row to be handled, with range [0, sourceWidth)
283  * @param numberTargetRows The number of target rows to be handled, with range [1, sourceWidth - firstTargetRow]
284  * @tparam TElementType Data type of the elements of the image pixels
285  * @tparam tChannels Number of data channels, with range [1, infinity)
286  */
287  template <typename TElementType, unsigned int tChannels>
288  static void rotate90Subset(const TElementType* source, TElementType* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const bool clockwise, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows);
289 };
290 
291 inline bool FrameTransposer::Comfort::rotate90(Frame& frame, const bool clockwise, Worker* worker)
292 {
293  ocean_assert(frame.isValid());
294 
295  Frame tmpFrame;
296  if (!rotate90(frame, tmpFrame, clockwise, worker))
297  {
298  return false;
299  }
300 
301  tmpFrame.setTimestamp(frame.timestamp());
302  tmpFrame.setRelativeTimestamp(frame.relativeTimestamp());
303 
304  frame = std::move(tmpFrame);
305 
306  return true;
307 }
308 
310 {
311  ocean_assert(frame.isValid());
312 
313  Frame tmpFrame;
314  if (!rotate180(frame, tmpFrame, worker))
315  {
316  return false;
317  }
318 
319  tmpFrame.setTimestamp(frame.timestamp());
320  tmpFrame.setRelativeTimestamp(frame.relativeTimestamp());
321 
322  frame = std::move(tmpFrame);
323 
324  return true;
325 }
326 
327 inline bool FrameTransposer::Comfort::rotate(Frame& frame, const int angle, Worker* worker)
328 {
329  ocean_assert(frame.isValid());
330 
331  if (angle == 0)
332  {
333  return frame.isValid();
334  }
335 
336  Frame tmpFrame;
337  if (!rotate(frame, tmpFrame, angle, worker))
338  {
339  return false;
340  }
341 
342  tmpFrame.setTimestamp(frame.timestamp());
343  tmpFrame.setRelativeTimestamp(frame.relativeTimestamp());
344 
345  frame = std::move(tmpFrame);
346 
347  return true;
348 }
349 
350 inline bool FrameTransposer::transpose(Frame& frame, Worker* worker)
351 {
352  ocean_assert(frame);
353 
354  Frame tmpFrame;
355 
356  if (!transpose(frame, tmpFrame, worker))
357  {
358  return false;
359  }
360 
361  tmpFrame.setTimestamp(frame.timestamp());
362  tmpFrame.setRelativeTimestamp(frame.relativeTimestamp());
363 
364  frame = std::move(tmpFrame);
365  return true;
366 }
367 
368 template <typename T, unsigned int tChannels>
369 void FrameTransposer::transpose(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
370 {
371  static_assert(tChannels != 0u, "Invalid channel number!");
372 
373  ocean_assert(source && target);
374  ocean_assert(source != target);
375  ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
376 
377  const unsigned int xBlocks8 = (sourceWidth + 7u) / 8u;
378  const unsigned int yBlocks8 = (sourceHeight + 7u) / 8u;
379 
380  const unsigned int blocks8 = xBlocks8 * yBlocks8;
381 
382  typedef typename TypeMapper<T>::Type MappedType;
383 
384  if (worker && blocks8 >= 800u)
385  {
386  worker->executeFunction(Worker::Function::createStatic(&transposeSubset<MappedType, tChannels, FD_NONE>, (const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, blocks8);
387  }
388  else
389  {
390  transposeSubset<MappedType, tChannels, FD_NONE>((const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, 0u, blocks8);
391  }
392 }
393 
394 template <typename T, unsigned int tChannels>
395 void FrameTransposer::rotate90(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const bool clockwise, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
396 {
397  static_assert(tChannels != 0u, "Invalid channel number!");
398 
399  ocean_assert(source && target);
400  ocean_assert(source != target);
401  ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
402 
403  typedef typename TypeMapper<T>::Type MappedType;
404 
405 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION > 0
406 
407  // on x86 CPUs, the SIMD implementation is slower than the non-SIMD implementation
408  // therefore, using a function without explicit SIMD instructions
409 
410  if (worker)
411  {
412  worker->executeFunction(Worker::Function::createStatic(rotate90Subset<MappedType, tChannels>, (const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, clockwise, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, sourceWidth, 7u, 8u, 20u);
413  }
414  else
415  {
416  rotate90Subset<MappedType, tChannels>((const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, clockwise, sourcePaddingElements, targetPaddingElements, 0u, sourceWidth);
417  }
418 
419 #else
420 
421  // on non-x86 CPUs (e.g., ARM), the SIMD implementation is significantly faster
422 
423  const unsigned int xBlocks8 = (sourceWidth + 7u) / 8u;
424  const unsigned int yBlocks8 = (sourceHeight + 7u) / 8u;
425 
426  const unsigned int blocks8 = xBlocks8 * yBlocks8;
427 
428  if (worker && blocks8 >= 800u)
429  {
430  if (clockwise)
431  {
432  worker->executeFunction(Worker::Function::createStatic(&transposeSubset<MappedType, tChannels, FD_LEFT_RIGHT>, (const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, blocks8);
433  }
434  else
435  {
436  worker->executeFunction(Worker::Function::createStatic(&transposeSubset<MappedType, tChannels, FD_TOP_BOTTOM>, (const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, blocks8);
437  }
438  }
439  else
440  {
441  if (clockwise)
442  {
443  transposeSubset<MappedType, tChannels, FD_LEFT_RIGHT>((const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, 0u, blocks8);
444  }
445  else
446  {
447  transposeSubset<MappedType, tChannels, FD_TOP_BOTTOM>((const MappedType*)(source), (MappedType*)(target), sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, 0u, blocks8);
448  }
449  }
450 
451 #endif
452 }
453 
454 template <typename T, unsigned int tChannels>
455 void FrameTransposer::rotate180(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
456 {
457  static_assert(tChannels != 0u, "Invalid channel number!");
458 
459  ocean_assert(source != nullptr);
460  ocean_assert(target != nullptr);
461 
462  ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
463 
464  FrameChannels::transformGeneric<T, tChannels>(source, target, sourceWidth, sourceHeight, CV::FrameConverter::CONVERT_FLIPPED_AND_MIRRORED, sourcePaddingElements, targetPaddingElements, worker);
465 }
466 
467 template <typename T, unsigned int tChannels>
468 bool FrameTransposer::rotate(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const int angle, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
469 {
470  static_assert(tChannels != 0u, "Invalid channel number!");
471 
472  ocean_assert(source != nullptr);
473  ocean_assert(target != nullptr);
474 
475  ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
476 
477  if (angle % 90 != 0)
478  {
479  ocean_assert(false && "Angle must be multiple of +/- 90");
480  return false;
481  }
482 
483  int adjustedAngle = angle % 360;
484 
485  if (adjustedAngle < 0)
486  {
487  adjustedAngle = 360 + adjustedAngle;
488  }
489 
490  ocean_assert(adjustedAngle == 0 || adjustedAngle == 90 || adjustedAngle == 180 || adjustedAngle == 270);
491 
492  switch (adjustedAngle)
493  {
494  case 0:
495  CV::FrameChannels::subFrame<T>(source, target, sourceWidth, sourceHeight, sourceWidth, sourceHeight, tChannels, 0u, 0u, 0u, 0u, sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements);
496  return true;
497 
498  case 90:
499  rotate90<T, tChannels>(source, target, sourceWidth, sourceHeight, true /*clockwise*/, sourcePaddingElements, targetPaddingElements, worker);
500  return true;
501 
502  case 180:
503  rotate180<T, tChannels>(source, target, sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements, worker);
504  return true;
505 
506  case 270:
507  rotate90<T, tChannels>(source, target, sourceWidth, sourceHeight, false /*clockwise*/, sourcePaddingElements, targetPaddingElements, worker);
508  return true;
509 
510  default:
511  break;
512  }
513 
514  ocean_assert(false && "This should never happen!");
515  return false;
516 }
517 
518 template <typename T, unsigned int tChannels, FrameTransposer::FlipDirection tFlipDirection>
519 void FrameTransposer::transposeSubset(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstBlock8, const unsigned int numberBlocks8)
520 {
521  static_assert(sizeof(T) != 0, "Invalid data type!");
522  static_assert(tChannels != 0u, "Invalid channel number!");
523 
524  ocean_assert(source && target);
525  ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
526 
527  const unsigned int sourceStrideElements = sourceWidth * tChannels + sourcePaddingElements;
528  const unsigned int targetStrideElements = sourceHeight * tChannels + targetPaddingElements;
529 
530  const unsigned int xBlocks8 = (sourceWidth + 7u) / 8u;
531  const unsigned int yBlocks8 = (sourceHeight + 7u) / 8u;
532  ocean_assert(firstBlock8 + numberBlocks8 <= xBlocks8 * yBlocks8);
533 
534  const unsigned int xSmallBlockIndex = xBlocks8 * 8u == sourceWidth ? (unsigned int)(-1) : (xBlocks8 - 1u);
535  const unsigned int ySmallBlockIndex = yBlocks8 * 8u == sourceHeight ? (unsigned int)(-1) : (yBlocks8 - 1u);
536 
537  for (unsigned int block8 = firstBlock8; block8 < firstBlock8 + numberBlocks8; ++block8)
538  {
539  const unsigned int yBlock8 = block8 / xBlocks8;
540  const unsigned int xBlock8 = block8 % xBlocks8;
541 
542  const T* sourceBlockTopLeft = nullptr;
543  T* targetBlockTopLeft = nullptr;
544 
545  switch (tFlipDirection)
546  {
547  case FD_NONE:
548  {
549  // simply transposing the block
550 
551  sourceBlockTopLeft = source + sourceStrideElements * yBlock8 * 8u + xBlock8 * 8u * tChannels;
552  targetBlockTopLeft = target + targetStrideElements * xBlock8 * 8u + yBlock8 * 8u * tChannels;
553 
554  break;
555  }
556 
557  case FD_LEFT_RIGHT:
558  {
559  // transposing the block and applying a left-right flip like a mirror, actually a 90 degree clockwise rotation
560 
561  const unsigned int xTarget = (unsigned int)(std::max(0, int(sourceHeight) - int((yBlock8 + 1u) * 8u)));
562 
563  sourceBlockTopLeft = source + sourceStrideElements * yBlock8 * 8u + xBlock8 * 8u * tChannels;
564  targetBlockTopLeft = target + targetStrideElements * xBlock8 * 8u + xTarget * tChannels;
565 
566  break;
567  }
568 
569  case FD_TOP_BOTTOM:
570  {
571  // transposing the block and applying a top-bottom flip, actually a 90 degree counter clockwise rotation
572 
573  const unsigned int yTarget = (unsigned int)(std::max(0, int(sourceWidth) - int((xBlock8 + 1u) * 8u)));
574 
575  sourceBlockTopLeft = source + sourceStrideElements * yBlock8 * 8u + xBlock8 * 8u * tChannels;
576  targetBlockTopLeft = target + targetStrideElements * yTarget + yBlock8 * 8u * tChannels;
577 
578  break;
579  }
580 
581  default:
582  ocean_assert(false && "Invalid flip direction!");
583  }
584 
585  ocean_assert(sourceBlockTopLeft != nullptr);
586  ocean_assert(targetBlockTopLeft != nullptr);
587 
588  if (xBlock8 != xSmallBlockIndex && yBlock8 != ySmallBlockIndex)
589  {
590  BlockTransposer<T, tChannels>::template transposeBlock8x8<tFlipDirection>(sourceBlockTopLeft, targetBlockTopLeft, sourceStrideElements, targetStrideElements);
591  }
592  else
593  {
594  const unsigned int blockWidth = min(sourceWidth - xBlock8 * 8u, 8u);
595  const unsigned int blockHeight = min(sourceHeight - yBlock8 * 8u, 8u);
596 
597  BlockTransposer<T, tChannels>::template transposeBlock<tFlipDirection>(sourceBlockTopLeft, targetBlockTopLeft, blockWidth, blockHeight, sourceStrideElements, targetStrideElements);
598  }
599  }
600 }
601 
602 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
603 
604 template <>
605 template <FrameTransposer::FlipDirection tFlipDirection>
606 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 1u>::transposeBlock8x8(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
607 {
608  ocean_assert(sourceBlock && targetBlock);
609  ocean_assert(sourceStrideElements >= 8u && targetStrideElements >= 8u);
610 
611  // A B C D E F G H
612  // a b c d e f g h
613  // 0 1 2 3 4 5 6 7
614  // ! @ # $ % ^ & *
615  // ...
616 
617  __m128 line02_f_32x4 = _mm_setzero_ps(); // A B C D E F G H 0 1 2 3 4 5 6 7
618  __m128 line13_f_32x4 = _mm_setzero_ps(); // a b c d e f g h ! @ # $ % ^ & *
619 
620  line02_f_32x4 = _mm_loadl_pi(line02_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 0u));
621  line13_f_32x4 = _mm_loadl_pi(line13_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 1u));
622  line02_f_32x4 = _mm_loadh_pi(line02_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 2u));
623  line13_f_32x4 = _mm_loadh_pi(line13_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 3u));
624 
625  const __m128i line01_u_8x16 = _mm_unpacklo_epi8(_mm_castps_si128(line02_f_32x4), _mm_castps_si128(line13_f_32x4)); // A a B b C c D d E e F f G g H h
626  const __m128i line23_u_8x16 = _mm_unpackhi_epi8(_mm_castps_si128(line02_f_32x4), _mm_castps_si128(line13_f_32x4)); // 0 ! 1 @ 2 # 3 $ 4 % 5 ^ 6 & 7 *
627 
628  const __m128i intermediateA_03_u_8x16 = _mm_unpacklo_epi16(line01_u_8x16, line23_u_8x16); // A a 0 ! B b 1 @ C c 2 # D d 3 $
629  const __m128i intermediateB_03_u_8x16 = _mm_unpackhi_epi16(line01_u_8x16, line23_u_8x16); // E e 4 % F f 5 ^ G g 6 & H h 7 *
630 
631  __m128 line46_f_32x4 = _mm_setzero_ps();
632  __m128 line57_f_32x4 = _mm_setzero_ps();
633  line46_f_32x4 = _mm_loadl_pi(line46_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 4u));
634  line57_f_32x4 = _mm_loadl_pi(line57_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 5u));
635  line46_f_32x4 = _mm_loadh_pi(line46_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 6u));
636  line57_f_32x4 = _mm_loadh_pi(line57_f_32x4, (const __m64*)(sourceBlock + sourceStrideElements * 7u));
637 
638  const __m128i line45_u_8x16 = _mm_unpacklo_epi8(_mm_castps_si128(line46_f_32x4), _mm_castps_si128(line57_f_32x4));
639  const __m128i line67_u_8x16 = _mm_unpackhi_epi8(_mm_castps_si128(line46_f_32x4), _mm_castps_si128(line57_f_32x4));
640 
641  const __m128i intermediateA_47_u_8x16 = _mm_unpacklo_epi16(line45_u_8x16, line67_u_8x16);
642  const __m128i intermediateB_47_u_8x16 = _mm_unpackhi_epi16(line45_u_8x16, line67_u_8x16);
643 
644  __m128i transposed01 = _mm_unpacklo_epi32(intermediateA_03_u_8x16, intermediateA_47_u_8x16);
645  __m128i transposed23 = _mm_unpackhi_epi32(intermediateA_03_u_8x16, intermediateA_47_u_8x16);
646  __m128i transposed45 = _mm_unpacklo_epi32(intermediateB_03_u_8x16, intermediateB_47_u_8x16);
647  __m128i transposed67 = _mm_unpackhi_epi32(intermediateB_03_u_8x16, intermediateB_47_u_8x16);
648 
649  switch (tFlipDirection)
650  {
651  case FD_LEFT_RIGHT:
652  {
653  const __m128i reverseSuffleMask_u_16x8 = _mm_set_epi64x(0x08090A0B0C0D0E0Fll, 0x0001020304050607ll);
654 
655  transposed01 = _mm_shuffle_epi8(transposed01, reverseSuffleMask_u_16x8);
656  transposed23 = _mm_shuffle_epi8(transposed23, reverseSuffleMask_u_16x8);
657  transposed45 = _mm_shuffle_epi8(transposed45, reverseSuffleMask_u_16x8);
658  transposed67 = _mm_shuffle_epi8(transposed67, reverseSuffleMask_u_16x8);
659 
660  // no break, as we use the store function from FD_NONE
661  [[fallthrough]];
662  }
663 
664  case FD_NONE:
665  {
666  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 0u), _mm_castsi128_ps(transposed01));
667  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 1u), _mm_castsi128_ps(transposed01));
668  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 2u), _mm_castsi128_ps(transposed23));
669  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 3u), _mm_castsi128_ps(transposed23));
670  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 4u), _mm_castsi128_ps(transposed45));
671  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 5u), _mm_castsi128_ps(transposed45));
672  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 6u), _mm_castsi128_ps(transposed67));
673  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 7u), _mm_castsi128_ps(transposed67));
674 
675  break;
676  }
677 
678  case FD_TOP_BOTTOM:
679  {
680  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 0u), _mm_castsi128_ps(transposed67));
681  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 1u), _mm_castsi128_ps(transposed67));
682  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 2u), _mm_castsi128_ps(transposed45));
683  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 3u), _mm_castsi128_ps(transposed45));
684  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 4u), _mm_castsi128_ps(transposed23));
685  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 5u), _mm_castsi128_ps(transposed23));
686  _mm_storeh_pi((__m64*)(targetBlock + targetStrideElements * 6u), _mm_castsi128_ps(transposed01));
687  _mm_storel_pi((__m64*)(targetBlock + targetStrideElements * 7u), _mm_castsi128_ps(transposed01));
688 
689  break;
690  }
691 
692  default:
693  ocean_assert(false && "Invalid flip direction!");
694  }
695 }
696 
697 template <>
698 template <FrameTransposer::FlipDirection tFlipDirection>
699 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 2u>::transposeBlock8x8(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
700 {
701  ocean_assert(sourceBlock && targetBlock);
702  ocean_assert(sourceStrideElements >= 8u && targetStrideElements >= 8u);
703 
704  // AA BB CC DD EE FF GG HH
705  // aa bb cc dd ee ff gg hh
706  // 00 11 22 33 44 55 66 77
707  // !! @@ ## $$ %% ^^ && **
708  // ...
709 
710  const __m128i line0_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 0u)); // AA BB CC DD EE FF GG HH
711  const __m128i line1_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 1u)); // aa bb cc dd ee ff gg hh
712  const __m128i line2_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 2u));
713  const __m128i line3_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 3u));
714  const __m128i line4_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 4u));
715  const __m128i line5_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 5u));
716  const __m128i line6_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 6u));
717  const __m128i line7_u_8x16 = _mm_loadu_si128((const __m128i*)(sourceBlock + sourceStrideElements * 7u));
718 
719  const __m128i line01_A_u_8x16 = _mm_unpacklo_epi16(line0_u_8x16, line1_u_8x16); // AA aa BB bb CC cc DD dd
720  const __m128i line01_B_u_8x16 = _mm_unpackhi_epi16(line0_u_8x16, line1_u_8x16); // EE ee FF ff GG gg HH hh
721  const __m128i line23_A_u_8x16 = _mm_unpacklo_epi16(line2_u_8x16, line3_u_8x16); // 00 !! 11 @@ ...
722  const __m128i line23_B_u_8x16 = _mm_unpackhi_epi16(line2_u_8x16, line3_u_8x16); // 44 %% 55 ^^ ...
723  const __m128i line45_A_u_8x16 = _mm_unpacklo_epi16(line4_u_8x16, line5_u_8x16);
724  const __m128i line45_B_u_8x16 = _mm_unpackhi_epi16(line4_u_8x16, line5_u_8x16);
725  const __m128i line67_A_u_8x16 = _mm_unpacklo_epi16(line6_u_8x16, line7_u_8x16);
726  const __m128i line67_B_u_8x16 = _mm_unpackhi_epi16(line6_u_8x16, line7_u_8x16);
727 
728  const __m128i intermediateAA_03_u_8x16 = _mm_unpacklo_epi32(line01_A_u_8x16, line23_A_u_8x16); // AA aa 00 !! BB bb 11 @@
729  const __m128i intermediateAB_03_u_8x16 = _mm_unpackhi_epi32(line01_A_u_8x16, line23_A_u_8x16); // CC cc 22 ## DD dd 33 $$
730  const __m128i intermediateBA_03_u_8x16 = _mm_unpacklo_epi32(line01_B_u_8x16, line23_B_u_8x16);
731  const __m128i intermediateBB_03_u_8x16 = _mm_unpackhi_epi32(line01_B_u_8x16, line23_B_u_8x16);
732  const __m128i intermediateAA_47_u_8x16 = _mm_unpacklo_epi32(line45_A_u_8x16, line67_A_u_8x16);
733  const __m128i intermediateAB_47_u_8x16 = _mm_unpackhi_epi32(line45_A_u_8x16, line67_A_u_8x16);
734  const __m128i intermediateBA_47_u_8x16 = _mm_unpacklo_epi32(line45_B_u_8x16, line67_B_u_8x16);
735  const __m128i intermediateBB_47_u_8x16 = _mm_unpackhi_epi32(line45_B_u_8x16, line67_B_u_8x16);
736 
737  __m128i transposed0 = _mm_unpacklo_epi64(intermediateAA_03_u_8x16, intermediateAA_47_u_8x16);
738  __m128i transposed1 = _mm_unpackhi_epi64(intermediateAA_03_u_8x16, intermediateAA_47_u_8x16);
739  __m128i transposed2 = _mm_unpacklo_epi64(intermediateAB_03_u_8x16, intermediateAB_47_u_8x16);
740  __m128i transposed3 = _mm_unpackhi_epi64(intermediateAB_03_u_8x16, intermediateAB_47_u_8x16);
741  __m128i transposed4 = _mm_unpacklo_epi64(intermediateBA_03_u_8x16, intermediateBA_47_u_8x16);
742  __m128i transposed5 = _mm_unpackhi_epi64(intermediateBA_03_u_8x16, intermediateBA_47_u_8x16);
743  __m128i transposed6 = _mm_unpacklo_epi64(intermediateBB_03_u_8x16, intermediateBB_47_u_8x16);
744  __m128i transposed7 = _mm_unpackhi_epi64(intermediateBB_03_u_8x16, intermediateBB_47_u_8x16);
745 
746  switch (tFlipDirection)
747  {
748  case FD_LEFT_RIGHT:
749  {
750  const __m128i reverseSuffleMask_u_16x8 = _mm_set_epi64x(0x0100030205040706ll, 0x09080B0A0D0C0F0Ell);
751 
752  transposed0 = _mm_shuffle_epi8(transposed0, reverseSuffleMask_u_16x8);
753  transposed1 = _mm_shuffle_epi8(transposed1, reverseSuffleMask_u_16x8);
754  transposed2 = _mm_shuffle_epi8(transposed2, reverseSuffleMask_u_16x8);
755  transposed3 = _mm_shuffle_epi8(transposed3, reverseSuffleMask_u_16x8);
756  transposed4 = _mm_shuffle_epi8(transposed4, reverseSuffleMask_u_16x8);
757  transposed5 = _mm_shuffle_epi8(transposed5, reverseSuffleMask_u_16x8);
758  transposed6 = _mm_shuffle_epi8(transposed6, reverseSuffleMask_u_16x8);
759  transposed7 = _mm_shuffle_epi8(transposed7, reverseSuffleMask_u_16x8);
760 
761  // no break, as we use the store function from FD_NONE
762  [[fallthrough]];
763  }
764 
765  case FD_NONE:
766  {
767  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 0u), transposed0);
768  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 1u), transposed1);
769  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 2u), transposed2);
770  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 3u), transposed3);
771  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 4u), transposed4);
772  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 5u), transposed5);
773  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 6u), transposed6);
774  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 7u), transposed7);
775 
776  break;
777  }
778 
779  case FD_TOP_BOTTOM:
780  {
781  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 0u), transposed7);
782  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 1u), transposed6);
783  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 2u), transposed5);
784  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 3u), transposed4);
785  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 4u), transposed3);
786  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 5u), transposed2);
787  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 6u), transposed1);
788  _mm_storeu_si128((__m128i*)(targetBlock + targetStrideElements * 7u), transposed0);
789 
790  break;
791  }
792 
793  default:
794  ocean_assert(false && "Invalid flip direction!");
795  }
796 }
797 
798 #endif // defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SEE_VERSION >= 41
799 
800 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
801 
802 template <>
803 template <FrameTransposer::FlipDirection tFlipDirection>
804 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 4u>::transposeBlock4x4NEON(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
805 {
806  ocean_assert(sourceBlock && targetBlock);
807  ocean_assert(sourceStrideElements >= 4u * 3u && targetStrideElements >= 4u * 3u);
808 
809  // the NEON code is straight forward simply using the VTRN (transpose) instruction
810 
811  const uint32x4_t line0_u_32x4 = vreinterpretq_u32_u8(vld1q_u8(sourceBlock + sourceStrideElements * 0u));
812  const uint32x4_t line1_u_32x4 = vreinterpretq_u32_u8(vld1q_u8(sourceBlock + sourceStrideElements * 1u));
813 
814  // A B C D A a C c
815  // a b c d -> B b D d
816  const uint32x4x2_t line01_u_32x4x2 = vtrnq_u32(line0_u_32x4, line1_u_32x4);
817 
818  const uint32x4_t line2_u_32x4 = vreinterpretq_u32_u8(vld1q_u8(sourceBlock + sourceStrideElements * 2u));
819  const uint32x4_t line3_u_32x4 = vreinterpretq_u32_u8(vld1q_u8(sourceBlock + sourceStrideElements * 3u));
820 
821  // 0 1 2 3 0 ! 2 #
822  // ! @ # $ -> 1 @ 3 $
823  const uint32x4x2_t line23_u_32x4x2 = vtrnq_u32(line2_u_32x4, line3_u_32x4);
824 
825  // Aa Cc Aa 0!
826  // Bb Dd -> Bb 1@
827  // 0! 2# Cc 2#
828  // 1@ 3$ Dd 3$
829  const uint32x4_t result0_u_32x4 = vcombine_u32(vget_low_u32(line01_u_32x4x2.val[0]), vget_low_u32(line23_u_32x4x2.val[0]));
830  const uint32x4_t result1_u_32x4 = vcombine_u32(vget_low_u32(line01_u_32x4x2.val[1]), vget_low_u32(line23_u_32x4x2.val[1]));
831  const uint32x4_t result2_u_32x4 = vcombine_u32(vget_high_u32(line01_u_32x4x2.val[0]), vget_high_u32(line23_u_32x4x2.val[0]));
832  const uint32x4_t result3_u_32x4 = vcombine_u32(vget_high_u32(line01_u_32x4x2.val[1]), vget_high_u32(line23_u_32x4x2.val[1]));
833 
834  switch (tFlipDirection)
835  {
836  case FD_NONE:
837  {
838  vst1q_u8(targetBlock + targetStrideElements * 0u, vreinterpretq_u8_u32(result0_u_32x4));
839  vst1q_u8(targetBlock + targetStrideElements * 1u, vreinterpretq_u8_u32(result1_u_32x4));
840  vst1q_u8(targetBlock + targetStrideElements * 2u, vreinterpretq_u8_u32(result2_u_32x4));
841  vst1q_u8(targetBlock + targetStrideElements * 3u, vreinterpretq_u8_u32(result3_u_32x4));
842 
843  break;
844  }
845 
846  case FD_LEFT_RIGHT:
847  {
848  const uint32x4_t halfReverseResult0_u_32x4 = vrev64q_u32(result0_u_32x4);
849  const uint8x16_t reverseResult0_u_32x4 = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(halfReverseResult0_u_32x4), vget_low_u32(halfReverseResult0_u_32x4)));
850  vst1q_u8(targetBlock + targetStrideElements * 0u, reverseResult0_u_32x4);
851 
852  const uint32x4_t halfReverseResult1_u_32x4 = vrev64q_u32(result1_u_32x4);
853  const uint8x16_t reverseResult1_u_32x4 = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(halfReverseResult1_u_32x4), vget_low_u32(halfReverseResult1_u_32x4)));
854  vst1q_u8(targetBlock + targetStrideElements * 1u, reverseResult1_u_32x4);
855 
856  const uint32x4_t halfReverseResult2_u_32x4 = vrev64q_u32(result2_u_32x4);
857  const uint8x16_t reverseResult2_u_32x4 = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(halfReverseResult2_u_32x4), vget_low_u32(halfReverseResult2_u_32x4)));
858  vst1q_u8(targetBlock + targetStrideElements * 2u, reverseResult2_u_32x4);
859 
860  const uint32x4_t halfReverseResult3_u_32x4 = vrev64q_u32(result3_u_32x4);
861  const uint8x16_t reverseResult3_u_32x4 = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(halfReverseResult3_u_32x4), vget_low_u32(halfReverseResult3_u_32x4)));
862  vst1q_u8(targetBlock + targetStrideElements * 3u, reverseResult3_u_32x4);
863 
864  break;
865  }
866 
867  case FD_TOP_BOTTOM:
868  {
869  vst1q_u8(targetBlock + targetStrideElements * 0u, vreinterpretq_u8_u32(result3_u_32x4));
870  vst1q_u8(targetBlock + targetStrideElements * 1u, vreinterpretq_u8_u32(result2_u_32x4));
871  vst1q_u8(targetBlock + targetStrideElements * 2u, vreinterpretq_u8_u32(result1_u_32x4));
872  vst1q_u8(targetBlock + targetStrideElements * 3u, vreinterpretq_u8_u32(result0_u_32x4));
873 
874  break;
875  }
876 
877  default:
878  ocean_assert(false && "Invalid flip direction!");
879  }
880 }
881 
882 template <>
883 template <FrameTransposer::FlipDirection tFlipDirection>
884 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 1u>::transposeBlock8x8(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
885 {
886  ocean_assert(sourceBlock && targetBlock);
887  ocean_assert(sourceStrideElements >= 8u && targetStrideElements >= 8u);
888 
889  // the NEON code is straight forward simply using the VTRN (transpose) instruction
890 
891  const uint8x8_t line0_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 0u);
892  const uint8x8_t line1_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 1u);
893 
894  // A B C D E F G H A a C c E e G g
895  // a b c d e f g h -> B b D d F f H h
896  const uint8x8x2_t line01_u_8x8x2 = vtrn_u8(line0_u_8x8, line1_u_8x8);
897 
898  const uint8x8_t line2_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 2u);
899  const uint8x8_t line3_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 3u);
900 
901  // 0 1 2 3 4 5 6 7 0 ! 2 # 4 % 6 &
902  // ! @ # $ % ^ & * -> 1 @ 3 $ 5 ^ 7 *
903  const uint8x8x2_t line23_u_8x8x2 = vtrn_u8(line2_u_8x8, line3_u_8x8);
904 
905  // Aa Cc Ee Gg Aa 0! Ee 4%
906  // 0! 2# 4% 6& -> Cc 2# Gg 6&
907  const uint16x4x2_t line02_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_u_8x8x2.val[0]), vreinterpret_u16_u8(line23_u_8x8x2.val[0]));
908 
909  // Bb Dd Ff Hh Bb 1@ Ef 5^
910  // 1@ 3$ 5^ 7* -> Dd 3$ Hh 7*
911  const uint16x4x2_t line13_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_u_8x8x2.val[1]), vreinterpret_u16_u8(line23_u_8x8x2.val[1]));
912 
913  const uint8x8_t line4_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 4u);
914  const uint8x8_t line5_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 5u);
915 
916  const uint8x8x2_t line45_u_8x8x2 = vtrn_u8(line4_u_8x8, line5_u_8x8);
917 
918  const uint8x8_t line6_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 6u);
919  const uint8x8_t line7_u_8x8 = vld1_u8(sourceBlock + sourceStrideElements * 7u);
920 
921  const uint8x8x2_t line67_u_8x8x2 = vtrn_u8(line6_u_8x8, line7_u_8x8);
922 
923  const uint16x4x2_t line46_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_u_8x8x2.val[0]), vreinterpret_u16_u8(line67_u_8x8x2.val[0]));
924  const uint16x4x2_t line57_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_u_8x8x2.val[1]), vreinterpret_u16_u8(line67_u_8x8x2.val[1]));
925 
926  const uint32x2x2_t line04_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_u_16x4x2.val[0]), vreinterpret_u32_u16(line46_u_16x4x2.val[0]));
927  const uint32x2x2_t line26_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_u_16x4x2.val[1]), vreinterpret_u32_u16(line46_u_16x4x2.val[1]));
928 
929  const uint32x2x2_t line15_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_u_16x4x2.val[0]), vreinterpret_u32_u16(line57_u_16x4x2.val[0]));
930  const uint32x2x2_t line37_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_u_16x4x2.val[1]), vreinterpret_u32_u16(line57_u_16x4x2.val[1]));
931 
932  switch (tFlipDirection)
933  {
934  case FD_NONE:
935  {
936  vst1_u8(targetBlock + targetStrideElements * 0u, vreinterpret_u8_u32(line04_u_32x2x2.val[0]));
937  vst1_u8(targetBlock + targetStrideElements * 1u, vreinterpret_u8_u32(line15_u_32x2x2.val[0]));
938  vst1_u8(targetBlock + targetStrideElements * 2u, vreinterpret_u8_u32(line26_u_32x2x2.val[0]));
939  vst1_u8(targetBlock + targetStrideElements * 3u, vreinterpret_u8_u32(line37_u_32x2x2.val[0]));
940  vst1_u8(targetBlock + targetStrideElements * 4u, vreinterpret_u8_u32(line04_u_32x2x2.val[1]));
941  vst1_u8(targetBlock + targetStrideElements * 5u, vreinterpret_u8_u32(line15_u_32x2x2.val[1]));
942  vst1_u8(targetBlock + targetStrideElements * 6u, vreinterpret_u8_u32(line26_u_32x2x2.val[1]));
943  vst1_u8(targetBlock + targetStrideElements * 7u, vreinterpret_u8_u32(line37_u_32x2x2.val[1]));
944 
945  break;
946  }
947 
948  case FD_LEFT_RIGHT:
949  {
950  vst1_u8(targetBlock + targetStrideElements * 0u, vrev64_u8(vreinterpret_u8_u32(line04_u_32x2x2.val[0])));
951  vst1_u8(targetBlock + targetStrideElements * 1u, vrev64_u8(vreinterpret_u8_u32(line15_u_32x2x2.val[0])));
952  vst1_u8(targetBlock + targetStrideElements * 2u, vrev64_u8(vreinterpret_u8_u32(line26_u_32x2x2.val[0])));
953  vst1_u8(targetBlock + targetStrideElements * 3u, vrev64_u8(vreinterpret_u8_u32(line37_u_32x2x2.val[0])));
954  vst1_u8(targetBlock + targetStrideElements * 4u, vrev64_u8(vreinterpret_u8_u32(line04_u_32x2x2.val[1])));
955  vst1_u8(targetBlock + targetStrideElements * 5u, vrev64_u8(vreinterpret_u8_u32(line15_u_32x2x2.val[1])));
956  vst1_u8(targetBlock + targetStrideElements * 6u, vrev64_u8(vreinterpret_u8_u32(line26_u_32x2x2.val[1])));
957  vst1_u8(targetBlock + targetStrideElements * 7u, vrev64_u8(vreinterpret_u8_u32(line37_u_32x2x2.val[1])));
958 
959  break;
960  }
961 
962  case FD_TOP_BOTTOM:
963  {
964  vst1_u8(targetBlock + targetStrideElements * 0u, vreinterpret_u8_u32(line37_u_32x2x2.val[1]));
965  vst1_u8(targetBlock + targetStrideElements * 1u, vreinterpret_u8_u32(line26_u_32x2x2.val[1]));
966  vst1_u8(targetBlock + targetStrideElements * 2u, vreinterpret_u8_u32(line15_u_32x2x2.val[1]));
967  vst1_u8(targetBlock + targetStrideElements * 3u, vreinterpret_u8_u32(line04_u_32x2x2.val[1]));
968  vst1_u8(targetBlock + targetStrideElements * 4u, vreinterpret_u8_u32(line37_u_32x2x2.val[0]));
969  vst1_u8(targetBlock + targetStrideElements * 5u, vreinterpret_u8_u32(line26_u_32x2x2.val[0]));
970  vst1_u8(targetBlock + targetStrideElements * 6u, vreinterpret_u8_u32(line15_u_32x2x2.val[0]));
971  vst1_u8(targetBlock + targetStrideElements * 7u, vreinterpret_u8_u32(line04_u_32x2x2.val[0]));
972 
973  break;
974  }
975 
976  default:
977  ocean_assert(false && "Invalid flip direction!");
978  }
979 }
980 
981 template <>
982 template <FrameTransposer::FlipDirection tFlipDirection>
983 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 2u>::transposeBlock8x8(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
984 {
985  ocean_assert(sourceBlock && targetBlock);
986  ocean_assert(sourceStrideElements >= 8u * 2u && targetStrideElements >= 8u * 2u);
987 
988  // the NEON code is straight forward simply using the VTRN (transpose) instruction
989  // the 2-channel code is similar to the 1-channel code but simply transposes 16 bit values instead of 8 bit values
990 
991  const uint16x8_t line0_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 0u));
992  const uint16x8_t line1_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 1u));
993 
994  // A B C D E F G H A a C c E e G g
995  // a b c d e f g h -> B b D d F f H h
996  const uint16x8x2_t line01_u_16x8x2 = vtrnq_u16(line0_u_16x8, line1_u_16x8);
997 
998  const uint16x8_t line2_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 2u));
999  const uint16x8_t line3_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 3u));
1000 
1001  // 0 1 2 3 4 5 6 7 0 ! 2 # 4 % 6 &
1002  // ! @ # $ % ^ & * -> 1 @ 3 $ 5 ^ 7 *
1003  const uint16x8x2_t line23_u_16x8x2 = vtrnq_u16(line2_u_16x8, line3_u_16x8);
1004 
1005  // Aa Cc Ee Gg Aa 0! Ee 4%
1006  // 0! 2# 4% 6& -> Cc 2# Gg 6&
1007  const uint32x4x2_t line02_u_32x4x2 = vtrnq_u32(vreinterpretq_u32_u16(line01_u_16x8x2.val[0]), vreinterpretq_u32_u16(line23_u_16x8x2.val[0]));
1008 
1009  // Bb Dd Ff Hh Bb 1@ Ef 5^
1010  // 1@ 3$ 5^ 7* -> Dd 3$ Hh 7*
1011  const uint32x4x2_t line13_u_32x4x2 = vtrnq_u32(vreinterpretq_u32_u16(line01_u_16x8x2.val[1]), vreinterpretq_u32_u16(line23_u_16x8x2.val[1]));
1012 
1013  const uint16x8_t line4_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 4u));
1014  const uint16x8_t line5_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 5u));
1015 
1016  const uint16x8x2_t line45_u_16x8x2 = vtrnq_u16(line4_u_16x8, line5_u_16x8);
1017 
1018  const uint16x8_t line6_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 6u));
1019  const uint16x8_t line7_u_16x8 = vreinterpretq_u16_u8(vld1q_u8(sourceBlock + sourceStrideElements * 7u));
1020 
1021  const uint16x8x2_t line67_u_16x8x2 = vtrnq_u16(line6_u_16x8, line7_u_16x8);
1022 
1023  const uint32x4x2_t line46_u_32x4x2 = vtrnq_u32(vreinterpretq_u32_u16(line45_u_16x8x2.val[0]), vreinterpretq_u32_u16(line67_u_16x8x2.val[0]));
1024  const uint32x4x2_t line57_u_32x4x2 = vtrnq_u32(vreinterpretq_u32_u16(line45_u_16x8x2.val[1]), vreinterpretq_u32_u16(line67_u_16x8x2.val[1]));
1025 
1026  const uint32x4_t result0_u_32x4 = vcombine_u32(vget_low_u32(line02_u_32x4x2.val[0]), vget_low_u32(line46_u_32x4x2.val[0]));
1027  const uint32x4_t result1_u_32x4 = vcombine_u32(vget_low_u32(line13_u_32x4x2.val[0]), vget_low_u32(line57_u_32x4x2.val[0]));
1028 
1029  const uint32x4_t result2_u_32x4 = vcombine_u32(vget_low_u32(line02_u_32x4x2.val[1]), vget_low_u32(line46_u_32x4x2.val[1]));
1030  const uint32x4_t result3_u_32x4 = vcombine_u32(vget_low_u32(line13_u_32x4x2.val[1]), vget_low_u32(line57_u_32x4x2.val[1]));
1031 
1032  const uint32x4_t result4_u_32x4 = vcombine_u32(vget_high_u32(line02_u_32x4x2.val[0]), vget_high_u32(line46_u_32x4x2.val[0]));
1033  const uint32x4_t result5_u_32x4 = vcombine_u32(vget_high_u32(line13_u_32x4x2.val[0]), vget_high_u32(line57_u_32x4x2.val[0]));
1034 
1035  const uint32x4_t result6_u_32x4 = vcombine_u32(vget_high_u32(line02_u_32x4x2.val[1]), vget_high_u32(line46_u_32x4x2.val[1]));
1036  const uint32x4_t result7_u_32x4 = vcombine_u32(vget_high_u32(line13_u_32x4x2.val[1]), vget_high_u32(line57_u_32x4x2.val[1]));
1037 
1038  switch (tFlipDirection)
1039  {
1040  case FD_NONE:
1041  {
1042  vst1q_u8(targetBlock + targetStrideElements * 0u, vreinterpretq_u8_u32(result0_u_32x4));
1043  vst1q_u8(targetBlock + targetStrideElements * 1u, vreinterpretq_u8_u32(result1_u_32x4));
1044  vst1q_u8(targetBlock + targetStrideElements * 2u, vreinterpretq_u8_u32(result2_u_32x4));
1045  vst1q_u8(targetBlock + targetStrideElements * 3u, vreinterpretq_u8_u32(result3_u_32x4));
1046  vst1q_u8(targetBlock + targetStrideElements * 4u, vreinterpretq_u8_u32(result4_u_32x4));
1047  vst1q_u8(targetBlock + targetStrideElements * 5u, vreinterpretq_u8_u32(result5_u_32x4));
1048  vst1q_u8(targetBlock + targetStrideElements * 6u, vreinterpretq_u8_u32(result6_u_32x4));
1049  vst1q_u8(targetBlock + targetStrideElements * 7u, vreinterpretq_u8_u32(result7_u_32x4));
1050 
1051  break;
1052  }
1053 
1054  case FD_LEFT_RIGHT:
1055  {
1056  const uint8x16_t targetHalfReverse0_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result0_u_32x4)));
1057  vst1q_u8(targetBlock + targetStrideElements * 0u, vcombine_u8(vget_high_u8(targetHalfReverse0_u_8x16), vget_low_u8(targetHalfReverse0_u_8x16)));
1058 
1059  const uint8x16_t targetHalfReverse1_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result1_u_32x4)));
1060  vst1q_u8(targetBlock + targetStrideElements * 1u, vcombine_u8(vget_high_u8(targetHalfReverse1_u_8x16), vget_low_u8(targetHalfReverse1_u_8x16)));
1061 
1062  const uint8x16_t targetHalfReverse2_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result2_u_32x4)));
1063  vst1q_u8(targetBlock + targetStrideElements * 2u, vcombine_u8(vget_high_u8(targetHalfReverse2_u_8x16), vget_low_u8(targetHalfReverse2_u_8x16)));
1064 
1065  const uint8x16_t targetHalfReverse3_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result3_u_32x4)));
1066  vst1q_u8(targetBlock + targetStrideElements * 3u, vcombine_u8(vget_high_u8(targetHalfReverse3_u_8x16), vget_low_u8(targetHalfReverse3_u_8x16)));
1067 
1068  const uint8x16_t targetHalfReverse4_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result4_u_32x4)));
1069  vst1q_u8(targetBlock + targetStrideElements * 4u, vcombine_u8(vget_high_u8(targetHalfReverse4_u_8x16), vget_low_u8(targetHalfReverse4_u_8x16)));
1070 
1071  const uint8x16_t targetHalfReverse5_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result5_u_32x4)));
1072  vst1q_u8(targetBlock + targetStrideElements * 5u, vcombine_u8(vget_high_u8(targetHalfReverse5_u_8x16), vget_low_u8(targetHalfReverse5_u_8x16)));
1073 
1074  const uint8x16_t targetHalfReverse6_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result6_u_32x4)));
1075  vst1q_u8(targetBlock + targetStrideElements * 6u, vcombine_u8(vget_high_u8(targetHalfReverse6_u_8x16), vget_low_u8(targetHalfReverse6_u_8x16)));
1076 
1077  const uint8x16_t targetHalfReverse7_u_8x16 = vreinterpretq_u8_u16(vrev64q_u16(vreinterpretq_u16_u32(result7_u_32x4)));
1078  vst1q_u8(targetBlock + targetStrideElements * 7u, vcombine_u8(vget_high_u8(targetHalfReverse7_u_8x16), vget_low_u8(targetHalfReverse7_u_8x16)));
1079 
1080  break;
1081  }
1082 
1083  case FD_TOP_BOTTOM:
1084  {
1085  vst1q_u8(targetBlock + targetStrideElements * 0u, vreinterpretq_u8_u32(result7_u_32x4));
1086  vst1q_u8(targetBlock + targetStrideElements * 1u, vreinterpretq_u8_u32(result6_u_32x4));
1087  vst1q_u8(targetBlock + targetStrideElements * 2u, vreinterpretq_u8_u32(result5_u_32x4));
1088  vst1q_u8(targetBlock + targetStrideElements * 3u, vreinterpretq_u8_u32(result4_u_32x4));
1089  vst1q_u8(targetBlock + targetStrideElements * 4u, vreinterpretq_u8_u32(result3_u_32x4));
1090  vst1q_u8(targetBlock + targetStrideElements * 5u, vreinterpretq_u8_u32(result2_u_32x4));
1091  vst1q_u8(targetBlock + targetStrideElements * 6u, vreinterpretq_u8_u32(result1_u_32x4));
1092  vst1q_u8(targetBlock + targetStrideElements * 7u, vreinterpretq_u8_u32(result0_u_32x4));
1093 
1094  break;
1095  }
1096 
1097  default:
1098  ocean_assert(false && "Invalid flip direction!");
1099  }
1100 }
1101 
1102 template <>
1103 template <FrameTransposer::FlipDirection tFlipDirection>
1104 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 3u>::transposeBlock8x8(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
1105 {
1106  ocean_assert(sourceBlock && targetBlock);
1107  ocean_assert(sourceStrideElements >= 8u * 3u && targetStrideElements >= 8u * 3u);
1108 
1109  // the NEON code is straight forward simply using the VTRN (transpose) instruction
1110  // the 3-channel code is similar to the 1-channel code but uses vld3_u8/vst3_u8 instead of vld1_u8/vst1_u8
1111 
1112  const uint8x8x3_t line0_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 0u);
1113  const uint8x8x3_t line1_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 1u);
1114 
1115  // A B C D E F G H A a C c E e G g
1116  // a b c d e f g h -> B b D d F f H h
1117  const uint8x8x2_t line01_channel0_u_8x8x2 = vtrn_u8(line0_u_8x8x3.val[0], line1_u_8x8x3.val[0]);
1118  const uint8x8x2_t line01_channel1_u_8x8x2 = vtrn_u8(line0_u_8x8x3.val[1], line1_u_8x8x3.val[1]);
1119  const uint8x8x2_t line01_channel2_u_8x8x2 = vtrn_u8(line0_u_8x8x3.val[2], line1_u_8x8x3.val[2]);
1120 
1121  const uint8x8x3_t line2_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 2u);
1122  const uint8x8x3_t line3_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 3u);
1123 
1124  // 0 1 2 3 4 5 6 7 0 ! 2 # 4 % 6 &
1125  // ! @ # $ % ^ & * -> 1 @ 3 $ 5 ^ 7 *
1126  const uint8x8x2_t line23_channel0_u_8x8x2 = vtrn_u8(line2_u_8x8x3.val[0], line3_u_8x8x3.val[0]);
1127  const uint8x8x2_t line23_channel1_u_8x8x2 = vtrn_u8(line2_u_8x8x3.val[1], line3_u_8x8x3.val[1]);
1128  const uint8x8x2_t line23_channel2_u_8x8x2 = vtrn_u8(line2_u_8x8x3.val[2], line3_u_8x8x3.val[2]);
1129 
1130  // Aa Cc Ee Gg Aa 0! Ee 4%
1131  // 0! 2# 4% 6& -> Cc 2# Gg 6&
1132  const uint16x4x2_t line02_channel0_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_channel0_u_8x8x2.val[0]), vreinterpret_u16_u8(line23_channel0_u_8x8x2.val[0]));
1133  const uint16x4x2_t line02_channel1_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_channel1_u_8x8x2.val[0]), vreinterpret_u16_u8(line23_channel1_u_8x8x2.val[0]));
1134  const uint16x4x2_t line02_channel2_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_channel2_u_8x8x2.val[0]), vreinterpret_u16_u8(line23_channel2_u_8x8x2.val[0]));
1135 
1136  // Bb Dd Ff Hh Bb 1@ Ef 5^
1137  // 1@ 3$ 5^ 7* -> Dd 3$ Hh 7*
1138  const uint16x4x2_t line13_channel0_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_channel0_u_8x8x2.val[1]), vreinterpret_u16_u8(line23_channel0_u_8x8x2.val[1]));
1139  const uint16x4x2_t line13_channel1_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_channel1_u_8x8x2.val[1]), vreinterpret_u16_u8(line23_channel1_u_8x8x2.val[1]));
1140  const uint16x4x2_t line13_channel2_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line01_channel2_u_8x8x2.val[1]), vreinterpret_u16_u8(line23_channel2_u_8x8x2.val[1]));
1141 
1142  const uint8x8x3_t line4_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 4u);
1143  const uint8x8x3_t line5_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 5u);
1144 
1145  const uint8x8x2_t line45_channel0_u_8x8x2 = vtrn_u8(line4_u_8x8x3.val[0], line5_u_8x8x3.val[0]);
1146  const uint8x8x2_t line45_channel1_u_8x8x2 = vtrn_u8(line4_u_8x8x3.val[1], line5_u_8x8x3.val[1]);
1147  const uint8x8x2_t line45_channel2_u_8x8x2 = vtrn_u8(line4_u_8x8x3.val[2], line5_u_8x8x3.val[2]);
1148 
1149  const uint8x8x3_t line6_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 6u);
1150  const uint8x8x3_t line7_u_8x8x3 = vld3_u8(sourceBlock + sourceStrideElements * 7u);
1151 
1152  const uint8x8x2_t line67_channel0_u_8x8x2 = vtrn_u8(line6_u_8x8x3.val[0], line7_u_8x8x3.val[0]);
1153  const uint8x8x2_t line67_channel1_u_8x8x2 = vtrn_u8(line6_u_8x8x3.val[1], line7_u_8x8x3.val[1]);
1154  const uint8x8x2_t line67_channel2_u_8x8x2 = vtrn_u8(line6_u_8x8x3.val[2], line7_u_8x8x3.val[2]);
1155 
1156  const uint16x4x2_t line46_channel0_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_channel0_u_8x8x2.val[0]), vreinterpret_u16_u8(line67_channel0_u_8x8x2.val[0]));
1157  const uint16x4x2_t line46_channel1_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_channel1_u_8x8x2.val[0]), vreinterpret_u16_u8(line67_channel1_u_8x8x2.val[0]));
1158  const uint16x4x2_t line46_channel2_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_channel2_u_8x8x2.val[0]), vreinterpret_u16_u8(line67_channel2_u_8x8x2.val[0]));
1159 
1160  const uint16x4x2_t line57_channel0_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_channel0_u_8x8x2.val[1]), vreinterpret_u16_u8(line67_channel0_u_8x8x2.val[1]));
1161  const uint16x4x2_t line57_channel1_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_channel1_u_8x8x2.val[1]), vreinterpret_u16_u8(line67_channel1_u_8x8x2.val[1]));
1162  const uint16x4x2_t line57_channel2_u_16x4x2 = vtrn_u16(vreinterpret_u16_u8(line45_channel2_u_8x8x2.val[1]), vreinterpret_u16_u8(line67_channel2_u_8x8x2.val[1]));
1163 
1164  const uint32x2x2_t line04_channel0_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_channel0_u_16x4x2.val[0]), vreinterpret_u32_u16(line46_channel0_u_16x4x2.val[0]));
1165  const uint32x2x2_t line04_channel1_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_channel1_u_16x4x2.val[0]), vreinterpret_u32_u16(line46_channel1_u_16x4x2.val[0]));
1166  const uint32x2x2_t line04_channel2_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_channel2_u_16x4x2.val[0]), vreinterpret_u32_u16(line46_channel2_u_16x4x2.val[0]));
1167 
1168  const uint32x2x2_t line26_channel0_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_channel0_u_16x4x2.val[1]), vreinterpret_u32_u16(line46_channel0_u_16x4x2.val[1]));
1169  const uint32x2x2_t line26_channel1_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_channel1_u_16x4x2.val[1]), vreinterpret_u32_u16(line46_channel1_u_16x4x2.val[1]));
1170  const uint32x2x2_t line26_channel2_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line02_channel2_u_16x4x2.val[1]), vreinterpret_u32_u16(line46_channel2_u_16x4x2.val[1]));
1171 
1172  const uint32x2x2_t line15_channel0_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_channel0_u_16x4x2.val[0]), vreinterpret_u32_u16(line57_channel0_u_16x4x2.val[0]));
1173  const uint32x2x2_t line15_channel1_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_channel1_u_16x4x2.val[0]), vreinterpret_u32_u16(line57_channel1_u_16x4x2.val[0]));
1174  const uint32x2x2_t line15_channel2_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_channel2_u_16x4x2.val[0]), vreinterpret_u32_u16(line57_channel2_u_16x4x2.val[0]));
1175 
1176  const uint32x2x2_t line37_channel0_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_channel0_u_16x4x2.val[1]), vreinterpret_u32_u16(line57_channel0_u_16x4x2.val[1]));
1177  const uint32x2x2_t line37_channel1_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_channel1_u_16x4x2.val[1]), vreinterpret_u32_u16(line57_channel1_u_16x4x2.val[1]));
1178  const uint32x2x2_t line37_channel2_u_32x2x2 = vtrn_u32(vreinterpret_u32_u16(line13_channel2_u_16x4x2.val[1]), vreinterpret_u32_u16(line57_channel2_u_16x4x2.val[1]));
1179 
1180  switch (tFlipDirection)
1181  {
1182  case FD_NONE:
1183  {
1184  uint8x8x3_t result0_u_8x8x3;
1185  result0_u_8x8x3.val[0] = vreinterpret_u8_u32(line04_channel0_u_32x2x2.val[0]);
1186  result0_u_8x8x3.val[1] = vreinterpret_u8_u32(line04_channel1_u_32x2x2.val[0]);
1187  result0_u_8x8x3.val[2] = vreinterpret_u8_u32(line04_channel2_u_32x2x2.val[0]);
1188  vst3_u8(targetBlock + targetStrideElements * 0u, result0_u_8x8x3);
1189 
1190  uint8x8x3_t result1_u_8x8x3;
1191  result1_u_8x8x3.val[0] = vreinterpret_u8_u32(line15_channel0_u_32x2x2.val[0]);
1192  result1_u_8x8x3.val[1] = vreinterpret_u8_u32(line15_channel1_u_32x2x2.val[0]);
1193  result1_u_8x8x3.val[2] = vreinterpret_u8_u32(line15_channel2_u_32x2x2.val[0]);
1194  vst3_u8(targetBlock + targetStrideElements * 1u, result1_u_8x8x3);
1195 
1196  uint8x8x3_t result2_u_8x8x3;
1197  result2_u_8x8x3.val[0] = vreinterpret_u8_u32(line26_channel0_u_32x2x2.val[0]);
1198  result2_u_8x8x3.val[1] = vreinterpret_u8_u32(line26_channel1_u_32x2x2.val[0]);
1199  result2_u_8x8x3.val[2] = vreinterpret_u8_u32(line26_channel2_u_32x2x2.val[0]);
1200  vst3_u8(targetBlock + targetStrideElements * 2u, result2_u_8x8x3);
1201 
1202  uint8x8x3_t result3_u_8x8x3;
1203  result3_u_8x8x3.val[0] = vreinterpret_u8_u32(line37_channel0_u_32x2x2.val[0]);
1204  result3_u_8x8x3.val[1] = vreinterpret_u8_u32(line37_channel1_u_32x2x2.val[0]);
1205  result3_u_8x8x3.val[2] = vreinterpret_u8_u32(line37_channel2_u_32x2x2.val[0]);
1206  vst3_u8(targetBlock + targetStrideElements * 3u, result3_u_8x8x3);
1207 
1208  uint8x8x3_t result4_u_8x8x3;
1209  result4_u_8x8x3.val[0] = vreinterpret_u8_u32(line04_channel0_u_32x2x2.val[1]);
1210  result4_u_8x8x3.val[1] = vreinterpret_u8_u32(line04_channel1_u_32x2x2.val[1]);
1211  result4_u_8x8x3.val[2] = vreinterpret_u8_u32(line04_channel2_u_32x2x2.val[1]);
1212  vst3_u8(targetBlock + targetStrideElements * 4u, result4_u_8x8x3);
1213 
1214  uint8x8x3_t result5_u_8x8x3;
1215  result5_u_8x8x3.val[0] = vreinterpret_u8_u32(line15_channel0_u_32x2x2.val[1]);
1216  result5_u_8x8x3.val[1] = vreinterpret_u8_u32(line15_channel1_u_32x2x2.val[1]);
1217  result5_u_8x8x3.val[2] = vreinterpret_u8_u32(line15_channel2_u_32x2x2.val[1]);
1218  vst3_u8(targetBlock + targetStrideElements * 5u, result5_u_8x8x3);
1219 
1220  uint8x8x3_t result6_u_8x8x3;
1221  result6_u_8x8x3.val[0] = vreinterpret_u8_u32(line26_channel0_u_32x2x2.val[1]);
1222  result6_u_8x8x3.val[1] = vreinterpret_u8_u32(line26_channel1_u_32x2x2.val[1]);
1223  result6_u_8x8x3.val[2] = vreinterpret_u8_u32(line26_channel2_u_32x2x2.val[1]);
1224  vst3_u8(targetBlock + targetStrideElements * 6u, result6_u_8x8x3);
1225 
1226  uint8x8x3_t result7_u_8x8x3;
1227  result7_u_8x8x3.val[0] = vreinterpret_u8_u32(line37_channel0_u_32x2x2.val[1]);
1228  result7_u_8x8x3.val[1] = vreinterpret_u8_u32(line37_channel1_u_32x2x2.val[1]);
1229  result7_u_8x8x3.val[2] = vreinterpret_u8_u32(line37_channel2_u_32x2x2.val[1]);
1230  vst3_u8(targetBlock + targetStrideElements * 7u, result7_u_8x8x3);
1231 
1232  break;
1233  }
1234 
1235  case FD_LEFT_RIGHT:
1236  {
1237  uint8x8x3_t result0_u_8x8x3;
1238  result0_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line04_channel0_u_32x2x2.val[0]));
1239  result0_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line04_channel1_u_32x2x2.val[0]));
1240  result0_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line04_channel2_u_32x2x2.val[0]));
1241  vst3_u8(targetBlock + targetStrideElements * 0u, result0_u_8x8x3);
1242 
1243  uint8x8x3_t result1_u_8x8x3;
1244  result1_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line15_channel0_u_32x2x2.val[0]));
1245  result1_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line15_channel1_u_32x2x2.val[0]));
1246  result1_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line15_channel2_u_32x2x2.val[0]));
1247  vst3_u8(targetBlock + targetStrideElements * 1u, result1_u_8x8x3);
1248 
1249  uint8x8x3_t result2_u_8x8x3;
1250  result2_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line26_channel0_u_32x2x2.val[0]));
1251  result2_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line26_channel1_u_32x2x2.val[0]));
1252  result2_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line26_channel2_u_32x2x2.val[0]));
1253  vst3_u8(targetBlock + targetStrideElements * 2u, result2_u_8x8x3);
1254 
1255  uint8x8x3_t result3_u_8x8x3;
1256  result3_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line37_channel0_u_32x2x2.val[0]));
1257  result3_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line37_channel1_u_32x2x2.val[0]));
1258  result3_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line37_channel2_u_32x2x2.val[0]));
1259  vst3_u8(targetBlock + targetStrideElements * 3u, result3_u_8x8x3);
1260 
1261  uint8x8x3_t result4_u_8x8x3;
1262  result4_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line04_channel0_u_32x2x2.val[1]));
1263  result4_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line04_channel1_u_32x2x2.val[1]));
1264  result4_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line04_channel2_u_32x2x2.val[1]));
1265  vst3_u8(targetBlock + targetStrideElements * 4u, result4_u_8x8x3);
1266 
1267  uint8x8x3_t result5_u_8x8x3;
1268  result5_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line15_channel0_u_32x2x2.val[1]));
1269  result5_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line15_channel1_u_32x2x2.val[1]));
1270  result5_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line15_channel2_u_32x2x2.val[1]));
1271  vst3_u8(targetBlock + targetStrideElements * 5u, result5_u_8x8x3);
1272 
1273  uint8x8x3_t result6_u_8x8x3;
1274  result6_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line26_channel0_u_32x2x2.val[1]));
1275  result6_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line26_channel1_u_32x2x2.val[1]));
1276  result6_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line26_channel2_u_32x2x2.val[1]));
1277  vst3_u8(targetBlock + targetStrideElements * 6u, result6_u_8x8x3);
1278 
1279  uint8x8x3_t result7_u_8x8x3;
1280  result7_u_8x8x3.val[0] = vrev64_u8(vreinterpret_u8_u32(line37_channel0_u_32x2x2.val[1]));
1281  result7_u_8x8x3.val[1] = vrev64_u8(vreinterpret_u8_u32(line37_channel1_u_32x2x2.val[1]));
1282  result7_u_8x8x3.val[2] = vrev64_u8(vreinterpret_u8_u32(line37_channel2_u_32x2x2.val[1]));
1283  vst3_u8(targetBlock + targetStrideElements * 7u, result7_u_8x8x3);
1284 
1285  break;
1286  }
1287 
1288  case FD_TOP_BOTTOM:
1289  {
1290  uint8x8x3_t result7_u_8x8x3;
1291  result7_u_8x8x3.val[0] = vreinterpret_u8_u32(line37_channel0_u_32x2x2.val[1]);
1292  result7_u_8x8x3.val[1] = vreinterpret_u8_u32(line37_channel1_u_32x2x2.val[1]);
1293  result7_u_8x8x3.val[2] = vreinterpret_u8_u32(line37_channel2_u_32x2x2.val[1]);
1294  vst3_u8(targetBlock + targetStrideElements * 0u, result7_u_8x8x3);
1295 
1296  uint8x8x3_t result6_u_8x8x3;
1297  result6_u_8x8x3.val[0] = vreinterpret_u8_u32(line26_channel0_u_32x2x2.val[1]);
1298  result6_u_8x8x3.val[1] = vreinterpret_u8_u32(line26_channel1_u_32x2x2.val[1]);
1299  result6_u_8x8x3.val[2] = vreinterpret_u8_u32(line26_channel2_u_32x2x2.val[1]);
1300  vst3_u8(targetBlock + targetStrideElements * 1u, result6_u_8x8x3);
1301 
1302  uint8x8x3_t result5_u_8x8x3;
1303  result5_u_8x8x3.val[0] = vreinterpret_u8_u32(line15_channel0_u_32x2x2.val[1]);
1304  result5_u_8x8x3.val[1] = vreinterpret_u8_u32(line15_channel1_u_32x2x2.val[1]);
1305  result5_u_8x8x3.val[2] = vreinterpret_u8_u32(line15_channel2_u_32x2x2.val[1]);
1306  vst3_u8(targetBlock + targetStrideElements * 2u, result5_u_8x8x3);
1307 
1308  uint8x8x3_t result4_u_8x8x3;
1309  result4_u_8x8x3.val[0] = vreinterpret_u8_u32(line04_channel0_u_32x2x2.val[1]);
1310  result4_u_8x8x3.val[1] = vreinterpret_u8_u32(line04_channel1_u_32x2x2.val[1]);
1311  result4_u_8x8x3.val[2] = vreinterpret_u8_u32(line04_channel2_u_32x2x2.val[1]);
1312  vst3_u8(targetBlock + targetStrideElements * 3u, result4_u_8x8x3);
1313 
1314  uint8x8x3_t result3_u_8x8x3;
1315  result3_u_8x8x3.val[0] = vreinterpret_u8_u32(line37_channel0_u_32x2x2.val[0]);
1316  result3_u_8x8x3.val[1] = vreinterpret_u8_u32(line37_channel1_u_32x2x2.val[0]);
1317  result3_u_8x8x3.val[2] = vreinterpret_u8_u32(line37_channel2_u_32x2x2.val[0]);
1318  vst3_u8(targetBlock + targetStrideElements * 4u, result3_u_8x8x3);
1319 
1320  uint8x8x3_t result2_u_8x8x3;
1321  result2_u_8x8x3.val[0] = vreinterpret_u8_u32(line26_channel0_u_32x2x2.val[0]);
1322  result2_u_8x8x3.val[1] = vreinterpret_u8_u32(line26_channel1_u_32x2x2.val[0]);
1323  result2_u_8x8x3.val[2] = vreinterpret_u8_u32(line26_channel2_u_32x2x2.val[0]);
1324  vst3_u8(targetBlock + targetStrideElements * 5u, result2_u_8x8x3);
1325 
1326  uint8x8x3_t result1_u_8x8x3;
1327  result1_u_8x8x3.val[0] = vreinterpret_u8_u32(line15_channel0_u_32x2x2.val[0]);
1328  result1_u_8x8x3.val[1] = vreinterpret_u8_u32(line15_channel1_u_32x2x2.val[0]);
1329  result1_u_8x8x3.val[2] = vreinterpret_u8_u32(line15_channel2_u_32x2x2.val[0]);
1330  vst3_u8(targetBlock + targetStrideElements * 6u, result1_u_8x8x3);
1331 
1332  uint8x8x3_t result0_u_8x8x3;
1333  result0_u_8x8x3.val[0] = vreinterpret_u8_u32(line04_channel0_u_32x2x2.val[0]);
1334  result0_u_8x8x3.val[1] = vreinterpret_u8_u32(line04_channel1_u_32x2x2.val[0]);
1335  result0_u_8x8x3.val[2] = vreinterpret_u8_u32(line04_channel2_u_32x2x2.val[0]);
1336  vst3_u8(targetBlock + targetStrideElements * 7u, result0_u_8x8x3);
1337 
1338  break;
1339  }
1340 
1341  default:
1342  ocean_assert(false && "Invalid flip direction!");
1343  }
1344 }
1345 
1346 template <>
1347 template <FrameTransposer::FlipDirection tFlipDirection>
1348 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<uint8_t, 4u>::transposeBlock8x8(const uint8_t* sourceBlock, uint8_t* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
1349 {
1350  ocean_assert(sourceBlock && targetBlock);
1351  ocean_assert(sourceStrideElements >= 8u * 4u && targetStrideElements >= 8u * 4u);
1352 
1353  // we simply tranpose four blocks of 4x4 pixels
1354 
1355  switch (tFlipDirection)
1356  {
1357  case FD_NONE:
1358  {
1359  transposeBlock4x4NEON<tFlipDirection>(sourceBlock, targetBlock, sourceStrideElements, targetStrideElements);
1360  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 16, targetBlock + 4 * targetStrideElements, sourceStrideElements, targetStrideElements);
1361  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 4 * sourceStrideElements, targetBlock + 16, sourceStrideElements, targetStrideElements);
1362  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 4 * sourceStrideElements + 16, targetBlock + 4 * targetStrideElements + 16, sourceStrideElements, targetStrideElements);
1363 
1364  break;
1365  }
1366 
1367  case FD_LEFT_RIGHT:
1368  {
1369  transposeBlock4x4NEON<tFlipDirection>(sourceBlock, targetBlock + 16, sourceStrideElements, targetStrideElements);
1370  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 16, targetBlock + 4 * targetStrideElements + 16, sourceStrideElements, targetStrideElements);
1371  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 4 * sourceStrideElements, targetBlock, sourceStrideElements, targetStrideElements);
1372  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 4 * sourceStrideElements + 16, targetBlock + 4 * targetStrideElements, sourceStrideElements, targetStrideElements);
1373 
1374  break;
1375  }
1376 
1377  case FD_TOP_BOTTOM:
1378  {
1379  transposeBlock4x4NEON<tFlipDirection>(sourceBlock, targetBlock + 4 * targetStrideElements, sourceStrideElements, targetStrideElements);
1380  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 16, targetBlock, sourceStrideElements, targetStrideElements);
1381  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 4 * sourceStrideElements, targetBlock + 4 * targetStrideElements + 16, sourceStrideElements, targetStrideElements);
1382  transposeBlock4x4NEON<tFlipDirection>(sourceBlock + 4 * sourceStrideElements + 16, targetBlock + 16, sourceStrideElements, targetStrideElements);
1383 
1384  break;
1385  }
1386 
1387  default:
1388  ocean_assert(false && "Invalid flip direction!");
1389  }
1390 }
1391 
1392 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1393 
1394 template <typename T, unsigned int tChannels>
1395 template <FrameTransposer::FlipDirection tFlipDirection>
1396 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<T, tChannels>::transposeBlock8x8(const T* sourceBlock, T* targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
1397 {
1398  ocean_assert(sourceBlock && targetBlock);
1399  ocean_assert(sourceStrideElements >= 8u && targetStrideElements >= 8u);
1400 
1401  typedef typename DataType<T, tChannels>::Type PixelType;
1402 
1403  switch (tFlipDirection)
1404  {
1405  case FD_NONE:
1406  {
1407  // simply transposing the block
1408 
1409  for (unsigned int y = 0u; y < 8u; ++y)
1410  {
1411  const PixelType* const sourcePixel = (const PixelType*)sourceBlock;
1412 
1413  for (unsigned int x = 0u; x < 8u; ++x)
1414  {
1415  *((PixelType*)(targetBlock + targetStrideElements * x)) = sourcePixel[x];
1416  }
1417 
1418  sourceBlock += sourceStrideElements;
1419  targetBlock += tChannels;
1420  }
1421 
1422  break;
1423  }
1424 
1425  case FD_LEFT_RIGHT:
1426  {
1427  // transposing the block and applying a left-right flip like a mirror, actually a 90 degree clockwise rotation
1428 
1429  for (unsigned int y = 0u; y < 8u; ++y)
1430  {
1431  const PixelType* const sourcePixel = (const PixelType*)sourceBlock;
1432 
1433  for (unsigned int x = 0u; x < 8u; ++x)
1434  {
1435  *((PixelType*)(targetBlock + targetStrideElements * x) + (8u - y - 1u)) = sourcePixel[x];
1436  }
1437 
1438  sourceBlock += sourceStrideElements;
1439  }
1440 
1441  break;
1442  }
1443 
1444  case FD_TOP_BOTTOM:
1445  {
1446  // transposing the block and applying a top-bottom flip, actually a 90 degree counter clockwise rotation
1447 
1448  for (unsigned int y = 0u; y < 8u; ++y)
1449  {
1450  const PixelType* const sourcePixel = (const PixelType*)sourceBlock;
1451 
1452  for (unsigned int x = 0u; x < 8u; ++x)
1453  {
1454  *((PixelType*)(targetBlock + targetStrideElements * (8u - x - 1u)) + y) = sourcePixel[x];
1455  }
1456 
1457  sourceBlock += sourceStrideElements;
1458  }
1459 
1460  break;
1461  }
1462 
1463  default:
1464  ocean_assert(false && "Invalid flip direction!");
1465  }
1466 }
1467 
1468 template <typename T, unsigned int tChannels>
1469 template <FrameTransposer::FlipDirection tFlipDirection>
1470 OCEAN_FORCE_INLINE void FrameTransposer::BlockTransposer<T, tChannels>::transposeBlock(const T* sourceBlock, T* targetBlock, const unsigned int blockWidth, const unsigned int blockHeight, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
1471 {
1472  ocean_assert(sourceBlock && targetBlock);
1473 
1474  ocean_assert(blockWidth >= 1u && blockHeight >= 1u);
1475  ocean_assert(blockWidth < 8u || blockHeight < 8u);
1476 
1477  ocean_assert(sourceStrideElements >= blockWidth);
1478  ocean_assert(targetStrideElements >= blockHeight);
1479 
1480  typedef typename DataType<T, tChannels>::Type PixelType;
1481 
1482  switch (tFlipDirection)
1483  {
1484  case FD_NONE:
1485  {
1486  // simply transposing the block
1487 
1488  for (unsigned int y = 0u; y < blockHeight; ++y)
1489  {
1490  const PixelType* const sourcePixel = (const PixelType*)sourceBlock;
1491 
1492  for (unsigned int x = 0u; x < blockWidth; ++x)
1493  {
1494  *((PixelType*)(targetBlock + targetStrideElements * x)) = sourcePixel[x];
1495  }
1496 
1497  sourceBlock += sourceStrideElements;
1498  targetBlock += tChannels;
1499  }
1500 
1501  break;
1502  }
1503 
1504  case FD_LEFT_RIGHT:
1505  {
1506  // transposing the block and applying a left-right flip like a mirror, actually a 90 degree clockwise rotation
1507 
1508  for (unsigned int y = 0u; y < blockHeight; ++y)
1509  {
1510  const PixelType* const sourcePixel = (const PixelType*)sourceBlock;
1511 
1512  for (unsigned int x = 0u; x < blockWidth; ++x)
1513  {
1514  *((PixelType*)(targetBlock + targetStrideElements * x) + (blockHeight - y - 1u)) = sourcePixel[x];
1515  }
1516 
1517  sourceBlock += sourceStrideElements;
1518  }
1519 
1520  break;
1521  }
1522 
1523  case FD_TOP_BOTTOM:
1524  {
1525  // transposing the block and applying a top-bottom flip, actually a 90 degree counter clockwise rotation
1526 
1527  for (unsigned int y = 0u; y < blockHeight; ++y)
1528  {
1529  const PixelType* const sourcePixel = (const PixelType*)sourceBlock;
1530 
1531  for (unsigned int x = 0u; x < blockWidth; ++x)
1532  {
1533  *((PixelType*)(targetBlock + targetStrideElements * (blockWidth - x - 1u)) + y) = sourcePixel[x];
1534  }
1535 
1536  sourceBlock += sourceStrideElements;
1537  }
1538 
1539  break;
1540  }
1541 
1542  default:
1543  ocean_assert(false && "Invalid flip direction!");
1544  }
1545 }
1546 
1547 template <typename TElementType, unsigned int tChannels>
1548 inline void FrameTransposer::rotate90Subset(const TElementType* source, TElementType* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const bool clockwise, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
1549 {
1550  static_assert(tChannels >= 1u, "Invalid channel number!");
1551 
1552  ocean_assert(source && target);
1553  ocean_assert(sourceWidth >= 1u && sourceHeight >= 1u);
1554 
1555  ocean_assert(firstTargetRow + numberTargetRows <= sourceWidth);
1556 
1557  const unsigned int& targetWidth = sourceHeight;
1558 
1559  // Clockwise: Counter-Clockwise:
1560  // Source: Source:
1561  // ^ ^ ^ ^ ... D C B A
1562  // | | | | | | | |
1563  // | | | | | | | |
1564  // | | | | | | | |
1565  // A B C D ... v v v v
1566  // Target: Target:
1567  // A ------> A ------>
1568  // B ------> B ------>
1569  // C ------> C ------>
1570  // D ... D ...
1571  // E ... E ...
1572 
1573  const unsigned int sourceStrideElements = sourceWidth * tChannels + sourcePaddingElements;
1574  const unsigned int targetStrideElements = targetWidth * tChannels + targetPaddingElements;
1575 
1576  TElementType* targetRowStartElement = target + firstTargetRow * targetStrideElements;
1577  const TElementType* const targetEndElement = targetRowStartElement + numberTargetRows * targetStrideElements - targetPaddingElements;
1578  ocean_assert_and_suppress_unused(targetRowStartElement < targetEndElement || numberTargetRows == 0u, targetEndElement);
1579 
1580  if (clockwise)
1581  {
1582  const TElementType* sourceColumnStartElement = source + (sourceHeight - 1u) * sourceStrideElements + tChannels * firstTargetRow;
1583 
1584  for (unsigned row = 0u; row < numberTargetRows; ++row)
1585  {
1586  const TElementType* sourceElement = sourceColumnStartElement;
1587 
1588  TElementType* targetElement = targetRowStartElement;
1589  const TElementType* const targetRowEndElement = targetRowStartElement + tChannels * targetWidth;
1590  ocean_assert(targetRowEndElement <= targetEndElement);
1591 
1592  while (targetElement != targetRowEndElement)
1593  {
1594  ocean_assert(sourceElement < source + sourceHeight * sourceStrideElements - sourcePaddingElements);
1595  ocean_assert(targetElement < targetEndElement);
1596  ocean_assert(targetElement < targetRowEndElement);
1597 
1598  for (unsigned int c = 0u; c < tChannels; ++c)
1599  {
1600  targetElement[c] = sourceElement[c];
1601  }
1602 
1603  sourceElement -= sourceStrideElements;
1604  targetElement += tChannels;
1605  }
1606 
1607  sourceColumnStartElement += tChannels;
1608  targetRowStartElement += targetStrideElements;
1609  }
1610  }
1611  else
1612  {
1613  const TElementType* sourceColumnStartElement = source + tChannels * (sourceWidth - firstTargetRow - 1u);
1614 
1615  for (unsigned row = 0u; row < numberTargetRows; ++row)
1616  {
1617  const TElementType* sourceElement = sourceColumnStartElement;
1618  ocean_assert(sourceElement >= source);
1619 
1620  TElementType* targetElement = targetRowStartElement;
1621  const TElementType* const targetRowEndElement = targetRowStartElement + tChannels * targetWidth;
1622  ocean_assert(targetRowEndElement <= targetEndElement);
1623 
1624  while (targetElement != targetRowEndElement)
1625  {
1626  ocean_assert(sourceElement < source + sourceHeight * sourceStrideElements - sourcePaddingElements);
1627  ocean_assert(targetElement < targetEndElement);
1628  ocean_assert(targetElement < targetRowEndElement);
1629 
1630  for (unsigned int c = 0u; c < tChannels; ++c)
1631  {
1632  targetElement[c] = sourceElement[c];
1633  }
1634 
1635  sourceElement += sourceStrideElements;
1636  targetElement += tChannels;
1637  }
1638 
1639  sourceColumnStartElement -= tChannels;
1640  targetRowStartElement += targetStrideElements;
1641  }
1642  }
1643 }
1644 
1645 }
1646 
1647 }
1648 
1649 #endif // META_OCEAN_CV_FRAME_TRANSPOSER_H
@ CONVERT_FLIPPED_AND_MIRRORED
Rotated conversion, rotates the image by 180.0 degrees with anchor in the center of the image.
Definition: FrameConverter.h:82
Helper class for functions transposing blocks.
Definition: FrameTransposer.h:122
static OCEAN_FORCE_INLINE void transposeBlock(const T *sourceBlock, T *targetBlock, const unsigned int blockWidth, const unsigned int blockHeight, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
Transposes a block of n x m pixels.
Definition: FrameTransposer.h:1470
static OCEAN_FORCE_INLINE void transposeBlock4x4NEON(const T *sourceBlock, T *targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
Transposes a block of 4x4 pixels.
static OCEAN_FORCE_INLINE void transposeBlock8x8(const T *sourceBlock, T *targetBlock, const unsigned int sourceStrideElements, const unsigned int targetStrideElements)
Transposes a block of 8x8 pixels.
Definition: FrameTransposer.h:1396
The following comfort class provides comfortable functions simplifying prototyping applications but a...
Definition: FrameTransposer.h:39
static bool rotate(const Frame &input, Frame &output, const int angle, Worker *worker=nullptr)
Rotates a given frame with 90 degree steps.
static bool rotate180(const Frame &input, Frame &output, Worker *worker=nullptr)
Rotates a given frame by 180 degrees.
static bool rotate90(const Frame &input, Frame &output, const bool clockwise, Worker *worker=nullptr)
Rotates a given frame either clockwise or counter-clockwise by 90 degrees.
This class implements a frame transposer.
Definition: FrameTransposer.h:30
static void rotate90Subset(const TElementType *source, TElementType *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const bool clockwise, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
Rotates a subset of a given frame either clockwise or counter-clockwise by 90 degree.
Definition: FrameTransposer.h:1548
static void transposeSubset(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstSourceRow, const unsigned int numberSourceRows)
Transposes the subset of a given image buffer.
Definition: FrameTransposer.h:519
static bool transpose(const Frame &source, Frame &target, Worker *worker=nullptr)
Transposes a given frame.
FlipDirection
Definition of individual flip directions which can be applied to a transposed frame.
Definition: FrameTransposer.h:105
@ FD_NONE
Applying no flip.
Definition: FrameTransposer.h:107
@ FD_TOP_BOTTOM
Applying a top-bottom flip, combined with a transpose operation an image can be rotated counter clock...
Definition: FrameTransposer.h:111
@ FD_LEFT_RIGHT
Applying a left-right flip like a mirror, combined with a transpose operation an image can be rotated...
Definition: FrameTransposer.h:109
static void rotate180(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Rotates a given image buffer 180 degrees.
Definition: FrameTransposer.h:455
static void rotate90(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const bool clockwise, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Rotates a given image buffer 90 degrees clockwise or counter clockwise.
Definition: FrameTransposer.h:395
static bool rotate(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const int angle, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Rotates a given image with 90 degree steps.
Definition: FrameTransposer.h:468
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition: Caller.h:2876
This class implements Ocean's image class.
Definition: Frame.h:1760
void setRelativeTimestamp(const Timestamp &relative)
Sets the relative timestamp of this frame.
Definition: Frame.h:4121
bool isValid() const
Returns whether this frame is valid.
Definition: Frame.h:4416
void setTimestamp(const Timestamp &timestamp)
Sets the timestamp of this frame.
Definition: Frame.h:4116
const Timestamp & timestamp() const
Returns the timestamp of this frame.
Definition: Frame.h:4106
const Timestamp & relativeTimestamp() const
Returns the relative timestamp of this frame.
Definition: Frame.h:4111
TypeMapperBySize< sizeof(T)>::Type Type
Definition of an invalid mapped data type.
Definition: DataType.h:501
This class implements a worker able to distribute function calls over different threads.
Definition: Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15
Default definition of a type with tBytes bytes.
Definition: DataType.h:32