Ocean
SquareMatrix4.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_MATH_SQUARE_MATRIX_4_H
9 #define META_OCEAN_MATH_SQUARE_MATRIX_4_H
10 
11 #include "ocean/math/Math.h"
12 #include "ocean/math/Equation.h"
13 #include "ocean/math/Vector2.h"
14 #include "ocean/math/Vector3.h"
15 #include "ocean/math/Vector4.h"
16 
17 #include <vector>
18 
19 namespace Ocean
20 {
21 
22 // Forward declaration.
23 template <typename T> class AnyCameraT;
24 
25 // Forward declaration.
26 template <typename T> class HomogenousMatrixT4;
27 
28 // Forward declaration.
29 template <typename T> class SquareMatrixT3;
30 
31 // Forward declaration.
32 template <typename T> class SquareMatrixT4;
33 
34 /**
35  * Definition of the SquareMatrix4 object, depending on the OCEAN_MATH_USE_SINGLE_PRECISION either with single or double precision float data type.
36  * @see SquareMatrixT4
37  * @ingroup math
38  */
40 
41 /**
42  * Instantiation of the SquareMatrixT4 template class using a double precision float data type.
43  * @see SquareMatrixT4
44  * @ingroup math
45  */
47 
48 /**
49  * Instantiation of the SquareMatrixT4 template class using a double precision float data type.
50  * @see SquareMatrixT4
51  * @ingroup math
52  */
54 
55 /**
56  * Definition of a typename alias for vectors with SquareMatrixT4 objects.
57  * @see SquareMatrixT4
58  * @ingroup math
59  */
60 template <typename T>
61 using SquareMatricesT4 = std::vector<SquareMatrixT4<T>>;
62 
63 /**
64  * Definition of a vector holding SquareMatrix4 objects.
65  * @see SquareMatrix4
66  * @ingroup math
67  */
68 typedef std::vector<SquareMatrix4> SquareMatrices4;
69 
70 /**
71  * This class implements a 4x4 square matrix.
72  * The values are stored in a column aligned order with indices:
73  * <pre>
74  * | 0 4 8 12 |
75  * | 1 5 9 13 |
76  * | 2 6 10 14 |
77  * | 3 7 11 15 |
78  * </pre>
79  * @tparam T Data type of matrix elements
80  * @see SquareMatrix4, SquareMatrixF4, SquareMatrixD4.
81  * @ingroup math
82  */
83 template <typename T>
85 {
86  template <typename U> friend class SquareMatrixT4;
87 
88  public:
89 
90  /**
91  * Definition of the used data type.
92  */
93  typedef T Type;
94 
95  public:
96 
97  /**
98  * Creates a new SquareMatrixT4 object with undefined elements.
99  * Beware: This matrix is neither a zero nor an entity matrix!
100  */
101  inline SquareMatrixT4();
102 
103  /**
104  * Copy constructor.
105  * @param matrix The matrix to copy
106  */
107  SquareMatrixT4(const SquareMatrixT4<T>& matrix) = default;
108 
109  /**
110  * Copy constructor for a matrix with difference element data type than T.
111  * @param matrix The matrix to copy
112  * @tparam U The element data type of the second matrix
113  */
114  template <typename U>
115  inline explicit SquareMatrixT4(const SquareMatrixT4<U>& matrix);
116 
117  /**
118  * Creates a new SquareMatrixT4 object.
119  * @param setToIdentity Determines whether a entity matrix will be created
120  */
121  explicit SquareMatrixT4(const bool setToIdentity);
122 
123  /**
124  * Creates a new SquareMatrixT4 object by an array of at least sixteen elements of float type U.
125  * @param arrayValues The sixteen matrix elements defining the new matrix, must be valid
126  * @tparam U The floating point type of the given elements
127  */
128  template <typename U>
129  explicit SquareMatrixT4(const U* arrayValues);
130 
131  /**
132  * Creates a new SquareMatrixT4 object by an array of at least sixteen elements.
133  * @param arrayValues The sixteen matrix elements defining the new matrix, must be valid
134  */
135  explicit SquareMatrixT4(const T* arrayValues);
136 
137  /**
138  * Creates a new SquareMatrixT4 object by an array of at least sixteen elements of float type U.
139  * @param arrayValues The sixteen matrix elements defining the new matrix, must be valid
140  * @param valuesRowAligned True, if the given values are stored in a row aligned order; False, if the values are stored in a column aligned order (which is the default case for this matrix)
141  * @tparam U The floating point type of the given elements
142  */
143  template <typename U>
144  SquareMatrixT4(const U* arrayValues, const bool valuesRowAligned);
145 
146  /**
147  * Creates a new SquareMatrixT4 object by an array of at least sixteen elements.
148  * @param valuesRowAligned True, if the given values are stored in a row aligned order; False, if the values are stored in a column aligned order (which is the default case for this matrix)
149  * @param arrayValues The sixteen matrix elements defining the new matrix, must be valid
150  */
151  SquareMatrixT4(const T* arrayValues, const bool valuesRowAligned);
152 
153  /**
154  * Creates a new SquareMatrixT4 object by given transformation matrix.
155  * @param transformation The transformation matrix to copy
156  */
157  explicit SquareMatrixT4(const HomogenousMatrixT4<T>& transformation);
158 
159  /**
160  * Creates a new SquareMatrixT4 object by given 3x3 sub matrix.
161  * The lower and right elements of the 4x4 square matrix are set to zero.
162  * @param subMatrix 3x3 subMatrix defining the upper left elements of the new matrix
163  */
164  explicit SquareMatrixT4(const SquareMatrixT3<T>& subMatrix);
165 
166  /**
167  * Creates a new SquareMatrixT4 object by a given diagonal vector.
168  * @param diagonal The diagonal vector for the new matrix
169  */
170  explicit SquareMatrixT4(const VectorT4<T>& diagonal);
171 
172  /**
173  * Returns the transposed of this matrix.
174  * @return Transposed matrix
175  */
177 
178  /**
179  * Transposes the matrix.
180  */
181  void transpose();
182 
183  /**
184  * Returns the inverted matrix of this matrix.
185  * This matrix must not be singular.<br>
186  * Beware: This function does not throw an exception if the matrix cannot be inverted.<br>
187  * Thus, ensure that the matrix is invertible before calling this function.<br>
188  * Even better: avoid the usage of this function and call invert() instead.<br>
189  * In case, this matrix is not invertible, this matrix will be returned instead.
190  * @return The inverted matrix
191  * @see invert(), isSingular().
192  */
194 
195  /**
196  * Inverts this matrix in place.
197  * @return True, if the matrix is not singular and could be inverted
198  * @see inverted().
199  */
200  bool invert();
201 
202  /**
203  * Inverts the matrix and returns the result.
204  * @param invertedMatrix The resulting inverted matrix
205  * @return True, if the matrix is not singular and could be inverted
206  * @see inverted().
207  */
208  bool invert(SquareMatrixT4<T>& invertedMatrix) const;
209 
210  /**
211  * Returns the determinant of the matrix.
212  * @return Matrix determinant
213  */
214  T determinant() const;
215 
216  /**
217  * Returns the trace of the matrix which is the sum of the diagonal elements.
218  * @return Trace of the matrix
219  */
220  T trace() const;
221 
222  /**
223  * Sets the matrix to the identity matrix.
224  */
225  inline void toIdentity();
226 
227  /**
228  * Sets the matrix to a zero matrix.
229  * @see isNull();
230  */
231  inline void toNull();
232 
233  /**
234  * Returns whether this matrix is a null matrix.
235  * @return True, if so
236  */
237  bool isNull() const;
238 
239  /**
240  * Returns whether this matrix is the identity matrix.
241  * @return True, if so
242  */
243  bool isIdentity() const;
244 
245  /**
246  * Returns whether this matrix is singular (and thus cannot be inverted).
247  * A matrix is singular if the determinant of a matrix is 0.<br>
248  * @return True, if so
249  */
250  inline bool isSingular() const;
251 
252  /**
253  * Returns whether this matrix is symmetric.
254  * @param epsilon The epsilon threshold to be used, with range [0, infinity)
255  * @return True, if so
256  */
257  bool isSymmetric(const T epsilon = NumericT<T>::eps()) const;
258 
259  /**
260  * Returns whether two matrices are almost identical up to a specified epsilon.
261  * @param matrix Second matrix that will be checked
262  * @param eps The epsilon threshold to be used, with range [0, infinity)
263  * @return True, if so
264  */
265  inline bool isEqual(const SquareMatrixT4<T>& matrix, const T eps = NumericT<T>::eps()) const;
266 
267  /**
268  * Performs an eigen value analysis.
269  * @param eigenValues Vector holding the three eigen values
270  * @param eigenVectors Matrix holding the three corresponding eigen vectors
271  * @return True, if succeeded
272  */
273  bool eigenSystem(VectorT4<T>& eigenValues, SquareMatrixT4<T>& eigenVectors);
274 
275  /**
276  * Returns a pointer to the internal values.
277  * @return Pointer to the internal values
278  */
279  inline const T* data() const;
280 
281  /**
282  * Returns a pointer to the internal values.
283  * @return Pointer to the internal values
284  */
285  inline T* data();
286 
287  /**
288  * Copies the elements of this matrix to an array with floating point values of type U.
289  * @param arrayValues Array with 16 floating point values of type U receiving the elements of this matrix
290  * @tparam U Floating point type
291  */
292  template <typename U>
293  void copyElements(U* arrayValues) const;
294 
295  /**
296  * Copies the elements of this matrix to an array with floating point values.
297  * @param arrayValues Array with floating point values receiving the elements of this matrix
298  */
299  void copyElements(T* arrayValues) const;
300 
301  /**
302  * Default assign operator.
303  * @return Reference to this object
304  */
306 
307  /**
308  * Returns whether two matrices are identical up to a small epsilon.
309  * @param matrix Right operand
310  * @return True, if so
311  */
312  bool operator==(const SquareMatrixT4<T>& matrix) const;
313 
314  /**
315  * Returns whether two matrices are not identical up to a small epsilon.
316  * @param matrix Right operand
317  * @return True, if so
318  */
319  inline bool operator!=(const SquareMatrixT4<T>& matrix) const;
320 
321  /**
322  * Adds two matrices.
323  * @param matrix Right operand
324  * @return Sum matrix
325  */
327 
328  /**
329  * Adds and assigns two matrices.
330  * @param matrix Right operand
331  * @return Reference to this object
332  */
334 
335  /**
336  * Subtracts two matrices.
337  * @param matrix Right operand
338  * @return Difference matrix
339  */
341 
342  /**
343  * Subtracts and assigns two matrices.
344  * @param matrix Right operand
345  * @return Reference to this object
346  */
348 
349  /**
350  * Returns the negative matrix of this matrix (all matrix elements are multiplied by -1).
351  * @return Resulting negative matrix
352  */
353  inline SquareMatrixT4<T> operator-() const;
354 
355  /**
356  * Multiplies two matrices.
357  * @param matrix Right operand
358  * @return Product matrix
359  */
360  OCEAN_FORCE_INLINE SquareMatrixT4<T> operator*(const SquareMatrixT4<T>& matrix) const;
361 
362  /**
363  * Multiplies two matrices.
364  * @param matrix Right operand
365  * @return Product matrix
366  */
367  OCEAN_FORCE_INLINE SquareMatrixT4<T> operator*(const HomogenousMatrixT4<T>& matrix) const;
368 
369  /**
370  * Multiplies and assigns two matrices.
371  * @param matrix Right operand
372  * @return Reference to this object
373  */
374  OCEAN_FORCE_INLINE SquareMatrixT4<T>& operator*=(const SquareMatrixT4<T>& matrix);
375 
376  /**
377  * Multiplies and assigns two matrices.
378  * @param matrix Right operand
379  * @return Reference to this object
380  */
381  OCEAN_FORCE_INLINE SquareMatrixT4<T>& operator*=(const HomogenousMatrixT4<T>& matrix);
382 
383  /**
384  * Multiply operator for a 3D vector.
385  * The 3D vector is interpreted as a 4D vector with fourth component equal to 1.<br>
386  * The final result will be de-homogenizated to provide a 3D vector result.<br>
387  * Beware the dot product of the last row with the vector must not be zero!
388  * @param vector Right operand
389  * @return Resulting 3D vector
390  */
391  OCEAN_FORCE_INLINE VectorT3<T> operator*(const VectorT3<T>& vector) const;
392 
393  /**
394  * Multiply operator for a 4D vector.
395  * @param vector Right operand
396  * @return Resulting 4D vector
397  */
398  OCEAN_FORCE_INLINE VectorT4<T> operator*(const VectorT4<T>& vector) const;
399 
400  /**
401  * Multiplies this matrix with a scalar value.
402  * @param value Right operand
403  * @return Resulting matrix
404  */
405  SquareMatrixT4<T> operator*(const T value) const;
406 
407  /**
408  * Multiplies and assigns this matrix with a scalar value.
409  * @param value right operand
410  * @return Reference to this object
411  */
412  SquareMatrixT4<T>& operator*=(const T value);
413 
414  /**
415  * Element operator.
416  * Beware: No range check will be done!
417  * @param index The index of the element to return [0, 15]
418  * @return Specified element
419  */
420  inline T operator[](const unsigned int index) const;
421 
422  /**
423  * Element operator.
424  * Beware: No range check will be done!
425  * @param index The index of the element to return [0, 15]
426  * @return Specified element
427  */
428  inline T& operator[](const unsigned int index);
429 
430  /**
431  * Element operator.
432  * Beware: No range check will be done!
433  * @param row The row of the element to return [0, 3]
434  * @param column The column of the element to return [0, 3]
435  * @return Specified element
436  */
437  inline T operator()(const unsigned int row, const unsigned int column) const;
438 
439  /**
440  * Element operator.
441  * Beware: No range check will be done!
442  * @param row The row of the element to return [0, 3]
443  * @param column The column of the element to return [0, 3]
444  * @return Specified element
445  */
446  inline T& operator()(const unsigned int row, const unsigned int column);
447 
448  /**
449  * Element operator.
450  * Beware: No range check will be done!
451  * @param index The index of the element to return [0, 15]
452  * @return Specified element
453  */
454  inline T operator()(const unsigned int index) const;
455 
456  /**
457  * Element operator.
458  * Beware: No range check will be done!
459  * @param index The index of the element to return [0, 15]
460  * @return Specified element
461  */
462  inline T& operator()(const unsigned int index);
463 
464  /**
465  * Access operator.
466  * @return Pointer to the internal values
467  */
468  inline const T* operator()() const;
469 
470  /**
471  * Access operator.
472  * @return Pointer to the internal values
473  */
474  inline T* operator()();
475 
476  /**
477  * Hash function.
478  * @param matrix The matrix for which the hash value will be determined
479  * @return The resulting hash value
480  */
481  inline size_t operator()(const SquareMatrixT4<T>& matrix) const;
482 
483  /**
484  * Returns the number of elements this matrix has.
485  * @return The number of elements, always 16
486  */
487  static inline size_t elements();
488 
489  /**
490  * Multiplies several 4D vectors with a given matrix.
491  * @param matrix The matrix to be used for multiplication, may be nullptr if number is 0
492  * @param vectors The input vectors that will be multiplied, may be nullptr if number is 0
493  * @param results The resulting output (multiplied/transformed) vectors, with same number as the provided input vectors
494  * @param number The number of provided vectors (input and output), with range [0, infinity)
495  */
496  static void multiply(const SquareMatrixT4<T>& matrix, const VectorT4<T>* vectors, VectorT4<T>* results, const size_t number);
497 
498  /**
499  * Creates a projection matrix defined by the horizontal field of view, the aspect ratio and the near and far clipping plane.
500  * @param fovX Horizontal field of view, in radian and a range of [0, Pi]
501  * @param aspectRatio View aspect ratio which is width divided by the height of the projection window
502  * @param nearDistance Positive distance to the near clipping plane
503  * @param farDistance Positive distance to the far clipping lane
504  * @return Specified projection matrix
505  */
506  static SquareMatrixT4<T> projectionMatrix(const T fovX, const T aspectRatio, const T nearDistance, const T farDistance);
507 
508  /**
509  * Creates a projection matrix defined by a camera profile of a pinhole camera and the near and far clipping plane.
510  * @param anyCamera The camera profile of a pinhole camera, without distortion parameters, must be valid
511  * @param nearDistance Positive distance to the near clipping plane
512  * @param farDistance Positive distance to the far clipping lane
513  * @return Specified projection matrix
514  */
515  static SquareMatrixT4<T> projectionMatrix(const AnyCameraT<T>& anyCamera, const T nearDistance, const T farDistance);
516 
517  /**
518  * Creates a projection matrix defined by an asymmetric viewing frustum.
519  * The shape of the frustum is defined by the rectangle on the near plane.<br>
520  * Afterwards, the field of view is defined by the (positive) distance to the near clipping plane.<br>
521  * Followed by the (positive) far clipping plane to determine the entire frustum.
522  * @param left Position of the left border of the rectangle on the near plane
523  * @param right Position of the right border of the rectangle on the near plane
524  * @param top Position of the top border of the rectangle on the near plane
525  * @param bottom Position of the bottom border of the rectangle on the near plane
526  * @param nearDistance Positive distance to the near clipping plane
527  * @param farDistance Positive distance to the far clipping plane
528  * @return Specified frustum projection matrix
529  */
530  static SquareMatrixT4<T> frustumMatrix(const T left, const T right, const T top, const T bottom, const T nearDistance, const T farDistance);
531 
532  /**
533  * Creates a project matrix defined by an asymmetric viewing frustum.
534  * The shape of the frustum is defined by the rectangle on the near plane.<br>
535  * The viewing position is defined by the given view matrix while the near plane is expected to lie in the origin of the coordinate system.
536  * @param width The width of the near plane, with range (0, infinity)
537  * @param height The height of the near plane, with range (0, infinity)
538  * @param viewingMatrix Viewing matrix transforming point defined in the camera coordinate system into points defined in the world coordinate system, must be invertible
539  * @param nearDistance Positive distance to the near clipping plane, with range (0, infinity)
540  * @param farDistance Positive distance to the far clipping plane, with range (nearDistance, infinity)
541  * @return Specified frustum projection matrix
542  */
543  static SquareMatrixT4<T> frustumMatrix(const T width, const T height, const HomogenousMatrixT4<T>& viewingMatrix, const T nearDistance, const T farDistance);
544 
545  /**
546  * Converts matrices with specific data type to matrices with different data type.
547  * @param matrices The matrices to convert
548  * @return The converted matrices
549  * @tparam U The element data type of the matrices to convert
550  */
551  template <typename U>
552  static inline std::vector< SquareMatrixT4<T> > matrices2matrices(const std::vector< SquareMatrixT4<U> >& matrices);
553 
554  /**
555  * Converts matrices with specific data type to matrices with different data type.
556  * @param matrices The matrices to convert
557  * @param size The number of matrices to convert
558  * @return The converted matrices
559  * @tparam U The element data type of the matrices to convert
560  */
561  template <typename U>
562  static inline std::vector< SquareMatrixT4<T> > matrices2matrices(const SquareMatrixT4<U>* matrices, const size_t size);
563 
564  protected:
565 
566  /**
567  * Swaps two rows of this matrix.
568  * @param row0 The index of the first row ,with range [0, 3]
569  * @param row1 The index of the second row, with range [0, 3]
570  */
571  void swapRows(const unsigned int row0, const unsigned int row1);
572 
573  /**
574  * Multiplies a row with a scalar value.
575  * @param row The index of the row to multiply, with range [0, 3]
576  * @param scalar The scalar to multiply
577  */
578  void multiplyRow(const unsigned int row, const T scalar);
579 
580  /**
581  * Multiplies elements from a specific row with a scalar and adds them to another row.
582  * @param targetRow The index of the target row, with range [0, 3]
583  * @param sourceRow The index of the source row, with range [0, 3]
584  * @param scalar The scalar to multiply the source elements with
585  */
586  void addRows(const unsigned int targetRow, const unsigned int sourceRow, const T scalar);
587 
588  protected:
589 
590  /// The sixteen values of the matrix.
591  T values[16];
592 };
593 
594 template <typename T>
596 {
597  // nothing to do here
598 }
599 
600 template <typename T>
601 template <typename U>
603 {
604  values[ 0] = T(matrix.values[ 0]);
605  values[ 1] = T(matrix.values[ 1]);
606  values[ 2] = T(matrix.values[ 2]);
607  values[ 3] = T(matrix.values[ 3]);
608  values[ 4] = T(matrix.values[ 4]);
609  values[ 5] = T(matrix.values[ 5]);
610  values[ 6] = T(matrix.values[ 6]);
611  values[ 7] = T(matrix.values[ 7]);
612  values[ 8] = T(matrix.values[ 8]);
613  values[ 9] = T(matrix.values[ 9]);
614  values[10] = T(matrix.values[10]);
615  values[11] = T(matrix.values[11]);
616  values[12] = T(matrix.values[12]);
617  values[13] = T(matrix.values[13]);
618  values[14] = T(matrix.values[14]);
619  values[15] = T(matrix.values[15]);
620 }
621 
622 template <typename T>
623 SquareMatrixT4<T>::SquareMatrixT4(const bool setToIdentity)
624 {
625  if (setToIdentity)
626  {
627  values[ 0] = T(1.0);
628  values[ 1] = T(0.0);
629  values[ 2] = T(0.0);
630  values[ 3] = T(0.0);
631  values[ 4] = T(0.0);
632  values[ 5] = T(1.0);
633  values[ 6] = T(0.0);
634  values[ 7] = T(0.0);
635  values[ 8] = T(0.0);
636  values[ 9] = T(0.0);
637  values[10] = T(1.0);
638  values[11] = T(0.0);
639  values[12] = T(0.0);
640  values[13] = T(0.0);
641  values[14] = T(0.0);
642  values[15] = T(1.0);
643  }
644  else
645  {
646  values[ 0] = T(0.0);
647  values[ 1] = T(0.0);
648  values[ 2] = T(0.0);
649  values[ 3] = T(0.0);
650  values[ 4] = T(0.0);
651  values[ 5] = T(0.0);
652  values[ 6] = T(0.0);
653  values[ 7] = T(0.0);
654  values[ 8] = T(0.0);
655  values[ 9] = T(0.0);
656  values[10] = T(0.0);
657  values[11] = T(0.0);
658  values[12] = T(0.0);
659  values[13] = T(0.0);
660  values[14] = T(0.0);
661  values[15] = T(0.0);
662  }
663 }
664 
665 template <typename T>
666 template <typename U>
668 {
669  ocean_assert(arrayValues);
670 
671  for (unsigned int n = 0u; n < 16u; ++n)
672  {
673  values[n] = T(arrayValues[n]);
674  }
675 }
676 
677 template <typename T>
679 {
680  ocean_assert(arrayValues);
681  memcpy(values, arrayValues, sizeof(T) * 16);
682 }
683 
684 template <typename T>
685 template <typename U>
686 SquareMatrixT4<T>::SquareMatrixT4(const U* arrayValues, const bool valuesRowAligned)
687 {
688  ocean_assert(arrayValues);
689 
690  if (valuesRowAligned)
691  {
692  values[ 0] = T(arrayValues[ 0]);
693  values[ 1] = T(arrayValues[ 4]);
694  values[ 2] = T(arrayValues[ 8]);
695  values[ 3] = T(arrayValues[12]);
696  values[ 4] = T(arrayValues[ 1]);
697  values[ 5] = T(arrayValues[ 5]);
698  values[ 6] = T(arrayValues[ 9]);
699  values[ 7] = T(arrayValues[13]);
700  values[ 8] = T(arrayValues[ 2]);
701  values[ 9] = T(arrayValues[ 6]);
702  values[10] = T(arrayValues[10]);
703  values[11] = T(arrayValues[14]);
704  values[12] = T(arrayValues[ 3]);
705  values[13] = T(arrayValues[ 7]);
706  values[14] = T(arrayValues[11]);
707  values[15] = T(arrayValues[15]);
708  }
709  else
710  {
711  for (unsigned int n = 0u; n < 16u; ++n)
712  values[n] = T(arrayValues[n]);
713  }
714 }
715 
716 template <typename T>
717 SquareMatrixT4<T>::SquareMatrixT4(const T* arrayValues, const bool valuesRowAligned)
718 {
719  ocean_assert(arrayValues);
720 
721  if (valuesRowAligned)
722  {
723  values[ 0] = arrayValues[ 0];
724  values[ 1] = arrayValues[ 4];
725  values[ 2] = arrayValues[ 8];
726  values[ 3] = arrayValues[12];
727  values[ 4] = arrayValues[ 1];
728  values[ 5] = arrayValues[ 5];
729  values[ 6] = arrayValues[ 9];
730  values[ 7] = arrayValues[13];
731  values[ 8] = arrayValues[ 2];
732  values[ 9] = arrayValues[ 6];
733  values[10] = arrayValues[10];
734  values[11] = arrayValues[14];
735  values[12] = arrayValues[ 3];
736  values[13] = arrayValues[ 7];
737  values[14] = arrayValues[11];
738  values[15] = arrayValues[15];
739  }
740  else
741  {
742  memcpy(values, arrayValues, sizeof(T) * 16);
743  }
744 }
745 
746 template <typename T>
748 {
749  memcpy(values, transformation(), sizeof(T) * 16);
750 }
751 
752 template <typename T>
754 {
755  values[ 3] = T(0.0);
756  values[ 7] = T(0.0);
757  values[11] = T(0.0);
758  values[12] = T(0.0);
759  values[13] = T(0.0);
760  values[14] = T(0.0);
761  values[15] = T(0.0);
762 
763  memcpy(values, subMatrix(), sizeof(T) * 3); // Set values[0] - values[2]
764  memcpy(values + 4, subMatrix() + 3, sizeof(T) * 3); // Set values[4] - values[6]
765  memcpy(values + 8, subMatrix() + 6, sizeof(T) * 3); // Set values[8] - values[10]
766 }
767 
768 template <typename T>
770 {
771  values[ 0] = diagonal[0];
772  values[ 1] = T(0.0);
773  values[ 2] = T(0.0);
774  values[ 3] = T(0.0);
775  values[ 4] = T(0.0);
776  values[ 5] = diagonal[1];
777  values[ 6] = T(0.0);
778  values[ 7] = T(0.0);
779  values[ 8] = T(0.0);
780  values[ 9] = T(0.0);
781  values[10] = diagonal[2];
782  values[11] = T(0.0);
783  values[12] = T(0.0);
784  values[13] = T(0.0);
785  values[14] = T(0.0);
786  values[15] = diagonal[3];
787 }
788 
789 template <typename T>
790 inline const T* SquareMatrixT4<T>::data() const
791 {
792  return values;
793 }
794 
795 template <typename T>
797 {
798  return values;
799 }
800 
801 template <typename T>
803 {
804  SquareMatrixT4<T> result(*this);
805 
806  result.values[1] = values[4];
807  result.values[4] = values[1];
808 
809  result.values[2] = values[8];
810  result.values[8] = values[2];
811 
812  result.values[3] = values[12];
813  result.values[12] = values[3];
814 
815  result.values[6] = values[9];
816  result.values[9] = values[6];
817 
818  result.values[7] = values[13];
819  result.values[13] = values[7];
820 
821  result.values[11] = values[14];
822  result.values[14] = values[11];
823 
824  return result;
825 }
826 
827 template <typename T>
829 {
830  SquareMatrixT4<T> tmp(*this);
831 
832  values[4] = tmp.values[1];
833  values[1] = tmp.values[4];
834 
835  values[8] = tmp.values[2];
836  values[2] = tmp.values[8];
837 
838  values[12] = tmp.values[3];
839  values[3] = tmp.values[12];
840 
841  values[9] = tmp.values[6];
842  values[6] = tmp.values[9];
843 
844  values[13] = tmp.values[7];
845  values[7] = tmp.values[13];
846 
847  values[14] = tmp.values[11];
848  values[11] = tmp.values[14];
849 }
850 
851 template <typename T>
853 {
854  SquareMatrixT4 invertedMatrix;
855 
856  if (!invert(invertedMatrix))
857  {
858  ocean_assert(false && "Could not invert matrix.");
859  return *this;
860  }
861 
862  return invertedMatrix;
863 }
864 
865 template <typename T>
867 {
868  SquareMatrixT4<T> invertedMatrix;
869 
870  if (!invert(invertedMatrix))
871  {
872  return false;
873  }
874 
875  *this = invertedMatrix;
876 
877  return true;
878 }
879 
880 template <typename T>
881 bool SquareMatrixT4<T>::invert(SquareMatrixT4<T>& invertedMatrix) const
882 {
883  // implements the Gauss-Jordon Elimination
884 
885  SquareMatrixT4<T> source(*this);
886  invertedMatrix.toIdentity();
887 
888  for (unsigned int col = 0; col < 4u; ++col)
889  {
890  // find largest absolute value in the col-th column,
891  // to remove zeros from the main diagonal and to provide numerical stability
892  T absolute = 0.0;
893  unsigned int selectedRow = 0u;
894 
895  for (unsigned int row = col; row < 4u; ++row)
896  {
897  const T value = NumericT<T>::abs(source(row, col));
898  if (absolute < value)
899  {
900  absolute = value;
901  selectedRow = row;
902  }
903  }
904 
905  // if there was no greater absolute value than 0 this matrix is singular
906 
907  if (NumericT<T>::isEqualEps(absolute))
908  {
909  return false;
910  }
911 
912  // exchange the two rows
913  if (selectedRow != col)
914  {
915  source.swapRows(col, selectedRow);
916  invertedMatrix.swapRows(col, selectedRow);
917  }
918 
919  // now the element at (col, col) will be 1.0
920  if (NumericT<T>::isNotEqual(source(col, col), T(1.0)))
921  {
922  const T divisor = T(1.0) / source(col, col);
923  ocean_assert(divisor != T(0.0));
924 
925  source.multiplyRow(col, divisor);
926  invertedMatrix.multiplyRow(col, divisor);
927  }
928 
929  // clear each entry above and below the selected row and column to zero
930  for (unsigned int row = 0; row < 4u; ++row)
931  {
932  if (row != col)
933  {
934  const T value = -source(row, col);
935 
936  source.addRows(row, col, value);
937  invertedMatrix.addRows(row, col, value);
938  }
939  }
940  }
941 
942  return true;
943 }
944 
945 template <typename T>
947 {
948  const T v6_15 = values[6] * values[15];
949  const T v10_15 = values[10] * values[15];
950  const T v11_14 = values[11] * values[14];
951  const T v7_10 = values[7] * values[10];
952  const T v9_14 = values[9] * values[14];
953  const T v6_13 = values[6] * values[13];
954  const T v2_13 = values[2] * values[13];
955  const T v2_9 = values[2] * values[9];
956  const T v3_10 = values[3] * values[10];
957  const T v2_5 = values[2] * values[5];
958 
959  return values[0] * (values[5] * v10_15 - values[13] * v7_10
960  + v9_14 * values[7] - values[5] * v11_14
961  + v6_13 * values[11] - values[9] * v6_15)
962  - values[4] * (v9_14 * values[3] - values[1] * v11_14
963  + v2_13 * values[11] - v2_9 * values[15]
964  + values[1] * v10_15 - values[13] * v3_10)
965  + values[8] * (values[1] * v6_15 - v6_13 * values[3]
966  + values[5] * values[14] * values[3] - values[1] * values[14] * values[7]
967  + v2_13 * values[7] - v2_5 * values[15])
968  - values[12] * (values[1] * values[6] * values[11] - values[9] * values[6] * values[3]
969  + values[5] * v3_10 - values[1] * v7_10
970  + v2_9 * values[7] - v2_5 * values[11]);
971 }
972 
973 template <typename T>
975 {
976  return values[0] + values[5] + values[10] + values[15];
977 }
978 
979 template <typename T>
981 {
982  values[ 0] = T(1.0);
983  values[ 1] = T(0.0);
984  values[ 2] = T(0.0);
985  values[ 3] = T(0.0);
986  values[ 4] = T(0.0);
987  values[ 5] = T(1.0);
988  values[ 6] = T(0.0);
989  values[ 7] = T(0.0);
990  values[ 8] = T(0.0);
991  values[ 9] = T(0.0);
992  values[10] = T(1.0);
993  values[11] = T(0.0);
994  values[12] = T(0.0);
995  values[13] = T(0.0);
996  values[14] = T(0.0);
997  values[15] = T(1.0);
998 }
999 
1000 template <typename T>
1002 {
1003  values[ 0] = T(0.0);
1004  values[ 1] = T(0.0);
1005  values[ 2] = T(0.0);
1006  values[ 3] = T(0.0);
1007  values[ 4] = T(0.0);
1008  values[ 5] = T(0.0);
1009  values[ 6] = T(0.0);
1010  values[ 7] = T(0.0);
1011  values[ 8] = T(0.0);
1012  values[ 9] = T(0.0);
1013  values[10] = T(0.0);
1014  values[11] = T(0.0);
1015  values[12] = T(0.0);
1016  values[13] = T(0.0);
1017  values[14] = T(0.0);
1018  values[15] = T(0.0);
1019 }
1020 
1021 template <typename T>
1023 {
1024  return NumericT<T>::isEqual(values[0], 1) && NumericT<T>::isEqualEps(values[1]) && NumericT<T>::isEqualEps(values[2]) && NumericT<T>::isEqualEps(values[3])
1025  && NumericT<T>::isEqualEps(values[4]) && NumericT<T>::isEqual(values[5], 1) && NumericT<T>::isEqualEps(values[6]) && NumericT<T>::isEqualEps(values[7])
1026  && NumericT<T>::isEqualEps(values[8]) && NumericT<T>::isEqualEps(values[9]) && NumericT<T>::isEqual(values[10], 1) && NumericT<T>::isEqualEps(values[11])
1027  && NumericT<T>::isEqualEps(values[12]) && NumericT<T>::isEqualEps(values[13]) && NumericT<T>::isEqualEps(values[14]) && NumericT<T>::isEqual(values[15], 1);
1028 }
1029 
1030 template <typename T>
1032 {
1033  return NumericT<T>::isEqualEps(values[0]) && NumericT<T>::isEqualEps(values[1]) && NumericT<T>::isEqualEps(values[2]) && NumericT<T>::isEqualEps(values[3])
1034  && NumericT<T>::isEqualEps(values[4]) && NumericT<T>::isEqualEps(values[5]) && NumericT<T>::isEqualEps(values[6]) && NumericT<T>::isEqualEps(values[7])
1035  && NumericT<T>::isEqualEps(values[8]) && NumericT<T>::isEqualEps(values[9]) && NumericT<T>::isEqualEps(values[10]) && NumericT<T>::isEqualEps(values[11])
1036  && NumericT<T>::isEqualEps(values[12]) && NumericT<T>::isEqualEps(values[13]) && NumericT<T>::isEqualEps(values[14]) && NumericT<T>::isEqualEps(values[15]);
1037 }
1038 
1039 template <typename T>
1041 {
1042  return NumericT<T>::isEqualEps(determinant());
1043 }
1044 
1045 template <typename T>
1046 bool SquareMatrixT4<T>::isSymmetric(const T epsilon) const
1047 {
1048  ocean_assert(epsilon >= T(0));
1049 
1050  return NumericT<T>::isEqual(values[1], values[4], epsilon) && NumericT<T>::isEqual(values[2], values[8], epsilon) && NumericT<T>::isEqual(values[3], values[12], epsilon)
1051  && NumericT<T>::isEqual(values[6], values[9], epsilon) && NumericT<T>::isEqual(values[7], values[13], epsilon) && NumericT<T>::isEqual(values[11], values[14], epsilon);
1052 }
1053 
1054 template <typename T>
1055 inline bool SquareMatrixT4<T>::isEqual(const SquareMatrixT4<T>& matrix, const T eps) const
1056 {
1057  return NumericT<T>::isEqual(values[0], matrix.values[0], eps) && NumericT<T>::isEqual(values[1], matrix.values[1], eps)
1058  && NumericT<T>::isEqual(values[2], matrix.values[2], eps) && NumericT<T>::isEqual(values[3], matrix.values[3], eps)
1059  && NumericT<T>::isEqual(values[4], matrix.values[4], eps) && NumericT<T>::isEqual(values[5], matrix.values[5], eps)
1060  && NumericT<T>::isEqual(values[6], matrix.values[6], eps) && NumericT<T>::isEqual(values[7], matrix.values[7], eps)
1061  && NumericT<T>::isEqual(values[8], matrix.values[8], eps) && NumericT<T>::isEqual(values[9], matrix.values[9], eps)
1062  && NumericT<T>::isEqual(values[10], matrix.values[10], eps) && NumericT<T>::isEqual(values[11], matrix.values[11], eps)
1063  && NumericT<T>::isEqual(values[12], matrix.values[12], eps) && NumericT<T>::isEqual(values[13], matrix.values[13], eps)
1064  && NumericT<T>::isEqual(values[14], matrix.values[14], eps) && NumericT<T>::isEqual(values[15], matrix.values[15], eps);
1065 }
1066 
1067 template <typename T>
1069 {
1070  /**
1071  * <pre>
1072  * Computation of the characteristic polynomial
1073  *
1074  * [ a b c d ]
1075  * A = [ e f g h ]
1076  * [ i j k l ]
1077  * [ m n o p ]
1078  *
1079  * [ a-x b c d ]
1080  * A - x * E = [ e f-x g h ]
1081  * [ i j k-x l ]
1082  * [ m n o p-x ]
1083  *
1084  * Polynomial = Det|A - x * E| = 0
1085  * = x^4 + (-a - f - k - p)x^3 + (-be + af - ci - gj + ak + fk - dm - hn - lo + ap + fp + kp)x^2
1086  * + (cfi - bgi - cej + agj + bek - afk + dfm - bhm + dkm - clm - den + ahn + hkn - gln - dio - hjo + alo + flo + bep - afp + cip + gjp - akp - fkp)x
1087  * + (dgjm - chjm - dfkm + bhkm + cflm - bglm)
1088  * + (-dgin + chin + dekn - ahkn - celn + agln)
1089  * + (dfio - bhio - dejo + ahjo + belo - aflo)
1090  * + (-cfip + bgip + cejp - agjp - bekp + afkp)
1091  * = a1x^4 + a2x^3 + a3x^2 + a4x + a5 = 0
1092  * </pre>
1093  */
1094 
1095  const T a = values[0];
1096  const T b = values[1];
1097  const T c = values[2];
1098  const T d = values[3];
1099  const T e = values[4];
1100  const T f = values[5];
1101  const T g = values[6];
1102  const T h = values[7];
1103  const T i = values[8];
1104  const T j = values[9];
1105  const T k = values[10];
1106  const T l = values[11];
1107  const T m = values[12];
1108  const T n = values[13];
1109  const T o = values[14];
1110  const T p = values[15];
1111 
1112  const T a1 = 1.0;
1113  const T a2 = -a - f - k - p;
1114  const T a3 = -b * e + a * f - c * i - g * j + a * k + f * k - d * m - h * n - l * o + a * p + f * p + k * p;
1115  const T a4 = c * f * i - b * g * i - c * e * j + a * g * j + b * e * k - a * f * k + d * f * m - b * h * m
1116  + d * k * m - c * l * m - d * e * n + a * h * n + h * k * n - g * l * n - d * i * o - h * j * o
1117  + a * l * o + f * l * o + b * e * p - a * f * p + c * i * p + g * j * p - a * k * p - f * k * p;
1118  const T a5 = d * g * j * m - c * h * j * m - d * f * k * m + b * h * k * m + c * f * l * m - b * g * l * m
1119  - d * g * i * n + c * h * i * n + d * e * k * n - a * h * k * n - c * e * l * n + a * g * l * n
1120  + d * f * i * o - b * h * i * o - d * e * j * o + a * h * j * o + b * e * l * o - a * f * l * o
1121  - c * f * i * p + b * g * i * p + c * e * j * p - a * g * j * p - b * e * k * p + a * f * k * p;
1122 
1123 #if defined(SQUARE_MATRIX_DISABLED_MISSING_IMPLEMENTATION)
1124  T x[4];
1125  unsigned int solutions = EquationT<T>::solveQuartic(a1, a2, a3, a4, a5, x);
1126 #endif // SQUARE_MATRIX_DISABLED_MISSING_IMPLEMENTATION
1127 
1128  ocean_assert(false && "Missing implementation, calculate eigen vectors");
1129  OCEAN_SUPPRESS_UNUSED_WARNING(eigenValues);
1130  OCEAN_SUPPRESS_UNUSED_WARNING(eigenVectors);
1131 
1132  return false;
1133 }
1134 
1135 template <typename T>
1136 template <typename U>
1137 void SquareMatrixT4<T>::copyElements(U* arrayValues) const
1138 {
1139  ocean_assert(arrayValues);
1140 
1141  arrayValues[ 0] = U(values[ 0]);
1142  arrayValues[ 1] = U(values[ 1]);
1143  arrayValues[ 2] = U(values[ 2]);
1144  arrayValues[ 3] = U(values[ 3]);
1145  arrayValues[ 4] = U(values[ 4]);
1146  arrayValues[ 5] = U(values[ 5]);
1147  arrayValues[ 6] = U(values[ 6]);
1148  arrayValues[ 7] = U(values[ 7]);
1149  arrayValues[ 8] = U(values[ 8]);
1150  arrayValues[ 9] = U(values[ 9]);
1151  arrayValues[10] = U(values[10]);
1152  arrayValues[11] = U(values[11]);
1153  arrayValues[12] = U(values[12]);
1154  arrayValues[13] = U(values[13]);
1155  arrayValues[14] = U(values[14]);
1156  arrayValues[15] = U(values[15]);
1157 }
1158 
1159 template <typename T>
1160 void SquareMatrixT4<T>::copyElements(T* arrayValues) const
1161 {
1162  ocean_assert(arrayValues);
1163 
1164  memcpy(arrayValues, values, sizeof(T) * 16);
1165 }
1166 
1167 template <typename T>
1168 inline bool SquareMatrixT4<T>::operator!=(const SquareMatrixT4<T>& matrix) const
1169 {
1170  return !(*this == matrix);
1171 }
1172 
1173 template <typename T>
1175 {
1176  *this = *this * matrix;
1177  return *this;
1178 }
1179 
1180 template <typename T>
1182 {
1183  *this = *this * matrix;
1184  return *this;
1185 }
1186 
1187 template <typename T>
1188 inline T SquareMatrixT4<T>::operator[](const unsigned int index) const
1189 {
1190  ocean_assert(index < 16u);
1191  return values[index];
1192 }
1193 
1194 template <typename T>
1195 inline T& SquareMatrixT4<T>::operator[](const unsigned int index)
1196 {
1197  ocean_assert(index < 16u);
1198  return values[index];
1199 }
1200 
1201 template <typename T>
1202 inline T SquareMatrixT4<T>::operator()(const unsigned int row, const unsigned int column) const
1203 {
1204  ocean_assert(row < 4u && column < 4u);
1205  return values[(column << 2) + row]; // values[(column * 4) + row];
1206 }
1207 
1208 template <typename T>
1209 inline T& SquareMatrixT4<T>::operator()(const unsigned int row, const unsigned int column)
1210 {
1211  ocean_assert(row < 4u && column < 4u);
1212  return values[(column << 2) + row]; // values[(column * 4) + row];
1213 }
1214 
1215 template <typename T>
1216 inline T SquareMatrixT4<T>::operator()(const unsigned int index) const
1217 {
1218  ocean_assert(index < 16u);
1219  return values[index];
1220 }
1221 
1222 template <typename T>
1223 inline T& SquareMatrixT4<T>::operator()(const unsigned int index)
1224 {
1225  ocean_assert(index < 16u);
1226  return values[index];
1227 }
1228 
1229 template <typename T>
1230 inline const T* SquareMatrixT4<T>::operator()() const
1231 {
1232  return values;
1233 }
1234 
1235 template <typename T>
1237 {
1238  return values;
1239 }
1240 
1241 template <typename T>
1242 inline size_t SquareMatrixT4<T>::operator()(const SquareMatrixT4<T>& matrix) const
1243 {
1244  size_t seed = std::hash<T>{}(matrix.values[0]);
1245 
1246  for (unsigned int n = 1u; n < 16u; ++n)
1247  {
1248  seed ^= std::hash<T>{}(matrix.values[n]) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
1249  }
1250 
1251  return seed;
1252 }
1253 
1254 template <typename T>
1256 {
1257  return 16;
1258 }
1259 
1260 template <typename T>
1262 {
1263  return isEqual(matrix);
1264 }
1265 
1266 template <typename T>
1268 {
1269  SquareMatrixT4<T> result(*this);
1270 
1271  result.values[0] += matrix.values[0];
1272  result.values[1] += matrix.values[1];
1273  result.values[2] += matrix.values[2];
1274  result.values[3] += matrix.values[3];
1275  result.values[4] += matrix.values[4];
1276  result.values[5] += matrix.values[5];
1277  result.values[6] += matrix.values[6];
1278  result.values[7] += matrix.values[7];
1279  result.values[8] += matrix.values[8];
1280  result.values[9] += matrix.values[9];
1281  result.values[10] += matrix.values[10];
1282  result.values[11] += matrix.values[11];
1283  result.values[12] += matrix.values[12];
1284  result.values[13] += matrix.values[13];
1285  result.values[14] += matrix.values[14];
1286  result.values[15] += matrix.values[15];
1287 
1288  return result;
1289 }
1290 
1291 template <typename T>
1293 {
1294  values[0] += matrix.values[0];
1295  values[1] += matrix.values[1];
1296  values[2] += matrix.values[2];
1297  values[3] += matrix.values[3];
1298  values[4] += matrix.values[4];
1299  values[5] += matrix.values[5];
1300  values[6] += matrix.values[6];
1301  values[7] += matrix.values[7];
1302  values[8] += matrix.values[8];
1303  values[9] += matrix.values[9];
1304  values[10] += matrix.values[10];
1305  values[11] += matrix.values[11];
1306  values[12] += matrix.values[12];
1307  values[13] += matrix.values[13];
1308  values[14] += matrix.values[14];
1309  values[15] += matrix.values[15];
1310 
1311  return *this;
1312 }
1313 
1314 template <typename T>
1316 {
1317  SquareMatrixT4<T> result(*this);
1318 
1319  result.values[0] -= matrix.values[0];
1320  result.values[1] -= matrix.values[1];
1321  result.values[2] -= matrix.values[2];
1322  result.values[3] -= matrix.values[3];
1323  result.values[4] -= matrix.values[4];
1324  result.values[5] -= matrix.values[5];
1325  result.values[6] -= matrix.values[6];
1326  result.values[7] -= matrix.values[7];
1327  result.values[8] -= matrix.values[8];
1328  result.values[9] -= matrix.values[9];
1329  result.values[10] -= matrix.values[10];
1330  result.values[11] -= matrix.values[11];
1331  result.values[12] -= matrix.values[12];
1332  result.values[13] -= matrix.values[13];
1333  result.values[14] -= matrix.values[14];
1334  result.values[15] -= matrix.values[15];
1335 
1336  return result;
1337 }
1338 
1339 template <typename T>
1341 {
1342  values[0] -= matrix.values[0];
1343  values[1] -= matrix.values[1];
1344  values[2] -= matrix.values[2];
1345  values[3] -= matrix.values[3];
1346  values[4] -= matrix.values[4];
1347  values[5] -= matrix.values[5];
1348  values[6] -= matrix.values[6];
1349  values[7] -= matrix.values[7];
1350  values[8] -= matrix.values[8];
1351  values[9] -= matrix.values[9];
1352  values[10] -= matrix.values[10];
1353  values[11] -= matrix.values[11];
1354  values[12] -= matrix.values[12];
1355  values[13] -= matrix.values[13];
1356  values[14] -= matrix.values[14];
1357  values[15] -= matrix.values[15];
1358 
1359  return *this;
1360 }
1361 
1362 template <typename T>
1364 {
1365  SquareMatrixT4<T> result;
1366 
1367  result.values[ 0] = -values[ 0];
1368  result.values[ 1] = -values[ 1];
1369  result.values[ 2] = -values[ 2];
1370  result.values[ 3] = -values[ 3];
1371  result.values[ 4] = -values[ 4];
1372  result.values[ 5] = -values[ 5];
1373  result.values[ 6] = -values[ 6];
1374  result.values[ 7] = -values[ 7];
1375  result.values[ 8] = -values[ 8];
1376  result.values[ 9] = -values[ 9];
1377  result.values[10] = -values[10];
1378  result.values[11] = -values[11];
1379  result.values[12] = -values[12];
1380  result.values[13] = -values[13];
1381  result.values[14] = -values[14];
1382 
1383  return result;
1384 }
1385 
1386 template <typename T>
1388 {
1389  SquareMatrixT4<T> result;
1390 
1391  result.values[0] = values[0] * matrix.values[0] + values[4] * matrix.values[1] + values[8] * matrix.values[2] + values[12] * matrix.values[3];
1392  result.values[1] = values[1] * matrix.values[0] + values[5] * matrix.values[1] + values[9] * matrix.values[2] + values[13] * matrix.values[3];
1393  result.values[2] = values[2] * matrix.values[0] + values[6] * matrix.values[1] + values[10] * matrix.values[2] + values[14] * matrix.values[3];
1394  result.values[3] = values[3] * matrix.values[0] + values[7] * matrix.values[1] + values[11] * matrix.values[2] + values[15] * matrix.values[3];
1395 
1396  result.values[4] = values[0] * matrix.values[4] + values[4] * matrix.values[5] + values[8] * matrix.values[6] + values[12] * matrix.values[7];
1397  result.values[5] = values[1] * matrix.values[4] + values[5] * matrix.values[5] + values[9] * matrix.values[6] + values[13] * matrix.values[7];
1398  result.values[6] = values[2] * matrix.values[4] + values[6] * matrix.values[5] + values[10] * matrix.values[6] + values[14] * matrix.values[7];
1399  result.values[7] = values[3] * matrix.values[4] + values[7] * matrix.values[5] + values[11] * matrix.values[6] + values[15] * matrix.values[7];
1400 
1401  result.values[8] = values[0] * matrix.values[8] + values[4] * matrix.values[9] + values[8] * matrix.values[10] + values[12] * matrix.values[11];
1402  result.values[9] = values[1] * matrix.values[8] + values[5] * matrix.values[9] + values[9] * matrix.values[10] + values[13] * matrix.values[11];
1403  result.values[10] = values[2] * matrix.values[8] + values[6] * matrix.values[9] + values[10] * matrix.values[10] + values[14] * matrix.values[11];
1404  result.values[11] = values[3] * matrix.values[8] + values[7] * matrix.values[9] + values[11] * matrix.values[10] + values[15] * matrix.values[11];
1405 
1406  result.values[12] = values[0] * matrix.values[12] + values[4] * matrix.values[13] + values[8] * matrix.values[14] + values[12] * matrix.values[15];
1407  result.values[13] = values[1] * matrix.values[12] + values[5] * matrix.values[13] + values[9] * matrix.values[14] + values[13] * matrix.values[15];
1408  result.values[14] = values[2] * matrix.values[12] + values[6] * matrix.values[13] + values[10] * matrix.values[14] + values[14] * matrix.values[15];
1409  result.values[15] = values[3] * matrix.values[12] + values[7] * matrix.values[13] + values[11] * matrix.values[14] + values[15] * matrix.values[15];
1410 
1411  return result;
1412 }
1413 
1414 #if defined(OCEAN_HARDWARE_AVX_VERSION) && OCEAN_HARDWARE_AVX_VERSION >= 10
1415 
1416 template <>
1418 {
1419  // the following code uses the following AVX instructions, and needs AVX1 or higher
1420 
1421  // AVX1:
1422  // _mm256_broadcast_sd
1423  // _mm256_loadu_pd
1424  // _mm256_mul_pd
1425  // _mm256_add_pd
1426  // _mm256_storeu_pd
1427 
1428  // we use the same strategy as we apply for matrix-vector multiplication
1429  // further, here we interpret the right matrix as 4 vectors
1430 
1431  // we load the columns of the left matrix
1432  __m256d c0 = _mm256_loadu_pd(values + 0);
1433  __m256d c1 = _mm256_loadu_pd(values + 4);
1434  __m256d c2 = _mm256_loadu_pd(values + 8);
1435  __m256d c3 = _mm256_loadu_pd(values + 12);
1436 
1437 
1438  // we determine the first vector of the resulting matrix
1439  __m256d v0 = _mm256_broadcast_sd(matrix.data() + 0);
1440  __m256d r0 = _mm256_mul_pd(c0, v0);
1441 
1442  __m256d v1 = _mm256_broadcast_sd(matrix.data() + 1);
1443  __m256d r1 = _mm256_mul_pd(c1, v1);
1444 
1445  r0 = _mm256_add_pd(r0, r1);
1446 
1447  __m256d v2 = _mm256_broadcast_sd(matrix.data() + 2);
1448  __m256d r2 = _mm256_mul_pd(c2, v2);
1449 
1450  r0 = _mm256_add_pd(r0, r2);
1451 
1452  __m256d v3 = _mm256_broadcast_sd(matrix.data() + 3);
1453  __m256d r3 = _mm256_mul_pd(c3, v3);
1454 
1455  r0 = _mm256_add_pd(r0, r3);
1456 
1457  SquareMatrixT4<double> result;
1458 
1459  _mm256_storeu_pd(result.data(), r0);
1460 
1461 
1462  // we determine the second vector of the resulting matrix
1463  __m256d v4 = _mm256_broadcast_sd(matrix.data() + 4);
1464  __m256d r4 = _mm256_mul_pd(c0, v4);
1465 
1466  __m256d v5 = _mm256_broadcast_sd(matrix.data() + 5);
1467  __m256d r5 = _mm256_mul_pd(c1, v5);
1468 
1469  r4 = _mm256_add_pd(r4, r5);
1470 
1471  __m256d v6 = _mm256_broadcast_sd(matrix.data() + 6);
1472  __m256d r6 = _mm256_mul_pd(c2, v6);
1473 
1474  r4 = _mm256_add_pd(r4, r6);
1475 
1476  __m256d v7 = _mm256_broadcast_sd(matrix.data() + 7);
1477  __m256d r7 = _mm256_mul_pd(c3, v7);
1478 
1479  r4 = _mm256_add_pd(r4, r7);
1480 
1481  _mm256_storeu_pd(result.data() + 4, r4);
1482 
1483 
1484  // we determine the third vector of the resulting matrix
1485  __m256d v8 = _mm256_broadcast_sd(matrix.data() + 8);
1486  __m256d r8 = _mm256_mul_pd(c0, v8);
1487 
1488  __m256d v9 = _mm256_broadcast_sd(matrix.data() + 9);
1489  __m256d r9 = _mm256_mul_pd(c1, v9);
1490 
1491  r8 = _mm256_add_pd(r8, r9);
1492 
1493  __m256d v10 = _mm256_broadcast_sd(matrix.data() + 10);
1494  __m256d r10 = _mm256_mul_pd(c2, v10);
1495 
1496  r8 = _mm256_add_pd(r8, r10);
1497 
1498  __m256d v11 = _mm256_broadcast_sd(matrix.data() + 11);
1499  __m256d r11 = _mm256_mul_pd(c3, v11);
1500 
1501  r8 = _mm256_add_pd(r8, r11);
1502 
1503  _mm256_storeu_pd(result.data() + 8, r8);
1504 
1505 
1506  // we determine the forth vector of the resulting matrix
1507  __m256d v12 = _mm256_broadcast_sd(matrix.data() + 12);
1508  __m256d r12 = _mm256_mul_pd(c0, v12);
1509 
1510  __m256d v13 = _mm256_broadcast_sd(matrix.data() + 13);
1511  __m256d r13 = _mm256_mul_pd(c1, v13);
1512 
1513  r12 = _mm256_add_pd(r12, r13);
1514 
1515  __m256d v14 = _mm256_broadcast_sd(matrix.data() + 14);
1516  __m256d r14 = _mm256_mul_pd(c2, v14);
1517 
1518  r12 = _mm256_add_pd(r12, r14);
1519 
1520  __m256d v15 = _mm256_broadcast_sd(matrix.data() + 15);
1521  __m256d r15 = _mm256_mul_pd(c3, v15);
1522 
1523  r12 = _mm256_add_pd(r12, r15);
1524 
1525  _mm256_storeu_pd(result.data() + 12, r12);
1526 
1527  return result;
1528 }
1529 
1530 template <>
1532 {
1533  // the following code uses the following AVX instructions, and needs AVX1 or higher
1534 
1535  // we use the same strategy as we apply for matrix-vector multiplication
1536  // further, here we interpret the right matrix as 4 vectors, and combine two vectors into one 256 bit register
1537 
1538  // we load the four columns of the left matrix
1539  __m256 c0 = _mm256_broadcast_ps((const __m128*)values + 0);
1540  __m256 c1 = _mm256_broadcast_ps((const __m128*)values + 1);
1541  __m256 c2 = _mm256_broadcast_ps((const __m128*)values + 2);
1542  __m256 c3 = _mm256_broadcast_ps((const __m128*)values + 3);
1543 
1544  __m256 m01 = _mm256_loadu_ps(matrix.data() + 0);
1545  __m256 m23 = _mm256_loadu_ps(matrix.data() + 8);
1546 
1547 
1548  // we determine the first two vectors of the resulting matrix
1549  __m256 v0_4 = _mm256_permute_ps(m01, 0x00);
1550  __m256 r0 = _mm256_mul_ps(c0, v0_4);
1551 
1552 #ifdef OCEAN_COMPILER_MSC
1553  ocean_assert(NumericF::isEqual(r0.m256_f32[0], values[0] * matrix[0]));
1554  ocean_assert(NumericF::isEqual(r0.m256_f32[1], values[1] * matrix[0]));
1555  ocean_assert(NumericF::isEqual(r0.m256_f32[2], values[2] * matrix[0]));
1556  ocean_assert(NumericF::isEqual(r0.m256_f32[3], values[3] * matrix[0]));
1557 
1558  ocean_assert(NumericF::isEqual(r0.m256_f32[4], values[0] * matrix[4]));
1559  ocean_assert(NumericF::isEqual(r0.m256_f32[5], values[1] * matrix[4]));
1560  ocean_assert(NumericF::isEqual(r0.m256_f32[6], values[2] * matrix[4]));
1561  ocean_assert(NumericF::isEqual(r0.m256_f32[7], values[3] * matrix[4]));
1562 #endif
1563 
1564  __m256 v1_5 = _mm256_permute_ps(m01, 0x55);
1565  __m256 r1 = _mm256_mul_ps(c1, v1_5);
1566 
1567 #ifdef OCEAN_COMPILER_MSC
1568  ocean_assert(NumericF::isEqual(r1.m256_f32[0], values[ 4] * matrix[1]));
1569  ocean_assert(NumericF::isEqual(r1.m256_f32[1], values[ 5] * matrix[1]));
1570  ocean_assert(NumericF::isEqual(r1.m256_f32[2], values[ 6] * matrix[1]));
1571  ocean_assert(NumericF::isEqual(r1.m256_f32[3], values[ 7] * matrix[1]));
1572 
1573  ocean_assert(NumericF::isEqual(r1.m256_f32[4], values[4] * matrix[5]));
1574  ocean_assert(NumericF::isEqual(r1.m256_f32[5], values[5] * matrix[5]));
1575  ocean_assert(NumericF::isEqual(r1.m256_f32[6], values[6] * matrix[5]));
1576  ocean_assert(NumericF::isEqual(r1.m256_f32[7], values[7] * matrix[5]));
1577 #endif
1578 
1579  r0 = _mm256_add_ps(r0, r1);
1580 
1581 
1582  __m256 v2_6 = _mm256_permute_ps(m01, 0xAA);
1583  __m256 r2 = _mm256_mul_ps(c2, v2_6);
1584 
1585  r0 = _mm256_add_ps(r0, r2);
1586 
1587 
1588  __m256 v3_7 = _mm256_permute_ps(m01, 0xFF);
1589  __m256 r3 = _mm256_mul_ps(c3, v3_7);
1590 
1591  r0 = _mm256_add_ps(r0, r3);
1592 
1593  SquareMatrixT4<float> result;
1594 
1595  _mm256_storeu_ps(result.data(), r0);
1596 
1597 
1598 
1599  // we determine the last two vectors of the resulting matrix
1600  __m256 v8_12 = _mm256_permute_ps(m23, 0x00);
1601  __m256 r8 = _mm256_mul_ps(c0, v8_12);
1602 
1603 
1604  __m256 v9_13 = _mm256_permute_ps(m23, 0x55);
1605  __m256 r9 = _mm256_mul_ps(c1, v9_13);
1606 
1607  r8 = _mm256_add_ps(r8, r9);
1608 
1609 
1610  __m256 v10_14 = _mm256_permute_ps(m23, 0xAA);
1611  __m256 r10 = _mm256_mul_ps(c2, v10_14);
1612 
1613  r8 = _mm256_add_ps(r8, r10);
1614 
1615 
1616  __m256 v11_15 = _mm256_permute_ps(m23, 0xFF);
1617  __m256 r11 = _mm256_mul_ps(c3, v11_15);
1618 
1619  r8 = _mm256_add_ps(r8, r11);
1620 
1621  _mm256_storeu_ps(result.data() + 8, r8);
1622 
1623  return result;
1624 }
1625 
1626 #else // OCEAN_HARDWARE_AVX_VERSION < 10
1627 
1628 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10
1629 
1630 template <>
1632 {
1633  // the following code uses the following SSE instructions, and needs SSE1 or higher
1634 
1635  // SSE1:
1636  // _mm_load1_ps
1637  // _mm_loadu_ps
1638  // _mm_mul_ps
1639  // _mm_add_ps
1640  // _mm_storeu_ps
1641 
1642  // we use the same strategy as we apply for matrix-vector multiplication
1643  // further, here we interpret the right matrix as 4 vectors
1644 
1645  // we load the columns of the left matrix
1646  __m128 c0 = _mm_loadu_ps(values + 0);
1647  __m128 c1 = _mm_loadu_ps(values + 4);
1648  __m128 c2 = _mm_loadu_ps(values + 8);
1649  __m128 c3 = _mm_loadu_ps(values + 12);
1650 
1651 
1652  // we determine the first vector of the resulting matrix
1653  __m128 v0 = _mm_load1_ps(matrix.data() + 0);
1654  __m128 r0 = _mm_mul_ps(c0, v0);
1655 
1656  __m128 v1 = _mm_load1_ps(matrix.data() + 1);
1657  __m128 r1 = _mm_mul_ps(c1, v1);
1658 
1659  r0 = _mm_add_ps(r0, r1);
1660 
1661  __m128 v2 = _mm_load1_ps(matrix.data() + 2);
1662  __m128 r2 = _mm_mul_ps(c2, v2);
1663 
1664  r0 = _mm_add_ps(r0, r2);
1665 
1666  __m128 v3 = _mm_load1_ps(matrix.data() + 3);
1667  __m128 r3 = _mm_mul_ps(c3, v3);
1668 
1669  r0 = _mm_add_ps(r0, r3);
1670 
1671  SquareMatrixT4<float> result;
1672 
1673  _mm_storeu_ps(result.data(), r0);
1674 
1675 
1676  // we determine the second vector of the resulting matrix
1677  __m128 v4 = _mm_load1_ps(matrix.data() + 4);
1678  __m128 r4 = _mm_mul_ps(c0, v4);
1679 
1680  __m128 v5 = _mm_load1_ps(matrix.data() + 5);
1681  __m128 r5 = _mm_mul_ps(c1, v5);
1682 
1683  r4 = _mm_add_ps(r4, r5);
1684 
1685  __m128 v6 = _mm_load1_ps(matrix.data() + 6);
1686  __m128 r6 = _mm_mul_ps(c2, v6);
1687 
1688  r4 = _mm_add_ps(r4, r6);
1689 
1690  __m128 v7 = _mm_load1_ps(matrix.data() + 7);
1691  __m128 r7 = _mm_mul_ps(c3, v7);
1692 
1693  r4 = _mm_add_ps(r4, r7);
1694 
1695  _mm_storeu_ps(result.data() + 4, r4);
1696 
1697 
1698  // we determine the third vector of the resulting matrix
1699  __m128 v8 = _mm_load1_ps(matrix.data() + 8);
1700  __m128 r8 = _mm_mul_ps(c0, v8);
1701 
1702  __m128 v9 = _mm_load1_ps(matrix.data() + 9);
1703  __m128 r9 = _mm_mul_ps(c1, v9);
1704 
1705  r8 = _mm_add_ps(r8, r9);
1706 
1707  __m128 v10 = _mm_load1_ps(matrix.data() + 10);
1708  __m128 r10 = _mm_mul_ps(c2, v10);
1709 
1710  r8 = _mm_add_ps(r8, r10);
1711 
1712  __m128 v11 = _mm_load1_ps(matrix.data() + 11);
1713  __m128 r11 = _mm_mul_ps(c3, v11);
1714 
1715  r8 = _mm_add_ps(r8, r11);
1716 
1717  _mm_storeu_ps(result.data() + 8, r8);
1718 
1719 
1720  // we determine the forth vector of the resulting matrix
1721  __m128 v12 = _mm_load1_ps(matrix.data() + 12);
1722  __m128 r12 = _mm_mul_ps(c0, v12);
1723 
1724  __m128 v13 = _mm_load1_ps(matrix.data() + 13);
1725  __m128 r13 = _mm_mul_ps(c1, v13);
1726 
1727  r12 = _mm_add_ps(r12, r13);
1728 
1729  __m128 v14 = _mm_load1_ps(matrix.data() + 14);
1730  __m128 r14 = _mm_mul_ps(c2, v14);
1731 
1732  r12 = _mm_add_ps(r12, r14);
1733 
1734  __m128 v15 = _mm_load1_ps(matrix.data() + 15);
1735  __m128 r15 = _mm_mul_ps(c3, v15);
1736 
1737  r12 = _mm_add_ps(r12, r15);
1738 
1739  _mm_storeu_ps(result.data() + 12, r12);
1740 
1741  return result;
1742 }
1743 
1744 #endif // OCEAN_HARDWARE_SSE_VERSION >= 10
1745 
1746 #endif // OCEAN_HARDWARE_AVX_VERSION >= 10
1747 
1748 template <typename T>
1750 {
1751  SquareMatrixT4<T> result;
1752 
1753  result.values[0] = values[0] * matrix[0] + values[4] * matrix[1] + values[8] * matrix[2]; // + values[12] * matrix[3];
1754  result.values[1] = values[1] * matrix[0] + values[5] * matrix[1] + values[9] * matrix[2]; // + values[13] * matrix[3];
1755  result.values[2] = values[2] * matrix[0] + values[6] * matrix[1] + values[10] * matrix[2]; // + values[14] * matrix[3];
1756  result.values[3] = values[3] * matrix[0] + values[7] * matrix[1] + values[11] * matrix[2]; // + values[15] * matrix[3];
1757 
1758  result.values[4] = values[0] * matrix[4] + values[4] * matrix[5] + values[8] * matrix[6]; // + values[12] * matrix[7];
1759  result.values[5] = values[1] * matrix[4] + values[5] * matrix[5] + values[9] * matrix[6]; // + values[13] * matrix[7];
1760  result.values[6] = values[2] * matrix[4] + values[6] * matrix[5] + values[10] * matrix[6]; // + values[14] * matrix[7];
1761  result.values[7] = values[3] * matrix[4] + values[7] * matrix[5] + values[11] * matrix[6]; // + values[15] * matrix[7];
1762 
1763  result.values[8] = values[0] * matrix[8] + values[4] * matrix[9] + values[8] * matrix[10]; // + values[12] * matrix[11];
1764  result.values[9] = values[1] * matrix[8] + values[5] * matrix[9] + values[9] * matrix[10]; // + values[13] * matrix[11];
1765  result.values[10] = values[2] * matrix[8] + values[6] * matrix[9] + values[10] * matrix[10]; // + values[14] * matrix[11];
1766  result.values[11] = values[3] * matrix[8] + values[7] * matrix[9] + values[11] * matrix[10]; // + values[15] * matrix[11];
1767 
1768  result.values[12] = values[0] * matrix[12] + values[4] * matrix[13] + values[8] * matrix[14] + values[12]; // * matrix[15];
1769  result.values[13] = values[1] * matrix[12] + values[5] * matrix[13] + values[9] * matrix[14] + values[13]; // * matrix[15];
1770  result.values[14] = values[2] * matrix[12] + values[6] * matrix[13] + values[10] * matrix[14] + values[14]; // * matrix[15];
1771  result.values[15] = values[3] * matrix[12] + values[7] * matrix[13] + values[11] * matrix[14] + values[15]; // * matrix[15];
1772 
1773  return result;
1774 }
1775 
1776 template <typename T>
1777 OCEAN_FORCE_INLINE VectorT3<T> SquareMatrixT4<T>::operator*(const VectorT3<T>& vector) const
1778 {
1779  const T w = values[3] * vector[0] + values[7] * vector[1] + values[11] * vector[2] + values[15];
1780  ocean_assert(NumericT<T>::isNotEqualEps(w) && "Division by zero!");
1781 
1782  const T factor = 1 / w;
1783 
1784  return VectorT3<T>((values[0] * vector[0] + values[4] * vector[1] + values[8] * vector[2] + values[12]) * factor,
1785  (values[1] * vector[0] + values[5] * vector[1] + values[9] * vector[2] + values[13]) * factor,
1786  (values[2] * vector[0] + values[6] * vector[1] + values[10] * vector[2] + values[14]) * factor);
1787 }
1788 
1789 template <typename T>
1790 OCEAN_FORCE_INLINE VectorT4<T> SquareMatrixT4<T>::operator*(const VectorT4<T>& vector) const
1791 {
1792  return VectorT4<T>(values[0] * vector[0] + values[4] * vector[1] + values[8] * vector[2] + values[12] * vector[3],
1793  values[1] * vector[0] + values[5] * vector[1] + values[9] * vector[2] + values[13] * vector[3],
1794  values[2] * vector[0] + values[6] * vector[1] + values[10] * vector[2] + values[14] * vector[3],
1795  values[3] * vector[0] + values[7] * vector[1] + values[11] * vector[2] + values[15] * vector[3]);
1796 }
1797 
1798 #if defined(OCEAN_HARDWARE_AVX_VERSION) && OCEAN_HARDWARE_AVX_VERSION >= 10
1799 
1800 #ifdef OCEAN_USE_SLOWER_IMPLEMENTATION
1801 
1802 // we keep following implementation inside 'OCEAN_USE_SLOWER_IMPLEMENTATION' showing an alternative (which is slower)
1803 
1804 template <>
1806 {
1807  // the following code uses the following AVX instructions, and needs AVX1 or higher
1808 
1809  // AVX1:
1810  // _mm256_loadu_pd
1811  // _mm256_mul_pd
1812  // _mm256_shuffle_pd
1813  // _mm256_hadd_pd
1814  // _mm256_permute2f128_pd
1815  // _mm256_storeu_pd
1816 
1817  // first we load the first four rows of the matrix
1818  __m256d row0 = _mm256_loadu_pd(values + 0);
1819  __m256d row1 = _mm256_loadu_pd(values + 4);
1820  __m256d row2 = _mm256_loadu_pd(values + 8);
1821  __m256d row3 = _mm256_loadu_pd(values + 12);
1822 
1823  // we load the four values of the vector
1824  __m256d v = _mm256_loadu_pd(vector.data());
1825 
1826  // first, we transpose the 4x4 matrix
1827 
1828  // A E I M A B C D
1829  // B F J N E F G H
1830  // C G K O I J K L
1831  // D H L P M N O P
1832 
1833  // A B C D, E F G H -> A E C G
1834  __m256d temp0 = _mm256_shuffle_pd(row0, row1, 0x00); // 0x00 = 0000 0000
1835  // A B C D, E F G H -> B F D H
1836  __m256d temp2 = _mm256_shuffle_pd(row0, row1, 0x0F); // 0x0F = 0000 1111
1837  // I J K L, M N O P -> I M K O
1838  __m256d temp1 = _mm256_shuffle_pd(row2, row3, 0x00);
1839  // I J K L, M N O P -> J N L P
1840  __m256d temp3 = _mm256_shuffle_pd(row2, row3, 0x0F);
1841 
1842  // A E C G I M K O -> A E I M
1843  row0 = _mm256_permute2f128_pd(temp0, temp1, 0x20); // 0x20 = 0010 0000
1844  // B F D H J N L P -> B F J N
1845  row1 = _mm256_permute2f128_pd(temp2, temp3, 0x20);
1846  // A E C G I M K O -> C G K O
1847  row2 = _mm256_permute2f128_pd(temp0, temp1, 0x31); // 0x31 = 0011 0001
1848  // B F D H J N L P -> D H L P
1849  row3 = _mm256_permute2f128_pd(temp2, temp3, 0x31);
1850 
1851 #ifdef OCEAN_COMPILER_MSC
1852  ocean_assert(row0.m256d_f64[0] == values[0] && row0.m256d_f64[1] == values[4] && row0.m256d_f64[2] == values[ 8] && row0.m256d_f64[3] == values[12]);
1853  ocean_assert(row1.m256d_f64[0] == values[1] && row1.m256d_f64[1] == values[5] && row1.m256d_f64[2] == values[ 9] && row1.m256d_f64[3] == values[13]);
1854  ocean_assert(row2.m256d_f64[0] == values[2] && row2.m256d_f64[1] == values[6] && row2.m256d_f64[2] == values[10] && row2.m256d_f64[3] == values[14]);
1855  ocean_assert(row3.m256d_f64[0] == values[3] && row3.m256d_f64[1] == values[7] && row3.m256d_f64[2] == values[11] && row3.m256d_f64[3] == values[15]);
1856 #endif
1857 
1858  // unfortunately the AVX does not offer _mm256_dp_pd (the determination of the dot product) so we have to do it on our own
1859 
1860  __m256d r0v = _mm256_mul_pd(row0, v);
1861  __m256d r1v = _mm256_mul_pd(row1, v);
1862  __m256d r2v = _mm256_mul_pd(row2, v);
1863  __m256d r3v = _mm256_mul_pd(row3, v);
1864 
1865  // we sum both multiplication results horizontally (at least two neighboring products)
1866  __m256d sum_interleaved_r0_r1 = _mm256_hadd_pd(r0v, r1v);
1867  __m256d sum_interleaved_r2_r3 = _mm256_hadd_pd(r2v, r3v);
1868 
1869  // now we reorder the interleaved sums
1870  __m256d sum_first = _mm256_permute2f128_pd(sum_interleaved_r0_r1, sum_interleaved_r2_r3, 0x20); // 0x20 = 0010 0000
1871  __m256d sum_second = _mm256_permute2f128_pd(sum_interleaved_r0_r1, sum_interleaved_r2_r3, 0x31); // 0x31 = 0011 0001
1872 
1873  // we finally add both reordered sums
1874  __m256d sum = _mm256_add_pd(sum_first, sum_second);
1875 
1876  VectorT4<double> result;
1877  _mm256_storeu_pd(result.data(), sum);
1878 
1879  ocean_assert(NumericD::isEqual(result[0], values[0] * vector[0] + values[4] * vector[1] + values[ 8] * vector[2] + values[12] * vector[3], NumericD::eps() * 100));
1880  ocean_assert(NumericD::isEqual(result[1], values[1] * vector[0] + values[5] * vector[1] + values[ 9] * vector[2] + values[13] * vector[3], NumericD::eps() * 100));
1881  ocean_assert(NumericD::isEqual(result[2], values[2] * vector[0] + values[6] * vector[1] + values[10] * vector[2] + values[14] * vector[3], NumericD::eps() * 100));
1882  ocean_assert(NumericD::isEqual(result[3], values[3] * vector[0] + values[7] * vector[1] + values[11] * vector[2] + values[15] * vector[3], NumericD::eps() * 100));
1883 
1884  return result;
1885 }
1886 
1887 #else // OCEAN_USE_SLOWER_IMPLEMENTATION
1888 
1889 template <>
1891 {
1892  // the following code uses the following AVX instructions, and needs AVX1 or higher
1893 
1894  // AVX1:
1895  // _mm256_broadcast_sd
1896  // _mm256_loadu_pd
1897  // _mm256_mul_pd
1898  // _mm256_add_pd
1899  // _mm256_storeu_pd
1900 
1901  // we use the same strategy as for the 32 bit float values
1902 
1903  // first we load the first vector element in all 64bit elements of the 256 bit register, so that we receive [a, a, a, a]
1904  __m256d v0 = _mm256_broadcast_sd(vector.data() + 0);
1905 
1906  // now we load the first column to receive: [A, B, C, D]
1907  __m256d c0 = _mm256_loadu_pd(values + 0);
1908 
1909  // now we multiply the 256 bit register [A, B, C, D] * [a, a, a, a] = [Aa, Ba, Ca, Da]
1910  __m256d r0 = _mm256_mul_pd(c0, v0);
1911 
1912 
1913  // now we proceed with the second column
1914  __m256d v1 = _mm256_broadcast_sd(vector.data() + 1);
1915  __m256d c1 = _mm256_loadu_pd(values + 4);
1916  __m256d r1 = _mm256_mul_pd(c1, v1);
1917 
1918  // and we sum the result of the first column with the result of the second column
1919  r0 = _mm256_add_pd(r0, r1);
1920 
1921 
1922  // now we proceed with the third column
1923  __m256d v2 = _mm256_broadcast_sd(vector.data() + 2);
1924  __m256d c2 = _mm256_loadu_pd(values + 8);
1925  __m256d r2 = _mm256_mul_pd(c2, v2);
1926 
1927  // we sum the results
1928  r0 = _mm256_add_pd(r0, r2);
1929 
1930 
1931  // now we proceed with the fourth column
1932  __m256d v3 = _mm256_broadcast_sd(vector.data() + 3);
1933  __m256d c3 = _mm256_loadu_pd(values + 12);
1934  __m256d r3 = _mm256_mul_pd(c3, v3);
1935 
1936  // we sum the results
1937  r0 = _mm256_add_pd(r0, r3);
1938 
1939  // and finally we store the results back to the vector
1940  VectorT4<double> result;
1941 
1942  _mm256_storeu_pd(result.data(), r0);
1943 
1944  return result;
1945 }
1946 
1947 #endif // OCEAN_USE_SLOWER_IMPLEMENTATION
1948 
1949 #else // OCEAN_HARDWARE_AVX_VERSION
1950 
1951 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1952 
1953 template <>
1955 {
1956  // the following code uses the following SSE instructions, and needs SSE2 or higher
1957 
1958  // SSE2:
1959  // _mm_load1_pd
1960  // _mm_loadu_pd
1961  // _mm_add_pd
1962  // _mm_storeu_pd
1963  // _mm_mul_pd
1964 
1965  // we use the following strategy:
1966  // the values of the matrix are column aligned so that we normally would need to transpose the matrix before we can apply simple SIMD instructions
1967  // however, we do not transpose the matrix (we avoid the shuffle instructions) and instead multiply the matrix column-wise:
1968  // finally we sum the four columns and have the result, compared to the transpose-based approach this approach is approx. two times faster
1969  //
1970  // A E I M a Aa + Eb + Ic + Md
1971  // B F J N b Ba + Fb + Jc + Nd
1972  // C G K O * c = Ca + Gb + Kc + Od
1973  // D H L P d Da + Hb + Lc + Pd
1974 
1975  // first we load the first vector element in both 64 elements of the 128 bit register, so that we receive [a, a]
1976  __m128d v0 = _mm_load1_pd(vector.data() + 0);
1977 
1978  // now we load the first column to receive: [A, B] and [C, D]
1979  __m128d c0a = _mm_loadu_pd(values + 0);
1980  __m128d c0b = _mm_loadu_pd(values + 2);
1981 
1982  // now we multiply both 128 bit registers by: [A, B] * [a, a] = [Aa, Ba] and [C, D] * [a, a] = [Ca, Da]
1983  __m128d r0a = _mm_mul_pd(c0a, v0);
1984  __m128d r0b = _mm_mul_pd(c0b, v0);
1985 
1986 
1987  // now we proceed with the second column
1988  __m128d v1 = _mm_load1_pd(vector.data() + 1);
1989 
1990  __m128d c1a = _mm_loadu_pd(values + 4);
1991  __m128d c1b = _mm_loadu_pd(values + 6);
1992 
1993  __m128d r1a = _mm_mul_pd(c1a, v1);
1994  __m128d r1b = _mm_mul_pd(c1b, v1);
1995 
1996  // and we sum the result of the first column with the result of the second column
1997  r0a = _mm_add_pd(r0a, r1a);
1998  r0b = _mm_add_pd(r0b, r1b);
1999 
2000 
2001  // now we proceed with the third column
2002  __m128d v2 = _mm_load1_pd(vector.data() + 2);
2003 
2004  __m128d c2a = _mm_loadu_pd(values + 8);
2005  __m128d c2b = _mm_loadu_pd(values + 10);
2006 
2007  __m128d r2a = _mm_mul_pd(c2a, v2);
2008  __m128d r2b = _mm_mul_pd(c2b, v2);
2009 
2010  // we sum the results
2011  r0a = _mm_add_pd(r0a, r2a);
2012  r0b = _mm_add_pd(r0b, r2b);
2013 
2014 
2015  // now we proceed with the fourth column
2016  __m128d v3 = _mm_load1_pd(vector.data() + 3);
2017 
2018  __m128d c3a = _mm_loadu_pd(values + 12);
2019  __m128d c3b = _mm_loadu_pd(values + 14);
2020 
2021  __m128d r3a = _mm_mul_pd(c3a, v3);
2022  __m128d r3b = _mm_mul_pd(c3b, v3);
2023 
2024  // we sum the results
2025  r0a = _mm_add_pd(r0a, r3a);
2026  r0b = _mm_add_pd(r0b, r3b);
2027 
2028  // and finally we store the results back to the vector
2029  VectorT4<double> result;
2030 
2031  _mm_storeu_pd(result.data() + 0, r0a);
2032  _mm_storeu_pd(result.data() + 2, r0b);
2033 
2034  return result;
2035 }
2036 
2037 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
2038 
2039 #endif // OCEAN_HARDWARE_AVX_VERSION >= 10
2040 
2041 #ifdef OCEAN_USE_SLOWER_IMPLEMENTATION
2042 
2043 // we keep following implementation inside 'OCEAN_USE_SLOWER_IMPLEMENTATION' showing an alternative (which is slower)
2044 
2045 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
2046 
2047 template <>
2049 {
2050  // the following code uses the following SSE instructions, and needs SSE 4.1 or higher
2051 
2052  // SSE:
2053  // _mm_loadu_ps
2054  // _mm_shuffle_ps
2055  // _mm_or_ps
2056  // _mm_storeu_ps
2057 
2058  // SSE4.1:
2059  // _mm_dp_ps
2060 
2061  // first we load the first four rows of the matrix
2062  __m128 row0 = _mm_loadu_ps(values + 0);
2063  __m128 row1 = _mm_loadu_ps(values + 4);
2064  __m128 row2 = _mm_loadu_ps(values + 8);
2065  __m128 row3 = _mm_loadu_ps(values + 12);
2066 
2067  // we load the four values of the vector
2068  __m128 v = _mm_loadu_ps(vector.data());
2069 
2070  // first, we transpose the 4x4 matrix
2071 
2072  // A E I M A B C D
2073  // B F J N E F G H
2074  // C G K O I J K L
2075  // D H L P M N O P
2076 
2077  // A B C D, E F G H -> A B E F
2078  __m128 temp0 = _mm_shuffle_ps(row0, row1, 0x44); // 0x44 = 0100 0100
2079  // A B C D, E F G H -> C D G H
2080  __m128 temp2 = _mm_shuffle_ps(row0, row1, 0xEE); // 0xEE = 1110 1110
2081  // I J K L, M N O P -> I J M N
2082  __m128 temp1 = _mm_shuffle_ps(row2, row3, 0x44);
2083  // I J K L, M N O P -> K L O P
2084  __m128 temp3 = _mm_shuffle_ps(row2, row3, 0xEE);
2085 
2086  // A B E F, I J M N -> A E I M
2087  row0 = _mm_shuffle_ps(temp0, temp1, 0x88); // 0x88 = 1000 1000
2088  // A B E F, I J M N -> B F J N
2089  row1 = _mm_shuffle_ps(temp0, temp1, 0xDD); // 0xDD = 1101 1101
2090  // C D G H, K L O P -> C G K O
2091  row2 = _mm_shuffle_ps(temp2, temp3, 0x88);
2092  // C D G H, K L O P -> D H L P
2093  row3 = _mm_shuffle_ps(temp2, temp3, 0xDD);
2094 
2095 #ifdef OCEAN_COMPILER_MSC
2096  ocean_assert(row0.m128_f32[0] == values[0] && row0.m128_f32[1] == values[4] && row0.m128_f32[2] == values[ 8] && row0.m128_f32[3] == values[12]);
2097  ocean_assert(row1.m128_f32[0] == values[1] && row1.m128_f32[1] == values[5] && row1.m128_f32[2] == values[ 9] && row1.m128_f32[3] == values[13]);
2098  ocean_assert(row2.m128_f32[0] == values[2] && row2.m128_f32[1] == values[6] && row2.m128_f32[2] == values[10] && row2.m128_f32[3] == values[14]);
2099  ocean_assert(row3.m128_f32[0] == values[3] && row3.m128_f32[1] == values[7] && row3.m128_f32[2] == values[11] && row3.m128_f32[3] == values[15]);
2100 #endif
2101 
2102  // we determine the dot product between the first row and the vector and store the result in the first float bin
2103  row0 = _mm_dp_ps(row0, v, 0xF1); // 0xF1 = 1111 0001
2104 
2105  // we determine the dot product between the second row and the vector and store the result in the second float bin
2106  row1 = _mm_dp_ps(row1, v, 0xF2); // 0xF2 = 1111 0010
2107  row2 = _mm_dp_ps(row2, v, 0xF4); // 0xF4 = 1111 0100
2108  row3 = _mm_dp_ps(row3, v, 0xF8); // 0xF8 = 1111 1000
2109 
2110  // now we blend the results by applying the bit-wise or operator
2111  __m128 result01 = _mm_or_ps(row0, row1);
2112  __m128 result23 = _mm_or_ps(row2, row3);
2113  __m128 result03 = _mm_or_ps(result01, result23);
2114 
2115  VectorT4<float> result;
2116  _mm_storeu_ps(result.data(), result03);
2117 
2118  return result;
2119 }
2120 
2121 #endif // OCEAN_HARDWARE_SSE_VERSION >= 41
2122 
2123 #else // OCEAN_USE_SLOWER_IMPLEMENTATION
2124 
2125 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10
2126 
2127 template <>
2129 {
2130  // the following code uses the following SSE instructions, and needs SSE1 or higher
2131 
2132  // SSE1:
2133  // _mm_load1_ps
2134  // _mm_loadu_ps
2135  // _mm_mul_ps
2136  // _mm_add_ps
2137  // _mm_storeu_ps
2138 
2139 
2140  // we use the following strategy:
2141  // the values of the matrix are column aligned so that we normally would need to transpose the matrix before we can apply simple SIMD instructions
2142  // however, we do not transpose the matrix (we avoid the shuffle instructions) and instead multiply the matrix column-wise:
2143  // finally we sum the four columns and have the result, compared to the transpose-based approach this approach is approx. two times faster
2144  //
2145  // A E I M a Aa + Eb + Ic + Md
2146  // B F J N b Ba + Fb + Jc + Nd
2147  // C G K O * c = Ca + Gb + Kc + Od
2148  // D H L P d Da + Hb + Lc + Pd
2149 
2150  // first we load the first vector element in all 32bit elements of the 128 bit register, so that we receive [a, a, a, a]
2151  __m128 v0 = _mm_load1_ps(vector.data() + 0);
2152 
2153  // now we load the first column to receive: [A, B, C, D]
2154  __m128 c0 = _mm_loadu_ps(values + 0);
2155 
2156  // now we multiply the 128 bit register [A, B, C, D] * [a, a, a, a] = [Aa, Ba, Ca, Da]
2157  __m128 r0 = _mm_mul_ps(c0, v0);
2158 
2159 
2160  // now we proceed with the second column
2161  __m128 v1 = _mm_load1_ps(vector.data() + 1);
2162  __m128 c1 = _mm_loadu_ps(values + 4);
2163  __m128 r1 = _mm_mul_ps(c1, v1);
2164 
2165  // and we sum the result of the first column with the result of the second column
2166  r0 = _mm_add_ps(r0, r1);
2167 
2168 
2169  // now we proceed with the third column
2170  __m128 v2 = _mm_load1_ps(vector.data() + 2);
2171  __m128 c2 = _mm_loadu_ps(values + 8);
2172  __m128 r2 = _mm_mul_ps(c2, v2);
2173 
2174  // we sum the results
2175  r0 = _mm_add_ps(r0, r2);
2176 
2177 
2178  // now we proceed with the fourth column
2179  __m128 v3 = _mm_load1_ps(vector.data() + 3);
2180  __m128 c3 = _mm_loadu_ps(values + 12);
2181  __m128 r3 = _mm_mul_ps(c3, v3);
2182 
2183  // we sum the results
2184  r0 = _mm_add_ps(r0, r3);
2185 
2186  // and finally we store the results back to the vector
2187  VectorT4<float> result;
2188 
2189  _mm_storeu_ps(result.data(), r0);
2190 
2191  return result;
2192 }
2193 
2194 #endif // OCEAN_HARDWARE_SSE_VERSION >= 10
2195 
2196 #endif // OCEAN_USE_SLOWER_IMPLEMENTATION
2197 
2198 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2199 
2200 #ifdef __aarch64__
2201 
2202 template <>
2204 {
2205  // the following NEON code is almost identical to the SSE implementation
2206 
2207  // we use the following strategy:
2208  // the values of the matrix are column aligned so that we normally would need to transpose the matrix before we can apply simple SIMD instructions
2209  // however, we do not transpose the matrix (we avoid the shuffle instructions) and instead multiply the matrix column-wise:
2210  // finally we sum the four columns and have the result, compared to the transpose-based approach this approach is approx. two times faster
2211  //
2212  // A E I M a Aa + Eb + Ic + Md
2213  // B F J N b Ba + Fb + Jc + Nd
2214  // C G K O * c = Ca + Gb + Kc + Od
2215  // D H L P d Da + Hb + Lc + Pd
2216 
2217  // first we load the first vector element in both 64 elements of the 128 bit register, so that we receive [a, a]
2218  float64x2_t v0 = vld1q_dup_f64(vector.data() + 0);
2219 
2220  // now we load the first column to receive: [A, B] and [C, D]
2221  float64x2_t c0a = vld1q_f64(values + 0);
2222  float64x2_t c0b = vld1q_f64(values + 2);
2223 
2224  // now we multiply both 128 bit registers by: [A, B] * [a, a] = [Aa, Ba] and [C, D] * [a, a] = [Ca, Da]
2225  float64x2_t r0a = vmulq_f64(c0a, v0);
2226  float64x2_t r0b = vmulq_f64(c0b, v0);
2227 
2228 
2229  // now we proceed with the second column
2230  float64x2_t v1 = vld1q_dup_f64(vector.data() + 1);
2231 
2232  float64x2_t c1a = vld1q_f64(values + 4);
2233  float64x2_t c1b = vld1q_f64(values + 6);
2234 
2235  float64x2_t r1a = vmulq_f64(c1a, v1);
2236  float64x2_t r1b = vmulq_f64(c1b, v1);
2237 
2238  // and we sum the result of the first column with the result of the second column
2239  r0a = vaddq_f64(r0a, r1a);
2240  r0b = vaddq_f64(r0b, r1b);
2241 
2242 
2243  // now we proceed with the third column
2244  float64x2_t v2 = vld1q_dup_f64(vector.data() + 2);
2245 
2246  float64x2_t c2a = vld1q_f64(values + 8);
2247  float64x2_t c2b = vld1q_f64(values + 10);
2248 
2249  float64x2_t r2a = vmulq_f64(c2a, v2);
2250  float64x2_t r2b = vmulq_f64(c2b, v2);
2251 
2252  // we sum the results
2253  r0a = vaddq_f64(r0a, r2a);
2254  r0b = vaddq_f64(r0b, r2b);
2255 
2256 
2257  // now we proceed with the fourth column
2258  float64x2_t v3 = vld1q_dup_f64(vector.data() + 3);
2259 
2260  float64x2_t c3a = vld1q_f64(values + 12);
2261  float64x2_t c3b = vld1q_f64(values + 14);
2262 
2263  float64x2_t r3a = vmulq_f64(c3a, v3);
2264  float64x2_t r3b = vmulq_f64(c3b, v3);
2265 
2266  // we sum the results
2267  r0a = vaddq_f64(r0a, r3a);
2268  r0b = vaddq_f64(r0b, r3b);
2269 
2270  // and finally we store the results back to the vector
2271  VectorT4<double> result;
2272 
2273  vst1q_f64(result.data() + 0, r0a);
2274  vst1q_f64(result.data() + 2, r0b);
2275 
2276  return result;
2277 }
2278 
2279 #endif // __aarch64__
2280 
2281 template <>
2283 {
2284  // the following NEON code is almost identical to the SSE implementation
2285 
2286  // we use the following strategy:
2287  // the values of the matrix are column aligned so that we normally would need to transpose the matrix before we can apply simple SIMD instructions
2288  // however, we do not transpose the matrix (we avoid the shuffle instructions) and instead multiply the matrix column-wise:
2289  // finally we sum the four columns and have the result, compared to the transpose-based approach this approach is approx. two times faster
2290  //
2291  // A E I M a Aa + Eb + Ic + Md
2292  // B F J N b Ba + Fb + Jc + Nd
2293  // C G K O * c = Ca + Gb + Kc + Od
2294  // D H L P d Da + Hb + Lc + Pd
2295 
2296  // first we load the first vector element in all 32bit elements of the 128 bit register, so that we receive [a, a, a, a]
2297  float32x4_t v0 = vld1q_dup_f32(vector.data() + 0);
2298 
2299  // now we load the first column to receive: [A, B, C, D]
2300  float32x4_t c0 = vld1q_f32(values + 0);
2301 
2302  // now we multiply the 128 bit register [A, B, C, D] * [a, a, a, a] = [Aa, Ba, Ca, Da]
2303  float32x4_t r0 = vmulq_f32(c0, v0);
2304 
2305 
2306  // now we proceed with the second column
2307  float32x4_t v1 = vld1q_dup_f32(vector.data() + 1);
2308  float32x4_t c1 = vld1q_f32(values + 4);
2309  float32x4_t r1 = vmulq_f32(c1, v1);
2310 
2311  // and we sum the result of the first column with the result of the second column
2312  r0 = vaddq_f32(r0, r1);
2313 
2314 
2315  // now we proceed with the third column
2316  float32x4_t v2 = vld1q_dup_f32(vector.data() + 2);
2317  float32x4_t c2 = vld1q_f32(values + 8);
2318  float32x4_t r2 = vmulq_f32(c2, v2);
2319 
2320  // we sum the results
2321  r0 = vaddq_f32(r0, r2);
2322 
2323 
2324  // now we proceed with the fourth column
2325  float32x4_t v3 = vld1q_dup_f32(vector.data() + 3);
2326  float32x4_t c3 = vld1q_f32(values + 12);
2327  float32x4_t r3 = vmulq_f32(c3, v3);
2328 
2329  // we sum the results
2330  r0 = vaddq_f32(r0, r3);
2331 
2332  // and finally we store the results back to the vector
2333  VectorT4<float> result;
2334 
2335  vst1q_f32(result.data(), r0);
2336 
2337  return result;
2338 }
2339 
2340 #endif
2341 
2342 template <typename T>
2344 {
2345  SquareMatrixT4<T> result(*this);
2346 
2347  result.values[0] *= value;
2348  result.values[1] *= value;
2349  result.values[2] *= value;
2350  result.values[3] *= value;
2351  result.values[4] *= value;
2352  result.values[5] *= value;
2353  result.values[6] *= value;
2354  result.values[7] *= value;
2355  result.values[8] *= value;
2356  result.values[9] *= value;
2357  result.values[10] *= value;
2358  result.values[11] *= value;
2359  result.values[12] *= value;
2360  result.values[13] *= value;
2361  result.values[14] *= value;
2362  result.values[15] *= value;
2363 
2364  return result;
2365 }
2366 
2367 template <typename T>
2369 {
2370  values[0] *= value;
2371  values[1] *= value;
2372  values[2] *= value;
2373  values[3] *= value;
2374  values[4] *= value;
2375  values[5] *= value;
2376  values[6] *= value;
2377  values[7] *= value;
2378  values[8] *= value;
2379  values[9] *= value;
2380  values[10] *= value;
2381  values[11] *= value;
2382  values[12] *= value;
2383  values[13] *= value;
2384  values[14] *= value;
2385  values[15] *= value;
2386 
2387  return *this;
2388 }
2389 
2390 template <typename T>
2391 SquareMatrixT4<T> SquareMatrixT4<T>::projectionMatrix(const T fovX, const T aspectRatio, const T nearDistance, const T farDistance)
2392 {
2393  /*
2394  * <pre>
2395  * Creates the following frustum projection matrix.
2396  *
2397  * --------------------------------------------------
2398  * | t/a 0 0 0 |
2399  * | 0 t 0 0 |
2400  * | 0 0 (f+n)/(n-f) -2fn/(n-f) |
2401  * | 0 0 -1 0 |
2402  * --------------------------------------------------
2403  *
2404  * With: t = 1 / tan (fovY / 2), a = aspectRatio, n = nearDistance, f = farDistance
2405  * </pre>
2406  */
2407 
2408  ocean_assert(fovX > 0 && fovX < NumericT<T>::pi());
2409  ocean_assert(aspectRatio > 0);
2410  ocean_assert(nearDistance > 0);
2411  ocean_assert(nearDistance < farDistance);
2412 
2413  const T fovY = T(2.0) * NumericT<T>::atan(NumericT<T>::tan(T(0.5) * fovX) / aspectRatio);
2414 
2415  SquareMatrixT4<T> matrix(false);
2416  ocean_assert(matrix(1, 0) == 0 && matrix(2, 0) == 0 && matrix(3, 0) == 0);
2417  ocean_assert(matrix(0, 1) == 0 && matrix(2, 1) == 0 && matrix(3, 1) == 0);
2418  ocean_assert(matrix(0, 2) == 0 && matrix(1, 2) == 0);
2419  ocean_assert(matrix(0, 3) == 0 && matrix(1, 3) == 0 && matrix(3, 3) == 0);
2420 
2421  ocean_assert(NumericT<T>::isNotEqual(farDistance, nearDistance));
2422 
2423  const T f = T(1.0) / NumericT<T>::tan(fovY * T(0.5));
2424  const T factor = T(1.0) / (nearDistance - farDistance);
2425 
2426  matrix(0, 0) = f / aspectRatio;
2427  matrix(1, 1) = f;
2428  matrix(2, 2) = (farDistance + nearDistance) * factor;
2429  matrix(3, 2) = -T(1.0);
2430  matrix(2, 3) = (T(2.0) * farDistance * nearDistance) * factor;
2431 
2432  return matrix;
2433 }
2434 
2435 template <typename T>
2436 SquareMatrixT4<T> SquareMatrixT4<T>::projectionMatrix(const AnyCameraT<T>& anyCamera, const T nearDistance, const T farDistance)
2437 {
2438  /*
2439  * <pre>
2440  * Creates the following frustum projection matrix.
2441  *
2442  * --------------------------------------------------
2443  * | Fx 0 mx 0 |
2444  * | 0 Fy my 0 |
2445  * | 0 0 (f+n)/(n-f) -2fn/(n-f) |
2446  * | 0 0 -1 0 |
2447  * --------------------------------------------------
2448  *
2449  * n = nearDistance, f = farDistance
2450  * </pre>
2451  */
2452 
2453  ocean_assert(anyCamera.isValid());
2454 
2455  ocean_assert(nearDistance > 0);
2456  ocean_assert(nearDistance < farDistance);
2457 
2458  const T fxPixel = anyCamera.focalLengthX();
2459  const T fyPixel = anyCamera.focalLengthY();
2460  ocean_assert(fxPixel > T(1) && fyPixel > T(1));
2461 
2462  const T mxPixel = anyCamera.principalPointX();
2463  const T myPixel = anyCamera.principalPointY();
2464 
2465  const T width_2 = T(anyCamera.width()) / T(2);
2466  const T height_2 = T(anyCamera.height()) / T(2);
2467 
2468  ocean_assert(NumericT<T>::isNotEqualEps(width_2));
2469  ocean_assert(NumericT<T>::isNotEqualEps(height_2));
2470 
2471  const T fx = fxPixel / width_2;
2472  const T fy = fyPixel / height_2;
2473 
2474  const T mx = (mxPixel - width_2) / width_2; // principal point with range [-1, 1]
2475  const T my = (myPixel - height_2) / height_2;
2476 
2477  const T factor = T(1.0) / (nearDistance - farDistance);
2478 
2479  SquareMatrixT4<T> matrix(false);
2480 
2481  matrix(0, 0) = fx;
2482  matrix(1, 1) = fy;
2483  matrix(0, 2) = -mx;
2484  matrix(1, 2) = my;
2485  matrix(2, 2) = (farDistance + nearDistance) * factor;
2486  matrix(3, 2) = -T(1);
2487  matrix(2, 3) = (T(2) * farDistance * nearDistance) * factor;
2488 
2489  return matrix;
2490 }
2491 
2492 template <typename T>
2493 SquareMatrixT4<T> SquareMatrixT4<T>::frustumMatrix(const T left, const T right, const T top, const T bottom, const T nearDistance, const T farDistance)
2494 {
2495  /**
2496  * <pre>
2497  * Creates the following frustum projection matrix:
2498  *
2499  * --------------------------------------------------
2500  * | 2n/(r-l) 0 (r+l)/(r-l) 0 |
2501  * | 0 2n/(t-b) (t+b)/(t-b) 0 |
2502  * | 0 0 -(f+n)/(f-n) -2fn/(f-n) |
2503  * | 0 0 -1 0 |
2504  * --------------------------------------------------
2505  * </pre>
2506  */
2507 
2508  ocean_assert(NumericT<T>::isNotEqual(left, right));
2509  ocean_assert(NumericT<T>::isNotEqual(top, bottom));
2510  ocean_assert(NumericT<T>::isNotEqual(nearDistance, farDistance));
2511 
2512  SquareMatrixT4<T> matrix(false);
2513  ocean_assert(matrix(1, 0) == 0 && matrix(2, 0) == 0 && matrix(3, 0) == 0);
2514  ocean_assert(matrix(0, 1) == 0 && matrix(2, 1) == 0 && matrix(3, 1) == 0);
2515  ocean_assert(matrix(0, 3) == 0 && matrix(1, 3) == 0 && matrix(3, 3) == 0);
2516 
2517  const T rightLeft = T(1.0) / (right - left);
2518  const T near2 = nearDistance * T(2.0);
2519 
2520  matrix(0, 0) = near2 * rightLeft;
2521  matrix(0, 2) = (right + left) * rightLeft;
2522 
2523  const T topBottom = T(1.0) / (top - bottom);
2524 
2525  matrix(1, 1) = near2 * topBottom;
2526  matrix(1, 2) = (top + bottom) * topBottom;
2527 
2528  const T farNear = T(1.0) / (farDistance - nearDistance);
2529 
2530  matrix(2, 2) = -(farDistance + nearDistance) * farNear;
2531  matrix(2, 3) = -T(2.0) * farDistance * nearDistance * farNear;
2532 
2533  matrix(3, 2) = -T(1.0);
2534 
2535  return matrix;
2536 }
2537 
2538 template <typename T>
2539 SquareMatrixT4<T> SquareMatrixT4<T>::frustumMatrix(const T width, const T height, const HomogenousMatrixT4<T>& viewingMatrix, const T nearDistance, const T farDistance)
2540 {
2541  ocean_assert(width > NumericT<T>::eps() && height > NumericT<T>::eps());
2542  ocean_assert(nearDistance >= NumericT<T>::eps() && farDistance > nearDistance);
2543 
2544  const T planeDistance = NumericT<T>::abs(viewingMatrix.translation().z());
2545  ocean_assert(viewingMatrix.isValid() && NumericT<T>::isNotEqualEps(planeDistance));
2546 
2547  const HomogenousMatrixT4<T> inversedViewingMatrix(viewingMatrix.inverted());
2548 
2549  const VectorT3<T> leftTop(width * -T(0.5), height * T(0.5), 0);
2550  const VectorT3<T> rightBottom(width * T(0.5), height * -T(0.5), 0);
2551 
2552  const VectorT3<T> leftTopInCamera(inversedViewingMatrix * leftTop);
2553  const VectorT3<T> rightBottomInCamera(inversedViewingMatrix * rightBottom);
2554 
2555  const T factor = nearDistance / planeDistance;
2556 
2557  return frustumMatrix(factor * leftTopInCamera.x(), factor * rightBottomInCamera.x(), factor * leftTopInCamera.y(), factor * rightBottomInCamera.y(), nearDistance, farDistance);
2558 }
2559 
2560 template <typename T>
2561 void SquareMatrixT4<T>::multiply(const SquareMatrixT4<T>& matrix, const VectorT4<T>* vectors, VectorT4<T>* results, const size_t number)
2562 {
2563  ocean_assert((vectors && results) || number == 0);
2564 
2565  for (size_t n = 0; n < number; ++n)
2566  {
2567  results[n] = matrix * vectors[n];
2568  }
2569 }
2570 
2571 #if defined(OCEAN_HARDWARE_AVX_VERSION) && OCEAN_HARDWARE_AVX_VERSION >= 10
2572 
2573 template <>
2574 inline void SquareMatrixT4<double>::multiply(const SquareMatrixT4<double>& matrix, const VectorT4<double>* vectors, VectorT4<double>* results, const size_t number)
2575 {
2576  // the following code uses the following AVX instructions, and needs AVX1 or higher
2577 
2578  // AVX1:
2579  // _mm256_broadcast_sd
2580  // _mm256_loadu_pd
2581  // _mm256_mul_pd
2582  // _mm256_add_pd
2583  // _mm256_storeu_pd
2584 
2585  // we use the same strategy as for the 32 bit float values
2586 
2587  __m256d c0 = _mm256_loadu_pd(matrix.values + 0);
2588  __m256d c1 = _mm256_loadu_pd(matrix.values + 4);
2589  __m256d c2 = _mm256_loadu_pd(matrix.values + 8);
2590  __m256d c3 = _mm256_loadu_pd(matrix.values + 12);
2591 
2592  for (size_t n = 0; n < number; ++n)
2593  {
2594  __m256d v0 = _mm256_broadcast_sd(vectors[n].data() + 0);
2595  __m256d r0 = _mm256_mul_pd(c0, v0);
2596 
2597  __m256d v1 = _mm256_broadcast_sd(vectors[n].data() + 1);
2598  __m256d r1 = _mm256_mul_pd(c1, v1);
2599 
2600  r0 = _mm256_add_pd(r0, r1);
2601 
2602  __m256d v2 = _mm256_broadcast_sd(vectors[n].data() + 2);
2603  __m256d r2 = _mm256_mul_pd(c2, v2);
2604 
2605  r0 = _mm256_add_pd(r0, r2);
2606 
2607  __m256d v3 = _mm256_broadcast_sd(vectors[n].data() + 3);
2608  __m256d r3 = _mm256_mul_pd(c3, v3);
2609 
2610  r0 = _mm256_add_pd(r0, r3);
2611 
2612  _mm256_storeu_pd(results[n].data(), r0);
2613  }
2614 }
2615 
2616 #else // OCEAN_HARDWARE_AVX_VERSION
2617 
2618 #ifdef OCEAN_USE_SLOWER_IMPLEMENTATION
2619 
2620 // we keep following implementation inside 'OCEAN_USE_SLOWER_IMPLEMENTATION' showing an alternative (which is slower)
2621 
2622 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
2623 
2624 template <>
2625 inline void SquareMatrixT4<float>::multiply(const SquareMatrixT4<float>& matrix, const VectorT4<float>* vectors, VectorT4<float>* results, const size_t number)
2626 {
2627  // the following code uses the following SSE instructions, and needs SSE 4.1 or higher
2628 
2629  // SSE:
2630  // _mm_loadu_ps
2631  // _mm_shuffle_ps
2632  // _mm_or_ps
2633  // _mm_storeu_ps
2634 
2635  // SSE4.1:
2636  // _mm_dp_ps
2637 
2638  // first we load the four rows of the matrix
2639  __m128 row0 = _mm_loadu_ps(matrix.values + 0);
2640  __m128 row1 = _mm_loadu_ps(matrix.values + 4);
2641  __m128 row2 = _mm_loadu_ps(matrix.values + 8);
2642  __m128 row3 = _mm_loadu_ps(matrix.values + 12);
2643 
2644  // first, we transpose the 4x4 matrix
2645 
2646  // A E I M A B C D
2647  // B F J N E F G H
2648  // C G K O I J K L
2649  // D H L P M N O P
2650 
2651  // A B C D, E F G H -> A B E F
2652  __m128 temp0 = _mm_shuffle_ps(row0, row1, 0x44); // 0x44 = 0100 0100
2653  // A B C D, E F G H -> C D G H
2654  __m128 temp2 = _mm_shuffle_ps(row0, row1, 0xEE); // 0xEE = 1110 1110
2655  // I J K L, M N O P -> I J M N
2656  __m128 temp1 = _mm_shuffle_ps(row2, row3, 0x44);
2657  // I J K L, M N O P -> K L O P
2658  __m128 temp3 = _mm_shuffle_ps(row2, row3, 0xEE);
2659 
2660  // A B E F, I J M N -> A E I M
2661  row0 = _mm_shuffle_ps(temp0, temp1, 0x88); // 0x88 = 1000 1000
2662  // A B E F, I J M N -> B F J N
2663  row1 = _mm_shuffle_ps(temp0, temp1, 0xDD); // 0xDD = 1101 1101
2664  // C D G H, K L O P -> C G K O
2665  row2 = _mm_shuffle_ps(temp2, temp3, 0x88);
2666  // C D G H, K L O P -> D H L P
2667  row3 = _mm_shuffle_ps(temp2, temp3, 0xDD);
2668 
2669 #ifdef OCEAN_COMPILER_MSC
2670  ocean_assert(row0.m128_f32[0] == matrix.values[0] && row0.m128_f32[1] == matrix.values[4] && row0.m128_f32[2] == matrix.values[ 8] && row0.m128_f32[3] == matrix.values[12]);
2671  ocean_assert(row1.m128_f32[0] == matrix.values[1] && row1.m128_f32[1] == matrix.values[5] && row1.m128_f32[2] == matrix.values[ 9] && row1.m128_f32[3] == matrix.values[13]);
2672  ocean_assert(row2.m128_f32[0] == matrix.values[2] && row2.m128_f32[1] == matrix.values[6] && row2.m128_f32[2] == matrix.values[10] && row2.m128_f32[3] == matrix.values[14]);
2673  ocean_assert(row3.m128_f32[0] == matrix.values[3] && row3.m128_f32[1] == matrix.values[7] && row3.m128_f32[2] == matrix.values[11] && row3.m128_f32[3] == matrix.values[15]);
2674 #endif
2675 
2676  for (size_t n = 0u; n < number; ++n)
2677  {
2678  // we load the four values of the vector
2679  __m128 v = _mm_loadu_ps(vectors[n].data());
2680 
2681  // we determine the dot product between the first row and the vector and store the result in the first float bin
2682  __m128 dot0 = _mm_dp_ps(row0, v, 0xF1); // 0xF1 = 1111 0001
2683 
2684  // we determine the dot product between the second row and the vector and store the result in the second float bin
2685  __m128 dot1 = _mm_dp_ps(row1, v, 0xF2); // 0xF2 = 1111 0010
2686  __m128 dot2 = _mm_dp_ps(row2, v, 0xF4); // 0xF4 = 1111 0100
2687  __m128 dot3 = _mm_dp_ps(row3, v, 0xF8); // 0xF8 = 1111 1000
2688 
2689  // now we blend the results by applying the bit-wise or operator
2690  __m128 result01 = _mm_or_ps(dot0, dot1);
2691  __m128 result23 = _mm_or_ps(dot2, dot3);
2692  __m128 result03 = _mm_or_ps(result01, result23);
2693 
2694  _mm_storeu_ps(results[n].data(), result03);
2695  }
2696 }
2697 
2698 #endif // OCEAN_HARDWARE_SSE_VERSION >= 41
2699 
2700 #else // OCEAN_USE_SLOWER_IMPLEMENTATION
2701 
2702 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2703 
2704 template <>
2705 inline void SquareMatrixT4<double>::multiply(const SquareMatrixT4<double>& matrix, const VectorT4<double>* vectors, VectorT4<double>* results, const size_t number)
2706 {
2707  // the following code uses the following SSE instructions, and needs SSE2 or higher
2708 
2709  // SSE2:
2710  // _mm_load1_pd
2711  // _mm_loadu_pd
2712  // _mm_add_pd
2713  // _mm_storeu_pd
2714  // _mm_mul_pd
2715 
2716  // now we load the four columns
2717  __m128d c0a = _mm_loadu_pd(matrix.values + 0);
2718  __m128d c0b = _mm_loadu_pd(matrix.values + 2);
2719  __m128d c1a = _mm_loadu_pd(matrix.values + 4);
2720  __m128d c1b = _mm_loadu_pd(matrix.values + 6);
2721  __m128d c2a = _mm_loadu_pd(matrix.values + 8);
2722  __m128d c2b = _mm_loadu_pd(matrix.values + 10);
2723  __m128d c3a = _mm_loadu_pd(matrix.values + 12);
2724  __m128d c3b = _mm_loadu_pd(matrix.values + 14);
2725 
2726  for (size_t n = 0u; n < number; ++n)
2727  {
2728  __m128d v0 = _mm_load1_pd(vectors[n].data() + 0);
2729  __m128d v1 = _mm_load1_pd(vectors[n].data() + 1);
2730  __m128d v2 = _mm_load1_pd(vectors[n].data() + 2);
2731  __m128d v3 = _mm_load1_pd(vectors[n].data() + 3);
2732 
2733  // first column
2734  __m128d r0a = _mm_mul_pd(c0a, v0);
2735  __m128d r0b = _mm_mul_pd(c0b, v0);
2736 
2737  // second column
2738  __m128d r1a = _mm_mul_pd(c1a, v1);
2739  __m128d r1b = _mm_mul_pd(c1b, v1);
2740 
2741  r0a = _mm_add_pd(r0a, r1a);
2742  r0b = _mm_add_pd(r0b, r1b);
2743 
2744  // third column
2745  __m128d r2a = _mm_mul_pd(c2a, v2);
2746  __m128d r2b = _mm_mul_pd(c2b, v2);
2747  r0a = _mm_add_pd(r0a, r2a);
2748  r0b = _mm_add_pd(r0b, r2b);
2749 
2750  // fourth column
2751  __m128d r3a = _mm_mul_pd(c3a, v3);
2752  __m128d r3b = _mm_mul_pd(c3b, v3);
2753  r0a = _mm_add_pd(r0a, r3a);
2754  r0b = _mm_add_pd(r0b, r3b);
2755 
2756  _mm_storeu_pd(results[n].data() + 0, r0a);
2757  _mm_storeu_pd(results[n].data() + 2, r0b);
2758  }
2759 }
2760 
2761 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
2762 
2763 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10
2764 
2765 template <>
2766 inline void SquareMatrixT4<float>::multiply(const SquareMatrixT4<float>& matrix, const VectorT4<float>* vectors, VectorT4<float>* results, const size_t number)
2767 {
2768  // the following code uses the following SSE instructions, and needs SSE1 or higher
2769 
2770  // SSE1:
2771  // _mm_load1_ps
2772  // _mm_loadu_ps
2773  // _mm_mul_ps
2774  // _mm_add_ps
2775  // _mm_storeu_ps
2776 
2777  // now we load the four columns
2778  __m128 c0 = _mm_loadu_ps(matrix.values + 0);
2779  __m128 c1 = _mm_loadu_ps(matrix.values + 4);
2780  __m128 c2 = _mm_loadu_ps(matrix.values + 8);
2781  __m128 c3 = _mm_loadu_ps(matrix.values + 12);
2782 
2783  for (size_t n = 0u; n < number; ++n)
2784  {
2785  __m128 v0 = _mm_load1_ps(vectors[n].data() + 0);
2786  __m128 v1 = _mm_load1_ps(vectors[n].data() + 1);
2787  __m128 v2 = _mm_load1_ps(vectors[n].data() + 2);
2788  __m128 v3 = _mm_load1_ps(vectors[n].data() + 3);
2789 
2790  // first column
2791  __m128 r0 = _mm_mul_ps(c0, v0);
2792 
2793  // second column
2794  __m128 r1 = _mm_mul_ps(c1, v1);
2795 
2796  r0 = _mm_add_ps(r0, r1);
2797 
2798  // third column
2799  __m128 r2 = _mm_mul_ps(c2, v2);
2800  r0 = _mm_add_ps(r0, r2);
2801 
2802  // fourth column
2803  __m128 r3 = _mm_mul_ps(c3, v3);
2804  r0 = _mm_add_ps(r0, r3);
2805 
2806  _mm_storeu_ps(results[n].data(), r0);
2807  }
2808 }
2809 
2810 #endif // OCEAN_HARDWARE_SSE_VERSION >= 10
2811 
2812 #endif // OCEAN_USE_SLOWER_IMPLEMENTATION
2813 
2814 #endif // OCEAN_HARDWARE_AVX_VERSION
2815 
2816 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2817 
2818 #ifdef __aarch64__
2819 
2820 template <>
2821 inline void SquareMatrixT4<double>::multiply(const SquareMatrixT4<double>& matrix, const VectorT4<double>* vectors, VectorT4<double>* results, const size_t number)
2822 {
2823  // the following NEON code is almost identical to the SSE implementation
2824 
2825  // now we load the four columns
2826  float64x2_t c0a = vld1q_f64(matrix.values + 0);
2827  float64x2_t c0b = vld1q_f64(matrix.values + 2);
2828  float64x2_t c1a = vld1q_f64(matrix.values + 4);
2829  float64x2_t c1b = vld1q_f64(matrix.values + 6);
2830  float64x2_t c2a = vld1q_f64(matrix.values + 8);
2831  float64x2_t c2b = vld1q_f64(matrix.values + 10);
2832  float64x2_t c3a = vld1q_f64(matrix.values + 12);
2833  float64x2_t c3b = vld1q_f64(matrix.values + 14);
2834 
2835  for (size_t n = 0u; n < number; ++n)
2836  {
2837  float64x2_t v0 = vld1q_dup_f64(vectors[n].data() + 0);
2838  float64x2_t v1 = vld1q_dup_f64(vectors[n].data() + 1);
2839  float64x2_t v2 = vld1q_dup_f64(vectors[n].data() + 2);
2840  float64x2_t v3 = vld1q_dup_f64(vectors[n].data() + 3);
2841 
2842  float64x2_t r0a = vmulq_f64(c0a, v0);
2843  float64x2_t r0b = vmulq_f64(c0b, v0);
2844 
2845  float64x2_t r1a = vmulq_f64(c1a, v1);
2846  float64x2_t r1b = vmulq_f64(c1b, v1);
2847 
2848  r0a = vaddq_f64(r0a, r1a);
2849  r0b = vaddq_f64(r0b, r1b);
2850 
2851  float64x2_t r2a = vmulq_f64(c2a, v2);
2852  float64x2_t r2b = vmulq_f64(c2b, v2);
2853 
2854  r0a = vaddq_f64(r0a, r2a);
2855  r0b = vaddq_f64(r0b, r2b);
2856 
2857  float64x2_t r3a = vmulq_f64(c3a, v3);
2858  float64x2_t r3b = vmulq_f64(c3b, v3);
2859 
2860  r0a = vaddq_f64(r0a, r3a);
2861  r0b = vaddq_f64(r0b, r3b);
2862 
2863  vst1q_f64(results[n].data() + 0, r0a);
2864  vst1q_f64(results[n].data() + 2, r0b);
2865  }
2866 }
2867 
2868 #endif // __aarch64__
2869 
2870 template <>
2871 inline void SquareMatrixT4<float>::multiply(const SquareMatrixT4<float>& matrix, const VectorT4<float>* vectors, VectorT4<float>* results, const size_t number)
2872 {
2873  // the following NEON code is almost identical to the SSE implementation
2874 
2875  // now we load the four columns
2876  float32x4_t c0 = vld1q_f32(matrix.values + 0);
2877  float32x4_t c1 = vld1q_f32(matrix.values + 4);
2878  float32x4_t c2 = vld1q_f32(matrix.values + 8);
2879  float32x4_t c3 = vld1q_f32(matrix.values + 12);
2880 
2881  for (size_t n = 0u; n < number; ++n)
2882  {
2883  float32x4_t v0 = vld1q_dup_f32(vectors[n].data() + 0);
2884  float32x4_t v1 = vld1q_dup_f32(vectors[n].data() + 1);
2885  float32x4_t v2 = vld1q_dup_f32(vectors[n].data() + 2);
2886  float32x4_t v3 = vld1q_dup_f32(vectors[n].data() + 3);
2887 
2888  // first column
2889  float32x4_t r0 = vmulq_f32(c0, v0);
2890 
2891  // second column
2892  float32x4_t r1 = vmulq_f32(c1, v1);
2893 
2894  r0 = vaddq_f32(r0, r1);
2895 
2896  // third column
2897  float32x4_t r2 = vmulq_f32(c2, v2);
2898 
2899  r0 = vaddq_f32(r0, r2);
2900 
2901  // fourth column
2902  float32x4_t r3 = vmulq_f32(c3, v3);
2903 
2904  r0 = vaddq_f32(r0, r3);
2905 
2906  vst1q_f32(results[n].data(), r0);
2907  }
2908 }
2909 
2910 #endif
2911 
2912 template <typename T>
2913 template <typename U>
2914 inline std::vector< SquareMatrixT4<T> > SquareMatrixT4<T>::matrices2matrices(const std::vector< SquareMatrixT4<U> >& matrices)
2915 {
2916  std::vector< SquareMatrixT4<T> > result;
2917  result.reserve(matrices.size());
2918 
2919  for (typename std::vector< SquareMatrixT4<U> >::const_iterator i = matrices.begin(); i != matrices.end(); ++i)
2920  {
2921  result.push_back(SquareMatrixT4<T>(*i));
2922  }
2923 
2924  return result;
2925 }
2926 
2927 template <>
2928 template <>
2929 inline std::vector< SquareMatrixT4<float> > SquareMatrixT4<float>::matrices2matrices(const std::vector< SquareMatrixT4<float> >& matrices)
2930 {
2931  return matrices;
2932 }
2933 
2934 template <>
2935 template <>
2936 inline std::vector< SquareMatrixT4<double> > SquareMatrixT4<double>::matrices2matrices(const std::vector< SquareMatrixT4<double> >& matrices)
2937 {
2938  return matrices;
2939 }
2940 
2941 template <typename T>
2942 template <typename U>
2943 inline std::vector< SquareMatrixT4<T> > SquareMatrixT4<T>::matrices2matrices(const SquareMatrixT4<U>* matrices, const size_t size)
2944 {
2945  std::vector< SquareMatrixT4<T> > result;
2946  result.reserve(size);
2947 
2948  for (size_t n = 0; n < size; ++n)
2949  {
2950  result.push_back(SquareMatrixT4<T>(matrices[n]));
2951  }
2952 
2953  return result;
2954 }
2955 
2956 template <typename T>
2957 void SquareMatrixT4<T>::swapRows(const unsigned int row0, const unsigned int row1)
2958 {
2959  ocean_assert(row0 < 4u && row1 < 4u);
2960 
2961  if (row0 == row1)
2962  {
2963  return;
2964  }
2965 
2966  T* first = values + row0;
2967  T* second = values + row1;
2968 
2969  T tmp = *first;
2970  *first = *second;
2971  *second = tmp;
2972 
2973  first += 4;
2974  second += 4;
2975  tmp = *first;
2976  *first = *second;
2977  *second = tmp;
2978 
2979  first += 4;
2980  second += 4;
2981  tmp = *first;
2982  *first = *second;
2983  *second = tmp;
2984 
2985  first += 4;
2986  second += 4;
2987  tmp = *first;
2988  *first = *second;
2989  *second = tmp;
2990 }
2991 
2992 template <typename T>
2993 void SquareMatrixT4<T>::multiplyRow(const unsigned int row, const T scalar)
2994 {
2995  ocean_assert(row < 4u);
2996 
2997  T* element = values + row;
2998 
2999  *element *= scalar;
3000  element += 4;
3001  *element *= scalar;
3002  element += 4;
3003  *element *= scalar;
3004  element += 4;
3005  *element *= scalar;
3006 }
3007 
3008 template <typename T>
3009 void SquareMatrixT4<T>::addRows(const unsigned int targetRow, unsigned int const sourceRow, const T scalar)
3010 {
3011  ocean_assert(targetRow < 4u && sourceRow < 4u);
3012  ocean_assert(targetRow != sourceRow);
3013 
3014  T* target = values + targetRow;
3015  T* source = values + sourceRow;
3016 
3017  *target += *source * scalar;
3018 
3019  target += 4;
3020  source += 4;
3021  *target += *source * scalar;
3022 
3023  target += 4;
3024  source += 4;
3025  *target += *source * scalar;
3026 
3027  target += 4;
3028  source += 4;
3029  *target += *source * scalar;
3030 }
3031 
3032 template <typename T>
3033 std::ostream& operator<<(std::ostream& stream, const SquareMatrixT4<T>& matrix)
3034 {
3035  stream << "|" << matrix(0, 0) << ", " << matrix(0, 1) << ", " << matrix(0, 2) << ", " << matrix(0, 3) << "|" << std::endl;
3036  stream << "|" << matrix(1, 0) << ", " << matrix(1, 1) << ", " << matrix(1, 2) << ", " << matrix(1, 3) << "|" << std::endl;
3037  stream << "|" << matrix(2, 0) << ", " << matrix(2, 1) << ", " << matrix(2, 2) << ", " << matrix(2, 3) << "|" << std::endl;
3038  stream << "|" << matrix(3, 0) << ", " << matrix(3, 1) << ", " << matrix(3, 2) << ", " << matrix(3, 3) << "|";
3039 
3040  return stream;
3041 }
3042 
3043 template <bool tActive, typename T>
3044 MessageObject<tActive>& operator<<(MessageObject<tActive>& messageObject, const SquareMatrixT4<T>& matrix)
3045 {
3046  return messageObject << "|" << matrix(0, 0) << ", " << matrix(0, 1) << ", " << matrix(0, 2) << ", " << matrix(0, 3) << "|\n|"
3047  << matrix(1, 0) << ", " << matrix(1, 1) << ", " << matrix(1, 2) << ", " << matrix(1, 3) << "|\n|"
3048  << matrix(2, 0) << ", " << matrix(2, 1) << ", " << matrix(2, 2) << ", " << matrix(2, 3) << "|\n|"
3049  << matrix(3, 0) << ", " << matrix(3, 1) << ", " << matrix(3, 2) << ", " << matrix(3, 3) << "|";
3050 }
3051 
3052 template <bool tActive, typename T>
3053 MessageObject<tActive>& operator<<(MessageObject<tActive>&& messageObject, const SquareMatrixT4<T>& matrix)
3054 {
3055  return messageObject << "|" << matrix(0, 0) << ", " << matrix(0, 1) << ", " << matrix(0, 2) << ", " << matrix(0, 3) << "|\n|"
3056  << matrix(1, 0) << ", " << matrix(1, 1) << ", " << matrix(1, 2) << ", " << matrix(1, 3) << "|\n|"
3057  << matrix(2, 0) << ", " << matrix(2, 1) << ", " << matrix(2, 2) << ", " << matrix(2, 3) << "|\n|"
3058  << matrix(3, 0) << ", " << matrix(3, 1) << ", " << matrix(3, 2) << ", " << matrix(3, 3) << "|";
3059 }
3060 
3061 }
3062 
3063 #endif // META_OCEAN_MATH_SQUARE_MATRIX_4_H
This class implements the abstract base class for all AnyCamera objects.
Definition: AnyCamera.h:130
virtual unsigned int width() const =0
Returns the width of the camera image.
virtual T focalLengthX() const =0
Returns the horizontal focal length parameter.
virtual T focalLengthY() const =0
Returns the vertical focal length parameter.
virtual unsigned int height() const =0
Returns the height of the camera image.
virtual T principalPointY() const =0
Returns the y-value of the principal point of the camera image in the pixel domain.
virtual bool isValid() const =0
Returns whether this camera is valid.
virtual T principalPointX() const =0
Returns the x-value of the principal point of the camera image in the pixel domain.
static unsigned int solveQuartic(const T a, const T b, const T c, const T d, const T e, T *x)
Solves a quartic equation with the form: ax^4 + bx^3 + cx^2 + dx + e = 0.
Definition: Equation.h:273
This class implements a 4x4 homogeneous transformation matrix using floating point values with the pr...
Definition: HomogenousMatrix4.h:110
HomogenousMatrixT4< T > inverted() const noexcept
Returns the inverted of this matrix.
Definition: HomogenousMatrix4.h:1575
VectorT3< T > translation() const
Returns the translation of the transformation.
Definition: HomogenousMatrix4.h:1381
bool isValid() const
Returns whether this matrix is a valid homogeneous transformation.
Definition: HomogenousMatrix4.h:1806
This class provides basic numeric functionalities.
Definition: Numeric.h:57
static T atan(const T value)
Returns the arctangent of a given value.
Definition: Numeric.h:1616
static T abs(const T value)
Returns the absolute value of a given value.
Definition: Numeric.h:1220
static constexpr T eps()
Returns a small epsilon.
static bool isEqual(const T first, const T second)
Returns whether two values are equal up to a small epsilon.
Definition: Numeric.h:2386
static T tan(const T value)
Returns the tangent of a given value.
Definition: Numeric.h:1600
static constexpr bool isEqualEps(const T value)
Returns whether a value is smaller than or equal to a small epsilon.
Definition: Numeric.h:2087
This class implements a 3x3 square matrix.
Definition: SquareMatrix3.h:88
OCEAN_FORCE_INLINE SquareMatrixT4< T > & operator*=(const SquareMatrixT4< T > &matrix)
Multiplies and assigns two matrices.
Definition: SquareMatrix4.h:1174
void copyElements(U *arrayValues) const
Copies the elements of this matrix to an array with floating point values of type U.
Definition: SquareMatrix4.h:1137
SquareMatrixT4< T > & operator-=(const SquareMatrixT4< T > &matrix)
Subtracts and assigns two matrices.
Definition: SquareMatrix4.h:1340
const T * data() const
Returns a pointer to the internal values.
Definition: SquareMatrix4.h:790
T operator()(const unsigned int index) const
Element operator.
Definition: SquareMatrix4.h:1216
SquareMatrixT4(const U *arrayValues, const bool valuesRowAligned)
Creates a new SquareMatrixT4 object by an array of at least sixteen elements of float type U.
Definition: SquareMatrix4.h:686
static SquareMatrixT4< T > projectionMatrix(const T fovX, const T aspectRatio, const T nearDistance, const T farDistance)
Creates a projection matrix defined by the horizontal field of view, the aspect ratio and the near an...
Definition: SquareMatrix4.h:2391
OCEAN_FORCE_INLINE VectorT3< T > operator*(const VectorT3< T > &vector) const
Multiply operator for a 3D vector.
Definition: SquareMatrix4.h:1777
bool isIdentity() const
Returns whether this matrix is the identity matrix.
Definition: SquareMatrix4.h:1022
SquareMatrixT4(const U *arrayValues)
Creates a new SquareMatrixT4 object by an array of at least sixteen elements of float type U.
Definition: SquareMatrix4.h:667
SquareMatrixT4(const SquareMatrixT4< U > &matrix)
Copy constructor for a matrix with difference element data type than T.
Definition: SquareMatrix4.h:602
void transpose()
Transposes the matrix.
Definition: SquareMatrix4.h:828
static SquareMatrixT4< T > projectionMatrix(const AnyCameraT< T > &anyCamera, const T nearDistance, const T farDistance)
Creates a projection matrix defined by a camera profile of a pinhole camera and the near and far clip...
Definition: SquareMatrix4.h:2436
SquareMatrixT4< T > transposed() const
Returns the transposed of this matrix.
Definition: SquareMatrix4.h:802
void swapRows(const unsigned int row0, const unsigned int row1)
Swaps two rows of this matrix.
Definition: SquareMatrix4.h:2957
void copyElements(T *arrayValues) const
Copies the elements of this matrix to an array with floating point values.
Definition: SquareMatrix4.h:1160
SquareMatrixT4< T > inverted() const
Returns the inverted matrix of this matrix.
Definition: SquareMatrix4.h:852
bool invert(SquareMatrixT4< T > &invertedMatrix) const
Inverts the matrix and returns the result.
Definition: SquareMatrix4.h:881
SquareMatrixT4< T > operator+(const SquareMatrixT4< T > &matrix) const
Adds two matrices.
Definition: SquareMatrix4.h:1267
T operator[](const unsigned int index) const
Element operator.
Definition: SquareMatrix4.h:1188
bool isSingular() const
Returns whether this matrix is singular (and thus cannot be inverted).
Definition: SquareMatrix4.h:1040
SquareMatrixT4()
Creates a new SquareMatrixT4 object with undefined elements.
Definition: SquareMatrix4.h:595
SquareMatrixT4(const VectorT4< T > &diagonal)
Creates a new SquareMatrixT4 object by a given diagonal vector.
Definition: SquareMatrix4.h:769
OCEAN_FORCE_INLINE SquareMatrixT4< T > operator*(const HomogenousMatrixT4< T > &matrix) const
Multiplies two matrices.
Definition: SquareMatrix4.h:1749
friend class SquareMatrixT4
Definition: SquareMatrix4.h:86
bool isSymmetric(const T epsilon=NumericT< T >::eps()) const
Returns whether this matrix is symmetric.
Definition: SquareMatrix4.h:1046
T values[16]
The sixteen values of the matrix.
Definition: SquareMatrix4.h:591
T determinant() const
Returns the determinant of the matrix.
Definition: SquareMatrix4.h:946
T * data()
Returns a pointer to the internal values.
Definition: SquareMatrix4.h:796
T operator()(const unsigned int row, const unsigned int column) const
Element operator.
Definition: SquareMatrix4.h:1202
bool isEqual(const SquareMatrixT4< T > &matrix, const T eps=NumericT< T >::eps()) const
Returns whether two matrices are almost identical up to a specified epsilon.
Definition: SquareMatrix4.h:1055
void toNull()
Sets the matrix to a zero matrix.
Definition: SquareMatrix4.h:1001
bool eigenSystem(VectorT4< T > &eigenValues, SquareMatrixT4< T > &eigenVectors)
Performs an eigen value analysis.
Definition: SquareMatrix4.h:1068
bool operator==(const SquareMatrixT4< T > &matrix) const
Returns whether two matrices are identical up to a small epsilon.
Definition: SquareMatrix4.h:1261
static size_t elements()
Returns the number of elements this matrix has.
Definition: SquareMatrix4.h:1255
SquareMatrixT4< T > & operator+=(const SquareMatrixT4< T > &matrix)
Adds and assigns two matrices.
Definition: SquareMatrix4.h:1292
const T * operator()() const
Access operator.
Definition: SquareMatrix4.h:1230
SquareMatrixT4(const T *arrayValues, const bool valuesRowAligned)
Creates a new SquareMatrixT4 object by an array of at least sixteen elements.
Definition: SquareMatrix4.h:717
static void multiply(const SquareMatrixT4< T > &matrix, const VectorT4< T > *vectors, VectorT4< T > *results, const size_t number)
Multiplies several 4D vectors with a given matrix.
Definition: SquareMatrix4.h:2561
SquareMatrixT4< T > & operator=(const SquareMatrixT4< T > &)=default
Default assign operator.
SquareMatrixT4< T > operator-() const
Returns the negative matrix of this matrix (all matrix elements are multiplied by -1).
Definition: SquareMatrix4.h:1363
bool isNull() const
Returns whether this matrix is a null matrix.
Definition: SquareMatrix4.h:1031
OCEAN_FORCE_INLINE SquareMatrixT4< T > & operator*=(const HomogenousMatrixT4< T > &matrix)
Multiplies and assigns two matrices.
Definition: SquareMatrix4.h:1181
SquareMatrixT4(const SquareMatrixT3< T > &subMatrix)
Creates a new SquareMatrixT4 object by given 3x3 sub matrix.
Definition: SquareMatrix4.h:753
SquareMatrixT4< T > operator-(const SquareMatrixT4< T > &matrix) const
Subtracts two matrices.
Definition: SquareMatrix4.h:1315
void addRows(const unsigned int targetRow, const unsigned int sourceRow, const T scalar)
Multiplies elements from a specific row with a scalar and adds them to another row.
Definition: SquareMatrix4.h:3009
static std::vector< SquareMatrixT4< T > > matrices2matrices(const std::vector< SquareMatrixT4< U > > &matrices)
Converts matrices with specific data type to matrices with different data type.
Definition: SquareMatrix4.h:2914
static SquareMatrixT4< T > frustumMatrix(const T left, const T right, const T top, const T bottom, const T nearDistance, const T farDistance)
Creates a projection matrix defined by an asymmetric viewing frustum.
Definition: SquareMatrix4.h:2493
T & operator()(const unsigned int index)
Element operator.
Definition: SquareMatrix4.h:1223
SquareMatrixT4< T > operator*(const T value) const
Multiplies this matrix with a scalar value.
Definition: SquareMatrix4.h:2343
bool invert()
Inverts this matrix in place.
Definition: SquareMatrix4.h:866
bool operator!=(const SquareMatrixT4< T > &matrix) const
Returns whether two matrices are not identical up to a small epsilon.
Definition: SquareMatrix4.h:1168
size_t operator()(const SquareMatrixT4< T > &matrix) const
Hash function.
Definition: SquareMatrix4.h:1242
static std::vector< SquareMatrixT4< T > > matrices2matrices(const SquareMatrixT4< U > *matrices, const size_t size)
Converts matrices with specific data type to matrices with different data type.
Definition: SquareMatrix4.h:2943
OCEAN_FORCE_INLINE SquareMatrixT4< T > operator*(const SquareMatrixT4< T > &matrix) const
Multiplies two matrices.
Definition: SquareMatrix4.h:1387
T & operator()(const unsigned int row, const unsigned int column)
Element operator.
Definition: SquareMatrix4.h:1209
T & operator[](const unsigned int index)
Element operator.
Definition: SquareMatrix4.h:1195
void multiplyRow(const unsigned int row, const T scalar)
Multiplies a row with a scalar value.
Definition: SquareMatrix4.h:2993
T trace() const
Returns the trace of the matrix which is the sum of the diagonal elements.
Definition: SquareMatrix4.h:974
SquareMatrixT4(const bool setToIdentity)
Creates a new SquareMatrixT4 object.
Definition: SquareMatrix4.h:623
SquareMatrixT4(const HomogenousMatrixT4< T > &transformation)
Creates a new SquareMatrixT4 object by given transformation matrix.
Definition: SquareMatrix4.h:747
OCEAN_FORCE_INLINE VectorT4< T > operator*(const VectorT4< T > &vector) const
Multiply operator for a 4D vector.
Definition: SquareMatrix4.h:1790
T * operator()()
Access operator.
Definition: SquareMatrix4.h:1236
T Type
Definition of the used data type.
Definition: SquareMatrix4.h:93
static SquareMatrixT4< T > frustumMatrix(const T width, const T height, const HomogenousMatrixT4< T > &viewingMatrix, const T nearDistance, const T farDistance)
Creates a project matrix defined by an asymmetric viewing frustum.
Definition: SquareMatrix4.h:2539
SquareMatrixT4< T > & operator*=(const T value)
Multiplies and assigns this matrix with a scalar value.
Definition: SquareMatrix4.h:2368
void toIdentity()
Sets the matrix to the identity matrix.
Definition: SquareMatrix4.h:980
SquareMatrixT4(const SquareMatrixT4< T > &matrix)=default
Copy constructor.
SquareMatrixT4(const T *arrayValues)
Creates a new SquareMatrixT4 object by an array of at least sixteen elements.
Definition: SquareMatrix4.h:678
This class implements a vector with three elements.
Definition: Vector3.h:97
const T & y() const noexcept
Returns the y value.
Definition: Vector3.h:812
const T & x() const noexcept
Returns the x value.
Definition: Vector3.h:800
This class implements a vector with four elements.
Definition: Vector4.h:97
const T * data() const noexcept
Returns an pointer to the vector elements.
Definition: Vector4.h:720
std::vector< SquareMatrixT4< T > > SquareMatricesT4
Definition of a typename alias for vectors with SquareMatrixT4 objects.
Definition: SquareMatrix4.h:61
SquareMatrixT4< float > SquareMatrixF4
Instantiation of the SquareMatrixT4 template class using a double precision float data type.
Definition: SquareMatrix4.h:53
SquareMatrixT4< Scalar > SquareMatrix4
Definition of the SquareMatrix4 object, depending on the OCEAN_MATH_USE_SINGLE_PRECISION either with ...
Definition: SquareMatrix4.h:32
std::vector< SquareMatrix4 > SquareMatrices4
Definition of a vector holding SquareMatrix4 objects.
Definition: SquareMatrix4.h:68
SquareMatrixT4< double > SquareMatrixD4
Instantiation of the SquareMatrixT4 template class using a double precision float data type.
Definition: SquareMatrix4.h:46
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15
std::ostream & operator<<(std::ostream &stream, const HighPerformanceStatistic &highPerformanceStatistic)
Definition: HighPerformanceTimer.h:963