8#ifndef META_OCEAN_CV_NEON_H
9#define META_OCEAN_CV_NEON_H
17#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
19#if defined(__ARM_NEON__) || defined(__ARM_NEON)
50 static constexpr uint8x8_t
create_uint8x8(
const uint8_t v0,
const uint8_t v1,
const uint8_t v2,
const uint8_t v3,
const uint8_t v4,
const uint8_t v5,
const uint8_t v6,
const uint8_t v7);
73 static constexpr uint8x16_t
create_uint8x16(
const uint8_t v0,
const uint8_t v1,
const uint8_t v2,
const uint8_t v3,
const uint8_t v4,
const uint8_t v5,
const uint8_t v6,
const uint8_t v7,
const uint8_t v8,
const uint8_t v9,
const uint8_t v10,
const uint8_t v11,
const uint8_t v12,
const uint8_t v13,
const uint8_t v14,
const uint8_t v15);
96 static constexpr int8x16_t
create_int8x16(
const int8_t v0,
const int8_t v1,
const int8_t v2,
const int8_t v3,
const int8_t v4,
const int8_t v5,
const int8_t v6,
const int8_t v7,
const int8_t v8,
const int8_t v9,
const int8_t v10,
const int8_t v11,
const int8_t v12,
const int8_t v13,
const int8_t v14,
const int8_t v15);
111 static constexpr int16x8_t
create_int16x8(
const int16_t v0,
const int16_t v1,
const int16_t v2,
const int16_t v3,
const int16_t v4,
const int16_t v5,
const int16_t v6,
const int16_t v7);
122 static constexpr uint32x4_t
create_uint32x4(
const uint32_t v0,
const uint32_t v1,
const uint32_t v2,
const uint32_t v3);
137 static constexpr uint16x8_t
create_uint16x8(
const uint16_t v0,
const uint16_t v1,
const uint16_t v2,
const uint16_t v3,
const uint16_t v4,
const uint16_t v5,
const uint16_t v6,
const uint16_t v7);
339 static inline void average24Elements1Channel8Bit3x3(
const uint8_t*
const image0,
const uint8_t*
const image1,
const uint8_t*
const image2, uint8_t*
const result);
452 static OCEAN_FORCE_INLINE uint16x4_t
moveHighBits16_8(
const uint16x4_t& value);
461 static OCEAN_FORCE_INLINE uint16x8_t
moveHighBits16_8(
const uint16x8_t& value);
494 static OCEAN_FORCE_INLINE int32x4_t
sum16Bit4Blocks3x3(
const short*
const rowTop,
const short*
const rowCenter,
const short*
const rowBottom);
503 static OCEAN_FORCE_INLINE uint64x2_t
multiply(
const uint64x2_t& value_u_64x2,
const uint32x2_t& value_u_32x2);
511 static OCEAN_FORCE_INLINE int32x4_t
copySign(
const uint32x4_t& signReceiver,
const int32x4_t& signProvider);
521 static OCEAN_FORCE_INLINE uint8x16_t
cast16ElementsNEON(
const float32x4_t& sourceA_f_32x4,
const float32x4_t& sourceB_f_32x4,
const float32x4_t& sourceC_f_32x4,
const float32x4_t& sourceD_f_32x4);
535 static OCEAN_FORCE_INLINE float32x4x4_t
cast16ElementsNEON(
const uint8x16_t& source_u_8x16);
542 static OCEAN_FORCE_INLINE float32x4x4_t
cast16ElementsNEON(
const uint8_t*
const source);
556 static inline unsigned int interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy);
570 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
588 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
591constexpr uint8x8_t
NEON::create_uint8x8(
const uint8_t v0,
const uint8_t v1,
const uint8_t v2,
const uint8_t v3,
const uint8_t v4,
const uint8_t v5,
const uint8_t v6,
const uint8_t v7)
593#ifdef OCEAN_COMPILER_MSC
594 return uint8x8_t{{uint64_t(v0) | (uint64_t(v1) << 8) | (uint64_t(v2) << 16) | (uint64_t(v3) << 24) | (uint64_t(v4) << 32) | (uint64_t(v5) << 40) | (uint64_t(v6) << 48) | (uint64_t(v7) << 56)}};
596 return uint8x8_t{v0, v1, v2, v3, v4, v5, v6, v7};
600constexpr uint8x16_t
NEON::create_uint8x16(
const uint8_t v0,
const uint8_t v1,
const uint8_t v2,
const uint8_t v3,
const uint8_t v4,
const uint8_t v5,
const uint8_t v6,
const uint8_t v7,
const uint8_t v8,
const uint8_t v9,
const uint8_t v10,
const uint8_t v11,
const uint8_t v12,
const uint8_t v13,
const uint8_t v14,
const uint8_t v15)
602#ifdef OCEAN_COMPILER_MSC
603 return uint8x16_t{{uint64_t(v0) | (uint64_t(v1) << 8) | (uint64_t(v2) << 16) | (uint64_t(v3) << 24) | (uint64_t(v4) << 32) | (uint64_t(v5) << 40) | (uint64_t(v6) << 48) | (uint64_t(v7) << 56), uint64_t(v8) | (uint64_t(v9) << 8) | (uint64_t(v10) << 16) | (uint64_t(v11) << 24) | (uint64_t(v12) << 32) | (uint64_t(v13) << 40) | (uint64_t(v14) << 48) | (uint64_t(v15) << 56)}};
605 return uint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
609constexpr int8x16_t
NEON::create_int8x16(
const int8_t v0,
const int8_t v1,
const int8_t v2,
const int8_t v3,
const int8_t v4,
const int8_t v5,
const int8_t v6,
const int8_t v7,
const int8_t v8,
const int8_t v9,
const int8_t v10,
const int8_t v11,
const int8_t v12,
const int8_t v13,
const int8_t v14,
const int8_t v15)
611#ifdef OCEAN_COMPILER_MSC
612 return int8x16_t{
create_uint8x16(uint8_t(v0), uint8_t(v1), uint8_t(v2), uint8_t(v3), uint8_t(v4), uint8_t(v5), uint8_t(v6), uint8_t(v7), uint8_t(v8), uint8_t(v9), uint8_t(v10), uint8_t(v11), uint8_t(v12), uint8_t(v13), uint8_t(v14), uint8_t(v15))};
614 return int8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
618constexpr int16x8_t
NEON::create_int16x8(
const int16_t v0,
const int16_t v1,
const int16_t v2,
const int16_t v3,
const int16_t v4,
const int16_t v5,
const int16_t v6,
const int16_t v7)
620#ifdef OCEAN_COMPILER_MSC
621 return int16x8_t{{uint64_t(uint16_t(v0)) | (uint64_t(uint16_t(v1)) << 16) | (uint64_t(uint16_t(v2)) << 32) | (uint64_t(uint16_t(v3)) << 48), uint64_t(uint16_t(v4)) | (uint64_t(uint16_t(v5)) << 16) | (uint64_t(uint16_t(v6)) << 32) | (uint64_t(uint16_t(v7)) << 48)}};
623 return int16x8_t{v0, v1, v2, v3, v4, v5, v6, v7};
627constexpr uint32x4_t
NEON::create_uint32x4(
const uint32_t v0,
const uint32_t v1,
const uint32_t v2,
const uint32_t v3)
629#ifdef OCEAN_COMPILER_MSC
630 return uint32x4_t{{uint64_t(v0) | (uint64_t(v1) << 32), uint64_t(v2) | (uint64_t(v3) << 32)}};
632 return uint32x4_t{v0, v1, v2, v3};
636constexpr uint16x8_t
NEON::create_uint16x8(
const uint16_t v0,
const uint16_t v1,
const uint16_t v2,
const uint16_t v3,
const uint16_t v4,
const uint16_t v5,
const uint16_t v6,
const uint16_t v7)
638#ifdef OCEAN_COMPILER_MSC
639 return uint16x8_t{{uint64_t(v0) | (uint64_t(v1) << 16) | (uint64_t(v2) << 32) | (uint64_t(v3) << 48), uint64_t(v4) | (uint64_t(v5) << 16) | (uint64_t(v6) << 32) | (uint64_t(v7) << 48)}};
641 return uint16x8_t{v0, v1, v2, v3, v4, v5, v6, v7};
647 ocean_assert(image0 && image1);
649 const uint8x16_t row0 = vld1q_u8(image0);
650 const uint8x16_t row1 = vld1q_u8(image1);
652 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
658 ocean_assert(image0 && image1);
660 const uint8x16_t row0 = vld1q_u8(image0);
661 const uint8x16_t row1 = vld1q_u8(image1);
663 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
669 ocean_assert(image0 && image1);
671 const uint8x16_t row0 = vld1q_u8(image0);
672 const uint8x16_t row1 = vld1q_u8(image1);
674 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
680 ocean_assert(image0 && image1);
682 const uint8x16_t row0 = vld1q_u8(image0);
683 const uint8x16_t row1 = vld1q_u8(image1);
685 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
691 ocean_assert(image0 && image1);
693 const uint8x16_t row0 = vld1q_u8(image0);
694 const uint8x16_t row1 = vld1q_u8(image1);
696 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
702 ocean_assert(image0 && image1);
704 const uint8x16_t row0 = vld1q_u8(image0);
705 const uint8x16_t row1 = vld1q_u8(image1);
707 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
713 ocean_assert(image0 && image1);
715 const uint8x16_t row0 = vld1q_u8(image0);
716 const uint8x16_t row1 = vld1q_u8(image1);
718 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
724 ocean_assert(image0 && image1);
726 const uint8x16_t row0 = vld1q_u8(image0);
727 const uint8x16_t row1 = vld1q_u8(image1);
729 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
735 ocean_assert(image0 && image1);
737 const uint8x16_t row0 = vld1q_u8(image0);
738 const uint8x16_t row1 = vld1q_u8(image1);
740 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
746 ocean_assert(image0 && image1);
748 const uint8x16_t row0 = vld1q_u8(image0);
749 const uint8x16_t row1 = vld1q_u8(image1);
751 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
757 ocean_assert(image0 && image1);
759 const uint8x16_t row0 = vld1q_u8(image0);
760 const uint8x16_t row1 = vld1q_u8(image1);
762 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
768 ocean_assert(image0 && image1);
770 const uint8x16_t row0 = vld1q_u8(image0);
771 const uint8x16_t row1 = vld1q_u8(image1);
773 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
779 ocean_assert(image0 && image1);
781 const uint8x16_t row0 = vld1q_u8(image0);
782 const uint8x16_t row1 = vld1q_u8(image1);
784 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
790 ocean_assert(image0 && image1);
792 const uint8x16_t row0 = vld1q_u8(image0);
793 const uint8x16_t row1 = vld1q_u8(image1);
795 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
801 ocean_assert(image0 && image1);
803 uint8x16_t row0 = vld1q_u8(image0);
804 uint8x16_t row1 = vld1q_u8(image1);
812 uint8x16_t subtract = vabdq_u8(row0, row1);
814 uint8x8_t subtractLow = vget_low_u8(subtract);
815 uint8x8_t subtractHigh = vget_high_u8(subtract);
817 uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
818 uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
820 return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
825 ocean_assert(image0 && image1);
827 const uint8x8_t row0 = vld1_u8(image0);
828 const uint8x8_t row1 = vld1_u8(image1);
836 const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
840 const uint16x4_t subtractHigh =
moveHighBits16_8(vreinterpret_u16_u8(subtract));
842 const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
845 const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
853 ocean_assert(image0 && image1);
855 uint8x16_t row0 = vld1q_u8(image0);
856 uint8x16_t row1 = vld1q_u8(image1);
864 uint8x16_t subtract = vabdq_u8(row0, row1);
866 uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
868 return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
873 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
877 const uint8x16_t m128_row0 = vld1q_u8(row0);
878 const uint8x16_t m128_row1 = vld1q_u8(row1);
889 const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
893 vst1_u8(result, average);
900 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
902 const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
903 const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
905 const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
906 const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
908 const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
909 const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
911 const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
913 vst1q_u8(result, average_u_8x16);
918 ocean_assert(image0 && image1 && result);
920 const uint8x16_t row0 = vld1q_u8(image0);
921 const uint8x16_t row1 = vld1q_u8(image1);
924 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
927 const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
929 vst1_u8(result, thresholded);
934 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
941 const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
942 const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
955 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
956 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
960 vst2_u8(result, average);
967 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
969 const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
970 const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
972 const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
973 const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
975 const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
976 const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
977 const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
978 const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
980 uint8x16x2_t average_u_8x16x2;
982 average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
983 average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
985 vst2q_u8(result, average_u_8x16x2);
990 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
998 const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
999 const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
1010 uint8x8x3_t average;
1012 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
1013 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
1014 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
1018 vst3_u8(result, average);
1035 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
1044 const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
1045 const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
1056 uint8x8x4_t average;
1058 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
1059 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
1060 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
1061 average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
1065 vst4_u8(result, average);
1070 ocean_assert(image0 && image1 && image2 && result);
1079 uint8x8x3_t row0 = vld3_u8(image0);
1080 uint8x8x3_t row1 = vld3_u8(image1);
1081 uint8x8x3_t row2 = vld3_u8(image2);
1083 uint16x8x3_t sumPerRow;
1086 sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
1087 sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
1088 sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
1091 const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
1094 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
1096 vst1_u8(result, average);
1101 ocean_assert(image0 && image1 && image2 && result);
1115 uint8x16x3_t row0 = vld3q_u8(image0);
1116 uint8x16x3_t row1 = vld3q_u8(image1);
1117 uint8x16x3_t row2 = vld3q_u8(image2);
1128 uint8x16x3_t averagePerRow;
1129 averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1130 averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1131 averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1134 const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1136 vst1q_u8(result, average);
1141 ocean_assert(source && response && width >= 10u);
1144 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1146 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1149 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1151 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1156 result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1158 result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1161 vst2_s8((int8_t*)response, result);
1166 ocean_assert(source && response && width >= 10u);
1169 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1171 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1174 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1176 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1179 int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1181 int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1186 result.val[0] = vmulq_s16(horizontal, horizontal);
1188 result.val[1] = vmulq_s16(vertical, vertical);
1190 result.val[2] = vmulq_s16(horizontal, vertical);
1193 vst3q_s16(response, result);
1198#if defined(__aarch64__)
1200 return vaddvq_u32(value_u_32x4);
1204 const uint32x2_t sum_u_32x2 = vpadd_u32(vget_low_u32(value_u_32x4), vget_high_u32(value_u_32x4));
1205 return vget_lane_u32(vpadd_u32(sum_u_32x2, sum_u_32x2), 0);
1212 return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1217 return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1222 return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1227 return vshrq_n_u32(value, 16);
1232 return vshr_n_u16(value, 8);
1237 return vshrq_n_u16(value, 8);
1242 return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1247 return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1250OCEAN_FORCE_INLINE int32x4_t
NEON::sum16Bit4Blocks3x3(
const short*
const rowTop,
const short*
const rowCenter,
const short*
const rowBottom)
1252 ocean_assert(rowTop !=
nullptr);
1253 ocean_assert(rowCenter !=
nullptr);
1254 ocean_assert(rowBottom !=
nullptr);
1271 const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1272 const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1273 const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1276 const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1277 const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1278 const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1281 const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1282 const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1283 const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1286 const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1287 const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1288 const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1289 const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1292 const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1293 const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1295 const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1298 return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1301OCEAN_FORCE_INLINE uint64x2_t
NEON::multiply(
const uint64x2_t& value_u_64x2,
const uint32x2_t& value_u_32x2)
1308 const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1310 const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1311 const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1313 const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1315 return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1318OCEAN_FORCE_INLINE int32x4_t
NEON::copySign(
const uint32x4_t& signReceiver_u_32x4,
const int32x4_t& signProvider_s_32x4)
1320 const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1322 const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0));
1323 const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0));
1325 return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1328OCEAN_FORCE_INLINE uint8x16_t
NEON::cast16ElementsNEON(
const float32x4_t& sourceA_f_32x4,
const float32x4_t& sourceB_f_32x4,
const float32x4_t& sourceC_f_32x4,
const float32x4_t& sourceD_f_32x4)
1330 const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1331 const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1332 const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1333 const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1335 const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1336 const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1338 return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1343 ocean_assert(source !=
nullptr);
1346 for (
unsigned int n = 0u; n < 16u; ++n)
1348 ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1352 return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1357 const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1358 const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1360 const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1361 const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1362 const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1363 const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1365 float32x4x4_t result_u_32x4x4;
1366 result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1367 result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1368 result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1369 result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1371 return result_u_32x4x4;
1376 ocean_assert(source !=
nullptr);
1381inline unsigned int NEON::interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy)
1383 ocean_assert(pixel);
1384 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1386 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1389inline unsigned int NEON::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int ,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
1391 ocean_assert(pixel0 && pixel1);
1393 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1398inline unsigned int NEON::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
1400 ocean_assert(pixel0 && pixel1);
1402 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1403 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1405 return sqrDistance(
interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy),
interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
This class implements computer vision functions using NEON extensions.
Definition NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:678
static constexpr int8x16_t create_int8x16(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3, const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7, const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11, const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
Creates an int8x16_t vector from 16 individual int8_t values.
Definition NEON.h:609
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:932
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1196
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition NEON.h:916
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:871
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:788
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:733
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:1099
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:963
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition NEON.h:1230
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition NEON.h:1164
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition NEON.h:851
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition NEON.h:1240
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:1068
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:896
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:689
static constexpr int16x8_t create_int16x8(const int16_t v0, const int16_t v1, const int16_t v2, const int16_t v3, const int16_t v4, const int16_t v5, const int16_t v6, const int16_t v7)
Creates an int16x8_t vector from 8 individual int16_t values.
Definition NEON.h:618
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:667
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition NEON.h:1215
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:777
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition NEON.h:1225
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:645
static constexpr uint16x8_t create_uint16x8(const uint16_t v0, const uint16_t v1, const uint16_t v2, const uint16_t v3, const uint16_t v4, const uint16_t v5, const uint16_t v6, const uint16_t v7)
Creates a uint16x8_t vector from 8 individual uint16_t values.
Definition NEON.h:636
static constexpr uint8x8_t create_uint8x8(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7)
Creates a uint8x8_t vector from 8 individual uint8_t values.
Definition NEON.h:591
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition NEON.h:1328
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition NEON.h:1245
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition NEON.h:1381
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition NEON.h:1250
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:744
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:700
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:722
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition NEON.h:1033
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:711
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:656
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:755
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition NEON.h:799
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition NEON.h:988
static constexpr uint32x4_t create_uint32x4(const uint32_t v0, const uint32_t v1, const uint32_t v2, const uint32_t v3)
Creates a uint32x4_t vector from 4 individual uint32_t values.
Definition NEON.h:627
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition NEON.h:823
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:766
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition NEON.h:1389
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition NEON.h:1318
static constexpr uint8x16_t create_uint8x16(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7, const uint8_t v8, const uint8_t v9, const uint8_t v10, const uint8_t v11, const uint8_t v12, const uint8_t v13, const uint8_t v14, const uint8_t v15)
Creates a uint8x16_t vector from 16 individual uint8_t values.
Definition NEON.h:600
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition NEON.h:1139
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition NEON.h:1301
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition NEON.h:1210
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15