8 #ifndef META_OCEAN_CV_NEON_H
9 #define META_OCEAN_CV_NEON_H
17 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
19 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
41 static inline void prefetchT0(
const void*
const data);
47 static inline void prefetchT1(
const void*
const data);
53 static inline void prefetchT2(
const void*
const data);
59 static inline void prefetchNTA(
const void*
const data);
261 static inline void average24Elements1Channel8Bit3x3(
const uint8_t*
const image0,
const uint8_t*
const image1,
const uint8_t*
const image2, uint8_t*
const result);
330 static OCEAN_FORCE_INLINE
unsigned int sum32x4ByLanes(
const uint32x4_t& value);
375 static OCEAN_FORCE_INLINE uint16x4_t
moveHighBits16_8(
const uint16x4_t& value);
384 static OCEAN_FORCE_INLINE uint16x8_t
moveHighBits16_8(
const uint16x8_t& value);
417 static OCEAN_FORCE_INLINE int32x4_t
sum16Bit4Blocks3x3(
const short*
const rowTop,
const short*
const rowCenter,
const short*
const rowBottom);
426 static OCEAN_FORCE_INLINE uint64x2_t
multiply(
const uint64x2_t& value_u_64x2,
const uint32x2_t& value_u_32x2);
434 static OCEAN_FORCE_INLINE int32x4_t
copySign(
const uint32x4_t& signReceiver,
const int32x4_t& signProvider);
444 static OCEAN_FORCE_INLINE uint8x16_t
cast16ElementsNEON(
const float32x4_t& sourceA_f_32x4,
const float32x4_t& sourceB_f_32x4,
const float32x4_t& sourceC_f_32x4,
const float32x4_t& sourceD_f_32x4);
458 static OCEAN_FORCE_INLINE float32x4x4_t
cast16ElementsNEON(
const uint8x16_t& source_u_8x16);
465 static OCEAN_FORCE_INLINE float32x4x4_t
cast16ElementsNEON(
const uint8_t*
const source);
479 static inline unsigned int interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy);
493 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
511 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
516 __builtin_prefetch(data, 0, 0);
521 __builtin_prefetch(data, 0, 1);
526 __builtin_prefetch(data, 0, 2);
531 __builtin_prefetch(data, 0, 3);
536 ocean_assert(image0 && image1);
538 const uint8x16_t row0 = vld1q_u8(image0);
539 const uint8x16_t row1 = vld1q_u8(image1);
541 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
547 ocean_assert(image0 && image1);
549 const uint8x16_t row0 = vld1q_u8(image0);
550 const uint8x16_t row1 = vld1q_u8(image1);
552 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
558 ocean_assert(image0 && image1);
560 const uint8x16_t row0 = vld1q_u8(image0);
561 const uint8x16_t row1 = vld1q_u8(image1);
563 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
569 ocean_assert(image0 && image1);
571 const uint8x16_t row0 = vld1q_u8(image0);
572 const uint8x16_t row1 = vld1q_u8(image1);
574 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
580 ocean_assert(image0 && image1);
582 const uint8x16_t row0 = vld1q_u8(image0);
583 const uint8x16_t row1 = vld1q_u8(image1);
585 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
591 ocean_assert(image0 && image1);
593 const uint8x16_t row0 = vld1q_u8(image0);
594 const uint8x16_t row1 = vld1q_u8(image1);
596 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
602 ocean_assert(image0 && image1);
604 const uint8x16_t row0 = vld1q_u8(image0);
605 const uint8x16_t row1 = vld1q_u8(image1);
607 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
613 ocean_assert(image0 && image1);
615 const uint8x16_t row0 = vld1q_u8(image0);
616 const uint8x16_t row1 = vld1q_u8(image1);
618 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
624 ocean_assert(image0 && image1);
626 const uint8x16_t row0 = vld1q_u8(image0);
627 const uint8x16_t row1 = vld1q_u8(image1);
629 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
635 ocean_assert(image0 && image1);
637 const uint8x16_t row0 = vld1q_u8(image0);
638 const uint8x16_t row1 = vld1q_u8(image1);
640 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
646 ocean_assert(image0 && image1);
648 const uint8x16_t row0 = vld1q_u8(image0);
649 const uint8x16_t row1 = vld1q_u8(image1);
651 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
657 ocean_assert(image0 && image1);
659 const uint8x16_t row0 = vld1q_u8(image0);
660 const uint8x16_t row1 = vld1q_u8(image1);
662 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
668 ocean_assert(image0 && image1);
670 const uint8x16_t row0 = vld1q_u8(image0);
671 const uint8x16_t row1 = vld1q_u8(image1);
673 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
679 ocean_assert(image0 && image1);
681 const uint8x16_t row0 = vld1q_u8(image0);
682 const uint8x16_t row1 = vld1q_u8(image1);
684 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
690 ocean_assert(image0 && image1);
692 uint8x16_t row0 = vld1q_u8(image0);
693 uint8x16_t row1 = vld1q_u8(image1);
701 uint8x16_t subtract = vabdq_u8(row0, row1);
703 uint8x8_t subtractLow = vget_low_u8(subtract);
704 uint8x8_t subtractHigh = vget_high_u8(subtract);
706 uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
707 uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
709 return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
714 ocean_assert(image0 && image1);
716 const uint8x8_t row0 = vld1_u8(image0);
717 const uint8x8_t row1 = vld1_u8(image1);
725 const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
729 const uint16x4_t subtractHigh =
moveHighBits16_8(vreinterpret_u16_u8(subtract));
731 const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
734 const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
742 ocean_assert(image0 && image1);
744 uint8x16_t row0 = vld1q_u8(image0);
745 uint8x16_t row1 = vld1q_u8(image1);
753 uint8x16_t subtract = vabdq_u8(row0, row1);
755 uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
757 return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
762 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
766 const uint8x16_t m128_row0 = vld1q_u8(row0);
767 const uint8x16_t m128_row1 = vld1q_u8(row1);
778 const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
782 vst1_u8(result, average);
789 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
791 const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
792 const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
794 const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
795 const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
797 const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
798 const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
800 const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
802 vst1q_u8(result, average_u_8x16);
807 ocean_assert(image0 && image1 && result);
809 const uint8x16_t row0 = vld1q_u8(image0);
810 const uint8x16_t row1 = vld1q_u8(image1);
813 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
816 const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
818 vst1_u8(result, thresholded);
823 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
830 const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
831 const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
844 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
845 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
849 vst2_u8(result, average);
856 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
858 const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
859 const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
861 const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
862 const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
864 const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
865 const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
866 const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
867 const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
869 uint8x16x2_t average_u_8x16x2;
871 average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
872 average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
874 vst2q_u8(result, average_u_8x16x2);
879 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
887 const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
888 const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
901 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
902 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
903 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
907 vst3_u8(result, average);
924 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
933 const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
934 const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
947 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
948 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
949 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
950 average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
954 vst4_u8(result, average);
959 ocean_assert(image0 && image1 && image2 && result);
968 uint8x8x3_t row0 = vld3_u8(image0);
969 uint8x8x3_t row1 = vld3_u8(image1);
970 uint8x8x3_t row2 = vld3_u8(image2);
972 uint16x8x3_t sumPerRow;
975 sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
976 sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
977 sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
980 const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
983 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
985 vst1_u8(result, average);
990 ocean_assert(image0 && image1 && image2 && result);
1004 uint8x16x3_t row0 = vld3q_u8(image0);
1005 uint8x16x3_t row1 = vld3q_u8(image1);
1006 uint8x16x3_t row2 = vld3q_u8(image2);
1017 uint8x16x3_t averagePerRow;
1018 averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1019 averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1020 averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1023 const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1025 vst1q_u8(result, average);
1030 ocean_assert(source && response && width >= 10u);
1033 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1035 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1038 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1040 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1045 result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1047 result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1050 vst2_s8((int8_t*)response, result);
1055 ocean_assert(source && response && width >= 10u);
1058 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1060 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1063 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1065 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1068 int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1070 int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1075 result.val[0] = vmulq_s16(horizontal, horizontal);
1077 result.val[1] = vmulq_s16(vertical, vertical);
1079 result.val[2] = vmulq_s16(horizontal, vertical);
1082 vst3q_s16(response, result);
1087 return vgetq_lane_u32(value, 0) + vgetq_lane_u32(value, 1) + vgetq_lane_u32(value, 2) + vgetq_lane_u32(value, 3);
1092 return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1097 return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1102 return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1107 return vshrq_n_u32(value, 16);
1112 return vshr_n_u16(value, 8);
1117 return vshrq_n_u16(value, 8);
1122 return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1127 return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1130 OCEAN_FORCE_INLINE int32x4_t
NEON::sum16Bit4Blocks3x3(
const short*
const rowTop,
const short*
const rowCenter,
const short*
const rowBottom)
1132 ocean_assert(rowTop !=
nullptr);
1133 ocean_assert(rowCenter !=
nullptr);
1134 ocean_assert(rowBottom !=
nullptr);
1151 const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1152 const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1153 const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1156 const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1157 const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1158 const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1161 const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1162 const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1163 const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1166 const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1167 const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1168 const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1169 const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1172 const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1173 const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1175 const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1178 return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1181 OCEAN_FORCE_INLINE uint64x2_t
NEON::multiply(
const uint64x2_t& value_u_64x2,
const uint32x2_t& value_u_32x2)
1188 const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1190 const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1191 const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1193 const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1195 return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1198 OCEAN_FORCE_INLINE int32x4_t
NEON::copySign(
const uint32x4_t& signReceiver_u_32x4,
const int32x4_t& signProvider_s_32x4)
1200 const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1202 const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0));
1203 const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0));
1205 return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1208 OCEAN_FORCE_INLINE uint8x16_t
NEON::cast16ElementsNEON(
const float32x4_t& sourceA_f_32x4,
const float32x4_t& sourceB_f_32x4,
const float32x4_t& sourceC_f_32x4,
const float32x4_t& sourceD_f_32x4)
1210 const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1211 const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1212 const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1213 const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1215 const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1216 const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1218 return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1223 ocean_assert(source !=
nullptr);
1226 for (
unsigned int n = 0u; n < 16u; ++n)
1228 ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1232 return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1237 const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1238 const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1240 const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1241 const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1242 const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1243 const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1245 float32x4x4_t result_u_32x4x4;
1246 result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1247 result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1248 result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1249 result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1251 return result_u_32x4x4;
1256 ocean_assert(source !=
nullptr);
1261 inline unsigned int NEON::interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy)
1263 ocean_assert(pixel);
1264 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1266 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1269 inline unsigned int NEON::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int ,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
1271 ocean_assert(pixel0 && pixel1);
1273 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1278 inline unsigned int NEON::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
1280 ocean_assert(pixel0 && pixel1);
1282 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1283 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1285 return sqrDistance(
interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy),
interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
This class implements computer vision functions using NEON extensions.
Definition: NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:567
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: NEON.h:821
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition: NEON.h:805
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition: NEON.h:519
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: NEON.h:760
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:677
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:622
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition: NEON.h:988
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition: NEON.h:852
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition: NEON.h:1110
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition: NEON.h:1053
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition: NEON.h:740
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition: NEON.h:1120
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition: NEON.h:957
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition: NEON.h:785
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:578
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:556
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition: NEON.h:529
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition: NEON.h:1095
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:666
static OCEAN_FORCE_INLINE unsigned int sum32x4ByLanes(const uint32x4_t &value)
Sums the four 32 bit values and returns the result.
Definition: NEON.h:1085
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition: NEON.h:1105
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition: NEON.h:534
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition: NEON.h:1208
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition: NEON.h:1125
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition: NEON.h:1261
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition: NEON.h:524
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition: NEON.h:1130
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:633
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:589
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition: NEON.h:611
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition: NEON.h:922
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:600
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:545
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:644
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition: NEON.h:688
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: NEON.h:514
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition: NEON.h:877
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition: NEON.h:712
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition: NEON.h:655
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition: NEON.h:1269
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition: NEON.h:1198
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition: NEON.h:1028
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition: NEON.h:1181
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition: NEON.h:1090
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15