8#ifndef META_OCEAN_CV_NEON_H
9#define META_OCEAN_CV_NEON_H
17#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
19#if defined(__ARM_NEON__) || defined(__ARM_NEON)
41 static inline void prefetchT0(
const void*
const data);
47 static inline void prefetchT1(
const void*
const data);
53 static inline void prefetchT2(
const void*
const data);
59 static inline void prefetchNTA(
const void*
const data);
261 static inline void average24Elements1Channel8Bit3x3(
const uint8_t*
const image0,
const uint8_t*
const image1,
const uint8_t*
const image2, uint8_t*
const result);
374 static OCEAN_FORCE_INLINE uint16x4_t
moveHighBits16_8(
const uint16x4_t& value);
383 static OCEAN_FORCE_INLINE uint16x8_t
moveHighBits16_8(
const uint16x8_t& value);
416 static OCEAN_FORCE_INLINE int32x4_t
sum16Bit4Blocks3x3(
const short*
const rowTop,
const short*
const rowCenter,
const short*
const rowBottom);
425 static OCEAN_FORCE_INLINE uint64x2_t
multiply(
const uint64x2_t& value_u_64x2,
const uint32x2_t& value_u_32x2);
433 static OCEAN_FORCE_INLINE int32x4_t
copySign(
const uint32x4_t& signReceiver,
const int32x4_t& signProvider);
443 static OCEAN_FORCE_INLINE uint8x16_t
cast16ElementsNEON(
const float32x4_t& sourceA_f_32x4,
const float32x4_t& sourceB_f_32x4,
const float32x4_t& sourceC_f_32x4,
const float32x4_t& sourceD_f_32x4);
457 static OCEAN_FORCE_INLINE float32x4x4_t
cast16ElementsNEON(
const uint8x16_t& source_u_8x16);
464 static OCEAN_FORCE_INLINE float32x4x4_t
cast16ElementsNEON(
const uint8_t*
const source);
478 static inline unsigned int interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy);
492 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
510 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
515 __builtin_prefetch(data, 0, 0);
520 __builtin_prefetch(data, 0, 1);
525 __builtin_prefetch(data, 0, 2);
530 __builtin_prefetch(data, 0, 3);
535 ocean_assert(image0 && image1);
537 const uint8x16_t row0 = vld1q_u8(image0);
538 const uint8x16_t row1 = vld1q_u8(image1);
540 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFF00000000000000ull), vdup_n_u8(0xFFu));
546 ocean_assert(image0 && image1);
548 const uint8x16_t row0 = vld1q_u8(image0);
549 const uint8x16_t row1 = vld1q_u8(image1);
551 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFF000000000000ull), vdup_n_u8(0xFFu));
557 ocean_assert(image0 && image1);
559 const uint8x16_t row0 = vld1q_u8(image0);
560 const uint8x16_t row1 = vld1q_u8(image1);
562 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFF0000000000ull), vdup_n_u8(0xFFu));
568 ocean_assert(image0 && image1);
570 const uint8x16_t row0 = vld1q_u8(image0);
571 const uint8x16_t row1 = vld1q_u8(image1);
573 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFF00000000ull), vdup_n_u8(0xFFu));
579 ocean_assert(image0 && image1);
581 const uint8x16_t row0 = vld1q_u8(image0);
582 const uint8x16_t row1 = vld1q_u8(image1);
584 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFF000000ull), vdup_n_u8(0xFFu));
590 ocean_assert(image0 && image1);
592 const uint8x16_t row0 = vld1q_u8(image0);
593 const uint8x16_t row1 = vld1q_u8(image1);
595 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFF0000ull), vdup_n_u8(0xFFu));
601 ocean_assert(image0 && image1);
603 const uint8x16_t row0 = vld1q_u8(image0);
604 const uint8x16_t row1 = vld1q_u8(image1);
606 const uint8x16_t mask = vcombine_u8(vcreate_u8(0xFFFFFFFFFFFFFF00ull), vdup_n_u8(0xFFu));
612 ocean_assert(image0 && image1);
614 const uint8x16_t row0 = vld1q_u8(image0);
615 const uint8x16_t row1 = vld1q_u8(image1);
617 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000000000FFull));
623 ocean_assert(image0 && image1);
625 const uint8x16_t row0 = vld1q_u8(image0);
626 const uint8x16_t row1 = vld1q_u8(image1);
628 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000000000FFFFull));
634 ocean_assert(image0 && image1);
636 const uint8x16_t row0 = vld1q_u8(image0);
637 const uint8x16_t row1 = vld1q_u8(image1);
639 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000000000FFFFFFull));
645 ocean_assert(image0 && image1);
647 const uint8x16_t row0 = vld1q_u8(image0);
648 const uint8x16_t row1 = vld1q_u8(image1);
650 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00000000FFFFFFFFull));
656 ocean_assert(image0 && image1);
658 const uint8x16_t row0 = vld1q_u8(image0);
659 const uint8x16_t row1 = vld1q_u8(image1);
661 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x000000FFFFFFFFFFull));
667 ocean_assert(image0 && image1);
669 const uint8x16_t row0 = vld1q_u8(image0);
670 const uint8x16_t row1 = vld1q_u8(image1);
672 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x0000FFFFFFFFFFFFull));
678 ocean_assert(image0 && image1);
680 const uint8x16_t row0 = vld1q_u8(image0);
681 const uint8x16_t row1 = vld1q_u8(image1);
683 const uint8x16_t mask = vcombine_u8(vdup_n_u8(0xFFu), vcreate_u8(0x00FFFFFFFFFFFFFFull));
689 ocean_assert(image0 && image1);
691 uint8x16_t row0 = vld1q_u8(image0);
692 uint8x16_t row1 = vld1q_u8(image1);
700 uint8x16_t subtract = vabdq_u8(row0, row1);
702 uint8x8_t subtractLow = vget_low_u8(subtract);
703 uint8x8_t subtractHigh = vget_high_u8(subtract);
705 uint16x8_t squareLow = vmull_u8(subtractLow, subtractLow);
706 uint16x8_t squareHigh = vmull_u8(subtractHigh, subtractHigh);
708 return vaddq_u32(vaddl_u16(vget_low_u16(squareLow), vget_low_u16(squareHigh)), vaddl_u16(vget_high_u16(squareLow), vget_high_u16(squareHigh)));
713 ocean_assert(image0 && image1);
715 const uint8x8_t row0 = vld1_u8(image0);
716 const uint8x8_t row1 = vld1_u8(image1);
724 const uint8x8_t subtract = vorr_u8(vqsub_u8(row0, row1), vqsub_u8(row1, row0));
728 const uint16x4_t subtractHigh =
moveHighBits16_8(vreinterpret_u16_u8(subtract));
730 const uint16x8_t subtractCombined = vcombine_u16(subtractLow, subtractHigh);
733 const uint16x8_t square = vmulq_u16(subtractCombined, subtractCombined);
741 ocean_assert(image0 && image1);
743 uint8x16_t row0 = vld1q_u8(image0);
744 uint8x16_t row1 = vld1q_u8(image1);
752 uint8x16_t subtract = vabdq_u8(row0, row1);
754 uint16x8_t add16 = vaddl_u8(vget_low_u8(subtract), vget_high_u8(subtract));
756 return vaddl_u16(vget_low_u16(add16), vget_high_u16(add16));
761 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
765 const uint8x16_t m128_row0 = vld1q_u8(row0);
766 const uint8x16_t m128_row1 = vld1q_u8(row1);
777 const uint8x8_t average = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m128_row0, m128_row1)), 1);
781 vst1_u8(result, average);
788 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
790 const uint8x16_t row0A_u_8x16 = vld1q_u8(row0 + 0);
791 const uint8x16_t row0B_u_8x16 = vld1q_u8(row0 + 16);
793 const uint8x16_t row1A_u_8x16 = vld1q_u8(row1 + 0);
794 const uint8x16_t row1B_u_8x16 = vld1q_u8(row1 + 16);
796 const uint8x8_t averageA_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16, row1A_u_8x16)), 1);
797 const uint8x8_t averageB_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16, row1B_u_8x16)), 1);
799 const uint8x16_t average_u_8x16 = vcombine_u8(averageA_u_8x8, averageB_u_8x8);
801 vst1q_u8(result, average_u_8x16);
806 ocean_assert(image0 && image1 && result);
808 const uint8x16_t row0 = vld1q_u8(image0);
809 const uint8x16_t row1 = vld1q_u8(image1);
812 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vpaddlq_u8(vhaddq_u8(row0, row1)), 1));
815 const uint8x8_t thresholded = vcge_u8(average, vmov_n_u8(threshold));
817 vst1_u8(result, thresholded);
822 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
829 const uint8x16x2_t m2_128_row0 = vld2q_u8(row0);
830 const uint8x16x2_t m2_128_row1 = vld2q_u8(row1);
843 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[0], m2_128_row1.val[0])), 1);
844 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m2_128_row0.val[1], m2_128_row1.val[1])), 1);
848 vst2_u8(result, average);
855 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
857 const uint8x16x2_t row0A_u_8x16x2 = vld2q_u8(row0 + 0);
858 const uint8x16x2_t row0B_u_8x16x2 = vld2q_u8(row0 + 32);
860 const uint8x16x2_t row1A_u_8x16x2 = vld2q_u8(row1 + 0);
861 const uint8x16x2_t row1B_u_8x16x2 = vld2q_u8(row1 + 32);
863 const uint8x8_t averageChannel0A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[0], row1A_u_8x16x2.val[0])), 1);
864 const uint8x8_t averageChannel1A_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0A_u_8x16x2.val[1], row1A_u_8x16x2.val[1])), 1);
865 const uint8x8_t averageChannel0B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[0], row1B_u_8x16x2.val[0])), 1);
866 const uint8x8_t averageChannel1B_u_8x8 = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(row0B_u_8x16x2.val[1], row1B_u_8x16x2.val[1])), 1);
868 uint8x16x2_t average_u_8x16x2;
870 average_u_8x16x2.val[0] = vcombine_u8(averageChannel0A_u_8x8, averageChannel0B_u_8x8);
871 average_u_8x16x2.val[1] = vcombine_u8(averageChannel1A_u_8x8, averageChannel1B_u_8x8);
873 vst2q_u8(result, average_u_8x16x2);
878 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
886 const uint8x16x3_t m3_128_row0 = vld3q_u8(row0);
887 const uint8x16x3_t m3_128_row1 = vld3q_u8(row1);
900 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[0], m3_128_row1.val[0])), 1);
901 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[1], m3_128_row1.val[1])), 1);
902 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m3_128_row0.val[2], m3_128_row1.val[2])), 1);
906 vst3_u8(result, average);
923 ocean_assert(row0 !=
nullptr && row1 !=
nullptr && result !=
nullptr);
932 const uint8x16x4_t m4_128_row0 = vld4q_u8(row0);
933 const uint8x16x4_t m4_128_row1 = vld4q_u8(row1);
946 average.val[0] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[0], m4_128_row1.val[0])), 1);
947 average.val[1] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[1], m4_128_row1.val[1])), 1);
948 average.val[2] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[2], m4_128_row1.val[2])), 1);
949 average.val[3] = vrshrn_n_u16(vpaddlq_u8(vrhaddq_u8(m4_128_row0.val[3], m4_128_row1.val[3])), 1);
953 vst4_u8(result, average);
958 ocean_assert(image0 && image1 && image2 && result);
967 uint8x8x3_t row0 = vld3_u8(image0);
968 uint8x8x3_t row1 = vld3_u8(image1);
969 uint8x8x3_t row2 = vld3_u8(image2);
971 uint16x8x3_t sumPerRow;
974 sumPerRow.val[0] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[0]), vmovl_u8(row2.val[0])), vshlq_n_u16(vmovl_u8(row1.val[0]), 1));
975 sumPerRow.val[1] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[1]), vmovl_u8(row2.val[1])), vshlq_n_u16(vmovl_u8(row1.val[1]), 1));
976 sumPerRow.val[2] = vaddq_u16(vaddq_u16(vmovl_u8(row0.val[2]), vmovl_u8(row2.val[2])), vshlq_n_u16(vmovl_u8(row1.val[2]), 1));
979 const uint16x8_t sum = vaddq_u16(vaddq_u16(sumPerRow.val[0], sumPerRow.val[2]), vshlq_n_u16(sumPerRow.val[1], 1));
982 const uint8x8_t average = vmovn_u16(vshrq_n_u16(vaddq_u16(sum, vmovq_n_u16(8u)), 4));
984 vst1_u8(result, average);
989 ocean_assert(image0 && image1 && image2 && result);
1003 uint8x16x3_t row0 = vld3q_u8(image0);
1004 uint8x16x3_t row1 = vld3q_u8(image1);
1005 uint8x16x3_t row2 = vld3q_u8(image2);
1016 uint8x16x3_t averagePerRow;
1017 averagePerRow.val[0] = vhaddq_u8(vhaddq_u8(row0.val[0], row2.val[0]), row1.val[0]);
1018 averagePerRow.val[1] = vhaddq_u8(vhaddq_u8(row0.val[1], row2.val[1]), row1.val[1]);
1019 averagePerRow.val[2] = vhaddq_u8(vhaddq_u8(row0.val[2], row2.val[2]), row1.val[2]);
1022 const uint8x16_t average = vhaddq_u8(vhaddq_u8(averagePerRow.val[0], averagePerRow.val[2]), averagePerRow.val[1]);
1024 vst1q_u8(result, average);
1029 ocean_assert(source && response && width >= 10u);
1032 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1034 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1037 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1039 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1044 result.val[0] = vmovn_s16(vhsubq_s16(horizontalPlus, horizontalMinus));
1046 result.val[1] = vmovn_s16(vhsubq_s16(verticalPlus, verticalMinus));
1049 vst2_s8((int8_t*)response, result);
1054 ocean_assert(source && response && width >= 10u);
1057 int16x8_t horizontalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - 1)));
1059 int16x8_t horizontalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + 1)));
1062 int16x8_t verticalMinus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source - width)));
1064 int16x8_t verticalPlus = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(source + width)));
1067 int16x8_t horizontal = vhsubq_s16(horizontalPlus, horizontalMinus);
1069 int16x8_t vertical = vhsubq_s16(verticalPlus, verticalMinus);
1074 result.val[0] = vmulq_s16(horizontal, horizontal);
1076 result.val[1] = vmulq_s16(vertical, vertical);
1078 result.val[2] = vmulq_s16(horizontal, vertical);
1081 vst3q_s16(response, result);
1086#if defined(__aarch64__)
1088 return vaddvq_u32(value_u_32x4);
1092 const uint32x2_t sum_u_32x2 = vpadd_u32(vget_low_u32(value_u_32x4), vget_high_u32(value_u_32x4));
1093 return vget_lane_u32(vpadd_u32(sum_u_32x2, sum_u_32x2), 0);
1100 return vandq_u32(value, vmovq_n_u32(0x0000FFFFu));
1105 return vand_u16(value, vreinterpret_u16_u32(vmov_n_u32(0x00FF00FFu)));
1110 return vandq_u16(value, vreinterpretq_u16_u32(vmovq_n_u32(0x00FF00FFu)));
1115 return vshrq_n_u32(value, 16);
1120 return vshr_n_u16(value, 8);
1125 return vshrq_n_u16(value, 8);
1130 return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
1135 return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
1138OCEAN_FORCE_INLINE int32x4_t
NEON::sum16Bit4Blocks3x3(
const short*
const rowTop,
const short*
const rowCenter,
const short*
const rowBottom)
1140 ocean_assert(rowTop !=
nullptr);
1141 ocean_assert(rowCenter !=
nullptr);
1142 ocean_assert(rowBottom !=
nullptr);
1159 const int16x4_t top_0_s_16x4 = vld1_s16(rowTop + 0);
1160 const int16x4_t top_1_s_16x4 = vld1_s16(rowTop + 1);
1161 const int16x4_t top_2_s_16x4 = vld1_s16(rowTop + 2);
1164 const int16x4_t center_0_s_16x4 = vld1_s16(rowCenter + 0);
1165 const int16x4_t center_1_s_16x4 = vld1_s16(rowCenter + 1);
1166 const int16x4_t center_2_s_16x4 = vld1_s16(rowCenter + 2);
1169 const int16x4_t bottom_0_s_16x4 = vld1_s16(rowBottom + 0);
1170 const int16x4_t bottom_1_s_16x4 = vld1_s16(rowBottom + 1);
1171 const int16x4_t bottom_2_s_16x4 = vld1_s16(rowBottom + 2);
1174 const int32x4_t result_A_s_32x4 = vaddl_s16(top_0_s_16x4, top_2_s_16x4);
1175 const int32x4_t result_B_s_32x4 = vaddl_s16(center_0_s_16x4, center_2_s_16x4);
1176 const int32x4_t result_C_s_32x4 = vaddl_s16(bottom_0_s_16x4, bottom_2_s_16x4);
1177 const int32x4_t result_D_s_32x4 = vaddl_s16(top_1_s_16x4, center_1_s_16x4);
1180 const int32x4_t result_E_s_32x4 = vaddq_s32(result_A_s_32x4, result_B_s_32x4);
1181 const int32x4_t result_F_s_32x4 = vaddq_s32(result_C_s_32x4, result_D_s_32x4);
1183 const int32x4_t result_G_s_32x4 = vaddq_s32(result_E_s_32x4, result_F_s_32x4);
1186 return vaddw_s16(result_G_s_32x4, bottom_1_s_16x4);
1189OCEAN_FORCE_INLINE uint64x2_t
NEON::multiply(
const uint64x2_t& value_u_64x2,
const uint32x2_t& value_u_32x2)
1196 const uint32x2x2_t value64_lowHigh_32x2x2 = vtrn_u32(vget_low_u32(vreinterpretq_u32_u64(value_u_64x2)), vget_high_u32(vreinterpretq_u32_u64(value_u_64x2)));
1198 const uint64x2_t multiplication_low_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[0], value_u_32x2);
1199 const uint64x2_t multiplication_high_64x2 = vmull_u32(value64_lowHigh_32x2x2.val[1], value_u_32x2);
1201 const uint64x2_t shiftedMultiplication_high_64x2 = vshlq_n_u64(multiplication_high_64x2, 32);
1203 return vaddq_u64(shiftedMultiplication_high_64x2, multiplication_low_64x2);
1206OCEAN_FORCE_INLINE int32x4_t
NEON::copySign(
const uint32x4_t& signReceiver_u_32x4,
const int32x4_t& signProvider_s_32x4)
1208 const int32x4_t negativeSignReceiver_u_32x4 = vnegq_s32(vreinterpretq_s32_u32(signReceiver_u_32x4));
1210 const uint32x4_t isNegativeMask_u_32x4 = vcltq_s32(signProvider_s_32x4, vdupq_n_s32(0));
1211 const uint32x4_t isPositiveMask_u_32x4 = vcgeq_s32(signProvider_s_32x4, vdupq_n_s32(0));
1213 return vreinterpretq_s32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_s32(negativeSignReceiver_u_32x4), isNegativeMask_u_32x4), vandq_u32(signReceiver_u_32x4, isPositiveMask_u_32x4)));
1216OCEAN_FORCE_INLINE uint8x16_t
NEON::cast16ElementsNEON(
const float32x4_t& sourceA_f_32x4,
const float32x4_t& sourceB_f_32x4,
const float32x4_t& sourceC_f_32x4,
const float32x4_t& sourceD_f_32x4)
1218 const uint32x4_t targetA_u_32x4 = vcvtq_u32_f32(sourceA_f_32x4);
1219 const uint32x4_t targetB_u_32x4 = vcvtq_u32_f32(sourceB_f_32x4);
1220 const uint32x4_t targetC_u_32x4 = vcvtq_u32_f32(sourceC_f_32x4);
1221 const uint32x4_t targetD_u_32x4 = vcvtq_u32_f32(sourceD_f_32x4);
1223 const uint16x8_t targetA_u_16x8 = vcombine_u16(vmovn_u32(targetA_u_32x4), vmovn_u32(targetB_u_32x4));
1224 const uint16x8_t targetB_u_16x8 = vcombine_u16(vmovn_u32(targetC_u_32x4), vmovn_u32(targetD_u_32x4));
1226 return vcombine_u8(vmovn_u16(targetA_u_16x8), vmovn_u16(targetB_u_16x8));
1231 ocean_assert(source !=
nullptr);
1234 for (
unsigned int n = 0u; n < 16u; ++n)
1236 ocean_assert(source[n] >= 0.0f && source[n] < 256.0f);
1240 return cast16ElementsNEON(vld1q_f32(source + 0), vld1q_f32(source + 4), vld1q_f32(source + 8), vld1q_f32(source + 12));
1245 const uint16x8_t sourceA_u_16x8 = vmovl_u8(vget_low_u8(source_u_8x16));
1246 const uint16x8_t sourceB_u_16x8 = vmovl_u8(vget_high_u8(source_u_8x16));
1248 const uint32x4_t sourceA_u_32x4 = vmovl_u16(vget_low_u16(sourceA_u_16x8));
1249 const uint32x4_t sourceB_u_32x4 = vmovl_u16(vget_high_u16(sourceA_u_16x8));
1250 const uint32x4_t sourceC_u_32x4 = vmovl_u16(vget_low_u16(sourceB_u_16x8));
1251 const uint32x4_t sourceD_u_32x4 = vmovl_u16(vget_high_u16(sourceB_u_16x8));
1253 float32x4x4_t result_u_32x4x4;
1254 result_u_32x4x4.val[0] = vcvtq_f32_u32(sourceA_u_32x4);
1255 result_u_32x4x4.val[1] = vcvtq_f32_u32(sourceB_u_32x4);
1256 result_u_32x4x4.val[2] = vcvtq_f32_u32(sourceC_u_32x4);
1257 result_u_32x4x4.val[3] = vcvtq_f32_u32(sourceD_u_32x4);
1259 return result_u_32x4x4;
1264 ocean_assert(source !=
nullptr);
1269inline unsigned int NEON::interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy)
1271 ocean_assert(pixel);
1272 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
1274 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
1277inline unsigned int NEON::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int ,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
1279 ocean_assert(pixel0 && pixel1);
1281 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1286inline unsigned int NEON::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
1288 ocean_assert(pixel0 && pixel1);
1290 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
1291 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
1293 return sqrDistance(
interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy),
interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
This class implements computer vision functions using NEON extensions.
Definition NEON.h:34
static uint32x4_t sumSquareDifferences8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:566
static OCEAN_FORCE_INLINE void average32Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:820
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1084
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint8_t threshold=192u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition NEON.h:804
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition NEON.h:518
static OCEAN_FORCE_INLINE void average16Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:759
static uint32x4_t sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:676
static uint32x4_t sumSquareDifference8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:621
static void average48Elements1Channel8Bit3x3Approximation(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 48 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:987
static OCEAN_FORCE_INLINE void average64Elements2Channel16Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition NEON.h:851
static OCEAN_FORCE_INLINE uint16x4_t moveHighBits16_8(const uint16x4_t &value)
Moves the high 8 bits of four 16 bit elements to the low 8 bits and fill the high bits with 0.
Definition NEON.h:1118
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 8 foll...
Definition NEON.h:1052
static uint32x4_t sumAbsoluteDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute difference determination for 16 elements with 8 bit precision.
Definition NEON.h:739
static OCEAN_FORCE_INLINE uint16x8_t combineLowBits32x4to16x8(const uint32x4_t &low, const uint32x4_t &high)
Combines eight 32 bit values (holding 16 bit information) two eight 16 bit values.
Definition NEON.h:1128
static void average24Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 24 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition NEON.h:956
static OCEAN_FORCE_INLINE void average32Elements1Channel8Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition NEON.h:784
static uint32x4_t sumSquareDifferences8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:577
static uint32x4_t sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:555
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition NEON.h:528
static OCEAN_FORCE_INLINE uint16x4_t removeHighBits16_8(const uint16x4_t &value)
Removes (sets to zero) the high 8 bits of four 16 bit elements.
Definition NEON.h:1103
static uint32x4_t sumSquareDifference8BitFront14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:665
static OCEAN_FORCE_INLINE uint32x4_t moveHighBits32_16(const uint32x4_t &value)
Moves the high 16 bits of four 32 bit elements to the low 16 bits and fill the high bits with 0.
Definition NEON.h:1113
static uint32x4_t sumSquareDifferences8BitBack9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:533
static OCEAN_FORCE_INLINE uint8x16_t cast16ElementsNEON(const float32x4_t &sourceA_f_32x4, const float32x4_t &sourceB_f_32x4, const float32x4_t &sourceC_f_32x4, const float32x4_t &sourceD_f_32x4)
Casts 16 float elements to 16 uint8_t elements.
Definition NEON.h:1216
static OCEAN_FORCE_INLINE uint8x16_t combineLowBits16x8to8x16(const uint16x8_t &low, const uint16x8_t &high)
Combines sixteen 16 bit values (holding 8 bit information) two sixteen 8 bit values.
Definition NEON.h:1133
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition NEON.h:1269
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition NEON.h:523
static OCEAN_FORCE_INLINE int32x4_t sum16Bit4Blocks3x3(const short *const rowTop, const short *const rowCenter, const short *const rowBottom)
Determines the four sums of four successive (overlapping) 3x3 blocks of signed 16 bit integer values.
Definition NEON.h:1138
static uint32x4_t sumSquareDifference8BitFront11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 11 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:632
static uint32x4_t sumSquareDifferences8BitBack14Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 14 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:588
static uint32x4_t sumSquareDifference8BitFront9Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 9 elements of an 16 elements buffer with 8 bit prec...
Definition NEON.h:610
static OCEAN_FORCE_INLINE void average64Elements4Channel32Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 64 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition NEON.h:921
static uint32x4_t sumSquareDifferences8BitBack15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 15 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:599
static uint32x4_t sumSquareDifferences8BitBack10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 10 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:544
static uint32x4_t sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:643
static uint32x4_t sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition NEON.h:687
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition NEON.h:513
static OCEAN_FORCE_INLINE void average48Elements3Channel24Bit2x2(const uint8_t *const row0, const uint8_t *const row1, uint8_t *const result)
Averages 48 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition NEON.h:876
static uint32x4_t sumSquareDifference8Bit8Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 8 elements with 8 bit precision.
Definition NEON.h:711
static uint32x4_t sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of an 16 elements buffer with 8 bit pre...
Definition NEON.h:654
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition NEON.h:1277
static OCEAN_FORCE_INLINE int32x4_t copySign(const uint32x4_t &signReceiver, const int32x4_t &signProvider)
Copies the sign of a given value to another one.
Definition NEON.h:1206
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 8 following pixels for a given 1 channel 8 b...
Definition NEON.h:1027
static OCEAN_FORCE_INLINE uint64x2_t multiply(const uint64x2_t &value_u_64x2, const uint32x2_t &value_u_32x2)
Multiplies an two uint64_t value with two uint32_t value and stores the results in two uint64_t value...
Definition NEON.h:1189
static OCEAN_FORCE_INLINE uint32x4_t removeHighBits32_16(const uint32x4_t &value)
Removes (sets to zero) the high 16 bits of four 32 bit elements.
Definition NEON.h:1098
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15