8 #ifndef META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H
9 #define META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H
86 static bool isFilterSymmetric(
const T* filterValues,
const size_t size);
96 static T sumFilterValues(
const T* filterValues,
const size_t size);
166 template <
typename T,
typename TFilter>
167 static bool filter(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const TFilter* horizontalFilter,
const unsigned int horizontalFilterSize,
const TFilter* verticalFilter,
const unsigned int verticalFilterSize,
Worker* worker =
nullptr,
ReusableMemory* reusableMemory =
nullptr,
const ProcessorInstructions processorInstructions =
Processor::get().instructions());
223 template <
typename T>
224 static bool filterUniversal(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const float* horizontalFilter,
const unsigned int horizontalFilterSize,
const float* verticalFilter,
const unsigned int verticalFilterSize,
Worker* worker =
nullptr);
249 template <
typename T,
typename TFilter, ProcessorInstructions tProcessorInstructions>
250 static void filter(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const TFilter* horizontalFilter,
const unsigned int horizontalFilterSize,
const TFilter* verticalFilter,
const unsigned int verticalFilterSize,
ReusableMemory* reusableMemory =
nullptr,
Worker* worker =
nullptr);
258 template <
typename T, ProcessorInstructions tProcessorInstructions>
268 template <
typename T, ProcessorInstructions tProcessorInstructions>
280 template <
typename T>
281 static void fillLeftExtraBorder(
const T* source,
const unsigned int channels,
const unsigned int pixels, T* extendedRowLeft);
292 template <
typename T>
293 static void fillRightExtraBorder(
const T* sourceEnd,
const unsigned int channels,
const unsigned int pixels, T* extendedRowRight);
311 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
332 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
355 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
380 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
396 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
397 static OCEAN_FORCE_INLINE
void filterHorizontalRowOneBlockWith4Elements(
const TSource*
const source, TFilter*
const target,
const unsigned int channels,
const TFilter*
const filter,
const unsigned int filterSize,
const bool isSymmetric);
412 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
413 static OCEAN_FORCE_INLINE
void filterHorizontalRowOneBlockWith8Elements(
const TSource*
const source, TFilter*
const target,
const unsigned int channels,
const TFilter*
const filter,
const unsigned int filterSize,
const bool isSymmetric);
428 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
444 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
460 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
478 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
479 static OCEAN_FORCE_INLINE
void filterVerticalCoreRow32BitPerChannelFloat(
const TSource* source, TTarget* target,
const unsigned int width,
const unsigned int channels,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric,
const unsigned int sourcePaddingElements);
496 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
497 static OCEAN_FORCE_INLINE
void filterVerticalBorderRow8Elements32BitPerChannelFloat(
const TSource* source, TTarget* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric);
514 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
515 static OCEAN_FORCE_INLINE
void filterVerticalBorderRow16Elements32BitPerChannelFloat(
const TSource* source, TTarget* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric);
534 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
535 static OCEAN_FORCE_INLINE
void filterVerticalBorderRow32BitPerChannelFloat(
const TSource* source, TTarget* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric,
const unsigned int sourcePaddingElements);
555 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
556 static void filterHorizontalSubset(
const TSource* source, TFilter* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const TFilter* filter,
const unsigned int filterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows);
577 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
578 static void filterVerticalSubset(
const TSource* source, TTarget* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const float* filter,
const unsigned int filterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
unsigned int firstRow,
const unsigned int numberRows);
595 template <
typename T,
typename TIntermediate>
596 static void filterUniversalHorizontalSubset(
const T* source, TIntermediate* target,
const unsigned int width,
const unsigned int channels,
const float* horizontalFilter,
const unsigned int filterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows);
614 template <
typename T,
typename TIntermediate>
615 static void filterUniversalVerticalSubset(
const TIntermediate* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const float* verticalFilter,
const unsigned int filterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows);
628 static inline unsigned int mirroredBorderLocationLeft(
const int value);
643 static inline unsigned int mirroredBorderLocationRight(
const unsigned int value,
const unsigned int size);
646 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10
666 #elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
688 template <
typename T>
691 ocean_assert(filterValues !=
nullptr);
692 ocean_assert(size >= 1 && size % 2 == 1);
694 for (
size_t n = 0; n < size / 2; ++n)
705 template <
typename T>
708 ocean_assert(filterValues !=
nullptr);
709 ocean_assert(size >= 1);
711 T sum = filterValues[0];
713 for (
size_t n = 1; n < size; ++n)
715 sum += filterValues[n];
721 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
728 value = _mm_setzero_si128();
736 value = _mm_set_ps1(0.0f);
742 _mm_storeu_si128((__m128i*)target, value);
746 OCEAN_FORCE_INLINE
void FrameFilterSeparable::writeSIMD<float, PI_SSE_2>(
const SIMD32x4<float>::Type& value,
float* target)
748 _mm_storeu_si128((__m128i*)target, _mm_castps_si128(value));
753 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
758 value = vdupq_n_u32(0u);
764 value = vdupq_n_f32(0.0f);
770 vst1q_u32(target, value);
774 OCEAN_FORCE_INLINE
void FrameFilterSeparable::writeSIMD<float, PI_NEON>(
const SIMD32x4<float>::Type& value,
float* target)
776 vst1q_f32(target, value);
781 template <
typename T>
784 ocean_assert(source !=
nullptr && extendedRow !=
nullptr);
786 for (
unsigned int n = 0u; n < pixels; ++n)
788 memcpy(extendedRow + n * channels, source + (pixels - n - 1u) * channels,
sizeof(T) * channels);
792 template <
typename T>
795 ocean_assert(sourceEnd !=
nullptr && extendedRow !=
nullptr);
797 for (
unsigned int n = 0u; n < pixels; ++n)
799 memcpy(extendedRow + n * channels, sourceEnd - (n + 1u) *
int(channels),
sizeof(T) * channels);
803 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
806 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
807 ocean_assert(channels >= 1u);
808 ocean_assert(filterSize % 2u == 1u);
810 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
812 unsigned int remainingElements = width * channels;
814 while (remainingElements >= 16u)
816 filterVerticalCoreRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements,
filter, filterSize, isSymmetric);
821 remainingElements -= 16u;
824 while (remainingElements >= 8u)
826 filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements,
filter, filterSize, isSymmetric);
831 remainingElements -= 8u;
834 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
836 while (remainingElements >= 4u)
838 filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements,
filter, filterSize, isSymmetric);
843 remainingElements -= 4u;
846 ocean_assert(width * channels >= 4u);
847 ocean_assert(remainingElements < 4u);
849 if (remainingElements != 0u)
851 const unsigned int shift = 4u - remainingElements;
853 filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements,
filter, filterSize, isSymmetric);
858 ocean_assert(width * channels >= 8u);
859 ocean_assert(remainingElements < 8u);
861 if (remainingElements != 0u)
863 const unsigned int shift = 8u - remainingElements;
865 filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements,
filter, filterSize, isSymmetric);
871 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
874 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
876 ocean_assert(source !=
nullptr && target !=
nullptr);
877 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
897 const unsigned int filterSize_2 = filterSize / 2u;
899 const __m128i* sourceBlock = (
const __m128i*)source;
902 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
905 __m128 source128 = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock));
906 __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
909 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
911 const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
912 const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
917 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
921 __m128i source128i = _mm_add_epi32(_mm_loadu_si128(sourceMinus), _mm_loadu_si128(sourcePlus));
923 result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128i), filterFactor_32x4));
928 __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
929 __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
931 __m128i source128iMinus = _mm_loadu_si128(sourceMinus);
932 __m128i source128iPlus = _mm_loadu_si128(sourcePlus);
934 result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iMinus), filterFactor128Minus));
935 result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iPlus), filterFactor128Plus));
941 __m128i source128i = _mm_cvtps_epi32(result128);
942 source128i = _mm_packs_epi32(source128i, source128i);
943 source128i = _mm_packus_epi16(source128i, source128i);
945 *((
unsigned int*)target) = SSE::value_u32<0u>(source128i);
949 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
951 ocean_assert(source !=
nullptr && target !=
nullptr);
952 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
967 const unsigned int filterSize_2 = filterSize / 2u;
969 const __m128i* sourceBlock = (
const __m128i*)source;
972 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
975 __m128 source128 = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
976 __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
979 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
981 const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
982 const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
987 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
991 source128 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus)));
993 result128 = _mm_add_ps(result128, _mm_mul_ps(source128, filterFactor_32x4));
998 __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
999 __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1002 __m128 source128Minus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus));
1003 __m128 source128Plus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus));
1005 result128 = _mm_add_ps(result128, _mm_mul_ps(source128Minus, filterFactor_32x4Minus));
1006 result128 = _mm_add_ps(result128, _mm_mul_ps(source128Plus, filterFactor_32x4Plus));
1010 writeSIMD<float, PI_SSE_2>(result128, target);
1014 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1016 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1017 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1038 const unsigned int filterSize_2 = filterSize / 2u;
1040 const __m128i* sourceBlock = (
const __m128i*)source;
1043 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
1046 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1047 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1050 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1051 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1054 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1056 const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
1057 const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
1062 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
1065 __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
1066 __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
1068 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1069 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1074 __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
1075 __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1078 __m128i source128aiMinus =_mm_loadu_si128(sourceMinus + 0);
1079 __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
1080 __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
1081 __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
1083 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
1084 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
1086 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
1087 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
1092 __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1093 result128 = _mm_packus_epi16(result128, result128);
1095 _mm_storel_epi64((__m128i*)target, result128);
1099 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1101 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1102 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1117 const unsigned int filterSize_2 = filterSize / 2u;
1119 const __m128i* sourceBlock = (
const __m128i*)source;
1122 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
1125 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1126 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1129 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1130 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1133 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1135 const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
1136 const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
1141 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
1145 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
1146 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
1148 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1149 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1154 __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
1155 __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1158 __m128 source128aMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
1159 __m128 source128aPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
1160 __m128 source128bMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
1161 __m128 source128bPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
1163 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aMinus, filterFactor_32x4Minus));
1164 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aPlus, filterFactor_32x4Plus));
1166 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bMinus, filterFactor_32x4Minus));
1167 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bPlus, filterFactor_32x4Plus));
1171 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1172 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1177 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1180 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1182 const unsigned int filterSize_2 = filterSize / 2u;
1185 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
1188 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1189 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1191 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1192 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1195 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1197 const unsigned int* sourceMinus = source - sourceStrideElements * i;
1198 const unsigned int* sourcePlus = source + sourceStrideElements * i;
1203 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
1206 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1207 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1209 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1210 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1216 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
1217 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
1219 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1220 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1222 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1223 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1225 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1226 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1228 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1229 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1234 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1236 uint8x8_t result64 = vqmovn_u16(result128ab);
1238 vst1_u8(target, result64);
1242 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1244 const unsigned int filterSize_2 = filterSize / 2u;
1247 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
1250 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
1251 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1253 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
1254 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1257 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1259 const float* sourceMinus = source - sourceStrideElements * i;
1260 const float* sourcePlus = source + sourceStrideElements * i;
1265 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
1268 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
1269 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
1271 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
1272 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
1278 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
1279 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
1281 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
1282 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
1284 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
1285 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
1287 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
1288 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
1290 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
1291 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
1295 vst1q_f32(target + 0, result_32x4a);
1296 vst1q_f32(target + 4, result_32x4b);
1301 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1304 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1306 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1307 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1348 const unsigned int filterSize_2 = filterSize / 2u;
1350 const __m128i* sourceBlock = (
const __m128i*)source;
1353 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
1356 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1357 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1360 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1361 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1363 __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
1364 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
1366 __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
1367 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
1370 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1372 const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
1373 const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
1378 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
1382 __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
1383 __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
1385 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1386 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1388 source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 2), _mm_loadu_si128(sourcePlus + 2));
1389 source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 3), _mm_loadu_si128(sourcePlus + 3));
1391 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1392 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1397 __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
1398 __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1400 __m128i source128aiMinus = _mm_loadu_si128(sourceMinus + 0);
1401 __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
1403 __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
1404 __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
1406 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
1407 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
1409 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
1410 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
1412 __m128i source128ciMinus = _mm_loadu_si128(sourceMinus + 2);
1413 __m128i source128ciPlus = _mm_loadu_si128(sourcePlus + 2);
1415 __m128i source128diMinus = _mm_loadu_si128(sourceMinus + 3);
1416 __m128i source128diPlus = _mm_loadu_si128(sourcePlus + 3);
1418 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
1419 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
1421 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
1422 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
1427 __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1428 __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
1429 __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
1431 _mm_storeu_si128((__m128i*)target, result128);
1435 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1437 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1438 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1473 const unsigned int filterSize_2 = filterSize / 2u;
1475 const __m128i* sourceBlock = (
const __m128i*)source;
1478 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
1481 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1482 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1485 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1486 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1488 __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
1489 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
1491 __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
1492 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
1495 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1497 const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
1498 const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
1503 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
1507 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
1508 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
1510 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1511 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1513 source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2)));
1514 source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3)));
1516 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
1517 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
1522 __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
1523 __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1525 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
1526 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
1528 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
1529 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
1531 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2));
1532 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
1534 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3));
1535 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
1537 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
1538 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
1540 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
1541 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
1543 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2));
1544 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
1546 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3));
1547 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
1551 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1552 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1553 writeSIMD<float, PI_SSE_2>(result_32x4c, target + 8);
1554 writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
1559 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1562 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1564 const unsigned int filterSize_2 = filterSize / 2u;
1567 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
1570 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1571 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1573 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1574 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1576 float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
1577 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
1579 float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
1580 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
1583 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1585 const unsigned int* sourceMinus = source - sourceStrideElements * i;
1586 const unsigned int* sourcePlus = source + sourceStrideElements * i;
1591 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
1594 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1595 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1597 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1598 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1600 source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
1601 source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
1603 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1604 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1610 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
1611 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
1613 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1614 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1616 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1617 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1619 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1620 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1622 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1623 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1625 uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
1626 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
1628 uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
1629 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
1631 uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
1632 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
1634 uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
1635 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
1640 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1641 uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
1643 uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
1645 vst1q_u8(target, result128);
1649 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1651 const unsigned int filterSize_2 = filterSize / 2u;
1654 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
1657 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
1658 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1660 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
1661 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1663 float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
1664 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
1666 float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
1667 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
1670 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1672 const float* sourceMinus = source - sourceStrideElements * i;
1673 const float* sourcePlus = source + sourceStrideElements * i;
1678 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
1681 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
1682 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
1684 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
1685 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
1687 source_32x4c = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
1688 source_32x4d = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
1690 result_32x4c = vmlaq_f32(result_32x4c, source_32x4c, filterFactor_32x4);
1691 result_32x4d = vmlaq_f32(result_32x4d, source_32x4d, filterFactor_32x4);
1697 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
1698 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
1700 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
1701 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
1703 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
1704 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
1706 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
1707 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
1709 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
1710 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
1712 source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
1713 source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
1715 source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
1716 source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
1718 result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
1719 result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
1721 result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
1722 result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
1726 vst1q_f32(target + 0, result_32x4a);
1727 vst1q_f32(target + 4, result_32x4b);
1728 vst1q_f32(target + 8, result_32x4c);
1729 vst1q_f32(target + 12, result_32x4d);
1734 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1737 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1739 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1740 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1761 const unsigned int filterSize_2 = filterSize / 2u;
1764 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1766 const __m128i* sourceBlock = (
const __m128i*)source;
1769 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
1772 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1773 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1776 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1777 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1779 __m128i source128ai, source128bi;
1782 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1789 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1790 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1795 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
1799 source128ai = _mm_add_epi32(_mm_loadu_si128((
const __m128i*)sourceMinus + 0), _mm_loadu_si128((
const __m128i*)sourcePlus + 0));
1800 source128bi = _mm_add_epi32(_mm_loadu_si128((
const __m128i*)sourceMinus + 1), _mm_loadu_si128((
const __m128i*)sourcePlus + 1));
1802 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1803 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1809 __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
1810 __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1814 source128ai = _mm_loadu_si128((
const __m128i*)sourceMinus + 0);
1815 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Minus));
1817 source128bi = _mm_loadu_si128((
const __m128i*)sourceMinus + 1);
1818 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Minus));
1820 source128ai = _mm_loadu_si128((
const __m128i*)sourcePlus + 0);
1821 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Plus));
1823 source128bi = _mm_loadu_si128((
const __m128i*)sourcePlus + 1);
1824 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Plus));
1829 __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1830 result128 = _mm_packus_epi16(result128, result128);
1832 _mm_storel_epi64((__m128i*)target, result128);
1836 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1838 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1839 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1854 const unsigned int filterSize_2 = filterSize / 2u;
1857 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1859 const __m128i* sourceBlock = (
const __m128i*)source;
1862 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
1865 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1866 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1869 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1870 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1873 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1880 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1881 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1886 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
1890 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0)));
1891 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1)));
1893 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1894 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1899 __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
1900 __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
1902 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0));
1903 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
1905 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1));
1906 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
1908 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0));
1909 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
1911 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1));
1912 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
1916 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1917 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1922 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1925 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
1927 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
1928 ocean_assert(filterSize % 2u == 1u);
1930 const unsigned int filterSize_2 = filterSize / 2u;
1933 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1936 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
1939 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1940 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1942 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1943 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1946 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
1953 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1954 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1959 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
1963 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1964 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1966 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1967 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1973 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
1974 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
1976 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1977 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1979 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1980 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1982 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1983 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1985 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1986 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1991 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1993 uint8x8_t result64 = vqmovn_u16(result128ab);
1995 vst1_u8(target, result64);
1999 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
2001 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
2002 ocean_assert(filterSize % 2u == 1u);
2004 const unsigned int filterSize_2 = filterSize / 2u;
2007 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2010 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
2013 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
2014 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
2016 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
2017 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
2020 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
2027 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2028 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2033 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
2037 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
2038 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
2040 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
2041 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
2047 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
2048 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
2050 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
2051 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
2053 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
2054 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
2056 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
2057 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
2059 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
2060 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
2064 vst1q_f32(target + 0, result_32x4a);
2065 vst1q_f32(target + 4, result_32x4b);
2070 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2073 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* source,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
2088 __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
2091 __m128i source128 = _mm_set1_epi32(*((
const int*)source));
2094 source128 = _mm_unpacklo_epi8(source128, _mm_setzero_si128());
2097 source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
2100 source128 = _mm_madd_epi16(source128, filterFactor_32x4);
2103 target = _mm_add_epi32(target, source128);
2107 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* source,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2123 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2126 __m128 source_32x4 = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)source));
2129 source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
2132 target_32x4 = _mm_add_ps(target_32x4, source_32x4);
2136 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* sourceLeft,
const uint8_t* sourceRight,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
2151 __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
2154 __m128i source128 = _mm_add_epi16(_mm_unpacklo_epi8(_mm_set1_epi32(*((
const int*)sourceLeft)), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_set1_epi32(*((
const int*)sourceRight)), _mm_setzero_si128()));
2157 source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
2160 source128 = _mm_madd_epi16(source128, filterFactor_32x4);
2163 target = _mm_add_epi32(target, source128);
2167 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* sourceLeft,
const float* sourceRight,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2183 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2186 __m128 source_32x4 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceLeft)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceRight)));
2189 source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
2192 target_32x4 = _mm_add_ps(target_32x4, source_32x4);
2196 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* source,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2213 __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
2216 __m128i source_32x4a = _mm_loadl_epi64((
const __m128i*)source);
2219 source_32x4a = _mm_unpacklo_epi8(source_32x4a, _mm_setzero_si128());
2222 __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
2223 source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
2226 source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
2227 source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
2230 target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
2231 target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
2235 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* source,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2251 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2254 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)source + 0));
2255 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)source + 1));
2258 source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2259 source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2262 target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
2263 target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
2267 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* sourceLeft,
const uint8_t* sourceRight,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2284 __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
2287 __m128i source_32x4a = _mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((
const __m128i*)sourceLeft), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_loadl_epi64((
const __m128i*)sourceRight), _mm_setzero_si128()));
2290 __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
2291 source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
2294 source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
2295 source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
2298 target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
2299 target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
2303 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* sourceLeft,
const float* sourceRight,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2319 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2322 __m128 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceLeft + 0)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceRight + 0)));
2323 __m128 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceLeft + 1)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceRight + 1)));
2326 source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2327 source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2330 target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
2331 target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
2336 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2339 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* source,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
2341 ocean_assert(filterFactor <= 0xFFFFu);
2344 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2346 #if defined(__aarch64__)
2349 const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(*((
const uint32_t*)source))));
2353 uint32_t sourceValue;
2354 ((uint8_t*)&sourceValue)[0] = source[0];
2355 ((uint8_t*)&sourceValue)[1] = source[1];
2356 ((uint8_t*)&sourceValue)[2] = source[2];
2357 ((uint8_t*)&sourceValue)[3] = source[3];
2359 const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValue)));
2364 target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
2368 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* source,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2371 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2374 const float32x4_t source128 = vld1q_f32(source);
2377 target_32x4 = vmlaq_f32(target_32x4, source128, filterFactor_32x4);
2381 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* sourceLeft,
const uint8_t* sourceRight,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
2383 ocean_assert(filterFactor <= 0xFFFFu);
2386 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2388 #if defined(__aarch64__)
2391 const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(*((
const uint32_t*)sourceLeft))), vreinterpret_u8_u32(vdup_n_u32(*((
const uint32_t*)sourceRight))));
2395 uint32_t sourceValueLeft;
2396 ((uint8_t*)&sourceValueLeft)[0] = sourceLeft[0];
2397 ((uint8_t*)&sourceValueLeft)[1] = sourceLeft[1];
2398 ((uint8_t*)&sourceValueLeft)[2] = sourceLeft[2];
2399 ((uint8_t*)&sourceValueLeft)[3] = sourceLeft[3];
2401 uint32_t sourceValueRight;
2402 ((uint8_t*)&sourceValueRight)[0] = sourceRight[0];
2403 ((uint8_t*)&sourceValueRight)[1] = sourceRight[1];
2404 ((uint8_t*)&sourceValueRight)[2] = sourceRight[2];
2405 ((uint8_t*)&sourceValueRight)[3] = sourceRight[3];
2408 const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValueLeft)), vreinterpret_u8_u32(vdup_n_u32(sourceValueRight)));
2413 target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
2417 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* sourceLeft,
const float* sourceRight,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2420 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2423 const float32x4_t source_32x4 = vaddq_f32(vld1q_f32(sourceLeft), vld1q_f32(sourceRight));
2426 target_32x4 = vmlaq_f32(target_32x4, source_32x4, filterFactor_32x4);
2430 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* source,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2432 ocean_assert(filterFactor <= 0xFFFFu);
2435 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2438 const uint16x8_t source16_8 = vmovl_u8(vld1_u8(source));
2441 target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
2442 target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
2446 OCEAN_FORCE_INLINE
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* source,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2449 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2452 const float32x4_t source_32x4a = vld1q_f32(source + 0);
2453 const float32x4_t source_32x4b = vld1q_f32(source + 4);
2456 target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
2457 target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
2461 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* sourceLeft,
const uint8_t* sourceRight,
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2463 ocean_assert(filterFactor <= 0xFFFFu);
2466 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2469 const uint16x8_t source16_8 = vaddl_u8(vld1_u8(sourceLeft), vld1_u8(sourceRight));
2472 target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
2473 target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
2477 OCEAN_FORCE_INLINE
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* sourceLeft,
const float* sourceRight,
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2480 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2483 const float32x4_t source_32x4a = vaddq_f32(vld1q_f32(sourceLeft + 0), vld1q_f32(sourceRight + 0));
2484 const float32x4_t source_32x4b = vaddq_f32(vld1q_f32(sourceLeft + 4), vld1q_f32(sourceRight + 4));
2487 target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
2488 target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
2493 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
2522 ocean_assert(source !=
nullptr &&
filter !=
nullptr);
2523 ocean_assert(channels >= 1u);
2524 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2528 setSIMDZero<TFilter, tProcessorInstructions>(target_32x4);
2532 const unsigned int filterSize_2 = filterSize / 2u;
2535 for (
unsigned int n = 0u; n < filterSize_2; ++n)
2537 symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels,
filter[n], target_32x4);
2541 asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels,
filter[filterSize_2], target_32x4);
2546 for (
unsigned int n = 0u; n < filterSize; ++n)
2548 asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels,
filter[n], target_32x4);
2552 writeSIMD<TFilter, tProcessorInstructions>(target_32x4, target);
2555 template <
typename TSource,
typename TFilter, ProcessorInstructions tProcessorInstructions>
2589 ocean_assert(source !=
nullptr &&
filter !=
nullptr);
2590 ocean_assert(channels >= 1u);
2591 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2595 setSIMDZero<TFilter, tProcessorInstructions>(target_32x4a);
2596 setSIMDZero<TFilter, tProcessorInstructions>(target_32x4b);
2600 const unsigned int filterSize_2 = filterSize / 2u;
2603 for (
unsigned int n = 0u; n < filterSize_2; ++n)
2605 symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels,
filter[n], target_32x4a, target_32x4b);
2609 asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels,
filter[filterSize_2], target_32x4a, target_32x4b);
2614 for (
unsigned int n = 0u; n < filterSize; ++n)
2616 asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels,
filter[n], target_32x4a, target_32x4b);
2620 writeSIMD<TFilter, tProcessorInstructions>(target_32x4a, target + 0);
2621 writeSIMD<TFilter, tProcessorInstructions>(target_32x4b, target + 4);
2624 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2627 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
2629 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
2630 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2668 const unsigned int filterSize_2 = filterSize / 2u;
2671 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2673 const __m128i* sourceBlock = (
const __m128i*)source;
2676 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
2679 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
2680 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2683 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
2684 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2686 __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
2687 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
2689 __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
2690 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
2692 __m128i source128ai, source128bi;
2695 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
2702 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2703 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2708 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
2712 source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 0), _mm_loadu_si128((__m128i*)sourcePlus + 0));
2713 source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 1), _mm_loadu_si128((__m128i*)sourcePlus + 1));
2715 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
2716 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
2718 source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 2), _mm_loadu_si128((__m128i*)sourcePlus + 2));
2719 source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 3), _mm_loadu_si128((__m128i*)sourcePlus + 3));
2721 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
2722 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
2727 __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
2728 __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
2730 __m128i source128aiMinus = _mm_loadu_si128((__m128i*)sourceMinus + 0);
2731 __m128i source128aiPlus = _mm_loadu_si128((__m128i*)sourcePlus + 0);
2733 __m128i source128biMinus = _mm_loadu_si128((__m128i*)sourceMinus + 1);
2734 __m128i source128biPlus = _mm_loadu_si128((__m128i*)sourcePlus + 1);
2736 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
2737 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
2739 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
2740 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
2742 __m128i source128ciMinus = _mm_loadu_si128((__m128i*)sourceMinus + 2);
2743 __m128i source128ciPlus = _mm_loadu_si128((__m128i*)sourcePlus + 2);
2745 __m128i source128diMinus = _mm_loadu_si128((__m128i*)sourceMinus + 3);
2746 __m128i source128diPlus = _mm_loadu_si128((__m128i*)sourcePlus + 3);
2748 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
2749 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
2751 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
2752 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
2757 __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
2758 __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
2759 __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
2761 _mm_storeu_si128((__m128i*)target, result128);
2765 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
2767 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
2768 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2800 const unsigned int filterSize_2 = filterSize / 2u;
2803 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2805 const __m128i* sourceBlock = (
const __m128i*)source;
2808 __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
2811 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
2812 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2815 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
2816 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2818 __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
2819 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
2821 __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
2822 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
2825 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
2832 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2833 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2838 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
2842 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0)));
2843 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1)));
2845 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
2846 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
2848 source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 2)));
2849 source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 3)));
2851 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
2852 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
2857 __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
2858 __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
2860 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0));
2861 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
2863 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1));
2864 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
2866 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 2));
2867 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
2869 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 3));
2870 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
2872 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0));
2873 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
2875 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1));
2876 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
2878 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 2));
2879 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
2881 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 3));
2882 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
2886 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
2887 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
2888 writeSIMD<float, PI_SSE_2>(result_32x4c, target + 8);
2889 writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
2894 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2897 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
2899 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
2900 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2919 const unsigned int filterSize_2 = filterSize / 2u;
2922 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2925 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
2928 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
2929 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
2931 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
2932 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
2934 float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
2935 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
2937 float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
2938 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
2941 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
2948 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2949 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2954 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
2958 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
2959 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
2961 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
2962 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
2964 source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
2965 source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
2967 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
2968 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
2974 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
2975 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
2977 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
2978 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
2980 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
2981 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
2983 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
2984 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
2986 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
2987 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
2989 uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
2990 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
2992 uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
2993 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
2995 uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
2996 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
2998 uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
2999 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
3004 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
3005 uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
3007 uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
3009 vst1q_u8(target, result128);
3013 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source,
float* target,
const unsigned int sourceStrideElements,
const unsigned int height,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric)
3015 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
3016 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3035 const unsigned int filterSize_2 = filterSize / 2u;
3038 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
3041 float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
3044 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
3045 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
3047 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
3048 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
3050 float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
3051 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
3053 float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
3054 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
3057 for (
unsigned int i = 1u; i <= filterSize_2; ++i)
3064 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
3065 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
3070 filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
3074 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
3075 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
3077 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
3078 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
3080 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
3081 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
3083 result_32x4c = vmlaq_f32(result_32x4c, source_32x4a, filterFactor_32x4);
3084 result_32x4d = vmlaq_f32(result_32x4d, source_32x4b, filterFactor_32x4);
3090 float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
3091 float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
3093 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
3094 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
3096 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
3097 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
3099 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
3100 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
3102 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
3103 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
3105 source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
3106 source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
3108 source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
3109 source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
3111 result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
3112 result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
3114 result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
3115 result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
3119 vst1q_f32(target + 0, result_32x4a);
3120 vst1q_f32(target + 4, result_32x4b);
3121 vst1q_f32(target + 8, result_32x4c);
3122 vst1q_f32(target + 12, result_32x4d);
3127 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
3128 OCEAN_FORCE_INLINE
void FrameFilterSeparable::filterVerticalBorderRow32BitPerChannelFloat(
const TSource* source, TTarget* target,
const unsigned int width,
const unsigned height,
const unsigned int channels,
const unsigned int row,
const float* filter,
const unsigned int filterSize,
const bool isSymmetric,
const unsigned int sourcePaddingElements)
3130 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
3131 ocean_assert(channels >= 1u);
3132 ocean_assert(filterSize <= height);
3133 ocean_assert(filterSize % 2u == 1u);
3135 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3137 unsigned int remainingElements = width * channels;
3139 while (remainingElements >= 16u)
3141 filterVerticalBorderRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row,
filter, filterSize, isSymmetric);
3146 remainingElements -= 16u;
3149 while (remainingElements >= 8u)
3151 filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row,
filter, filterSize, isSymmetric);
3156 remainingElements -= 8u;
3159 ocean_assert(width * channels >= 8u);
3160 ocean_assert(remainingElements < 8u);
3162 if (remainingElements != 0u)
3164 const unsigned int shift = 8u - remainingElements;
3166 filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, height, row,
filter, filterSize, isSymmetric);
3170 template <
typename TSource,
typename TFilter, const ProcessorInstructions tProcessorInstructions>
3171 void FrameFilterSeparable::filterHorizontalSubset(
const TSource* source, TFilter* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const TFilter* filter,
const unsigned int filterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows)
3173 ocean_assert(source !=
nullptr && target !=
nullptr &&
filter !=
nullptr);
3174 ocean_assert(width >= filterSize + 1u);
3176 ocean_assert(channels >= 1u && channels <= 8u);
3177 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3179 ocean_assert_and_suppress_unused(firstRow + numberRows <= height, height);
3181 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3182 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3186 const unsigned int filterSize_2 = filterSize / 2u;
3187 const unsigned int extraPixels = filterSize_2 * 2u;
3189 const unsigned int extendedElements = (width + extraPixels) * channels;
3191 Memory extendedRowMemory = Memory::create<TSource>(extendedElements);
3192 TSource*
const extendedRow = extendedRowMemory.
data<TSource>();
3193 ocean_assert(extendedRow !=
nullptr);
3195 source += firstRow * sourceStrideElements;
3196 target += firstRow * targetStrideElements;
3198 for (
unsigned int rowsProcessed = 0u; rowsProcessed < numberRows; ++rowsProcessed)
3201 fillLeftExtraBorder<TSource>(source, channels, filterSize_2, extendedRow);
3202 memcpy(extendedRow + filterSize_2 * channels, source, width * channels *
sizeof(TSource));
3203 fillRightExtraBorder<TSource>(source + width * channels, channels, filterSize_2, extendedRow + (width + filterSize_2) * channels);
3205 const TSource* extendedSource = extendedRow;
3207 unsigned int remainingElements = width * channels;
3209 #if (defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10) || (defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20)
3211 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3213 #elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
3219 while (remainingElements >= 8u)
3221 filterHorizontalRowOneBlockWith8Elements<TSource, TFilter, instructions>(extendedSource, target, channels,
filter, filterSize, isSymmetric);
3223 extendedSource += 8;
3226 remainingElements -= 8u;
3231 while (remainingElements >= 4u)
3233 filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels,
filter, filterSize, isSymmetric);
3235 extendedSource += 4;
3238 remainingElements -= 4u;
3243 if (remainingElements != 0u)
3245 const unsigned int shift = 4u - remainingElements;
3247 extendedSource -= shift;
3250 filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels,
filter, filterSize, isSymmetric);
3258 OCEAN_SUPPRESS_UNUSED_WARNING(extendedSource);
3259 OCEAN_SUPPRESS_UNUSED_WARNING(remainingElements);
3260 OCEAN_SUPPRESS_UNUSED_WARNING(isSymmetric);
3264 #ifdef OCEAN_INTENSIVE_DEBUG
3266 const TFilter*
const debugTarget = target - width * channels;
3268 for (
unsigned int x = 0u; x < width; ++x)
3270 for (
unsigned int n = 0u; n < channels; ++n)
3272 float result = 0.0f;
3274 for (
int xx = -
int(filterSize_2); xx <= int(filterSize_2); ++xx)
3277 result += float(*(source + mirroredXX * channels +
int(n))) *
filter[xx + int(filterSize_2)];
3280 const TFilter targetValue = debugTarget[x * channels + n];
3282 if (std::is_same<float, TFilter>::value)
3288 const TFilter result8_converted = (TFilter)(result);
3289 const TFilter result8_rounded = (TFilter)(result + 0.51f);
3290 ocean_assert(result8_converted == targetValue || result8_rounded == targetValue);
3297 source += sourceStrideElements;
3298 target += targetPaddingElements;
3302 template <
typename TSource,
typename TTarget, ProcessorInstructions tProcessorInstructions>
3303 void FrameFilterSeparable::filterVerticalSubset(
const TSource* source, TTarget* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const float* filter,
const unsigned int filterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows)
3305 ocean_assert(source !=
nullptr && target !=
nullptr);
3306 ocean_assert(
filter !=
nullptr);
3307 ocean_assert(height >= filterSize / 2u + 1u);
3308 ocean_assert(channels >= 1u && channels <= 8u);
3310 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3312 ocean_assert(firstRow + numberRows <= height);
3313 ocean_assert(width * channels >= 8u * 2u);
3315 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3316 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3320 const unsigned int filterSize_2 = filterSize / 2u;
3322 #ifdef OCEAN_INTENSIVE_DEBUG
3323 const TSource*
const debugSource = source;
3326 source += firstRow * sourceStrideElements;
3327 target += firstRow * targetStrideElements;
3329 unsigned int row = firstRow;
3333 while (row < min(firstRow + numberRows, filterSize_2))
3335 filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row,
filter, filterSize, isSymmetric, sourcePaddingElements);
3337 #ifdef OCEAN_INTENSIVE_DEBUG
3339 for (
unsigned int x = 0u; x < width * channels; ++x)
3341 float result = 0.0f;
3343 for (
int y = -
int(filterSize_2); y <= int(filterSize_2); ++y)
3346 result += float(*(debugSource + mirroredY *
int(sourceStrideElements) +
int(x))) *
filter[y + int(filterSize_2)];
3349 const TTarget targetValue = target[x];
3351 if (std::is_same<float, TTarget>::value)
3363 source += sourceStrideElements;
3364 target += targetStrideElements;
3371 while (row < min(firstRow + numberRows, height - filterSize_2))
3373 filterVerticalCoreRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, channels,
filter, filterSize, isSymmetric, sourcePaddingElements);
3375 #ifdef OCEAN_INTENSIVE_DEBUG
3377 for (
unsigned int x = 0u; x < width * channels; ++x)
3379 float result = 0.0f;
3381 for (
int y = -
int(filterSize_2); y <= int(filterSize_2); ++y)
3382 result +=
float(*(debugSource + (
int(row) + y) *
int(sourceStrideElements) +
int(x))) *
filter[y +
int(filterSize_2)];
3384 const TTarget targetValue = target[x];
3386 ocean_assert(result >= 0.0f && result < 256.0f);
3388 if (std::is_same<float, TTarget>::value)
3400 source += sourceStrideElements;
3401 target += targetStrideElements;
3408 while (row < firstRow + numberRows)
3410 ocean_assert(row + filterSize_2 >= height);
3412 filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row,
filter, filterSize, isSymmetric, sourcePaddingElements);
3414 #ifdef OCEAN_INTENSIVE_DEBUG
3417 for (
unsigned int x = 0u; x < width * channels; ++x)
3419 float result = 0.0f;
3421 for (
int y = -
int(filterSize_2); y <= int(filterSize_2); ++y)
3424 result += float(*(debugSource + mirroredY *
int(sourceStrideElements) +
int(x))) *
filter[y + int(filterSize_2)];
3427 const TTarget targetValue = target[x];
3429 ocean_assert(result >= 0.0f && result < 256.0f);
3431 if (std::is_same<float, TTarget>::value)
3443 source += sourceStrideElements;
3444 target += targetStrideElements;
3450 template <
typename T,
typename TFilter, ProcessorInstructions tProcessorInstructions>
3451 inline void FrameFilterSeparable::filter(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const TFilter* horizontalFilter,
const unsigned int horizontalFilterSize,
const TFilter* verticalFilter,
const unsigned int verticalFilterSize,
ReusableMemory* reusableMemory,
Worker* worker)
3453 Frame localIntermediateFrame;
3454 Frame* intermediateFrame = &localIntermediateFrame;
3456 if (reusableMemory !=
nullptr)
3467 worker->
executeFunction(
Worker::Function::createStatic(&filterHorizontalSubset<T, TFilter, tProcessorInstructions>, source, intermediateFrame->
data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->
paddingElements(), 0u, 0u), 0u, height);
3471 filterHorizontalSubset<T, TFilter, tProcessorInstructions>(source, intermediateFrame->
data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->
paddingElements(), 0u, height);
3477 std::vector<float> localFloatFilters;
3478 const float* verticalFloatFilter =
nullptr;
3480 if (std::is_same<TFilter, float>::value)
3482 verticalFloatFilter = (
const float*)(verticalFilter);
3486 ocean_assert((std::is_same<TFilter, unsigned int>::value));
3488 const TFilter sumHorizontalFilterValues =
sumFilterValues(horizontalFilter, horizontalFilterSize);
3489 const TFilter sumVerticalFilterValues =
sumFilterValues(verticalFilter, verticalFilterSize);
3491 const unsigned int normalizationFactor = (
unsigned int)(sumHorizontalFilterValues) * (
unsigned int)(sumVerticalFilterValues);
3492 ocean_assert(normalizationFactor != 0u);
3494 const float invNormalizationFactor = 1.0f / float(normalizationFactor);
3496 std::vector<float>& floatFilterBufferToUse = reusableMemory !=
nullptr ? reusableMemory->
filterFactors_ : localFloatFilters;
3498 floatFilterBufferToUse.resize(verticalFilterSize);
3500 for (
unsigned int n = 0u; n < verticalFilterSize; ++n)
3502 floatFilterBufferToUse[n] = float(verticalFilter[n]) * invNormalizationFactor;
3505 verticalFloatFilter = floatFilterBufferToUse.data();
3510 worker->
executeFunction(
Worker::Function::createStatic(&filterVerticalSubset<TFilter, T, tProcessorInstructions>, intermediateFrame->
constdata<TFilter>(), target, width, height, channels, (
const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->
paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
3514 filterVerticalSubset<TFilter, T, tProcessorInstructions>(intermediateFrame->
constdata<TFilter>(), target, width, height, channels, (
const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->
paddingElements(), targetPaddingElements, 0u, height);
3518 template <
typename T,
typename TFilter>
3519 bool FrameFilterSeparable::filter(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const TFilter* horizontalFilter,
const unsigned int horizontalFilterSize,
const TFilter* verticalFilter,
const unsigned int verticalFilterSize,
Worker* worker,
ReusableMemory* reusableMemory,
const ProcessorInstructions processorInstructions)
3521 ocean_assert(source !=
nullptr && target !=
nullptr);
3522 ocean_assert(width >= horizontalFilterSize && height >= verticalFilterSize);
3523 ocean_assert(channels >= 1u);
3525 if (source ==
nullptr || target ==
nullptr || width < horizontalFilterSize || height < verticalFilterSize || channels == 0u)
3530 OCEAN_SUPPRESS_UNUSED_WARNING(reusableMemory);
3532 if (width * channels >= 16u && width >= horizontalFilterSize + 1u)
3534 switch (Processor::bestInstructionGroup<false>(processorInstructions))
3541 OCEAN_APPLY_IF_SSE((filter<T, TFilter, PI_SSE_2>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
3545 OCEAN_APPLY_IF_NEON((filter<T, TFilter, PI_GROUP_NEON>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
3552 ocean_assert(
false &&
"Invalid instructions!");
3556 if constexpr (std::is_same<float, TFilter>::value)
3558 filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, (
const float*)(horizontalFilter), horizontalFilterSize, (
const float*)(verticalFilter), verticalFilterSize, worker);
3563 if constexpr (std::is_same<unsigned int, TFilter>::value)
3565 const TFilter horizontalNormalization =
sumFilterValues(horizontalFilter, horizontalFilterSize);
3566 ocean_assert(horizontalNormalization != TFilter(0));
3568 std::vector<float> horizontalFloatFilter(horizontalFilterSize);
3569 for (
size_t n = 0; n < horizontalFloatFilter.size(); ++n)
3571 horizontalFloatFilter[n] = float(horizontalFilter[n]) / float(horizontalNormalization);
3574 const TFilter verticalNormalization =
sumFilterValues(verticalFilter, verticalFilterSize);
3575 ocean_assert(verticalNormalization != TFilter(0));
3577 std::vector<float> verticalFloatFilter(verticalFilterSize);
3578 for (
size_t n = 0; n < verticalFloatFilter.size(); ++n)
3580 verticalFloatFilter[n] = float(verticalFilter[n]) / float(verticalNormalization);
3583 return filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFloatFilter.data(), (
unsigned int)horizontalFloatFilter.size(), verticalFloatFilter.data(), (
unsigned int)verticalFloatFilter.size(), worker);
3587 ocean_assert(
false &&
"Invalid combination of parameters!");
3591 template <
typename T>
3592 bool FrameFilterSeparable::filterUniversal(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const float* horizontalFilter,
const unsigned int horizontalFilterSize,
const float* verticalFilter,
const unsigned int verticalFilterSize,
Worker* worker)
3594 ocean_assert(source !=
nullptr && target !=
nullptr);
3595 ocean_assert(width >= 1u && height >= 1u);
3596 ocean_assert(channels != 0u);
3598 ocean_assert(horizontalFilter !=
nullptr && verticalFilter !=
nullptr);
3599 ocean_assert(horizontalFilterSize % 2u == 1u);
3600 ocean_assert(verticalFilterSize % 2u == 1u);
3602 if (source ==
nullptr || target ==
nullptr
3603 || verticalFilter ==
nullptr || horizontalFilter ==
nullptr
3604 || horizontalFilterSize > width || verticalFilterSize > height
3605 || horizontalFilterSize % 2u != 1u || verticalFilterSize % 2u != 1u)
3616 worker->
executeFunction(
Worker::Function::createStatic(&filterUniversalHorizontalSubset<T, TIntermediate>, source, intermediateFrame.
data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.
paddingElements(), 0u, 0u), 0u, height);
3617 worker->
executeFunction(
Worker::Function::createStatic(&filterUniversalVerticalSubset<T, TIntermediate>, intermediateFrame.
constdata<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.
paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
3621 filterUniversalHorizontalSubset<T, TIntermediate>(source, intermediateFrame.
data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.
paddingElements(), 0u, height);
3622 filterUniversalVerticalSubset<T, TIntermediate>(intermediateFrame.
data<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.
paddingElements(), targetPaddingElements, 0u, height);
3628 template <
typename T,
typename TIntermediate>
3629 void FrameFilterSeparable::filterUniversalHorizontalSubset(
const T* source, TIntermediate* target,
const unsigned int width,
unsigned int channels,
const float* horizontalFilter,
unsigned int horizontalFilterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows)
3631 ocean_assert(source !=
nullptr && target !=
nullptr);
3632 ocean_assert(width >= 1u);
3633 ocean_assert(channels != 0u);
3635 ocean_assert(horizontalFilterSize <=
size_t(width));
3636 ocean_assert(horizontalFilterSize % 2u == 1u);
3638 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3639 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3641 const unsigned int filterSize = horizontalFilterSize;
3642 const unsigned int filterSize_2 = filterSize / 2u;
3643 ocean_assert(filterSize_2 * 2u <= width);
3645 std::vector<TIntermediate> filterCopy;
3647 if (!std::is_same<TIntermediate, float>::value)
3649 filterCopy.resize(horizontalFilterSize);
3650 for (
size_t n = 0; n < filterCopy.size(); ++n)
3652 filterCopy[n] = TIntermediate(horizontalFilter[n]);
3656 const TIntermediate*
const filter = filterCopy.empty() ? (
const TIntermediate*)horizontalFilter : filterCopy.data();
3658 source += firstRow * sourceStrideElements;
3659 target += firstRow * targetStrideElements;
3661 TIntermediate*
const targetEnd = target + numberRows * targetStrideElements;
3663 while (target != targetEnd)
3665 ocean_assert(target < targetEnd);
3669 for (
unsigned int x = 0u; x < filterSize_2; ++x)
3671 for (
unsigned int n = 0u; n < channels; ++n)
3675 for (
unsigned int s = 1u; s < filterSize; ++s)
3678 target[n] = response;
3687 for (
unsigned int x = filterSize_2; x < width - filterSize_2; ++x)
3689 for (
unsigned int n = 0u; n < channels; ++n)
3691 TIntermediate response = TIntermediate(source[channels * 0u + n]) *
filter[0];
3693 for (
unsigned int s = 1u; s < filterSize; ++s)
3694 response += TIntermediate(source[channels * s + n]) *
filter[s];
3696 target[n] = response;
3705 for (
unsigned int x = 0u; x < filterSize_2; ++x)
3707 for (
unsigned int n = 0u; n < channels; ++n)
3711 for (
unsigned int s = 1u; s < filterSize; ++s)
3714 target[n] = response;
3721 source += filterSize_2 * 2u * channels + sourcePaddingElements;
3722 target += targetPaddingElements;
3726 template <
typename T,
typename TIntermediate>
3727 void FrameFilterSeparable::filterUniversalVerticalSubset(
const TIntermediate* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const float* verticalFilter,
const unsigned int verticalFilterSize,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int firstRow,
const unsigned int numberRows)
3729 ocean_assert(source !=
nullptr && target !=
nullptr);
3730 ocean_assert(width >= 1u && height >= 1u);
3731 ocean_assert(channels != 0u);
3733 ocean_assert(verticalFilterSize <= height);
3734 ocean_assert(verticalFilterSize % 2u == 1u);
3736 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3737 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3739 const TIntermediate*
const sourceStart = source;
3741 const unsigned int filterSize = verticalFilterSize;
3742 const unsigned int filterSize_2 = filterSize / 2u;
3743 ocean_assert(filterSize_2 * 2u <= height);
3745 std::vector<TIntermediate> filterCopy;
3747 if (!std::is_same<TIntermediate, float>::value)
3749 filterCopy.resize(verticalFilterSize);
3751 for (
size_t n = 0; n < filterCopy.size(); ++n)
3753 filterCopy[n] = TIntermediate(verticalFilter[n]);
3757 const TIntermediate*
const filter = filterCopy.empty() ? (
const TIntermediate*)verticalFilter : filterCopy.data();
3759 source += max(0,
int(firstRow) -
int(filterSize_2)) * sourceStrideElements;
3760 target += firstRow * targetStrideElements;
3762 unsigned int y = firstRow;
3766 while (y < min(filterSize_2, firstRow + numberRows))
3768 ocean_assert(source == sourceStart);
3769 const TIntermediate* sourceCopy = source;
3771 for (
unsigned int x = 0u; x < width; ++x)
3773 for (
unsigned int n = 0u; n < channels; ++n)
3777 for (
unsigned int s = 1u; s < filterSize; ++s)
3780 target[n] = T(response);
3787 target += targetPaddingElements;
3790 source = sourceCopy;
3796 const unsigned int centerRows = (
unsigned int)max(0,
int(min(firstRow + numberRows, height - filterSize_2)) - int(y));
3798 for (
unsigned int yCenter = 0u; yCenter < centerRows; ++yCenter)
3800 for (
unsigned int x = 0u; x < width; ++x)
3802 for (
unsigned int c = 0u; c < channels; ++c)
3804 TIntermediate response = TIntermediate(source[channels * 0u + c]) *
filter[0];
3806 for (
unsigned int s = 1u; s < filterSize; ++s)
3807 response += TIntermediate(source[sourceStrideElements * s + c]) *
filter[s];
3809 target[c] = T(response);
3816 source += sourcePaddingElements;
3817 target += targetPaddingElements;
3824 while (y < firstRow + numberRows)
3826 ocean_assert(y >= height - filterSize_2 && y < height);
3827 source = sourceStart + (height - filterSize_2 * 2u) * sourceStrideElements;
3829 const unsigned int yy = y - (height - filterSize_2);
3830 ocean_assert(yy < filterSize_2);
3832 for (
unsigned int x = 0u; x < width; ++x)
3834 for (
unsigned int n = 0u; n < channels; ++n)
3838 for (
unsigned int s = 1u; s < filterSize; ++s)
3843 target[n] = T(response);
3850 target += targetPaddingElements;
3873 ocean_assert(value < 2u * size);
3884 ocean_assert(size * 2u - value - 1u < size);
3885 return size * 2u - value - 1u;
This class holds re-usable memory for the filtering process.
Definition: FrameFilterSeparable.h:40
ReusableMemory()=default
Default constructor.
std::vector< float > filterFactors_
Float-based filter factors which can be re-used during filtering.
Definition: FrameFilterSeparable.h:56
std::vector< float > normalizedVerticalFilter_
Normalized vertical filter factors which can be re-used during filtering.
Definition: FrameFilterSeparable.h:62
Frame intermediateFrame_
An intermediate frame which can be re-used during filtering.
Definition: FrameFilterSeparable.h:53
std::vector< float > normalizedHorizontalFilter_
Normalized horizontal filter factors which can be re-used during filtering.
Definition: FrameFilterSeparable.h:59
This class implements separable filter.
Definition: FrameFilterSeparable.h:33
static void filterVerticalSubset(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, unsigned int firstRow, const unsigned int numberRows)
Applies the vertical filtering for a subset of the frame with a specified 1D filter kernel for frames...
Definition: FrameFilterSeparable.h:3303
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of a symmetric filter for 4 successive frame el...
static bool filterUniversal(const T *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float *horizontalFilter, const unsigned int horizontalFilterSize, const float *verticalFilter, const unsigned int verticalFilterSize, Worker *worker=nullptr)
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
Definition: FrameFilterSeparable.h:3592
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of a symmetric filter for 8 successive frame el...
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of an asymmetric filter for 8 successive frame ...
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith8Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 8 successive frame elements (8 elements...
Definition: FrameFilterSeparable.h:2556
static OCEAN_FORCE_INLINE void writeSIMD(const typename SIMD32x4< T >::Type &value, T *target)
Writes a SIMD with four 32 bit values to (not aligned) memory.
static void filterUniversalHorizontalSubset(const T *source, TIntermediate *target, const unsigned int width, const unsigned int channels, const float *horizontalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an horizontal filter to a subset of an image with almost arbitrary data type.
Definition: FrameFilterSeparable.h:3629
static void filterUniversalVerticalSubset(const TIntermediate *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *verticalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an vertical filter to a subset of an image with almost arbitrary data type.
Definition: FrameFilterSeparable.h:3727
static void filterHorizontalSubset(const TSource *source, TFilter *target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies the horizontal filtering in a subset of a frame with a specified 1D filter kernel for frames ...
Definition: FrameFilterSeparable.h:3171
static OCEAN_FORCE_INLINE void filterVerticalCoreRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int channels, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses for the inner core of a frame for one row.
Definition: FrameFilterSeparable.h:804
static T sumFilterValues(const T *filterValues, const size_t size)
Determines the sum of all elements of a given 1D filter.
Definition: FrameFilterSeparable.h:706
static bool isFilterSymmetric(const T *filterValues, const size_t size)
Returns whether a given 1D filter is symmetric.
Definition: FrameFilterSeparable.h:689
static OCEAN_FORCE_INLINE void filterVerticalBorderRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow4Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static void fillLeftExtraBorder(const T *source, const unsigned int channels, const unsigned int pixels, T *extendedRowLeft)
Fills the left border area of an extended row with mirrored pixel information (from the left image re...
Definition: FrameFilterSeparable.h:782
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of an asymmetric filter for 4 successive frame ...
static OCEAN_FORCE_INLINE void filterVerticalBorderRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses near the (vertical) border of a frame for one row.
static void fillRightExtraBorder(const T *sourceEnd, const unsigned int channels, const unsigned int pixels, T *extendedRowRight)
Fills the right border area of an extended row with mirrored pixel information (from the right image ...
Definition: FrameFilterSeparable.h:793
static OCEAN_FORCE_INLINE void filterVerticalBorderRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith4Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 4 successive frame elements (4 elements...
Definition: FrameFilterSeparable.h:2494
static bool filter(const Frame &source, Frame &target, const std::vector< unsigned int > &horizontalFilter, const std::vector< unsigned int > &verticalFilter, Worker *worker=nullptr, ReusableMemory *reusableMemory=nullptr, const ProcessorInstructions processorInstructions=Processor::get().instructions())
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
static OCEAN_FORCE_INLINE void setSIMDZero(typename SIMD32x4< T >::Type &value)
Sets a given SIMD value to zero.
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition: Caller.h:2876
This class implements Ocean's image class.
Definition: Frame.h:1792
const T * constdata(const unsigned int planeIndex=0u) const
Returns a pointer to the read-only pixel data of a specific plane.
Definition: Frame.h:4168
T * data(const unsigned int planeIndex=0u)
Returns a pointer to the pixel data of a specific plane.
Definition: Frame.h:4159
bool set(const FrameType &frameType, const bool forceOwner, const bool forceWritable=false, const Indices32 &planePaddingElements=Indices32(), const Timestamp ×tamp=Timestamp(false), bool *reallocated=nullptr)
Sets a new frame type for this frame.
unsigned int paddingElements(const unsigned int planeIndex=0u) const
Returns the optional number of padding elements at the end of each row for a specific plane.
Definition: Frame.h:4042
Definition of a frame type composed by the frame dimension, pixel format and pixel origin.
Definition: Frame.h:30
@ ORIGIN_UPPER_LEFT
The first pixel lies in the upper left corner, the last pixel in the lower right corner.
Definition: Frame.h:1050
This class implements an object able to allocate memory.
Definition: base/Memory.h:22
void * data()
Returns the pointer to the writable memory which is allocated by this object.
Definition: base/Memory.h:303
This class provides basic numeric functionalities.
Definition: Numeric.h:57
static Processor & get()
Returns a reference to the unique object.
Definition: Singleton.h:115
This class implements a worker able to distribute function calls over different threads.
Definition: Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
ProcessorInstructions
Definition of individual processor instruction types.
Definition: base/Processor.h:22
static unsigned int mirroredBorderLocationRight(const unsigned int value, const unsigned int size)
Mirrors a given value at the right border if necessary.
Definition: FrameFilterSeparable.h:3871
static unsigned int mirroredBorderLocationLeft(const int value)
Mirrors a given value at the left border if necessary.
Definition: FrameFilterSeparable.h:3856
@ PI_NONE
Unknown processor instruction set.
Definition: base/Processor.h:24
@ PI_GROUP_AVX_2_SSE_2
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition: base/Processor.h:64
@ PI_GROUP_SSE_4_1
All SSE instructions between (including) SSE and SSE4.1.
Definition: base/Processor.h:60
@ PI_SSE_2
SEE2 instructions.
Definition: base/Processor.h:28
@ PI_NEON
NEON instructions.
Definition: base/Processor.h:50
@ PI_GROUP_AVX_2_SSE_4_1
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition: base/Processor.h:68
@ PI_GROUP_SSE_2
All SSE instructions between (including) SSE and SSE2.
Definition: base/Processor.h:58
@ PI_GROUP_NEON
All NEON instructions (which is currently NEON only).
Definition: base/Processor.h:66
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15
float32x4_t Type
Definition: FrameFilterSeparable.h:683
__m128 Type
Definition: FrameFilterSeparable.h:663
uint32x4_t Type
Definition: FrameFilterSeparable.h:674
__m128i Type
Definition: FrameFilterSeparable.h:654
Definition of a 128 bit SIMD data type holding four 32 bit values.
Definition: FrameFilterSeparable.h:72
DataType< uint32_t, 4u >::Type Type
Definition: FrameFilterSeparable.h:73
Default definition of a type with tBytes bytes.
Definition: DataType.h:32
float Type
The 32 bit floating point data type for any data type T but 'double'.
Definition: DataType.h:373